nn.GRU with mask

Hello,

Im wondering how to implement below Keras approach for GRU into Pytorch GRU.

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )

most important part is mask definition.

Keras:

mask: Binary tensor of shape [samples, timesteps] indicating whether a given timestep should be masked (optional). An individual True entry indicates that the corresponding timestep should be utilized, while a False entry indicates that the corresponding timestep should be ignored. Defaults to None.

Something like this should work:

import torch
import torch.nn as nn

# set variables
features = 16
hidden_dim = 32
seq_len = 128
batch_size = 64

model = nn.GRU(features, hidden_dim, bias=False)

# generate an input
dummy_input = torch.rand((seq_len, batch_size, features))

# generate a mask
mask = torch.rand((seq_len, batch_size, 1))>0.5 # 50% masked values
mask = mask.repeat(1,1,features) #this only repeats the same mask across the features dimension

# apply the mask
dummy_input_masked = torch.where(mask, 0, dummy_input) # filling masked values with 0
print(dummy_input_masked)

# make the hidden layer
h0 = torch.rand(1, batch_size, hidden_dim)

#put masked input and hidden layer into the model
output, hn = model(dummy_input_masked, h0)
1 Like

I don’t know if Your answer cover me needs, becouse You pass at the and masked input , and what Keras Gru do (in my opinion) , is applying some kind of attention based on given mask.

My features

FEATURE
tensor([[[0.6853, 0.7958, 0.0017,  ..., 0.1823, 0.5553, 0.2935],
         [0.6637, 0.8767, 0.0271,  ..., 0.1365, 0.3762, 0.1650],
         [0.5990, 0.7948, 0.0245,  ..., 0.1462, 0.4408, 0.1912],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.3059, 1.4664, 0.0243,  ..., 0.6032, 0.6973, 0.3338],
         [0.3780, 1.1796, 0.1137,  ..., 0.3834, 0.7203, 0.1014],
         [0.2908, 1.2995, 0.1637,  ..., 0.5579, 0.8582, 0.2847],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]]) torch.Size([2, 112, 2048])
MASK
tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         False, False, False, False, False, False, False, False, False, False,
         False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False]]) torch.Size([2, 112])

My code is based on colab implementation for video sequence analyze - they took features from CNN model ( inception v3) where features are [SEQ_LEN, 2048]

and they wanna pass them to GRU net

link → keras_colab_video_cls

It looks like the mask was intended to be used to ignore any frames that were missing:

It does not appear to be an attention mask, like in a Transformer/LLM. In fact, the above is a loop that seems to just determine whether the frame exists or not, and then inserts a 1 if it does, or defaults to 0. You could recreate a similar loop with PyTorch for your dataset, and then output the mask from the dataloader and apply it based on that.

I did that

feature_extractor = timm.create_model('inception_v3', pretrained=True, num_classes=0, global_pool='avg')
feature_extractor.eval()


class LoadData(Dataset):
    def __init__(self, data_frame, feature_extractor, transformer=None):
        self.data_frame = data_frame
        self.feature_extractor = feature_extractor
        self.transformer = transformer

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, index):
        dumped_frames_path = self.data_frame.iloc[index]['dumped_frames_path']
        frames = self.load_frames(dumped_frames_path, transformer=self.transformer)
        features = self.extract_features(frames)
        label = int(self.data_frame.iloc[index]['cls_labels'])
        return features, label

    def extract_features(self, frames):
        frame_masks = torch.zeros(size=(1, MAX_SEQ_LENGTH), dtype=torch.bool)
        frame_features = torch.zeros(size=(1, MAX_SEQ_LENGTH, NUM_FEATURES))
        frames = frames[None, ...]
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            with torch.no_grad():
                for j in range(length):
                    frame_features[i, j, :] = self.feature_extractor(batch[None, j, :])
                frame_masks[i, :length] = 1
        frame_masks = frame_masks.squeeze()
        frame_features = frame_features.squeeze()
        return frame_features, frame_masks

    @staticmethod
    def load_frames(path, max_frames=0, transformer=None):
        frames = []
        for img in (p for p in os.listdir(path) if p.endswith('.png')):
            frame = Image.open(os.path.join(path, img))
            if transformer is not None:
                frame = transforms(frame)
            frames.append(frame)
            if len(frames) == max_frames:
                break
        if not frames:
            raise FileNotFoundError(f'Cannot list path: {path}')
        return torch.stack(frames)


train_data = LoadData(train[:100], feature_extractor, transformer=transforms)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, pin_memory=True, drop_last=True)

and model

class RnnGru(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, num_classes, drop_prob=0.4):
        super().__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(device)

        out, _ = self.gru(x, h0)
        out = self.relu(out)
        out = self.fc(out[:, -1, :])
        return out

but still I cannot understand what should I place as model input

  1. should I do under training :
dummy_input_masked = torch.where(frame_masks, 0, frame_features)
output, hn = model(dummy_input_masked)

I just read Mask part

so I assume I have to use packing

1 Like

If you are using the packed sequence route, something like this might work:

import torch
import torch.nn as nn

# set variables
features = 16
hidden_dim = 32
seq_len = 128
batch_size = 64

model = nn.GRU(features, hidden_dim, bias=False)

# generate an input
dummy_input = torch.rand((seq_len, batch_size, features))

# generate a mask
mask = torch.rand((seq_len, batch_size, 1))>0.5 # 50% masked values

# get lengths
lengths = torch.sum(mask.squeeze(2), 0)

mask = mask.repeat(1,1,features) #this only repeats the same mask across the features dimension

# apply the mask
dummy_input_masked = torch.where(mask, 0, dummy_input) # filling masked values with 0

# pack the sequence
dummy_input_packed = torch.nn.utils.rnn.pack_padded_sequence(dummy_input_masked, lengths, enforce_sorted=False)

print(dummy_input_packed)

# make the hidden layer
h0 = torch.rand(1, batch_size, hidden_dim)

#put masked input and hidden layer into the model
output, hn = model(dummy_input_packed, h0)

print(output)