Get frames from human recognition dataset

Hello, I’m trying to convert the videos from KTH dataset (Recognition of human actions) as it is said here (KTH-Dataset/sequences_list.txt at master · tejaskhot/KTH-Dataset · GitHub). I just got the data in the way the authors did, by making it by hand. At this moment, my code is the one below it is working, but not when it gets to the training. Can anyone help me? Thanks in advance.

# Import modules
import torch 
from numpy import random
import matplotlib.pyplot as plt
import torchvision
from torchvision import transforms 
from torchvision import datasets
from torch import optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os

from PIL import Image
from matplotlib import pyplot as plt
import numpy as np
import requests
import torch
from torchvision import models
from torchvision import transforms
import torch.nn as nn
from tqdm import tqdm
import shutil
from urllib.request import urlretrieve
from torch.utils.data import Dataset, DataLoader

#Class labels
classes = {'boxing':0, 'handclapping':1, 'handwaving':2, 'jogging':3, 'running':4, 'walking':5}
#Dataset
from google.colab import drive
drive.mount('/content/drive/', force_remount=True) 
import glob
folders_train = glob.glob('/content/drive/MyDrive/HumanActions_new_data/TRAIN')

count = 0
for i in range(0,len(folders_train)):
  video_data = folders_train[i]
  video = cv2.VideoCapture(video_data)
  success = True
  while success:
    success,image = video.read()
   name = '/content/drive/MyDrive/HumanActions_new_data/TRAIN/Frames'+str(count)+'.jpg'
   if success == True:
      cv2.imwrite(name,image)
      print('Frame {} Extracted Successfully'.format(count))
      count+=1
   else:
      i = i+1  
      i = i+1
print('\n\n\nVideo {} Extracted Successfully\n\n\n'.format(video_data))

train_dataloader = DataLoader('/content/drive/MyDrive/HumanActions_new_data/TRAIN', 
batch_size=16, shuffle=True)

And then I did the exactly same thing for validation and test datasets, and it works. But then I created a RNN and it gives an error

 #Set device
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Hyperparameters
input_size = 120
sequence_length = 28
num_layers = 2
hidden_size = 256
num_classes = 6
learning_rate = 0.001
batch_size = 64
num_epochs = 2

import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
         super(RNNModel, self).__init__()
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
         self.fc = nn.Linear(hidden_size*sequence_length, num_classes)
    
    def forward(self, x):
         # Initialize hidden state with zeros
         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
     
         # One time step
         out, _ = self.rnn(x, h0)
         out = out.reshape(out.shape[0], -1)
         return out

model = RNNModel(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
for data, targets in enumerate(train_dataloader, 1):
    # Get data to cuda if possible
    data = data.to(device)
    targets = targets.to(device=device)

    # Get to correct shape
    data = data.reshape(data.shape[0], -1)

    # Forward
    scores = model(data)
    loss = criterion(scores, targets)

    # Backward
    optimizer.zero_grad()
    loss.backward()

    # Gradient descent/ Adam step
    optimizer.step()

AttributeError Traceback (most recent call last)
in ()
2 for data, targets in enumerate(train_dataloader, 1):
3 # Get data to cuda if possible
----> 4 data = data.to(device)
5 targets = targets.to(device=device)
6

AttributeError: ‘int’ object has no attribute ‘to’

I thought it could be because train_dataset is not a tensor, but I don’t know how to convert it into a tensor, I’ve tried to put frames in a list and then convert it into a tensor, but it takes too long. Thanks in advance.

What is the type() of data and targets here?

I’m not sure it is sufficient to pass a path as the dataset argument to construct a DataLoader, which could be responsible for the unexpected or bogus types of data and targets. I would suggest taking a look at implementing a dataset(torch.utils.data — PyTorch 1.12 documentation) that matches the format/task you are attempting to solve. It might also be useful to see if existing video classification or vision dataset implementations could be adapted to this task: Datasets — Torchvision main documentation.

Thanks, I’ll see that

EDIT: I’ve decided to change my approach, because I wasn’t getting anywhere with the code above. So, I decided to create a custom dataset, but for videos. At this moment the code is this (RNN and training is still the same)

 import torchvision 

class VideoDataset(Dataset):
    def __init__(self, path, transforms): #ids, labels): 
         # ids: "video name"     
         #self.labels = labels
         self.path = path
         self.transforms = transforms
         #self.ids = ids
    def __len__(self):
        return len(self.path)
    def __getitem__(self, idx):
        video = torchvision.io.read_video(self.path[idx])[0] # [T, H, W, C]
        video = np.transpose(video) # [C, T, H, W]
        video.seek(5.0)
        self.frame = next(video)
        for self.frame in video:
            self.frames.append(self.frame['data'])

 train_transformer = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),  
        transforms.RandomAffine(degrees=0, translate=(0.1,0.1)),    
        transforms.ToTensor()
        ]) 

 train = VideoDataset('/content/drive/MyDrive/HumanActions_new_data/TRAIN', 
              transforms=train_transformer)
 val = VideoDataset('/content/drive/MyDrive/HumanActions_new_data/VALIDATION', 
              transforms=train_transformer)
 test = VideoDataset('/content/drive/MyDrive/HumanActions_new_data/TEST', 
              transforms=train_transformer)

 train_dataloader = DataLoader(train, batch_size=16)
 val_dataloader = DataLoader(val, batch_size=16)
 test_dataloader = DataLoader(test, batch_size=16)

But when I’m going to train it appears

ImportError Traceback (most recent call last)
in ()
1 for epoch in range(num_epochs):
----> 2 for data, targets in enumerate(train_dataloader):
3 # Get data to cuda if possible
4 data = data.to(device)
5 targets = targets.to(device=device)

6 frames
/usr/local/lib/python3.7/dist-packages/torchvision/io/video.py in _check_av_available()
40 def _check_av_available() → None:
41 if isinstance(av, Exception):
—> 42 raise av
43
44

ImportError: PyAV is not installed, and is necessary for the video operations in torchvision.
See GitHub - PyAV-Org/PyAV: Pythonic bindings for FFmpeg's libraries. for instructions on how to
install PyAV on your system.

So I installed PyAV through !pip3 install av, but the error persists

Do you think you can help, @eqy ?

Is there some issue with different environments? !pip3 install ... sounds like it is running in some kind of colab/notebook environment which I am not familiar with.