Import video in form of an numpy array in pytorch

I have an video in form of numpy array (Picture Number,x,y). And my mouse events corresponding to the pictures (Number,x,y,click). Now i tried the Approach in my code which oblivious doesn’t work. I couldn’t find anything specific in the internet either. Can someone help me how i import the dataset properly?
My code so far:

import torch
import torchvision
import torchvision.transforms as transforms


from PIL import Image
from CustomDataset import CustomMouseDataset,Rescale


def load_data():
#    transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
#    
#    #Load Recoreded Data
#    with h5py.File('data/video_data_22_7_2018_17_46','r') as data:
#        video = data['video'][()]
#        mouse = data['mouse'][()]
#    video = video[:50]
#    mouse = mouse[:50]    
    transform = transforms.Compose([transforms.ToTensor()])
    train_data = CustomMouseDataset('data/video_data_22_7_2018_17_46',transform)
    
    
    train_loader = torch.utils.data.DataLoader(train_data,batch_size=10,shuffle=True)
    
    return train_loader

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1,10, 560, 656)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(656, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = Net()
train_data = load_data()

import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr = 0.001, momentum = 0.9)

#Train the Network
for epoch in range(2):
    running_loss = 0.0
    for i,data in enumerate(train_data,0):
        inputs = data['frame']
        labels = data['mouse']
        print(type(inputs))
        print(inputs)
        print(inputs.shape)

        #Zero gradients Parameter
        optimizer.zero_grad()
        
        #forward + backward +optimize
        outputs = net(inputs)
        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()
        if i % 4 == 1:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 5))
            running_loss = 0.0

print('Finished Training')

And the Dataloader


import h5py
import torchvision
from PIL import Image


def load_data(h5py_file):

        with h5py.File(h5py_file,'r') as data:
            video = data['video'][()]
            mouse = data['mouse'][()]
        video = video[:50]
        mouse = mouse[:50]    

        return video,mouse

class CustomMouseDataset():
    """Dataloader for Custom dataset"""
    
    
    
    def __init__(self, h5py_file,transform = None):
        
        self.video,self.mouse = load_data(h5py_file)
        self.transform = transform
        
    
    def __len__(self):
        
        return len(self.mouse)
    
    def __getitem__(self,idx):
        frame = self.video[idx]
        frame = Image.fromarray(frame)
        frame = self.transform(frame)
        
        mouse = self.mouse[idx]
        
        
        sample= {'frame': frame, 'mouse': mouse}
        
#        if self.transform:
#            sample = self.transform(sample)
        return sample
class Rescale(object):
    
    def __init__(self,output_size):
        assert isinstance(output_size,(int,tuple))
        self.output_size = output_size
    
    def __call__(self,sample):
        frame, mouse = sample['frame'],sample['mouse']
        h,w = frame.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        fra = transform.resize(frame, (new_h, new_w))
        
        return {'image':fra,'mouse':mouse}

Greeting Losspost

So just to clarify, your input is a video sequence and your output is a classification of where and when there is a click on the video? Do you have multiple videos, or just one video? Are you trying to model the video as a sequence, or individual frames?

The best way for you is to create a Dataset class for your specific data, which will enable you to use PyTorch Dataloader. Furthermore, you can use the tremendous amount of neat tutorials to train a/the model and tests it your data.

The idea is that later on. The Network get an live video as input. And corresponding to the video it should move the mouse and click (A bot for a game). Yes i will have multible videos and the video is stored in a numpy array for the format (Number of Images, width of the image,height of the image).I made a screenshot and the append the nummpy array to get the video.

The primary consideration then is how much previous information is important in your model’s current decision? Say we are at time t, does the state of the video at time t-1, t-2, … matter in your solution?

If not, then this is a simple image classification problem. You can imagine every frame where there is no click to be one in which the outcome is 0 and each from where there is a click to be one where the outcome is 1. When your model predicts 1, you can also predict the x and y coordinates of your outcome.

Now lets say this is your case. What you can do is load up any number of frames and use a standard ResNet to classify each frame as needing a click or not. Unless there is a need for a different type of model, this is where I’d start.

Once you have a model that is converging for predicting which frames need to be clicked, I would then take all frames that need click and use a resnet, this time with a pair of linear outcomes, to predict what the x and y corrdinates of the click need to be.

Now if you need information from previous frames, you’re going to need to look into some sequence modeling type approaches like an RNN. This is a little trickier, but overall not much. There seem to be a good number of examples on the internet that can help with that problem.

Ok Good to know. I would probably need the second idea but i think i just try the first one first. What makes a problem for me right now it to bring this numpy arrays in the right form.

What I would start with is one video loaded into a matrix of shape (num frames, num channels, height, width) as X and an array or size (num frames) with a 1 if there was a click on that frame and a 0 if ther was not.

From there I would follow any pytorch resnet tutorial and use one of their retrained models with the number of output classes equal to 2.

The Data already exist. But loading the data properly is the problem this is what i have so far. But it doesn’t really work


import h5py

def load_data(h5py_file):

        with h5py.File(h5py_file,'r') as data:
            video = data['video'][()]
            mouse = data['mouse'][()]
        video = video[:50]
        mouse = mouse[:50]    

        return video,mouse

class CustomMouseDataset():
    """Dataloader for Custom dataset"""
    
    
    
    def __init__(self, h5py_file,transform = None):
        
        self.video,self.mouse = load_data(h5py_file)
    
    def __len__(self):
        
        return len(self.mouse)
    
    def __getitem(self,idx):
        frame = self.video[idx]
        
        mouse = self.mouse[idx]
        
        sample= {'frame': frame, 'mouse': mouse}
        
        if self.transform:
            sample = self.transform(sample)
        return sample

What error are you getting?

TypeError: ‘CustomMouseDataset’ object does not support indexing

runfile('D:/Nextcloud/Python/Gamebot/model.py', wdir='D:/Nextcloud/Python/Gamebot')
Traceback (most recent call last):

  File "<ipython-input-11-4f1765267e06>", line 1, in <module>
    runfile('D:/Nextcloud/Python/Gamebot/model.py', wdir='D:/Nextcloud/Python/Gamebot')

  File "D:\Programme\Anaconda\envs\Deeplearning\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 678, in runfile
    execfile(filename, namespace)

  File "D:\Programme\Anaconda\envs\Deeplearning\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 106, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)

  File "D:/Nextcloud/Python/Gamebot/model.py", line 66, in <module>
    for i,data in enumerate(train_data,0):

  File "D:\Programme\Anaconda\envs\Deeplearning\lib\site-packages\torch\utils\data\dataloader.py", line 264, in __next__
    batch = self.collate_fn([self.dataset[i] for i in indices])

  File "D:\Programme\Anaconda\envs\Deeplearning\lib\site-packages\torch\utils\data\dataloader.py", line 264, in <listcomp>
    batch = self.collate_fn([self.dataset[i] for i in indices])

TypeError: 'CustomMouseDataset' object does not support indexing

You need two underscores around both sides of getitem. __getitem__

Ok i changed it but now it complains that it is a string?

TypeError: conv2d(): argument ‘input’ (position 1) must be Tensor, not str
There are no strings in my data

or do i have To Transform it first?

Can you post your whole code?

Ok so far i seem to make progress. The Numpy is apperently no in the right form. But i assume my Network is somehow wrong?
I added my Full code to my original Post.
my error:


<class 'torch.Tensor'>
tensor([[[[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
        
          [ 0.8745,  0.8745,  0.8745,  ...,  0.8745,  0.8745,  0.8745]]],

          [ 0.8745,  0.8745,  0.8745,  ...,  0.8745,  0.8745,  0.8745]]]])
torch.Size([10, 1, 560, 656])
Traceback (most recent call last):

  File "<ipython-input-2-4f1765267e06>", line 1, in <module>
    runfile('D:/Nextcloud/Python/Gamebot/model.py', wdir='D:/Nextcloud/Python/Gamebot')

  File "D:\Programme\Anaconda\envs\DeepLearning\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 678, in runfile
    execfile(filename, namespace)

  File "D:\Programme\Anaconda\envs\DeepLearning\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 106, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)

  File "D:/Nextcloud/Python/Gamebot/model.py", line 79, in <module>
    outputs = net(inputs)

  File "D:\Programme\Anaconda\envs\DeepLearning\lib\site-packages\torch\nn\modules\module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)

  File "D:/Nextcloud/Python/Gamebot/model.py", line 50, in forward
    x = self.pool(F.relu(self.conv1(x)))

  File "D:\Programme\Anaconda\envs\DeepLearning\lib\site-packages\torch\nn\modules\module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)

  File "D:\Programme\Anaconda\envs\DeepLearning\lib\site-packages\torch\nn\modules\pooling.py", line 142, in forward
    self.return_indices)

  File "D:\Programme\Anaconda\envs\DeepLearning\lib\site-packages\torch\nn\functional.py", line 360, in max_pool2d
    ret = torch._C._nn.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)

RuntimeError: Given input size: (10x1x1). Calculated output size: (10x0x0). Output size is too small at c:\programdata\miniconda3\conda-bld\pytorch_1524546371102\work\aten\src\thnn\generic/SpatialDilatedMaxPooling.c:67

Is there any reason you are using a custom network and not something off the shelf like ResNet?

That being said, what is the architecture you’re expecting?

I doesnt use a Custom i used the example from the Pytorch Example. And what do you mean with what architecture i am expecting?

Can you link to that example?

Of course:
https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
The Second Part

The class Net there looks a little different than yours -

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Well yeah because i have another input and output size of my image. Or do i get something wrong?
Because for input i have 10 batches a 1 channel(gray iamge) and then 560x656.
And as output i need a vector of x,y,click