[VGG 11] RuntimeError: shape '[0, -1]' is invalid for input of size 1605632

Hi!

I have a problem with VGG 11 model and federated data loader. On training, I have a runtime error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-17-448339389201> in <module>
      1 for epoch in range(1, args.epochs + 1):
----> 2     train(args, model, device, federated_train_loader, optimizer, epoch)

<ipython-input-14-9b8111af22ce> in train(args, model, device, train_loader, optimizer, epoch)
      5         data, target = data.to(device), target.to(device)
      6         optimizer.zero_grad()
----> 7         output = model(data)
      8         loss = F.nll_loss(output, target)
      9         loss.backward()

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    491             result = self._slow_forward(*input, **kwargs)
    492         else:
--> 493             result = self.forward(*input, **kwargs)
    494         for hook in self._forward_hooks.values():
    495             hook_result = hook(self, input, result)

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torchvision/models/vgg.py in forward(self, x)
     42         x = self.features(x)
     43         x = self.avgpool(x)
---> 44         x = x.view(x.size(0), -1)
     45         x = self.classifier(x)
     46         return x

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_native_method(self, *args, **kwargs)
    675                 # Send the new command to the appropriate class and get the response
    676                 method = getattr(new_self, method_name)
--> 677                 response = method(*new_args, **new_kwargs)
    678 
    679                 # For inplace methods, just directly return self

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_pointer_method(self, *args, **kwargs)
    511             command = (attr, self, args, kwargs)
    512 
--> 513             response = owner.send_command(location, command)
    514 
    515             return response

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in send_command(self, recipient, message, return_ids)
    425 
    426         try:
--> 427             ret_val = self.send_msg(codes.MSGTYPE.CMD, message, location=recipient)
    428         except ResponseSignatureError as e:
    429             ret_val = None

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in send_msg(self, msg_type, message, location)
    221 
    222         # Step 2: send the message and wait for a response
--> 223         bin_response = self._send_msg(bin_message, location)
    224 
    225         # Step 3: deserialize the response

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/virtual.py in _send_msg(self, message, location)
      8 class VirtualWorker(BaseWorker, FederatedClient):
      9     def _send_msg(self, message: bin, location: BaseWorker) -> bin:
---> 10         return location._recv_msg(message)
     11 
     12     def _recv_msg(self, message: bin) -> bin:

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/virtual.py in _recv_msg(self, message)
     11 
     12     def _recv_msg(self, message: bin) -> bin:
---> 13         return self.recv_msg(message)
     14 
     15     @staticmethod

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in recv_msg(self, bin_message)
    252             print(f"worker {self} received {sy.codes.code2MSGTYPE[msg_type]} {contents}")
    253         # Step 1: route message to appropriate function
--> 254         response = self._message_router[msg_type](contents)
    255 
    256         # Step 2: Serialize the message to simple python objects

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in execute_command(self, message)
    363             else:
    364                 try:
--> 365                     response = getattr(_self, command_name)(*args, **kwargs)
    366                 except TypeError:
    367                     # TODO Andrew thinks this is gross, please fix. Instead need to properly deserialize strings

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_native_method(self, *args, **kwargs)
    661                 except BaseException as e:
    662                     # we can make some errors more descriptive with this method
--> 663                     raise route_method_exception(e, self, args, kwargs)
    664 
    665             else:  # means that there is a wrapper to remove

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_native_method(self, *args, **kwargs)
    655                 try:
    656                     if isinstance(args, tuple):
--> 657                         response = method(*args, **kwargs)
    658                     else:
    659                         response = method(args, **kwargs)

RuntimeError: shape '[0, -1]' is invalid for input of size 1605632

I use https://susanqq.github.io/UTKFace/ dataset. Here is like my training data and labels look like (both are in np-arrays):

Training data (224 over 224 3 channels images):

array([[[[ 25.,  32.,  36., ...,  70.,  72.,  74.],
         [ 23.,  30.,  34., ...,  70.,  73.,  74.],
         [ 22.,  26.,  31., ...,  70.,  73.,  74.],
         ...,
         [  5.,   4.,   2., ...,   3.,   2.,   1.],
         [ 15.,  12.,   8., ...,   4.,   2.,   0.],
         [ 22.,  18.,  12., ...,   4.,   3.,   0.]],

        [[  4.,   8.,  12., ...,  44.,  46.,  48.],
         [  2.,   6.,  10., ...,  44.,  47.,  48.],
         [  1.,   5.,   7., ...,  44.,  47.,  48.],
         ...,
         [ 41.,  39.,  37., ...,   3.,   2.,   1.],
         [ 51.,  47.,  43., ...,   4.,   2.,   0.],
         [ 58.,  53.,  47., ...,   4.,   3.,   0.]],

        [[  1.,   6.,  10., ...,  17.,  19.,  21.],
         [  0.,   4.,   6., ...,  17.,  20.,  21.],
         [  0.,   0.,   3., ...,  17.,  20.,  21.],
         ...,
         [ 67.,  67.,  67., ...,   3.,   2.,   1.],
         [ 77.,  75.,  73., ...,   4.,   2.,   0.],
         [ 84.,  81.,  75., ...,   4.,   3.,   0.]]],


       [[[ 68.,  70.,  77., ..., 246., 246., 246.],
         [ 70.,  72.,  79., ..., 246., 246., 246.],
         [ 75.,  77.,  84., ..., 245., 246., 246.],
         ...,
         [194., 194., 194., ..., 238., 238., 238.],
         [195., 194., 194., ..., 238., 238., 238.],
         [195., 195., 194., ..., 238., 238., 238.]],

        [[ 34.,  36.,  43., ..., 246., 246., 246.],
         [ 36.,  38.,  45., ..., 246., 246., 246.],
         [ 41.,  43.,  50., ..., 245., 246., 246.],
         ...,
         [144., 144., 144., ..., 238., 238., 238.],
         [145., 144., 144., ..., 238., 238., 238.],
         [145., 145., 144., ..., 238., 238., 238.]],

        [[ 24.,  26.,  33., ..., 248., 248., 248.],
         [ 26.,  28.,  35., ..., 248., 248., 248.],
         [ 31.,  33.,  40., ..., 247., 248., 248.],
         ...,
         [133., 133., 133., ..., 240., 240., 240.],
         [134., 133., 133., ..., 240., 240., 240.],
         [134., 134., 133., ..., 240., 240., 240.]]],


       [[[ 68.,  76.,  78., ...,  80.,  83.,  85.],
         [ 77.,  85.,  90., ...,  81.,  82.,  84.],
         [ 83.,  90.,  95., ...,  79.,  79.,  80.],
         ...,
         [122., 125., 131., ...,  27.,  29.,  30.],
         [135., 127., 124., ...,  27.,  28.,  30.],
         [135., 121., 117., ...,  27.,  28.,  30.]],

        [[ 57.,  65.,  67., ...,  59.,  62.,  64.],
         [ 66.,  74.,  79., ...,  60.,  61.,  63.],
         [ 72.,  79.,  84., ...,  58.,  58.,  59.],
         ...,
         [115., 118., 124., ...,  27.,  29.,  30.],
         [128., 120., 118., ...,  27.,  28.,  30.],
         [128., 114., 111., ...,  27.,  28.,  30.]],

        [[ 51.,  59.,  61., ...,  38.,  41.,  43.],
         [ 60.,  68.,  73., ...,  39.,  40.,  42.],
         [ 66.,  73.,  78., ...,  37.,  37.,  38.],
         ...,
         [122., 125., 131., ...,  27.,  29.,  30.],
         [135., 127., 122., ...,  27.,  28.,  30.],
         [135., 121., 115., ...,  27.,  28.,  30.]]],


       ...,


       [[[106., 106., 108., ..., 132., 165., 186.],
         [102., 104., 106., ..., 128., 162., 186.],
         [ 99., 101., 103., ..., 123., 158., 184.],
         ...,
         [180., 185., 181., ..., 162., 160., 162.],
         [193., 186., 175., ..., 175., 172., 173.],
         [201., 191., 173., ..., 184., 182., 184.]],

        [[ 32.,  35.,  37., ...,  78., 109., 130.],
         [ 31.,  33.,  35., ...,  74., 106., 130.],
         [ 28.,  29.,  31., ...,  67., 102., 128.],
         ...,
         [134., 137., 133., ..., 140., 138., 140.],
         [147., 140., 127., ..., 153., 150., 151.],
         [155., 145., 127., ..., 162., 160., 162.]],

        [[  3.,   5.,   7., ...,  52.,  82., 103.],
         [  1.,   5.,   7., ...,  48.,  79., 103.],
         [  0.,   4.,   6., ...,  42.,  75., 101.],
         ...,
         [ 98., 101.,  97., ..., 143., 141., 143.],
         [111., 104.,  91., ..., 156., 153., 154.],
         [119., 109.,  93., ..., 165., 163., 165.]]],


       [[[ 28.,  29.,  29., ...,  12.,  11.,  10.],
         [ 30.,  31.,  33., ...,  12.,  11.,  11.],
         [ 34.,  35.,  36., ...,  13.,  12.,  12.],
         ...,
         [ 58.,  55.,  49., ..., 102., 117., 139.],
         [ 60.,  57.,  49., ..., 105., 110., 131.],
         [ 62.,  58.,  50., ...,  68.,  57.,  72.]],

        [[ 29.,  30.,  30., ...,  14.,  13.,  12.],
         [ 31.,  32.,  34., ...,  14.,  13.,  13.],
         [ 35.,  36.,  37., ...,  15.,  14.,  14.],
         ...,
         [ 45.,  45.,  39., ..., 137., 152., 175.],
         [ 47.,  47.,  39., ..., 140., 145., 166.],
         [ 49.,  48.,  40., ..., 103.,  92., 107.]],

        [[ 24.,  25.,  25., ...,  13.,  12.,  11.],
         [ 26.,  27.,  29., ...,  13.,  12.,  12.],
         [ 30.,  31.,  32., ...,  14.,  13.,  13.],
         ...,
         [ 37.,  36.,  30., ..., 175., 206., 237.],
         [ 39.,  38.,  30., ..., 178., 199., 230.],
         [ 41.,  39.,  31., ..., 141., 146., 171.]]],


       [[[252., 252., 253., ...,  57.,  57.,  57.],
         [252., 252., 253., ...,  57.,  57.,  57.],
         [252., 252., 253., ...,  56.,  56.,  56.],
         ...,
         [246., 246., 245., ..., 170., 170., 170.],
         [245., 245., 245., ..., 172., 172., 171.],
         [245., 245., 244., ..., 173., 173., 173.]],

        [[253., 253., 252., ...,  54.,  54.,  54.],
         [253., 253., 252., ...,  54.,  54.,  54.],
         [253., 253., 252., ...,  53.,  53.,  53.],
         ...,
         [239., 239., 238., ..., 123., 123., 123.],
         [238., 238., 238., ..., 125., 125., 124.],
         [238., 238., 237., ..., 126., 126., 126.]],

        [[248., 248., 248., ...,  49.,  49.,  49.],
         [248., 248., 248., ...,  49.,  49.,  49.],
         [248., 248., 248., ...,  48.,  48.,  48.],
         ...,
         [247., 247., 246., ..., 105., 105., 105.],
         [246., 246., 246., ..., 107., 107., 106.],
         [246., 246., 245., ..., 108., 108., 108.]]]], dtype=float32)

train_data.shape ->

(7820, 3, 224, 224)

labels (gender, 0-male, 1-female):

array([0., 1., 0., ..., 1., 1., 1.], dtype=float32)

Federated data-loader ->

base=sy.BaseDataset(torch.from_numpy(train_data),
                    torch.from_numpy(train_labels_after))
base_federated=base.federate((bob, alice))
federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader 
                         base_federated,batch_size=args.batch_size)

If you have any ideas over what can help, please advise…

The error points to a shape of 0 in dim0, which is really strange.
I.e. you’ll get the same error, if you run the following code:

x = torch.randn(10, 10, 2)
x.view(x.size(0), -1)  # works
x.view(0, -1)  # your error

Could you print the shape of x right before the view operation in your forward method?

1 Like

Dear ptrblck,

Hmm, I’m not sure how to do it. Because I use existing VGG11 model… Could you advise please how to debug in this case?

import torchvision
model = torchvision.models.vgg11(pretrained=False)

Ah OK, in that case the error should come from the input batch.
Could you isolate the sample, which causes this error and print its shape in the training loop?
Something like this should work:

for data, target in federated_train_loader:
        print(data.shape)
        ...
        output = model(data)
1 Like

Sure. It seems that the problem come from the very first sample:
code I execute:

for data, target in federated_train_loader:
        print(data.shape)
        print(target)
        print(1)
        output = model(data)
        print(2)

Error I have:

torch.Size([64, 3, 224, 224])
(Wrapper)>[PointerTensor | me:36484845200 -> bob:18563538231]
1
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-25-09bbd3d5a518> in <module>
      3         print(target)
      4         print(1)
----> 5         output = model(data)
      6         print(2)
      7 

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    491             result = self._slow_forward(*input, **kwargs)
    492         else:
--> 493             result = self.forward(*input, **kwargs)
    494         for hook in self._forward_hooks.values():
    495             hook_result = hook(self, input, result)

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torchvision/models/vgg.py in forward(self, x)
     42         x = self.features(x)
     43         x = self.avgpool(x)
---> 44         x = x.view(x.size(0), -1)
     45         x = self.classifier(x)
     46         return x

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_native_method(self, *args, **kwargs)
    675                 # Send the new command to the appropriate class and get the response
    676                 method = getattr(new_self, method_name)
--> 677                 response = method(*new_args, **new_kwargs)
    678 
    679                 # For inplace methods, just directly return self

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_pointer_method(self, *args, **kwargs)
    511             command = (attr, self, args, kwargs)
    512 
--> 513             response = owner.send_command(location, command)
    514 
    515             return response

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in send_command(self, recipient, message, return_ids)
    425 
    426         try:
--> 427             ret_val = self.send_msg(codes.MSGTYPE.CMD, message, location=recipient)
    428         except ResponseSignatureError as e:
    429             ret_val = None

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in send_msg(self, msg_type, message, location)
    221 
    222         # Step 2: send the message and wait for a response
--> 223         bin_response = self._send_msg(bin_message, location)
    224 
    225         # Step 3: deserialize the response

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/virtual.py in _send_msg(self, message, location)
      8 class VirtualWorker(BaseWorker, FederatedClient):
      9     def _send_msg(self, message: bin, location: BaseWorker) -> bin:
---> 10         return location._recv_msg(message)
     11 
     12     def _recv_msg(self, message: bin) -> bin:

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/virtual.py in _recv_msg(self, message)
     11 
     12     def _recv_msg(self, message: bin) -> bin:
---> 13         return self.recv_msg(message)
     14 
     15     @staticmethod

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in recv_msg(self, bin_message)
    252             print(f"worker {self} received {sy.codes.code2MSGTYPE[msg_type]} {contents}")
    253         # Step 1: route message to appropriate function
--> 254         response = self._message_router[msg_type](contents)
    255 
    256         # Step 2: Serialize the message to simple python objects

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in execute_command(self, message)
    363             else:
    364                 try:
--> 365                     response = getattr(_self, command_name)(*args, **kwargs)
    366                 except TypeError:
    367                     # TODO Andrew thinks this is gross, please fix. Instead need to properly deserialize strings

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_native_method(self, *args, **kwargs)
    661                 except BaseException as e:
    662                     # we can make some errors more descriptive with this method
--> 663                     raise route_method_exception(e, self, args, kwargs)
    664 
    665             else:  # means that there is a wrapper to remove

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_native_method(self, *args, **kwargs)
    655                 try:
    656                     if isinstance(args, tuple):
--> 657                         response = method(*args, **kwargs)
    658                     else:
    659                         response = method(args, **kwargs)

RuntimeError: shape '[0, -1]' is invalid for input of size 1605632

Could the problem come from the dataset itself?

I assume the syft hooks are somehow returning an empty tensor. Could you explain a bit, what is supposed to happen under the hood?

Sure. My idea is to train existing VGG11(or any other architecture) with a dataset under federated settings. I want to check the behaviour, because the tutorials I’ve seen use just small self-developped small network. So my idea was to use existing stable big enough network with pretty big dataset to train it under federated settings. After I would like to try implementing other solutions with federated learning paradigm. Nothing too fancy to be honest :slight_smile:

As far as I know this would mean each device learns locally using only the local subset of the data and tries to communicate the small changes to a “master model”. Is that a correct understanding of federated learning?

Just by looking at the stack trace it seems the communication somehow messed up the tensor.
Is the same code working with another model?

Yes, that is right. I’ve tried with ResNet, there I have another kind of a problem ->

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-28-448339389201> in <module>
      1 for epoch in range(1, args.epochs + 1):
----> 2     train(args, model, device, federated_train_loader, optimizer, epoch)

<ipython-input-14-9b8111af22ce> in train(args, model, device, train_loader, optimizer, epoch)
      5         data, target = data.to(device), target.to(device)
      6         optimizer.zero_grad()
----> 7         output = model(data)
      8         loss = F.nll_loss(output, target)
      9         loss.backward()

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    491             result = self._slow_forward(*input, **kwargs)
    492         else:
--> 493             result = self.forward(*input, **kwargs)
    494         for hook in self._forward_hooks.values():
    495             hook_result = hook(self, input, result)

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torchvision/models/resnet.py in forward(self, x)
    191     def forward(self, x):
    192         x = self.conv1(x)
--> 193         x = self.bn1(x)
    194         x = self.relu(x)
    195         x = self.maxpool(x)

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    491             result = self._slow_forward(*input, **kwargs)
    492         else:
--> 493             result = self.forward(*input, **kwargs)
    494         for hook in self._forward_hooks.values():
    495             hook_result = hook(self, input, result)

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py in forward(self, input)
     59     @weak_script_method
     60     def forward(self, input):
---> 61         self._check_input_dim(input)
     62 
     63         # exponential_average_factor is self.momentum set to

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py in _check_input_dim(self, input)
    246     @weak_script_method
    247     def _check_input_dim(self, input):
--> 248         if input.dim() != 4:
    249             raise ValueError('expected 4D input (got {}D input)'
    250                              .format(input.dim()))

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_native_method(self, *args, **kwargs)
    675                 # Send the new command to the appropriate class and get the response
    676                 method = getattr(new_self, method_name)
--> 677                 response = method(*new_args, **new_kwargs)
    678 
    679                 # For inplace methods, just directly return self

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/pointers/pointer_tensor.py in dim(self)
    194 
    195     def dim(self) -> int:
--> 196         return len(self._shape)
    197 
    198     def share(self, *args, **kwargs):

TypeError: object of type 'NoneType' has no len()

I’m not sure, if that’s another problem, as apparently the input is also empty.

Could you post a reproducible code snippet so that we could have a look?

Hi ptrblck,
Yeap, please find my code snippet below:


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import syft as sy 
from torch.utils.data import Dataset
from torchvision import datasets, transforms
from resizeimage import resizeimage

class Arguments():
    def __init__(self):
        self.batch_size = 64
        self.test_batch_size = 1000
        self.epochs = 10
        self.lr = 0.01
        self.momentum = 0.5
        self.no_cuda = False
        self.seed = 1
        self.log_interval = 10
        self.save_model = False

args = Arguments()

use_cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}


hook = sy.TorchHook(torch)  # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob")  # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice")  # <-- NEW: and alice

from scipy.io import loadmat
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import imshow
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

get_ipython().run_line_magic('matplotlib', 'inline')

frame = pd.read_csv('advanced/filename.csv')

ID_GENDER_MAP = {0: 'male', 1: 'female'}
GENDER_ID_MAP = dict((g, i) for i, g in ID_GENDER_MAP.items())
ID_RACE_MAP = {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'}
RACE_ID_MAP = dict((r, i) for i, r in ID_RACE_MAP.items())

frame['gender'] = frame['gender'].map(lambda gender: GENDER_ID_MAP[gender])
frame['race'] = frame['race'].map(lambda race: RACE_ID_MAP[race])
#print(frame)
count_row = frame.shape[0]

train, test = train_test_split(frame, test_size=0.2) 

train_data = list(map(lambda x: np.asarray(Image.open(x), dtype = 'float32'), train['file']))
train_labels = train.as_matrix(columns=train.columns[1:2])

test_data = list(map(lambda x: np.asarray(Image.open(x), dtype = 'float32'), test['file']))
test_labels = test.as_matrix(columns=test.columns[1:2])


train_data = np.asarray(train_data)
test_data = np.asarray(test_data)

# shape of data check
train_data.shape
test_data.shape


train_labels = np.asarray(train_labels, dtype = 'float32')
train_labels

test_labels = np.asarray(test_labels, dtype = 'float32')
test_labels

train_labels_after = train_labels[:,0]


np.array_equal(test_data, test_labels)


# Transpose the image arrays
train_data= train_data.transpose((0,3,1,2))
train_data.shape


base=sy.BaseDataset(torch.from_numpy(train_data),
                    torch.from_numpy(train_labels_after))
base_federated=base.federate((bob, alice))
federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader 
                         base_federated,batch_size=args.batch_size)


def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
        model.send(data.location) # <-- NEW: send the model to the right location
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        model.get() # <-- NEW: get the model back
        if batch_idx % args.log_interval == 0:
            loss = loss.get() # <-- NEW: get the loss back
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * args.batch_size, len(train_loader) * args.batch_size, #batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))



import torchvision
model = torchvision.models.vgg11(pretrained=False)
#model = torchvision.models.resnet101(pretrained=False)


model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr) # TODO momentum is not supported at the moment

print(model)
print(optimizer)


for epoch in range(1, args.epochs + 1):
    train(args, model, device, federated_train_loader, optimizer, epoch) 
        

Another strange behaviour. When I “create” VGG (not simply import it from torchvision), I have totally different kind of error:

---------------------------------------------------------------------------
PureTorchTensorFoundError                 Traceback (most recent call last)
~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/tensors/interpreters/native.py in handle_func_command(cls, command)
    259             new_args, new_kwargs, new_type, args_type = syft.frameworks.torch.hook_args.hook_function_args(
--> 260                 cmd, args, kwargs, return_args_type=True
    261             )

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook_args.py in hook_function_args(attr, args, kwargs, return_args_type)
    156         # Try running it
--> 157         new_args = hook_args(args)
    158 

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook_args.py in <lambda>(x)
    350 
--> 351     return lambda x: f(lambdas, x)
    352 

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook_args.py in two_fold(lambdas, args, **kwargs)
    515 def two_fold(lambdas, args, **kwargs):
--> 516     return lambdas[0](args[0], **kwargs), lambdas[1](args[1], **kwargs)
    517 

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook_args.py in <lambda>(i)
    328         # Last if not, rule is probably == 1 so use type to return the right transformation.
--> 329         else lambda i: forward_func[type(i)](i)
    330         for a, r in zip(args, rules)  # And do this for all the args / rules provided

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook_args.py in <lambda>(i)
     55     if hasattr(i, "child")
---> 56     else (_ for _ in ()).throw(PureTorchTensorFoundError),
     57     torch.nn.Parameter: lambda i: i.child

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook_args.py in <genexpr>(.0)
     55     if hasattr(i, "child")
---> 56     else (_ for _ in ()).throw(PureTorchTensorFoundError),
     57     torch.nn.Parameter: lambda i: i.child

PureTorchTensorFoundError: 

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-22-ff894affe93a> in <module>
      1 for epoch in range(1, args.epochs + 1):
----> 2     train(args, model_perso_vgg, device, federated_train_loader, optimizer, epoch)

<ipython-input-18-cab4c7130012> in train(args, model, device, train_loader, optimizer, epoch)
      6         optimizer.zero_grad()
      7         output = model(data)
----> 8         loss = F.nll_loss(output, target)
      9         loss.backward()
     10         optimizer.step()

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_func(*args, **kwargs)
    715             cmd_name = f"{attr.__module__}.{attr.__name__}"
    716             command = (cmd_name, None, args, kwargs)
--> 717             response = TorchTensor.handle_func_command(command)
    718             return response
    719 

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/tensors/interpreters/native.py in handle_func_command(cls, command)
    268             new_command = (cmd, None, new_args, new_kwargs)
    269             # Send it to the appropriate class and get the response
--> 270             response = new_type.handle_func_command(new_command)
    271             # Put back the wrappers where needed
    272             response = syft.frameworks.torch.hook_args.hook_response(

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/pointers/object_pointer.py in handle_func_command(cls, command)
     86 
     87         # Send the command
---> 88         response = owner.send_command(location, command)
     89 
     90         return response

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in send_command(self, recipient, message, return_ids)
    425 
    426         try:
--> 427             ret_val = self.send_msg(codes.MSGTYPE.CMD, message, location=recipient)
    428         except ResponseSignatureError as e:
    429             ret_val = None

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in send_msg(self, msg_type, message, location)
    221 
    222         # Step 2: send the message and wait for a response
--> 223         bin_response = self._send_msg(bin_message, location)
    224 
    225         # Step 3: deserialize the response

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/virtual.py in _send_msg(self, message, location)
      8 class VirtualWorker(BaseWorker, FederatedClient):
      9     def _send_msg(self, message: bin, location: BaseWorker) -> bin:
---> 10         return location._recv_msg(message)
     11 
     12     def _recv_msg(self, message: bin) -> bin:

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/virtual.py in _recv_msg(self, message)
     11 
     12     def _recv_msg(self, message: bin) -> bin:
---> 13         return self.recv_msg(message)
     14 
     15     @staticmethod

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in recv_msg(self, bin_message)
    252             print(f"worker {self} received {sy.codes.code2MSGTYPE[msg_type]} {contents}")
    253         # Step 1: route message to appropriate function
--> 254         response = self._message_router[msg_type](contents)
    255 
    256         # Step 2: Serialize the message to simple python objects

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/workers/base.py in execute_command(self, message)
    383                 command = getattr(command, path)
    384 
--> 385             response = command(*args, **kwargs)
    386 
    387         # some functions don't return anything (such as .backward())

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/hook/hook.py in overloaded_func(*args, **kwargs)
    715             cmd_name = f"{attr.__module__}.{attr.__name__}"
    716             command = (cmd_name, None, args, kwargs)
--> 717             response = TorchTensor.handle_func_command(command)
    718             return response
    719 

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/syft/frameworks/torch/tensors/interpreters/native.py in handle_func_command(cls, command)
    285             # in the execute_command function
    286             if isinstance(args, tuple):
--> 287                 response = eval(cmd)(*args, **kwargs)
    288             else:
    289                 response = eval(cmd)(args, **kwargs)

~/miniconda3/envs/pysyft/lib/python3.7/site-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   1869                          .format(input.size(0), target.size(0)))
   1870     if dim == 2:
-> 1871         ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   1872     elif dim == 4:
   1873         ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)

RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'target'

The full code that generates the problem :

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import syft as sy 
from torch.utils.data import Dataset
from torchvision import datasets, transforms
from resizeimage import resizeimage

class Arguments():
    def __init__(self):
        self.batch_size = 64
        self.test_batch_size = 1000
        self.epochs = 10
        self.lr = 0.01
        self.momentum = 0.5
        self.no_cuda = False
        self.seed = 1
        self.log_interval = 10
        self.save_model = False

args = Arguments()

use_cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

hook = sy.TorchHook(torch)  # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob")  # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice")  # <-- NEW: and alice

from scipy.io import loadmat
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import imshow
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from scipy.misc import imread

%matplotlib inline

frame = pd.read_csv('advanced/filename.csv')

ID_GENDER_MAP = {0: 'male', 1: 'female'}
GENDER_ID_MAP = dict((g, i) for i, g in ID_GENDER_MAP.items())
ID_RACE_MAP = {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'}
RACE_ID_MAP = dict((r, i) for i, r in ID_RACE_MAP.items())

frame['gender'] = frame['gender'].map(lambda gender: GENDER_ID_MAP[gender])
frame['race'] = frame['race'].map(lambda race: RACE_ID_MAP[race])



#print(frame)
count_row = frame.shape[0]

X = torch.tensor([imread(x, flatten=False, mode='RGB') for x in frame['file']],dtype=torch.float).permute(0, 3, 1, 2)
y = torch.tensor(frame.as_matrix(columns=frame.columns[1:2]), dtype=torch.float)

y = y[:,0]

base=sy.BaseDataset(X[0:100],y[0:100])
base_federated=base.federate((bob, alice))
federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader 
                         base_federated,batch_size=args.batch_size)

def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
        model.send(data.location) # <-- NEW: send the model to the right location
        data, target = data.to(device=device), target.to(device=device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        model.get() # <-- NEW: get the model back
        if batch_idx % args.log_interval == 0:
            loss = loss.get() # <-- NEW: get the loss back
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * args.batch_size, len(train_loader) * args.batch_size, #batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

class VGG(nn.Module):

    def __init__(self, features, num_classes=1000, init_weights=True):
        super(VGG, self).__init__()
        self.features = features
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
        if init_weights:
            self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
#         print(x.shape)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)


def make_layers(cfg, batch_norm=False):
    layers = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)


cfgs = {
    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
}


def _vgg(arch, cfg, batch_norm, pretrained, progress, **kwargs):
    if pretrained:
        kwargs['init_weights'] = False
    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model


def vgg11(pretrained=False, progress=True, **kwargs):
    r"""VGG 11-layer model (configuration "A") from
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg11', 'A', False, pretrained, progress, **kwargs)

model_perso_vgg = vgg11(pretrained=False)  #.cuda()
model_perso_vgg = model_perso_vgg.to(device=device)
optimizer = optim.SGD(model.parameters(), lr=args.lr)

for epoch in range(1, args.epochs + 1):
    train(args, model_perso_vgg, device, federated_train_loader, optimizer, epoch)

It seems so strange. Maybe you will have any ideas why it is so?