Discrepancy in output shape

class Model(nn.Module):
    def __init__(self)
        super(Model, self).__init__()
        self.layer = resnext50_32x4d(pretrained = False)
        self.layer.fc = nn.Linear(2048,100)
        self.query = nn.Linear(100,100, bias = False)
        self.key = nn.Linear(100,100, bias = False)
        self.value = nn.Linear(100,100, bias = False)
        self.nl = nn.LogSoftmax(dim = 1)

    def forward(self, X):
        x = self.layer(X)
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        kqdots = torch.matmul(q, torch.transpose(k, 0, 1))
        return kqdots

This is the Model I am using. The batch size is 64.
I get the predictions using

model = Model()
model = torch.nn.DataParallel(model).cuda()
pred = model(data)

Shape of input batch is

torch.Size(64,3,256,256)

However the shape of the output I am getting is

torch.Size(200, 32)

Shouldn’t it be

torch.Size(64, 64)

I am using Pytorch version 1.1.0

I cannot reproduce the output shape using 8 GPUs and get [64, 8], which is expected due to the last line of code in your forward:

kqdots = torch.matmul(q, torch.transpose(k, 0, 1))

This will perform a matrix multiplication using tensors with the shape [batch_size/n_gpu, 100] and [100, batch_size/n_gpu], which creates an output of [batch_size/n_gpu, batch_size/n_gpu].
After concatenating the outputs from each GPU, the output will be [batch_size, batch_size/n_gpu], which corresponds to [64, 8] in my case.

I am using 2 GPUs. Could you try and check with two please ?

Sure! No difference and the shape is as expected:

CUDA_VISIBLE_DEVICES=0,1 python tmp.py
torch.Size([64, 32]) # [batch_size, batch_size/n_gpu]

using:

import torch
import torch.nn as nn
import torchvision.models as models

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.layer = models.resnext50_32x4d(pretrained = False)
        self.layer.fc = nn.Linear(2048,100)
        self.query = nn.Linear(100,100, bias = False)
        self.key = nn.Linear(100,100, bias = False)
        self.value = nn.Linear(100,100, bias = False)
        self.nl = nn.LogSoftmax(dim = 1)

    def forward(self, X):
        x = self.layer(X)
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        kqdots = torch.matmul(q, torch.transpose(k, 0, 1))
        return kqdots

model = Model()
model = nn.DataParallel(model).cuda()
x = torch.randn(64, 3, 256 ,256).cuda()
out = model(x)
print(out.shape)