RuntimeError: size mismatch, m1: [64 x 512], m2: [8192 x 512] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:290

I have this Pytorch Ensemble model but can figure out what is wrong. I got this from a book in Packt Deep Learning with PyTorch, I have included the definition of fit() so you can see the complete loop.

def fit(epoch,model,data_loader,phase='training',volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile=True

    running_loss = 0.0
    running_correct = 0
    for batch_idx , (data1,data2,data3,target) in enumerate(data_loader):
        if is_cuda:
            data1,data2,data3,target = data1.cuda(),data2.cuda(),data3.cuda(),target.cuda()
        data1,data2,data3,target = Variable(data1,volatile),Variable(data2,volatile),Variable(data3,volatile),Variable(target)

        if phase == 'training':
            optimizer.zero_grad()
        output = model(data1,data2,data3)
        loss = F.cross_entropy(output,target)

        running_loss += F.cross_entropy(output,target,size_average=False).data[0]
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        if phase == 'training':
            loss.backward()
            optimizer.step()

    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * running_correct/len(data_loader.dataset)

    print(f'{phase} loss is {loss:{5}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)}{accuracy:{10}.{4}}')
    return loss,accuracy

class LayerActivations():
    features=[]

    def __init__(self,model):
        self.features = []
        self.hook = model.register_forward_hook(self.hook_fn)

    def hook_fn(self,module,input,output):
        #out = F.avg_pool2d(output, kernel_size=8)
        self.features.extend(output.view(output.size(0),-1).cpu().data)


    def remove(self):

        self.hook.remove()


class EnsembleModel(nn.Module):
    def __init__(self, out_size, training=True):
        super().__init__()
        self.fc1 = nn.Linear(8192, 512)
        self.fc2 = nn.Linear(131072, 512)
        self.fc3 = nn.Linear(82944, 512)
        self.fc4 = nn.Linear(512, out_size)

    def forward(self, inp1, inp2, inp3):
        out1 = self.fc1(F.dropout(inp1, training=self.training))
        out2 = self.fc2(F.dropout(inp2, training=self.training))
        out3 = self.fc3(F.dropout(inp3, training=self.training))
        out = out1 + out2 + out3
        out = self.fc4(F.dropout(out, training=self.training))

        return out


em = EnsembleModel(2)
if is_cuda:
    em = em.cuda()

train_losses, train_accuracy = [], []
val_losses, val_accuracy = [], []

for epoch in range(1, 10):
    epoch_loss, epoch_accuracy = fit(epoch, em, trn_feat_loader, phase="training")

    val_epoch_loss, val_epoch_accuracy = fit(
        epoch, em, val_feat_loader, phase="validation"
    )

    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

RuntimeError                              Traceback (most recent call last)
    def extra_repr(self):
RuntimeError: size mismatch, m1: [64 x 512], m2: [8192 x 512] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:290

Thanks for your help!

Could you check the shape of data1 before passing it to the model?
It seems the shape mismatch is created using this tensor.

PS: You don’t need Variables anymore, as they are deprecated since PyTorch 0.4.0.

Hi @ptrblck, I have this values

inpt1 → torch.Size([64, 512])
inpt2 → torch.Size([64, 73728])
inpt3 → torch.Size([64, 65536])

if I do this

class EnsembleModel(nn.Module):
    
    def __init__(self,out_size,training=True):
        super().__init__()
#         self.fc1 = nn.Linear(8192,512)
        self.fc2 = nn.Linear(131072,512)
        self.fc3 = nn.Linear(82944,512)
        self.fc4 = nn.Linear(512,out_size)

    def forward(self,inp1,inp2,inp3):
        print(inp1.shape)
        print(inp2.shape)
        print(inp3.shape)
        
#         out1 = self.fc1(F.dropout(inp1,training=self.training))
        out2 = self.fc2(F.dropout(inp2,training=self.training))
        out3 = self.fc3(F.dropout(inp3,training=self.training))
        out = out1 + out2 + out3
        out = self.fc4(F.dropout(out,training=self.training))
        return out

I get this:
torch.Size([64, 73728])
torch.Size([64, 65536])

RuntimeError: size mismatch, m1: [64 x 73728], m2: [131072 x 512] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:283

no clue what is going on....

The number of input features in your linear layers should match the number of features from your inputs.
This code should work:

class EnsembleModel(nn.Module):
    
    def __init__(self,out_size,training=True):
        super().__init__()
        self.fc1 = nn.Linear(8192,512)
        self.fc2 = nn.Linear(73728,512)
        self.fc3 = nn.Linear(65536,512)
        self.fc4 = nn.Linear(512,out_size)

    def forward(self,inp1,inp2,inp3):        
        out1 = self.fc1(F.dropout(inp1,training=self.training))
        out2 = self.fc2(F.dropout(inp2,training=self.training))
        out3 = self.fc3(F.dropout(inp3,training=self.training))
        out = out1 + out2 + out3
        out = self.fc4(F.dropout(out,training=self.training))
        return out


inpt1 = torch.randn([64, 8192])
inpt2 = torch.randn([64, 73728])
inpt3 = torch.randn([64, 65536])

model = EnsembleModel(2)
out = model(inpt1, inpt2, inpt3)

class MMGCN(torch.nn.Module):
def init(self, features, edge_index, batch_size, num_user, num_item, aggr_mode, concate, num_layer, has_id,
dim_x):
super(MMGCN, self).init()
self.batch_size = batch_size
self.num_user = num_user
self.num_item = num_item
self.aggr_mode = aggr_mode
self.concate = concate

    self.edge_index = torch.tensor(edge_index).t().contiguous().cuda()
    self.edge_index = torch.cat((self.edge_index, self.edge_index[[1, 0]]), dim=1)

    t_feat = features
    self.t_feat = torch.tensor(t_feat, dtype=torch.float).cuda()
    self.t_gcn = GCN(self.t_feat, self.edge_index, batch_size, num_user, num_item, dim_x, self.aggr_mode,
                     self.concate, num_layer=num_layer, has_id=has_id, dim_latent=25)

    self.id_embedding = nn.init.xavier_normal_(torch.rand((num_user + num_item, dim_x), requires_grad=True)).cuda()
    self.result_embed = nn.init.xavier_normal_(torch.rand((num_user + num_item, dim_x))).cuda()

def forward(self, user_nodes, pos_item_nodes, neg_item_nodes):
    t_rep = self.t_gcn(self.id_embedding)
    representation = (t_rep) / 1
    self.result_embed = representation
    user_tensor = representation[user_nodes]
    pos_item_tensor = representation[pos_item_nodes]
    neg_item_tensor = representation[neg_item_nodes]
    pos_scores = torch.sum(user_tensor * pos_item_tensor, dim=1)
    neg_scores = torch.sum(user_tensor * neg_item_tensor, dim=1)
    return pos_scores, neg_scores

This is my model defination where am taking textual features but getting the above mentioned runtime errorkindly guide me to debug it

parser.add_argument(’–no-cuda’, action=‘store_true’, default=False, help=‘Disables CUDA training.’)
parser.add_argument(’–model_name’, default=‘MMGCN’, help=‘Model name.’)
#parser.add_argument(’–data_path’, default=‘amazon-book’, help=‘Dataset path’)
parser.add_argument(’–PATH_weight_load’, default=None, help=‘Loading weight filename.’)
parser.add_argument(’–PATH_weight_save’, default=None, help=‘Writing weight filename.’)
parser.add_argument(’–l_r’, type=float, default=1e-3, help=‘Learning rate.’)
parser.add_argument(’–weight_decay’, type=float, default=1e-3, help=‘Weight decay.’)
parser.add_argument(’–batch_size’, type=int, default=128, help=‘Batch size.’)
parser.add_argument(’–dim_latent’, type=int, default=25, help=‘Latent dimension.’)
parser.add_argument(’–num_epoch’, type=int, default=40, help=‘Epoch number.’)
parser.add_argument(’–num_workers’, type=int, default=5, help=‘Workers number.’)
parser.add_argument(’–num_user’, type=int, default=300, help=‘User number.’)
parser.add_argument(’–num_item’, type=int, default=25, help=‘Item number.’)
parser.add_argument(’–aggr_mode’, default=‘mean’, help=‘Aggregation mode.’)
parser.add_argument(’–concat’, type=bool, default=True, help=‘Concatenation’)
parser.add_argument(’–num_layer’, type=int, default=2, help=‘Layer number.’)
parser.add_argument(’–has_id’, type=bool, default=True, help=‘Has id_embedding’)

this is the training part sir i am not understanding where i am doing wrong

Could you post the complete model definition by wrapping it into three backticks ``` please?

This is my model definition where i am just taking the textual features with 100 latent dimensions

class MMGCN(torch.nn.Module):
    def __init__(self, features, edge_index, batch_size, num_user, num_item, aggr_mode, concate, num_layer, has_id,
                 dim_x):
        super(MMGCN, self).__init__()
        self.batch_size = batch_size
        self.num_user = num_user
        self.num_item = num_item
        self.aggr_mode = aggr_mode
        self.concate = concate

        self.edge_index = torch.tensor(edge_index).t().contiguous().cuda()
        self.edge_index = [torch.cat](http://torch.cat)((self.edge_index, self.edge_index[[1, 0]]), dim=1)

        t_feat = features
        self.t_feat = torch.tensor(t_feat, dtype=torch.float).cuda()
        self.t_gcn = GCN(self.t_feat, self.edge_index, batch_size, num_user, num_item, dim_x, self.aggr_mode,
                         self.concate, num_layer=num_layer, has_id=has_id, dim_latent=100)

        self.id_embedding = nn.init.xavier_normal_(torch.rand((num_user + num_item, dim_x), requires_grad=True)).cuda()
        self.result_embed = nn.init.xavier_normal_(torch.rand((num_user + num_item, dim_x))).cuda()

    def forward(self, user_nodes, pos_item_nodes, neg_item_nodes):
        t_rep = self.t_gcn(self.id_embedding)
        representation = (t_rep) / 1
and when i training this it is giving the error weight mismatch
in training am taking the 2 layers with batch size 128 against 55585 users and 5986 items. error is
5986*

in original model of MMGCN There were 3 modalities acoustic visual and textual one but am just taking the textual in the original one it had for textual 100 latent dimensions acoustic 128 visual 128 for 5986 and 55485 users i am not understanding that for one modality how should i balance the weights

You haven’t posted the complete model definition, so you would have to dig into your code and search for the offending line of code.
This dummy code snippet raises the same error:

lin = nn.Linear(5986, 28)
x = torch.randn(5986, 100)
out = lin(x)

As you can see, I’m passing x in the wrong shape. While 5986 features are expected, I’m using 5986 as the batch size and pass 100 features.

import math
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
from BaseModel import BaseModel

class GCN(torch.nn.Module):
def init(self, features, edge_index, batch_size, num_user, num_item, dim_id, aggr_mode, concate, num_layer,
has_id, dim_latent=None):
super(GCN, self).init()
self.batch_size = batch_size
self.num_user = num_user
self.num_item = num_item
self.dim_id = dim_id
self.dim_feat = features.size(1)
self.dim_latent = dim_latent
self.edge_index = edge_index
self.features = features
self.aggr_mode = aggr_mode
self.concate = concate
self.num_layer = num_layer
self.has_id = has_id

    if self.dim_latent:
        self.preference = nn.init.xavier_normal_(torch.rand((num_user, self.dim_latent), requires_grad=True)).cuda()
        self.MLP = nn.Linear(self.dim_feat, self.dim_latent)
        self.conv_embed_1 = BaseModel(self.dim_latent, self.dim_latent, aggr=self.aggr_mode)
        nn.init.xavier_normal_(self.conv_embed_1.weight)
        self.linear_layer1 = nn.Linear(self.dim_latent, self.dim_id)
        nn.init.xavier_normal_(self.linear_layer1.weight)
        self.g_layer1 = nn.Linear(self.dim_latent + self.dim_id, self.dim_id) if self.concate else nn.Linear(
            self.dim_latent, self.dim_id)
        nn.init.xavier_normal_(self.g_layer1.weight)

    else:
        self.preference = nn.init.xavier_normal_(torch.rand((num_user, self.dim_feat), requires_grad=True)).cuda()
        self.conv_embed_1 = BaseModel(self.dim_feat, self.dim_feat, aggr=self.aggr_mode)
        nn.init.xavier_normal_(self.conv_embed_1.weight)
        self.linear_layer1 = nn.Linear(self.dim_feat, self.dim_id)
        nn.init.xavier_normal_(self.linear_layer1.weight)
        self.g_layer1 = nn.Linear(self.dim_feat + self.dim_id, self.dim_id) if self.concate else nn.Linear(
            self.dim_feat, self.dim_id)
        nn.init.xavier_normal_(self.g_layer1.weight)

    self.conv_embed_2 = BaseModel(self.dim_id, self.dim_id, aggr=self.aggr_mode)
    nn.init.xavier_normal_(self.conv_embed_2.weight)
    self.linear_layer2 = nn.Linear(self.dim_id, self.dim_id)
    nn.init.xavier_normal_(self.linear_layer2.weight)
    self.g_layer2 = nn.Linear(self.dim_id + self.dim_id, self.dim_id) if self.concate else nn.Linear(self.dim_id,
                                                                                                     self.dim_id)

    self.conv_embed_3 = BaseModel(self.dim_id, self.dim_id, aggr=self.aggr_mode)
    nn.init.xavier_normal_(self.conv_embed_3.weight)
    self.linear_layer3 = nn.Linear(self.dim_id, self.dim_id)
    nn.init.xavier_normal_(self.linear_layer3.weight)
    self.g_layer3 = nn.Linear(self.dim_id + self.dim_id, self.dim_id) if self.concate else nn.Linear(self.dim_id,
                                                                                                     self.dim_id)

def forward(self, id_embedding):
    temp_features = self.MLP(self.features) if self.dim_latent else self.features

    x = [torch.cat](http://torch.cat)((self.preference, temp_features), dim=0)
    x = F.normalize(x).cuda()

    h = F.leaky_relu(self.conv_embed_1(x, self.edge_index))  # equation 1
    x_hat = F.leaky_relu(self.linear_layer1(x)) + id_embedding if self.has_id else F.leaky_relu(
        self.linear_layer1(x))  # equation 5
    x = F.leaky_relu(self.g_layer1([torch.cat](http://torch.cat)((h, x_hat), dim=1))) if self.concate else F.leaky_relu(
        self.g_layer1(h) + x_hat)

    if self.num_layer > 1:
        h = F.leaky_relu(self.conv_embed_2(x, self.edge_index))  # equation 1
        x_hat = F.leaky_relu(self.linear_layer2(x)) + id_embedding if self.has_id else F.leaky_relu(
            self.linear_layer2(x))  # equation 5
        x = F.leaky_relu(self.g_layer2([torch.cat](http://torch.cat)((h, x_hat), dim=1))) if self.concate else F.leaky_relu(
            self.g_layer2(h) + x_hat)
    if self.num_layer > 2:
        h = F.leaky_relu(self.conv_embed_3(x, self.edge_index))  # equation 1
        x_hat = F.leaky_relu(self.linear_layer3(x)) + id_embedding if self.has_id else F.leaky_relu(
            self.linear_layer3(x))  # equation 5
        x = F.leaky_relu(self.g_layer3([torch.cat](http://torch.cat)((h, x_hat), dim=1))) if self.concate else F.leaky_relu(
            self.g_layer3(h) + x_hat)
    return x

class MMGCN(torch.nn.Module):
def init(self, features, edge_index, batch_size, num_user, num_item, aggr_mode, concate, num_layer, has_id,
dim_x):
super(MMGCN, self).init()
self.batch_size = batch_size
self.num_user = num_user
self.num_item = num_item
self.aggr_mode = aggr_mode
self.concate = concate

    self.edge_index = torch.tensor(edge_index).t().contiguous().cuda()
    self.edge_index = [torch.cat](http://torch.cat)((self.edge_index, self.edge_index[[1, 0]]), dim=1)

    t_feat = features
    self.t_feat = torch.tensor(t_feat, dtype=torch.float).cuda()
    self.t_gcn = GCN(self.t_feat, self.edge_index, batch_size, num_user, num_item, dim_x, self.aggr_mode,
                     self.concate, num_layer=num_layer, has_id=has_id, dim_latent=28)

    self.id_embedding = nn.init.xavier_normal_(torch.rand((num_user + num_item, dim_x), requires_grad=True)).cuda()
    self.result_embed = nn.init.xavier_normal_(torch.rand((num_user + num_item, dim_x))).cuda()

def forward(self, user_nodes, pos_item_nodes, neg_item_nodes):
    t_rep = self.t_gcn(self.id_embedding)
    representation = (t_rep) / 1
    self.result_embed = representation
    user_tensor = representation[user_nodes]
    pos_item_tensor = representation[pos_item_nodes]
    neg_item_tensor = representation[neg_item_nodes]
    pos_scores = torch.sum(user_tensor * pos_item_tensor, dim=1)
    neg_scores = torch.sum(user_tensor * neg_item_tensor, dim=1)
    return pos_scores, neg_scores

def loss(self, data):
    user, pos_items, neg_items = data
    pos_scores, neg_scores = self.forward(user.cuda(), pos_items.cuda(), neg_items.cuda())
    loss_value = -torch.sum(torch.log2(torch.sigmoid(pos_scores - neg_scores)))
    return loss_value

def accuracy(self, dataset, topk=10, neg_num=1000):
    all_set = set(list(np.arange(neg_num)))
    sum_pre = 0.0
    sum_recall = 0.0
    sum_ndcg = 0.0
    sum_item = 0
    bar = tqdm(total=len(dataset))

    for data in dataset:
        bar.update(1)
        if len(data) < 1002:
            continue

        sum_item += 1
        user = data[0]
        neg_items = data[1:1001]
        pos_items = data[1001:]

        batch_user_tensor = torch.tensor(user).cuda()
        batch_pos_tensor = torch.tensor(pos_items).cuda()
        batch_neg_tensor = torch.tensor(neg_items).cuda()

        user_embed = self.result_embed[batch_user_tensor]
        pos_v_embed = self.result_embed[batch_pos_tensor]
        neg_v_embed = self.result_embed[batch_neg_tensor]

        num_pos = len(pos_items)
        pos_score = torch.sum(pos_v_embed * user_embed, dim=1)
        neg_score = torch.sum(neg_v_embed * user_embed, dim=1)

        _, index_of_rank_list = torch.topk([torch.cat](http://torch.cat)((neg_score, pos_score)), topk)
        index_set = set([iofr.cpu().item() for iofr in index_of_rank_list])
        num_hit = len(index_set.difference(all_set))
        sum_pre += float(num_hit / topk)
        sum_recall += float(num_hit / num_pos)
        ndcg_score = 0.0
        for i in range(num_pos):
            label_pos = neg_num + i
            if label_pos in index_of_rank_list:
                index = list(index_of_rank_list.cpu().numpy()).index(label_pos)
                ndcg_score = ndcg_score + math.log(2) / math.log(index + 2)
        sum_ndcg += ndcg_score / num_pos
    bar.close()

    return sum_pre / sum_item, sum_recall / sum_item, sum_ndcg / sum_item

this the full defination of my model