Connect MLP to Gated recurrent Unit

Hi all, i have problems with connect an MLP with a recurrent unit such a GRU net.

This is my nets:

import torch.cuda
import torch.nn as nn

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from torch import nn

class DeepMLPRegressor(nn.Module):
    def __init__(self, in_features):
        super(DeepMLPRegressor, self).__init__()

        self.model = nn.Sequential(nn.Linear(in_features, 32),
                                   nn.Linear(32, 64),
                                   nn.Linear(64, 128),

    def forward(self, x):
        return self.model(x)

class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, drop_prob=0.2):
        super(GRUNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x, h, o):
        out, h = self.gru(x, h, o)
        out = self.fc(self.relu(out[:, -1]))
        return out, h

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        device ="cuda" if torch.cuda.is_available() else "cpu"
        hidden =, batch_size, self.hidden_dim).zero_().to(device)
        return hidden

class final_model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(final_model, self).__init__()
        self.mlp = DeepMLPRegressor(input_dim)
        self.GRU = GRUNet(input_dim=128, hidden_dim= 32, output_dim=output_dim, n_layers=1)

    def forward(self, x):
        x1 = self.mlp(x)
        x2 = self.GRU(x1, 32,  2)
        return x2

This is my training class where i call all from main:

import torch
import torch.nn as nn
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import StepLR
import numpy as np
from dataset import Dataset

import net
from net import *
from import DataLoader
from torchnet.meter import AverageValueMeter
from torchnet.logger import VisdomPlotLogger, VisdomSaver

def train(model, train_loader, valid_loader, exp_name = "MLP121",  lr=0.00001, epochs=1000, wd = 0.000001):

    criterionX = nn.MSELoss()
    criterionZ = nn.MSELoss()
    optimizer = Adam(params=model.parameters(),lr = lr, weight_decay=wd)
    scheduler = StepLR(optimizer, step_size=100, gamma=0.5)#per ogni 100 epochs, lr si divide per due

    # meters
    lossX_meter = AverageValueMeter()
    lossZ_meter = AverageValueMeter()
    lossT_meter = AverageValueMeter()

    # device
    device = "cuda" #if torch.cuda.is_available() else "cpu"
    loader = {"train": train_loader, "test": valid_loader}

    loss_X_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'LossX', 'legend': ['train', 'test']})
    loss_Z_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'LossZ', 'legend': ['train', 'test']})
    loss_T_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'Total_Loss', 'legend': ['train', 'test']})
    visdom_saver = VisdomSaver(envs=[exp_name])

    last_best_loss = np.inf #for early stopping for best model
    for e in range(epochs):
        for mode in ["train", "test"]:

            model.train() if mode == "train" else model.eval()
            with torch.set_grad_enabled(mode == "train"):  # abilitiamo i gradienti in training

                for i, batch in enumerate(loader[mode]):
                    x = batch["Array"].to(device)
                    dx = batch['Movement'][:,0].float().to(device)
                    dz = batch['Movement'][:,1].float().to(device)

                    output = model(x)

                    #out1, out2 = model(x)
                    out1, out2 = output[:,0], output[:,1]

                    #out2 = out2 / torch.sqrt((out2 ** 2).sum(1)).view(-1, 1)

                    l1 = criterionX(out1, dx)
                    l2 = criterionZ(out2, dz)
                    loss = l1+l2

                    if mode == "train":

                        if loss < last_best_loss:
                  , 'Best_%s.pth' % exp_name)
                            last_best_loss = loss

                    n = x.shape[0]  # numero di elementi nel batch

                    lossX_meter.add(l1.item() * n, n)#update meter to ploot
                    lossZ_meter.add(l2.item() * n, n)
                    lossT_meter.add(loss.item()* n, n)

                    if mode == "train":

                        loss_X_logger.log(e + (i + 1) / len(loader[mode]), lossX_meter.value()[0], name=mode)
                        loss_Z_logger.log(e + (i + 1) / len(loader[mode]), lossZ_meter.value()[0], name=mode)
                        loss_T_logger.log(e + (i + 1) / len(loader[mode]), lossT_meter.value()[0], name=mode)

            loss_X_logger.log(e + (i + 1) / len(loader[mode]), lossX_meter.value()[0], name=mode)
            loss_Z_logger.log(e + (i + 1) / len(loader[mode]), lossZ_meter.value()[0], name=mode)
            loss_T_logger.log(e + (i + 1) / len(loader[mode]), lossT_meter.value()[0], name=mode)


        #save visdom environment

        # conserviamo solo l'ultimo modello sovrascrivendo i vecchi, salviamo anche il best di volta in volta, '%s.pth' % exp_name)

    return model

def start_all():

    model = final_model(15, 2)

    #Define train dataset and loader
    train_dataset = Dataset('../Dataset/121/','121_train_sequential.csv', 'raw/')
    valid_dataset = Dataset('../Dataset/121/','121_validation_sequential.csv', 'raw/')

    train_loader = DataLoader(train_dataset, batch_size=16, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=16, num_workers=2)

    model_trained = train(model, train_loader, valid_loader, exp_name="MLP_GRU", epochs=500)

My dataset class take three array of 5 elements each and pass them as a unit tensor.

i have this error:

RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x5 and 15x32)

and i don’t understand why!

Can you help me?

somewhere in the code you are trying to matmul two tensors with shapes (16,5) and (15,32) but it is impossible, because to multiply two 2-dimensional tensors you need tensor1.shape[1] = tensor2.shape[0]

Looks like you are sending incorrect values to nn.GRU (Inputs: input, tensor containing the features of the input sequence, h_0, the initial hidden state for each element in the batch)