How does RNN output calculated in Pytorch?

Hi everyone, I’m trying to create a simple RNN model in Pytorch and I’m at a loss here! How do you specify the output dimensions?
based on the documentation, it seems the output dimension is taken from the input dimension! and should have the shape (seq_len, batch, num_directions * hidden_size) but even in this case, my simple model doesnt give the correct output dimension and thus fails!

I have a very simple example which is given below:

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

# lets create a generator that generates new data!
def dataset_gen(i, sequence_num=10):
    data = torch.linspace(i*np.pi, (i+1)* np.pi, steps=sequence_num)
    data = torch.sin(data)
    yield data.unsqueeze(0).unsqueeze(2)  

#%%
#define the model 
class rnn(torch.nn.Module):
    def __init__(self, inputsize, hiddensize, outputsize, numlayers):
        super().__init__()
        self.rnn = nn.RNN(input_size=inputsize, hidden_size=hiddensize, num_layers=numlayers, batch_first=True)
                
    
    def forward(self, x, hiddenstate):
        outputs, hiddenstate = self.rnn(x, hiddenstate)
        print('input: ',x.shape)
        print('output: ',outputs.shape)
        print('h: ',hiddenstate.shape)
        outputs = F.sigmoid(outputs)
        return outputs ,hiddenstate

inputsize = 1
outputsize = 1
hiddensize = 32
sequence_length = 10 
num_layer = 1
batchsize = 1
#uni or bi direction (1 or 2)
direction = 1

model = rnn(inputsize, hiddensize, outputsize, num_layer)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# test
data = next(iter(dataset_gen(0, sequence_length)))
hiddenstate = torch.zeros(direction*num_layer, batchsize, hiddensize)
# (output, hidden) = model(data.cuda(), hiddenstate.cuda())

# training 
critertion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr =0.01)

def training(step , dataset, critertion, optimizer, model):
    model.train()

    hidden = torch.zeros((direction*num_layer, batchsize, hiddensize), device=device)
    for i in  range(step):
        data = next(iter(dataset(i, sequence_length)))
        data = data.to(device)

        X = data[:, :-1, :]
        Y = data[:,  1:, :]

        (output, hidden) = model(X, hidden)
        
        hidden = hidden.data

        loss = critertion(output, Y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i%10==0:
            print(f'{i+1}) loss: {loss.item()}')
            plt.plot(X.cpu().numpy().flatten(), 'r.')
            plt.plot(output.data.cpu().numpy().flatten(),'b.')
            plt.show()
    return model


training(3, dataset_gen, critertion, optimizer, model)

Running the code above creates the following output :

input:  torch.Size([1, 9, 1])
output:  torch.Size([1, 9, 32])
h:  torch.Size([1, 1, 32])
1) loss: 0.12164192646741867
input:  torch.Size([1, 9, 1])
output:  torch.Size([1, 9, 32])
h:  torch.Size([1, 1, 32])

input:  torch.Size([1, 9, 1])
output:  torch.Size([1, 9, 32])
h:  torch.Size([1, 1, 32])
C:\Users\Testusr\Anaconda3\lib\site-packages\torch\nn\functional.py:1386: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.
  warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
C:\Users\Testusr\Anaconda3\lib\site-packages\torch\nn\modules\loss.py:443: UserWarning: Using a target size (torch.Size([1, 9, 1])) that is different to the input size (torch.Size([1, 9, 32])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
  return F.mse_loss(input, target, reduction=self.reduction)

The output shape is completely off!
Can anyone please help me understand what is going on here!?
Thanks in advance

I found this very helpful