LSTM optimizer update error : p.data.add_(-group['lr'], d_p) RuntimeError: invalid argument 3: sizes do not match

Hi, I am new to LSTM. I create a single LSTM network:

class LSTMNet(nn.Module):
    def __init__(self):
        super(LSTMNet, self).__init__()
        self.lstm1 = nn.LSTMCell(1, 64) #(self, input_size, hidden_size, bias=True)
        self.lstm2 = nn.LSTMCell(64, 1)

    def forward(self, x):
        length, batch_size, in_channels = x.size()

        #initialisation
        h_t1 = Variable(torch.zeros(length, batch_size, 64)).cuda()
        c_t1 = Variable(torch.zeros(length, batch_size, 64)).cuda()
        h_t2 = Variable(torch.zeros(length, batch_size, 1 )).cuda()
        c_t2 = Variable(torch.zeros(length, batch_size, 1 )).cuda()

        h_t1, c_t1 = self.lstm1(x,    (h_t1,c_t1))
        h_t2, c_t2 = self.lstm2(h_t1, (h_t2,c_t2))

        y = h_t2
        return y

This is my training code:

def run_lstm_example_sdg():

    out_dir='/root/share/project/drone/results/xxx'
    os.makedirs(out_dir, exist_ok=True)

    # make training/validation set
    length    = 100
    num_train = 200
    num_valid = 5
    in_channels  = 1
    out_channels = 1
    train_input, train_target, valid_input, valid_target = gernerate_data(length, num_train, num_valid, in_channels, out_channels)


    # build the model
    net = LSTMNet()
    net.cuda()

    # optimizer
    optimizer = optim.SGD(net.parameters(), lr=0.01, ) #momentum=0.9, weight_decay=0.0001


    #begin to train
    num_iters = 300
    for i in range(num_iters):

        optimizer.zero_grad()
        ys   = net(train_input)
        loss = nn.MSELoss()(ys, train_target)
        loss.backward()
        optimizer.step()

        print('iter=%8d,  train loss:  %0.5f'%(i, loss.data.cpu().numpy()[0]))

My data. In particular, train data is torch.Size([100, 200, 1]), where sequence length=100, batch_size=200, dim=1. Train target dim is also dim=1.

def gernerate_data(length, num_train, num_valid, in_channels=1,out_channels=1):
    assert(in_channels ==1)
    assert(out_channels==1)

    T = 20 #period
    num = num_train+num_valid

    data    = np.empty((num, length+1), 'int64')
    data[:] = np.array(range(length+1)) + np.random.randint(-4 * T, 4 * T, num).reshape(num, 1)
    data = np.sin(data / T).astype('float32')

    train_input  = data[:num_train,  :-1].transpose().reshape(length,num_train,in_channels )
    train_target = data[:num_train, 1:  ].transpose().reshape(length,num_train,out_channels)
    valid_input  = data[num_train:,  :-1].transpose().reshape(length,num_valid,in_channels )
    valid_target = data[num_train:, 1:  ].transpose().reshape(length,num_valid,out_channels)
    train_input  = Variable(torch.from_numpy(train_input )).cuda() #torch.Size([100, 200, 1]); print(train_input.size())
    train_target = Variable(torch.from_numpy(train_target)).cuda() #torch.Size([100, 200, 1]); print(train_target.size())
    valid_input  = Variable(torch.from_numpy(valid_input )).cuda() #torch.Size([100, 5,   1]); print(valid_input.size())
    valid_target = Variable(torch.from_numpy(valid_target)).cuda() #torch.Size([100, 5,   1]); print(valid_target.size())

    return train_input, train_target, valid_input, valid_target

the error is:

Traceback (most recent call last):
File "/root/share/project/drone/build/lstm-00/lstm_example-1.py", line 208, in <module>
run_lstm_example_sdg()
File "/root/share/project/drone/build/lstm-00/lstm_example-1.py", line 172, in run_lstm_example_sdg
optimizer.step()
File "/opt/anaconda3/lib/python3.6/site-packages/torch/optim/sgd.py", line 99, in step
p.data.add_(-group['lr'], d_p)
RuntimeError: invalid argument 3: sizes do not match at /opt/pytorch/8fbe003/pytorch/torch/lib/THC/generated/../generic/THCTensorMathPointwise.cu:271

Where did i go wrong? Thanks!