ValueError: Expected input batch_size (1240) to match target batch_size (1248)

class myLM(nn.Module):
    def __init__(self, params):
        super(myLM, self).__init__()
        self.vocab_size = params['vocab_size']
        self.d_emb      = params['d_emb']
        self.d_hid      = params['d_hid']
        self.batch_size = params['batch_size']
        self.embedding  = nn.Embedding(self.vocab_size, self.d_emb)
        self.rnn        = nn.RNN(self.d_emb, self.d_hid, num_layers=1, batch_first=True)
        self.ws         = nn.Linear(self.d_hid, self.d_hid)
        self.wq         = nn.Linear(self.d_hid, self.d_hid)
        self.vs         = torch.randn(self.d_hid).to(device)
        self.wc         = nn.Linear(2 * self.d_hid, self.d_hid)
        self.out        = nn.Linear(self.d_hid, self.vocab_size)
        

    def forward(self, batch):
      bsz, seq_len = batch.size()  #32x40
      
      rnn_hiddens = torch.zeros(seq_len, bsz, self.d_hid)

      for idx in range(seq_len):
        #batch[:, idx] -> idx th word of all samples 32
        #self.embedding(batch[:, idx]) -> embs of idx th words -> 32x50
        
        rnn_input = self.embedding(batch[:, idx]).view(bsz, 1, self.d_emb)
        _, hidden = self.rnn(rnn_input)
        
        rnn_hiddens[idx] = hidden.to(device) #1x32x50
      
      new_hiddens = torch.zeros(seq_len, bsz, 2 * self.d_hid)
        
      for t in range(seq_len):
        context = torch.zeros_like(rnn_hiddens[0])
        if t>0:
          context = self.context(rnn_hiddens[t], rnn_hiddens[0:t])
        
        ht_new = torch.tanh(torch.cat((rnn_hiddens[t], context), 1))
        
        new_hiddens[t] = ht_new
      
      new_hiddens = new_hiddens.to(device)
      return self.out(self.wc(new_hiddens))
#       t= nn.functional.softmax(op, 2)
#       print(t.shape)
        
        
        
    def context(self, ht, buffer):
      ctx = torch.zeros_like(buffer[0]) #32x50
      
      for t, hi in enumerate(buffer):
        scores =  self.score(buffer[t], ht).to(device) #32
        scores = nn.functional.softmax(scores)
        #hi-32x50
        for i in range(ctx.size()[0]):
          ctx[i] += scores[i] * buffer[t][i]
        
      return ctx
        

    def score(self, hi, ht):
      hi = hi.to(device)
      ht = ht.to(device)
      return torch.tanh(self.ws(hi) + self.wq(ht)).mv(self.vs) #[32]
    
      

params = {}
params['vocab_size'] = len(idx_to_word)
params['d_emb'] = 50
params['d_hid'] = 50
params['batch_size'] = 32
params['epochs'] = 8 
params['learning_rate'] = 0.001 

lm = myLM(params)
lm.cuda()
train_lm(--data--, params, lm)

the training code is as follows:

def train_lm(dataset, params, net):
    
    # since the first index corresponds to the PAD token, we just ignore it
    # when computing the loss
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
    num_examples, seq_len = dataset.size()    
    batches = [(start, start + params['batch_size']) for start in\
               range(0, num_examples, params['batch_size'])]
    
    for epoch in range(params['epochs']):
        ep_loss = 0.
        start_time = time.time()
        random.shuffle(batches)
        
        # for each batch, calculate loss and optimize model parameters            
        for b_idx, (start, end) in enumerate(batches):
            print(b_idx)
            
            batch = dataset[start:end]
            print(batch.size())  -#---> 32x40
            preds = net(batch)
            
            preds = preds[:, :-1, :].contiguous().view(-1, net.vocab_size)
            print('pred', preds.shape) #1240xvocab
            targets = batch[:, 1:].contiguous().view(-1)
            print('tgt', targets.shape) #1248
            loss = criterion(preds, targets) #error here
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            ep_loss += loss

        print('epoch: %d, loss: %0.2f, time: %0.2f sec, dev perplexity: %0.2f' %\
              (epoch, ep_loss, time.time()-start_time, compute_perplexity(wikitext['dev'], net)))

I am not sure where I am going wrong.

you have mentioned in your comments already.
You are giving 1240 predictions and 1248 targets. hence the error. Ideally the shapes should have been predictions=1248 x vocab, target=1248.
Is there something thats confusing or did I misunderstand the issue here?