Mismatch between given batch size and the model output size

In the code below I am doing a normal training loop with batchsize 128 . the model outputs a 128x1 vector as predicticed result which i am comparing with a 128x1 target variable and computing L1 loss and backpropagating. However after 1 epoch i am getting the following warning followed by cuda out of memory error

/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/loss.py:91: UserWarning: Using a target size (torch.Size([59336])) that is different to the input size (torch.Size([59336, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.l1_loss(input, target, reduction=self.reduction)

Now how is the target size becoming 59336 when it should be128x1 , I am not able to understand . If possible any help would be appreciated

for no in range(1):

   model = ModelCombination(count,non_numeric_feats,numeric_feats,emb_dims,latent_dim,
                     int(full_data['price_band_index'].max())+1,price_diff,price_max)
  model.to(device)
  learning_rate = 0.001
  criterion = torch.nn.L1Loss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999),weight_decay=1e-5)
  patience = 20
  best = 1e5
  early_stop_counter = 0
  train_data , valid_data = train_test_split(final_train,shuffle=True,test_size=0.2)
  final_train1 = torch.tensor(train_data[['ASIN_index','price_band_index',
                                        'demand','price']].to_numpy(),dtype=torch.float32)
  valid_train1 = torch.tensor(valid_data[['ASIN_index','price_band_index',
                                        'demand','price']].to_numpy(),dtype=torch.float32)
  final_test1 = torch.tensor(final_test[['ASIN_index','price_band_index',
                                       'demand','price']].to_numpy(),dtype=torch.float32)
  train_dataset = torch.utils.data.TensorDataset(final_train1)  # Data set only on ASIN Index 
  train_loader  = torch.utils.data.DataLoader(train_dataset,batch_size=128)         # batches are 128 ASIN_index 
  valid_dataset = torch.utils.data.TensorDataset(valid_train1)  # Data set only on ASIN Index 
  valid_loader  = torch.utils.data.DataLoader(valid_dataset,batch_size=len(valid_dataset))  
  for epoch in range(100):
                for i,j in enumerate(train_loader):
                    batch_demand = j[0][:,2]
                    batch_price =  j[0][:,3]
                    batch_asin = torch.tensor(j[0][:,0],dtype=torch.int64)
                    batch_price_index = torch.tensor(j[0][:,1],dtype=torch.int64)
                    num_fts = numeric_matrix[batch_asin]
                    non_num_fts = non_numeric_matrix[batch_asin]                       d_hat=model(batch_asin.to(device),batch_price_index.to(device),batch_demand.to(device),batch_price.to(device),num_fts.to(device),non_num_fts.to(device))
             
                    loss = criterion(d_hat.flatten(),batch_demand.to(device))   # Scaling demand by number of sigmoids
                    #print(loss)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    #print("learning_rate",optimizer.state_dict()["param_groups"][0]['lr'])
                    #scheduler.step()  ## Decrease Learning Rate 
                val_mape,val_loss = evaluate_valid(valid_loader,model)
                #val_mape,val_loss = np.round(val_mape,5),np.round(val_loss,5)
                if val_mape.item() < best:
                    best = val_mape.item()
                    early_stop_counter = 0
                    torch.save({
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss,
                        }, "MEAN_V3"+str(no)+".pth")
                    print("Best yet",no,best,val_mape)
                    fname.write("Best yet"+str(no)+"\t"+str(best)+"\t"+str(val_mape.item())+"\n")
                early_stop_counter+=1
                print("Train Loss ",no,epoch,loss,val_mape.item(),
                      val_loss,np.round(optimizer.state_dict()["param_groups"][0]['lr'],5))
                if early_stop_counter>=patience:
                    print("Breaking Training Loop for Model",no)
                    break