RuntimeError: in optimizer.step()

Traceback (most recent call last):
  File "/home/lowen/PycharmProjects/533_CTR/main.py", line 12, in <module>
    trainer.fire(train_data_path='./data/new_merge_onehot/uid_train_new_all_onehot.csv')
  File "/home/lowen/PycharmProjects/533_CTR/trainer/FFM_trainer.py", line 139, in fire
    self.train_eval(train_data_loader, epoch, index)
  File "/home/lowen/PycharmProjects/533_CTR/trainer/FFM_trainer.py", line 61, in train_eval
    self.optimizer.step()
  File "/home/lowen/anaconda3/envs/newEnv/lib/python3.6/site-packages/torch/optim/adagrad.py", line 92, in step
    state['sum'].addcmul_(1, grad, grad)
RuntimeError: Expected object of backend CPU but got backend CUDA for argument #4 'tensor1'

This Error is code optimizer.step(). I had put all data and model in cuda. I have no idea about this error . The forward() method was ok and no exception arrowd. I got the logits and calulated the loss using BCEWithLogitsLoss, and then i loss.backward() and optimizer.step(), I got this error!

my model : Filed Factorization Machine
model,input data ,criterion all are in GPU

class FiledFactorizationMachine(nn.Module):
    def __init__(self, n, k, f):
        super(FiledFactorizationMachine, self).__init__()
        self.best_model_path = None
        
        self.n = n  # total onehot dim
        self.k = k  # K
        self.f = f  
        self.linear = nn.Linear(2, 1)
        
        self.linear_embedding = nn.ModuleList(
            [nn.Embedding(feature_size, 1) for feature_size in constants.new_columns_to_onehot_dim])
        
        self.cross_embedding_lists = nn.ModuleList(
            [nn.ModuleList(
                [nn.Embedding(filed_dim, opt.FFM_K) for _ in range(len(constants.new_columns_to_onehot_dim))])
                for i_filed, filed_dim in enumerate(constants.new_columns_to_onehot_dim)])
        self.bias = nn.Parameter(torch.randn(1))

        self.best_auc = 0  

    
    def forward(self, x_index, x_value):
        
        x_value = x_value.float()

        linear_part_temp = [emb(x_index[:, i]).squeeze() * x_value[:, i] for i, emb in enumerate(self.linear_embedding)]
        linear_part = sum(linear_part_temp)  
       
        cross_step_1 = [[torch.mul(embed(x_index[:, i]), x_value[:, i].unsqueeze(dim=1)) for embed in f_embeds] for
                        i, f_embeds in enumerate(self.cross_embedding_lists)]
        
        
        cross_step_2 = []
        for i in range(len(constants.new_columns_to_onehot_dim)):
            for j in range(i + 1,len(constants.new_columns_to_onehot_dim)):
                cross_step_2.append(torch.sum(torch.mul(cross_step_1[i][j], cross_step_1[j][i]), dim=-1))
        
        cross_part = sum(cross_step_2)
        logits = linear_part + cross_part + self.bias
        return logits

these are the context code s

       self.model.train()
        for label, x_index, x_value in tqdm(dataloader):
            self.optimizer.zero_grad()
            self.global_step += 1
            
            x_index, x_value, label = x_index.type(torch.LongTensor).cuda(), x_value.cuda(), label.cuda()
            
            logits = self.model(x_index, x_value)
            logits = logits.squeeze()
            # logits=torch.sigmoid(logits)
            loss = self.criterion(logits, label.float())
            if self.global_step % 30 == 1:
                print('epoch %d part %d global_step: %d  --- loss: %f' % (epoch, index, self.global_step, loss))
            loss.backward()
            self.optimizer.step()