Tensor into variable/Error after many Epochs

BenJ · July 14, 2018, 12:20pm

Hi,

I am desperately trying to figure out what’s wrong with my code. I am trying to get the index of the max log-probability as shown. The weird thing is that I get this RuntimeError after many Epochs (sometimes 7/12/25 …) but it usually runs smoothly for the first epochs.

I am quite new to PyTorch maybe it’s something obvious but I just can’t seem to figure out what could cause the problem after running for that many Epochs.

Python 3.5.2
Torch 0.3.1

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
~/SDT_CIFAR/main.py in <module>()
    122 
    123 for epoch in range(1, args.epochs + 1):
--> 124     model.train_(trainloader, epoch)
    125     accuracy = model.test_(testloader, epoch)
    126 save_result(accuracy)

~/SDT_CIFAR/model.py in train_(self, train_loader, epoch)
    224             loss.backward(retain_variables=True)
    225             self.optimizer.step()
--> 226             pred = output.data.max(1)[1] # get the index of the max log-probability
    227             correct += pred.eq(target.data).cpu().sum()
    228             accuracy = 100. * correct / len(data)

~/.local/lib/python3.5/site-packages/torch/tensor.py in data(self)
    405     @property
    406     def data(self):
--> 407         raise RuntimeError('cannot call .data on a torch.Tensor: did you intend to use autograd.Variable?')
    408 
    409     # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray`
RuntimeError: cannot call .data on a torch.Tensor: did you intend to use autograd.Variable?

...

    def train_(self, train_loader, epoch):
        t = time.time()
        self.train()
        self.define_extras(self.args.batch_size)
        for batch_idx, (data, target) in enumerate(train_loader):
            correct = 0
            if self.args.cuda:
                data, target = data.cuda(), target.cuda()
            #data = data.view(self.args.batch_size,-1)
            target = Variable(target)
            target_ = target.view(-1,1)
            batch_size = target_.size()[0]
            data = data.view(batch_size,-1)
            ##convert int target to one-hot vector
            data = Variable(data)
            if not batch_size == self.args.batch_size: #because we have to initialize parameters for batch_size, tensor not matches with batch size cannot be trained
                self.define_extras(batch_size)
            self.target_onehot.data.zero_()            
            self.target_onehot.scatter_(1, target_, 1.)
            self.optimizer.zero_grad()

            loss, output = self.cal_loss(data, self.target_onehot)
            loss.backward(retain_variables=True)
            self.optimizer.step()
            pred = output.data.max(1)[1] # get the index of the max log-probability
            correct += pred.eq(target.data).cpu().sum()
            accuracy = 100. * correct / len(data)

            if batch_idx % self.args.log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accuracy: {}/{} ({:.4f}%)'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.data[0],
                    correct, len(data),
                    accuracy))
            
        elapsed = time.time() - t
        print("elapsed time in this epoch: " + str(elapsed) + " sec")
        self.get_node_weights(epoch)

...

Full Code is very similar to this Github link.

I would appreciate any help.
Thanks in advance.