What is an inplace operation?

soojin99 · April 27, 2020, 8:10pm

Hello I am new in pytorch but i got this error and i dont know how to fix it…

Warning: Error detected in CudnnRnnBackward. Traceback of forward call that caused the error:
File “D:/GitHubRepos/NLP/biGRU/trainer.py”, line 161, in
output, loss = train(target, label)
File “D:/GitHubRepos/NLP/biGRU/trainer.py”, line 120, in train
output, hidden = model(target[i], hidden)
File “D:\Python\venv\lib\site-packages\torch\nn\modules\module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “D:\GitHubRepos\NLP\biGRU\GRU.py”, line 12, in forward
out, h = self.gru(x, h)
File “D:\Python\venv\lib\site-packages\torch\nn\modules\module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “D:\Python\venv\lib\site-packages\torch\nn\modules\rnn.py”, line 727, in forward
self.dropout, self.training, self.bidirectional, self.batch_first)
(print_stack at …\torch\csrc\autograd\python_anomaly_mode.cpp:60)
Traceback (most recent call last):
File “D:/GitHubRepos/NLP/biGRU/trainer.py”, line 161, in
output, loss = train(target, label)
File “D:/GitHubRepos/NLP/biGRU/trainer.py”, line 138, in train
loss.backward(retain_graph=True)
File “D:\Python\venv\lib\site-packages\torch\tensor.py”, line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File “D:\Python\venv\lib\site-packages\torch\autograd_init_.py”, line 100, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [7116, 2372]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

This is the code of my model.

class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=False)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.softmax(out)
        return out, h

    def init_hidden(self, batch_size, device):
        weight = next(self.parameters()).data
        hidden = weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device)
        return hidden

i think theres something wrong with out, h = self.gru(x, h) but i dont know how to fix it.

ptrblck · April 28, 2020, 6:36am

Not sure, if this could cause the issue, but could you call the out of place .zero() operation of weight.new() instead the inplace .zero_()?

Inplace operation act on the tensor directly without creating a new result tensor and have a trailing underscore.
This code snippet shows the usage of inplace operations:

x = torch.zeros(10)
x[0] = 1.
x.sigmoid_()
print(x)

As you can see, I don’t need to assign a new variable to x.sigmoid_(), as it will apply the sigmoid directly on x.
However, since intermediate activations are often needed to calculate the gradients, inplace operations might create errors during the backward call and should then be removed.

EDIT: Seems to have been solved here.

soojin99 · April 28, 2020, 8:06am

Hi!!! you again. You really helped me a lot.
Actually, when I try to avoid this error, it trains nothing, the hidden states stay still, so output (predicted string in this example) is always the same.
What I did before is
output, hidden = model(target[i], hidden.detach()) and loss.backward().
But somehow when I do both, I get outputs of the first word of corpus repeated seq_len times. I changed the code a little bit but I really cant find the error.
I know what inplace operations are like but I swear I didn’t use any o them.

from data_loader import *
import torch.nn as nn
from GRU import GRU
import sys
import matplotlib.pyplot as plt
import time
import math

device = None

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("cuda is available!")
    torch.backends.cudnn.benchmark = True
    print('Memory Usage:')
    print('Max Alloc:', round(torch.cuda.max_memory_allocated(0)/1024**3, 1), 'GB')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
    print('cuDNN:    ', torch.backends.cudnn.version())

else:
    device = torch.device("cpu")


# corpus file path
data_path = "../data/Korean/processed/lyrics/tokenized.pkl"

# word_to_idx path
dictionary_path = "../data/Korean/processed/lyrics/dict.pkl"

word_to_idx = load_data(dictionary_path)
data = load_data(data_path)


# a dictionary that maps an index to word
idx_to_word = {v: k for k, v in word_to_idx.items()}

input_size = len(word_to_idx)

hidden_size = 512

output_size = len(word_to_idx)

num_layers = 1

batch_size = 1

learning_rate = 0.01

model = GRU(input_size, hidden_size, output_size, batch_size, device, num_layers).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

n_iter = 1000

now_epoch = 0

seq_len = 5

def make_batch(docs):
    target = []
    now_word = 0
    flag = True

    while flag:
        flag = False
        one_hot_vector = torch.zeros(seq_len, batch_size, input_size).to(device)
        for i, doc in enumerate(docs):
            for j in range(0, seq_len):
                try:
                    word = doc[now_word + j]
                    one_hot_vector[j][i][word_to_idx[word]] = 1
                    flag = True
                except IndexError:
                    return target

        target.append(one_hot_vector)
        now_word += 1

    return target


def time_since(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def print_string(string, j):
    for k in range(0, seq_len):
        expected = torch.argmax(string[k][j])
        sys.stdout.write(idx_to_word[expected.item()] + " ")


def train(target, label):
    # initializing hidden state
    hidden = model.init_hidden()

    for i in range(len(target)):
        output, hidden = model(target[i], hidden)

        # reshape for loss function
        (seq, bat, inp) = output.size()
        output = output.reshape(seq, inp, bat)
        print(output.size(), label[i].argmax(2).size())
        
        # loss
        loss = criterion(output, label[i].argmax(2)).to(device)

        # backprop
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        for j in range(0, batch_size):
            print_string(target[i], j)
            sys.stdout.write(" -> ")
            print_string(output, j)     # generated string
            sys.stdout.write(" / ")
            print_string(label[i], j)
            sys.stdout.write("\n")

    return output, loss.item()


losses = []
cur_loss = 0

print_every = 100
plot_every = 100
start = time.time()

for iter in range(1, n_iter + 1):
    now_epoch = 0

    while now_epoch + batch_size <= len(data):

        target = make_batch(data[now_epoch:now_epoch + batch_size])

        label = target[1:] + [torch.cat((target[-1][1:seq_len], torch.zeros(1, batch_size, input_size).to(device)), 0)]

        output, loss = train(target, label)
        cur_loss += loss

        now_epoch += batch_size


    if iter % print_every == 0:
        sys.stdout.write("%d %d%% (%s) %.4f\n\n" % (iter, iter/n_iter*100, time_since(start), loss))

    if iter % plot_every == 0:
        losses.append(cur_loss/plot_every)
        cur_loss = 0


plt.figure()
plt.plot(losses)
plt.show()

torch.save(model, "gru.pt")