Hi! I’m running into an error with optimizer.step() in an LSTM I’m trying to implement, where the traceback says this:
Traceback (most recent call last):
File "pipeline_baseline.py", line 259, in <module>
optimizer.step()
File "C:\Users\Mustafa\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\autograd\grad_mode.py", line 26, in decorate_context
return func(*args, **kwargs)
File "C:\Users\Mustafa\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\optim\sgd.py", line 106, in step
buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
RuntimeError: set_indices_and_values_unsafe is not allowed on a Tensor created from .data or .detach().
If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset)
without autograd tracking the change, remove the .data / .detach() call and wrap the change in a `with torch.no_grad():` block.
For example, change:
x.data.set_(y)
to:
with torch.no_grad():
x.set_(y)
When I googled the error about set_indices_and_values_unsafe, I couldn’t find much documentation other than this link which I wasn’t really sure how to relate to this issue I’m facing. I’m just trying to get this LSTM to work on a very basic level and not really worried about finetuning, does anyone know what the issue here might be?
Here’s my code:
for target_label in labels:
model_file = os.path.join(model_dir, 'checkpoint_{}.mdl'.format(target_label))
model = models[target_label]
optimizer = optimizers[target_label]
# TODO: combine init_dataset() and shuffle_dataset()
dev_set.init_dataset(target_label)
test_set.init_dataset(target_label)
(dev_tids, dev_tokens, dev_labels, dev_lens) = dev_set.get_dataset(max_seq_len, volatile=True, gpu=use_gpu)
(test_tids, test_tokens, test_labels, test_lens) = test_set.get_dataset(max_seq_len, volatile=True, gpu=use_gpu)
best_dev_fscore = 0.0
best_test_scores = None
for epoch in range(max_epoch):
epoch_start_time = current_time()
epoch_loss = 0.0
train_set.shuffle_dataset(target_label, balance=True)
batch_num = train_set.batch_num(batch_size)
for batch_idx in range(batch_num):
optimizer.zero_grad()
(batch_tids, batch_tokens, batch_labels, batch_lens) = train_set.get_batch(batch_size, gpu=use_gpu)
model_output = model.forward(batch_tokens, batch_lens)
loss = loss_func.forward(model_output, batch_labels)
loss.backward()
optimizer.step()
epoch_loss += 1.0 / batch_num * float(loss)
epoch_elapsed_time = current_time() - epoch_start_time
# Evaluate the current model on dev and test sets
dev_preds = model.forward(dev_tokens, dev_lens)
dev_scores = _calc_scores(dev_preds, dev_labels)
test_preds = model.forward(test_tokens, test_lens)
test_scores = _calc_scores(test_preds, test_labels)
class LSTM(nn.LSTM):
def __init__(self, input_size, hidden_size, num_layers=1, bias=True, batch_first=False, dropout=0, bidirectional=False, forget_bias=0):
super(LSTM, self).__init__(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
self.forget_bias = forget_bias
self.output_size = hidden_size * (2 if bidirectional else 1)
self.bidirectional = bidirectional
self.initialize()
def initialize(self):
for n, p in self.named_parameters():
if 'weight' in n:
# I.xavier_normal(p)
I.orthogonal(p)
elif 'bias' in n:
bias_size = p.size(0)
p.data[bias_size // 4:bias_size // 2].fill_(self.forget_bias)
def forward(self, inputs, lens, hx=None):
inputs_packed = R.pack_padded_sequence(inputs, lens.data.tolist(), batch_first=True)
outputs, h = super(LSTM, self).forward(inputs_packed, hx)
outputs, _ = R.pad_packed_sequence(outputs, batch_first=True)
return outputs, h
class Classifier(nn.Module):
def __init__(self, word_embedding, lstm, linears, embed_dropout_prob=.5, lstm_dropout_prob=.5, gpu=False):
super(Classifier, self).__init__()
self.word_embedding = word_embedding
self.lstm = lstm
self.linears = nn.ModuleList(linears)
self.linear_num = len(linears)
self.embed_dropout = nn.Dropout(p=embed_dropout_prob)
self.lstm_dropout = nn.Dropout(p=lstm_dropout_prob)
self.gpu = gpu
def forward(self, tokens, lens):
# embedding lookup
tokens_embed = self.word_embedding.forward(tokens)
tokens_embed = self.embed_dropout.forward(tokens_embed)
# lstm layer
_lstm_outputs, (last_hidden, _last_cell) = self.lstm.forward(tokens_embed, lens)
last_hidden = last_hidden.squeeze(0)
last_hidden = self.lstm_dropout.forward(last_hidden)
# linear layers
linear_input = last_hidden
for layer_idx, linear in enumerate(self.linears):
linear_input = linear.forward(linear_input)
if layer_idx != self.linear_num - 1:
linear_input = F.dropout(linear_input, p=.2)
return linear_input
Best I can figure out is that the since the issue is with optimizer.step(), that means that I’m not setting up the optimizer correctly. How do I go about debugging this?