Hi,
Indeed, the only way to get something similar with earlier releases is to use autograd.grad()
and then populate the .grad
fields manually with the gradient it returned.
Hi,
Indeed, the only way to get something similar with earlier releases is to use autograd.grad()
and then populate the .grad
fields manually with the gradient it returned.
I met the same problem as you, I passed the two inputs to the backbone, the error appeared.
But when there is only one input, the error disappears .May I ask how you solve this problem?
This is my code:
inputs = inputs.cuda(cfg[‘GPU’], non_blocking=True)
labels = labels.cuda(cfg[‘GPU’], non_blocking=True)
inputs_ = inputs_.cuda(cfg[‘GPU’], non_blocking=True)
labels_ = labels_.cuda(cfg[‘GPU’], non_blocking=True)
features = backbone(inputs)
features_ = backbone(inputs_)
outputs = head(features, labels)
outputs_ = head(features_, labels_)
lossx = loss(outputs, labels) + loss(outputs_, labels_)
optimizer.zero_grad()
lossx.backward()
optimizer.step()
Thanks! It worked in my codes.
But, I have a question for this. Isn’t there any effects from action_loss to value_optimizer?
I mean, in my minor knowledge, I didn’t understand why the computational graph of
“value_loss → action_loss → value_optimizer → action_optimizer” have each correct optimizing value.
Could you anyone help me and explain this shortly? thanks.
Thanks for all wonderful discussions.
I just wanted to confirm that the following two solutions would give the same and right optimization results (pytorch 1.9.0).
In such a pipeline:
optim1 = optim.Adam(G.parameters())
optim2 = optim.Adam(D.parameters())
G = Model1()
D = Model2()
recons, z = G(input)
loss1 = loss_func1(recons)
diff = D(z)
loss2 = loss_func2(diff)
loss3 = loss_func3(diff)
loss_G = loss1 + loss2 # we don’t want to update D parameters here
loss_D = loss3
optim1.zero_grad()
loss_G.backward(retain_graph=True)
optim2.zero_grad()
loss_D.backward()
optim1.step()
optim2.step()
optim1.zero_grad()
loss_G.backward(retain_graph=True, inputs=list(G.parameters()))
optim1.step()
optim2.zero_grad()
loss_D.backward(inputs=list(D.parameters()))
optim2.step()
Both of the solutions come from the previous solutions. Thanks again.
I am facing the same issue and has beem stuck for 1 day
here my code
class RNN(nn.Module):
def __init__(self,input_size, output_size, hidden_size=64):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.xh = nn.Linear(self.input_size, self.hidden_size, bias=False)
self.hh = nn.Linear(self.hidden_size, self.hidden_size)
self.hy = nn.Linear(self.hidden_size, self.output_size)
self.h = torch.zeros(self.hidden_size, requires_grad=True)
self.tanh = nn.Tanh()
self.softmax = nn.Softmax(dim=1)
self.sigmoid = nn.Sigmoid()
def rnn_cell(self, x_t):
first_h = self.hh(self.h)
second_x = self.xh(x_t)
act = second_x + first_h
self.h = self.tanh(act)
updated_c = self.sigmoid(self.hy(self.h))
return updated_c
def forward(self, inp):
return self.rnn_cell(inp)
here is training code
def train(train_x, valid_x, lr, epochs, hidden_units, net='RNN'):
for step, (data, label) in enumerate(train_x):
inputs = np.array(data)
break
if net=='RNN':
model = RNN(inputs.shape[1], 1, hidden_units)
elif net == 'LSTM':
h = torch.zeros(hidden_units).requires_grad_()
c = torch.zeros(hidden_units).requires_grad_()
model = LSTM(inputs.shape[1], 1, hidden_units)
elif net == 'GRU':
St_1 = torch.zeros(hidden_units).requires_grad_()
model = GRUModel(inputs.shape[1], 1, hidden_units)
model.to(device)
train_loss, val_loss = [],[]
train_accuracy, val_accuracy = [], []
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
criterion = nn.BCELoss()
for ep in range(epochs):
running_loss, correct = 0, 0
for i, (data, label) in enumerate(train_x):
data, label = Variable(data), Variable(label)
data, label = data.to(device), label.to(device)
optimizer.zero_grad()
if net == 'RNN':
net_out= model(data)
elif net == 'LSTM':
net_out, h, c = model(data, h, c)
elif net == 'GRU':
net_out, St_1 = model(data, St_1)
label = torch.reshape(label, (label.shape[0], 1))
net_out = torch.reshape(net_out, (label.shape[0], 1))
label = label.float()
loss = criterion(net_out, label)
loss.backward(retain_graph=True, inputs=list(model.parameters()))
optimizer.step()
running_loss += loss.item()
# pred = torch.argmax(net_out, axis=1) # get the index of the max log-probability
# actual = torch.argmax(label, axis=1)
out = (net_out>0.5).float()
correct += out.eq(label).sum()
print(running_loss)
print("Epoch:", ep)
print(correct.item())
print("Training Accuracy:", 100. * correct.item() / len(train_x.dataset))
print("Train Loss:", running_loss / len(train_x.dataset))
train_loss.append(running_loss / len(train_x.dataset))
train_accuracy.append(correct / len(train_x.dataset))
# test_loss = 0
# correct = 0
# with torch.no_grad():
# for batch_idx, (data, target) in enumerate(valid_x):
# data, target = Variable(data), Variable(target)
# data, target = data.to(device), target.to(device)
# # data = data.view(-1, 784)
# if net == 'RNN':
# net_out, _ = model(data, h)
# elif net == 'LSTM':
# net_out, _, _ = model(data, h, c)
# elif net == 'GRU':
# net_out, _ = model(data, St_1)
# net_out = torch.reshape(net_out, (net_out.shape[0],))
# # sum up batch loss
# target = target.float()
# test_loss += criterion(net_out, target).item()
# # pred = torch.argmax(net_out, axis=1) # get the index of the max log-probability
# # actual = torch.argmax(label, axis=1)
# out = (net_out>0.5).float()
# correct += out.eq(target).sum()
# val_loss.append(test_loss / len(valid_x.dataset))
# val_accuracy.append(correct / len(valid_x.dataset))
# print("Validation Accuracy:" , 100. * correct.item() / len(valid_x.dataset))
# print("Validation Loss:", test_loss / len(valid_x.dataset))
# print("----------------------------------------------------------")
return model, train_loss, train_accuracy, val_loss, val_accuracy
Here is the error list:
/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/autograd/__init__.py:147: UserWarning: Error detected in MmBackward. Traceback of forward call that caused the error:
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
app.start()
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 619, in start
self.io_loop.start()
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
self.asyncio_loop.run_forever()
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
self._run_once()
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
handle._run()
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/asyncio/events.py", line 81, in _run
self._context.run(self._callback, *self._args)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/ioloop.py", line 688, in <lambda>
lambda f: self._run_callback(functools.partial(callback, future))
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/ioloop.py", line 741, in _run_callback
ret = callback()
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 814, in inner
self.ctx_run(self.run)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 775, in run
yielded = self.gen.send(value)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 358, in process_one
yield gen.maybe_future(dispatch(*args))
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
yielded = ctx_run(next, result)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
yielded = ctx_run(next, result)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 536, in execute_request
self.do_execute(
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
yielded = ctx_run(next, result)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 302, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 539, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2898, in run_cell
result = self._run_cell(
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2944, in _run_cell
return runner(coro)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3169, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3361, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-61-f6fbdf7371e3>", line 2, in <module>
net, train_loss, train_accuracy, val_loss, val_accuracy = train(train_loader, valid_loader, lr=0.0001, epochs=10, hidden_units=64, net='RNN')
File "<ipython-input-60-3bb8b3924f63>", line 35, in train
net_out= model(data)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "<ipython-input-59-cdbb099f3af3>", line 39, in forward
return self.rnn_cell(inp)
File "<ipython-input-59-cdbb099f3af3>", line 25, in rnn_cell
first_h = self.hh(self.h)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 96, in forward
return F.linear(input, self.weight, self.bias)
File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/nn/functional.py", line 1847, in linear
return torch._C._nn.linear(input, weight, bias)
(Triggered internally at /Users/distiller/project/conda/conda-bld/pytorch_1623459044803/work/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward(
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-61-f6fbdf7371e3> in <module>
1 torch.autograd.set_detect_anomaly(True)
----> 2 net, train_loss, train_accuracy, val_loss, val_accuracy = train(train_loader, valid_loader, lr=0.0001, epochs=10, hidden_units=64, net='RNN')
<ipython-input-60-3bb8b3924f63> in train(train_x, valid_x, lr, epochs, hidden_units, net)
44 label = label.float()
45 loss = criterion(net_out, label)
---> 46 loss.backward(retain_graph=True, inputs=list(model.parameters()))
47 optimizer.step()
48
~/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
253 create_graph=create_graph,
254 inputs=inputs)
--> 255 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
256
257 def register_hook(self, hook):
~/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
145 retain_graph = create_graph
146
--> 147 Variable._execution_engine.run_backward(
148 tensors, grad_tensors_, retain_graph, create_graph, inputs,
149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 64]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
can you @ptrblck @albanD please help me I’ll be very thankful to all of you
I guess the error is raised, since you are not detaching the hidden state, are retaining the graph, and updating the parameters in each iteration. This would cause the intermediate forward activations from the previous iterations be become “stale” and the gradient computation would fail.
I don’t know what your exact use case is, but you might want to detach()
the hidden and cell states in each iteration and remove retain_graph=True
as well.
I solve the problem by deleting the “inplace=True” for Torch 1.7
please , could you give me more details?
where did you put ‘inplace=True’?
Hey, has this issue been solved? I am having the same issue in exactly a similar scenario. If it is solved, could you please suggest the solution?
Thanks Jeff. It worked for me.
I’m getting this error at loss.backward()
. I don’t know what is this error. I’m using pytorch 1.10.1.
Exception has occurred: RuntimeError
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [48, 2048]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
File "/home/ma/Downloads/Thesis/main_top_down_baseline.py", line 45, in train
loss.backward().clone()
File "/home/ma/Downloads/Thesis/main_top_down_baseline.py", line 253, in main
train(model, train_loader, dev_loader, optimizer, scheduler, n_epoch, args.output_dir, encoder, args.gpuid, clip_norm, model_name, args.model_saving_name,
File "/home/ma/Downloads/Thesis/main_top_down_baseline.py", line 262, in <module>
main()
This is model which I’m using
'''
This is the baseline model.
We directly use top-down VQA like mechanism for SR
Modified bottom-up top-down code from https://github.com/hengyuan-hu/bottom-up-attention-vqa and
added normalization from https://github.com/yuzcccc/vqa-mfb
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
torch.autograd.set_detect_anomaly(True)
sys.path.append('/home/ma/Downloads/Thesis')
from lib.attention import Attention
from lib.classifier import SimpleClassifier
from lib.fc import FCNet
import torchvision as tv
class vgg16_modified(nn.Module):
def __init__(self):
super(vgg16_modified, self).__init__()
vgg = tv.models.vgg16_bn(pretrained=True)
self.vgg_features = vgg.features
def forward(self,x):
features = self.vgg_features(x)
return features
class Top_Down_Baseline(nn.Module):
def __init__(self, convnet, role_emb, verb_emb, query_composer, v_att, q_net, v_net, classifier, encoder, Dropout_C):
super(Top_Down_Baseline, self).__init__()
self.convnet = convnet
self.role_emb = role_emb
self.verb_emb = verb_emb
self.query_composer = query_composer
self.v_att = v_att
self.q_net = q_net
self.v_net = v_net
self.classifier = classifier
self.encoder = encoder
self.Dropout_C = Dropout_C
def forward(self, v_org, gt_verb):
'''
:param v_org: original image
:param gt_verb: ground truth verb id
:return: predicted role label logits
'''
img_features = self.convnet(v_org)
batch_size, n_channel, conv_h, conv_w = img_features.size()
img_org = img_features.view(batch_size, -1, conv_h* conv_w)
v = img_org.permute(0, 2, 1)
batch_size = v.size(0)
role_idx = self.encoder.get_role_ids_batch(gt_verb)
if torch.cuda.is_available():
role_idx = role_idx.to(torch.device('cuda'))
img = v
img = img.expand(self.encoder.max_role_count, img.size(0), img.size(1), img.size(2))
img = img.transpose(0,1)
img = img.contiguous().view(batch_size * self.encoder.max_role_count, -1, v.size(2))
verb_embd = self.verb_emb(gt_verb)
role_embd = self.role_emb(role_idx)
verb_embed_expand = verb_embd.expand(self.encoder.max_role_count, verb_embd.size(0), verb_embd.size(1))
verb_embed_expand = verb_embed_expand.transpose(0,1)
#query for image reasoning
concat_query = torch.cat([ verb_embed_expand, role_embd], -1)
role_verb_embd = concat_query.contiguous().view(-1, role_embd.size(-1)*2)
q_emb = self.query_composer(role_verb_embd)
att = self.v_att(img, q_emb)
v_emb = (att * img).sum(1)
v_repr = self.v_net(v_emb)
q_repr = self.q_net(q_emb)
mfb_iq_eltwise = torch.mul(q_repr, v_repr)
mfb_iq_drop = self.Dropout_C(mfb_iq_eltwise)
#normalization to avoid model convergence to unsatisfactory local minima
mfb_iq_resh = mfb_iq_drop.view(batch_size* self.encoder.max_role_count, 1, -1, 1)
# sum pooling can be more useful if there are multiple heads like original MFB.
# we kept out head count to 1 for final implementation, but experimented with multiple without considerable improvement.
mfb_iq_sumpool = torch.sum(mfb_iq_resh, 3, keepdim=True)
mfb_out = torch.squeeze(mfb_iq_sumpool)
mfb_sign_sqrt = torch.sqrt(F.relu(mfb_out)) - torch.sqrt(F.relu(-mfb_out))
mfb_l2 = F.normalize(mfb_sign_sqrt)
out = mfb_l2
logits = self.classifier(out)
role_label_pred = logits.contiguous().view(v.size(0), self.encoder.max_role_count, -1)
return role_label_pred
def forward_hiddenrep(self, v_org, gt_verb):
'''
:param v_org: original image
:param gt_verb: ground truth verb id
:return: hidden representation which is the input to the classifier
'''
img_features = self.convnet(v_org)
batch_size, n_channel, conv_h, conv_w = img_features.size()
img_org = img_features.view(batch_size, -1, conv_h* conv_w)
v = img_org.permute(0, 2, 1)
batch_size = v.size(0)
role_idx = self.encoder.get_role_ids_batch(gt_verb)
if torch.cuda.is_available():
role_idx = role_idx.to(torch.device('cuda'))
img = v
img = img.expand(self.encoder.max_role_count, img.size(0), img.size(1), img.size(2))
img = img.transpose(0,1)
img = img.contiguous().view(batch_size * self.encoder.max_role_count, -1, v.size(2))
verb_embd = self.verb_emb(gt_verb)
role_embd = self.role_emb(role_idx)
verb_embed_expand = verb_embd.expand(self.encoder.max_role_count, verb_embd.size(0), verb_embd.size(1))
verb_embed_expand = verb_embed_expand.transpose(0,1)
concat_query = torch.cat([ verb_embed_expand, role_embd], -1)
role_verb_embd = concat_query.contiguous().view(-1, role_embd.size(-1)*2)
q_emb = self.query_composer(role_verb_embd)
att = self.v_att(img, q_emb)
v_emb = (att * img).sum(1)
v_repr = self.v_net(v_emb)
q_repr = self.q_net(q_emb)
mfb_iq_eltwise = torch.mul(q_repr, v_repr)
mfb_iq_drop = self.Dropout_C(mfb_iq_eltwise)
mfb_iq_resh = mfb_iq_drop.view(batch_size* self.encoder.max_role_count, 1, -1, 1) # N x 1 x 1000 x 5
mfb_iq_sumpool = torch.sum(mfb_iq_resh, 3, keepdim=True) # N x 1 x 1000 x 1
mfb_out = torch.squeeze(mfb_iq_sumpool) # N x 1000
mfb_sign_sqrt = torch.sqrt(F.relu(mfb_out)) - torch.sqrt(F.relu(-mfb_out))
mfb_l2 = F.normalize(mfb_sign_sqrt)
out = mfb_l2
return out
def forward_agentplace_noverb(self, v_org, pred_verb):
max_role_count = 2
img_features = self.convnet(v_org)
batch_size, n_channel, conv_h, conv_w = img_features.size()
img_org = img_features.view(batch_size, -1, conv_h* conv_w)
v = img_org.permute(0, 2, 1)
batch_size = v.size(0)
role_idx = self.encoder.get_agent_place_ids_batch(batch_size)
if torch.cuda.is_available():
role_idx = role_idx.to(torch.device('cuda'))
img = v
img = img.expand(max_role_count, img.size(0), img.size(1), img.size(2))
img = img.transpose(0,1)
img = img.contiguous().view(batch_size * max_role_count, -1, v.size(2))
#verb_embd = torch.sum(self.verb_emb.weight, 0)
#verb_embd = verb_embd.expand(batch_size, verb_embd.size(-1))
#verb_embd = torch.zeros(batch_size, 300).cuda()
verb_embd = self.verb_emb(pred_verb)
role_embd = self.role_emb(role_idx)
verb_embed_expand = verb_embd.expand(max_role_count, verb_embd.size(0), verb_embd.size(1))
verb_embed_expand = verb_embed_expand.transpose(0,1)
concat_query = torch.cat([ verb_embed_expand, role_embd], -1)
role_verb_embd = concat_query.contiguous().view(-1, role_embd.size(-1)*2)
q_emb = self.query_composer(role_verb_embd)
att = self.v_att(img, q_emb)
v_emb = (att * img).sum(1)
v_repr = self.v_net(v_emb)
q_repr = self.q_net(q_emb)
mfb_iq_eltwise = torch.mul(q_repr, v_repr)
mfb_iq_drop = self.Dropout_C(mfb_iq_eltwise)
mfb_iq_resh = mfb_iq_drop.view(batch_size* max_role_count, 1, -1, 1) # N x 1 x 1000 x 5
mfb_iq_sumpool = torch.sum(mfb_iq_resh, 3, keepdim=True) # N x 1 x 1000 x 1
mfb_out = torch.squeeze(mfb_iq_sumpool) # N x 1000
mfb_sign_sqrt = torch.sqrt(F.relu(mfb_out)) - torch.sqrt(F.relu(-mfb_out))
mfb_l2 = F.normalize(mfb_sign_sqrt)
out = mfb_l2
logits = self.classifier(out)
role_label_pred = logits.contiguous().view(v.size(0), max_role_count, -1)
role_label_rep = v_repr.contiguous().view(v.size(0), max_role_count, -1)
return role_label_pred, role_label_rep
def calculate_loss(self, gt_verbs, role_label_pred, gt_labels):
batch_size = role_label_pred.size()[0]
criterion = nn.CrossEntropyLoss(ignore_index=self.encoder.get_num_labels())
gt_label_turned = gt_labels.transpose(1,2).contiguous().view(batch_size* self.encoder.max_role_count*3, -1)
role_label_pred = role_label_pred.contiguous().view(batch_size* self.encoder.max_role_count, -1)
role_label_pred = role_label_pred.expand(3, role_label_pred.size(0), role_label_pred.size(1))
role_label_pred = role_label_pred.transpose(0,1)
role_label_pred = role_label_pred.contiguous().view(-1, role_label_pred.size(-1))
loss = criterion(role_label_pred, gt_label_turned.squeeze(1)) * 3
return loss
def build_top_down_baseline(n_roles, n_verbs, num_ans_classes, encoder):
hidden_size = 1024
word_embedding_size = 300
img_embedding_size = 512
covnet = vgg16_modified()
role_emb = nn.Embedding(n_roles+1, word_embedding_size, padding_idx=n_roles)
verb_emb = nn.Embedding(n_verbs, word_embedding_size)
query_composer = FCNet([word_embedding_size * 2, hidden_size])
v_att = Attention(img_embedding_size, hidden_size, hidden_size)
q_net = FCNet([hidden_size, hidden_size ])
v_net = FCNet([img_embedding_size, hidden_size])
classifier = SimpleClassifier(
hidden_size, 2 * hidden_size, num_ans_classes, 0.5)
Dropout_C = nn.Dropout(0.1)
return Top_Down_Baseline(covnet, role_emb, verb_emb, query_composer, v_att, q_net,
v_net, classifier, encoder, Dropout_C)
This is training loop.
import torch
import json
import os
from utils import utils, imsitu_scorer, imsitu_loader, imsitu_encoder
from models import top_down_baseline
def train(model, train_loader, dev_loader, optimizer, scheduler, max_epoch, model_dir, encoder, gpu_mode, clip_norm, model_name, model_saving_name, eval_frequency=4000):
model.train()
train_loss = 0
total_steps = 0
print_freq = 400
dev_score_list = []
if gpu_mode > 0 :
ngpus = 2
device_array = [i for i in range(0,ngpus)]
pmodel = torch.nn.DataParallel(model, device_ids=device_array)
else:
pmodel = model
top1 = imsitu_scorer.imsitu_scorer(encoder, 1, 3)
top5 = imsitu_scorer.imsitu_scorer(encoder, 5, 3)
for epoch in range(max_epoch):
for i, (_, img, verb, labels) in enumerate(train_loader):
total_steps += 1
if gpu_mode >= 0:
img = torch.autograd.Variable(img.cuda())
verb = torch.autograd.Variable(verb.cuda())
labels = torch.autograd.Variable(labels.cuda())
else:
img = torch.autograd.Variable(img)
verb = torch.autograd.Variable(verb)
labels = torch.autograd.Variable(labels)
role_predict = pmodel(img, verb)
loss = model.calculate_loss(verb, role_predict, labels)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
optimizer.step()
optimizer.zero_grad()
train_loss += loss.item()
top1.add_point_noun(verb, role_predict, labels)
top5.add_point_noun(verb, role_predict, labels)
if total_steps % print_freq == 0:
top1_a = top1.get_average_results_nouns()
top5_a = top5.get_average_results_nouns()
print ("{},{},{}, {} , {}, loss = {:.2f}, avg loss = {:.2f}"
.format(total_steps-1,epoch,i, utils.format_dict(top1_a, "{:.2f}", "1-"),
utils.format_dict(top5_a,"{:.2f}","5-"), loss.item(),
train_loss / ((total_steps-1)%eval_frequency) ))
if total_steps % eval_frequency == 0:
top1, top5, val_loss = eval(model, dev_loader, encoder, gpu_mode)
model.train()
top1_avg = top1.get_average_results_nouns()
top5_avg = top5.get_average_results_nouns()
avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"]
avg_score /= 8
print ('Dev {} average :{:.2f} {} {}'.format(total_steps-1, avg_score*100,
utils.format_dict(top1_avg,'{:.2f}', '1-'),
utils.format_dict(top5_avg, '{:.2f}', '5-')))
dev_score_list.append(avg_score)
max_score = max(dev_score_list)
if max_score == dev_score_list[-1]:
torch.save(model.state_dict(), model_dir + "/{}_{}.model".format( model_name, model_saving_name))
print ('New best model saved! {0}'.format(max_score))
print('current train loss', train_loss)
train_loss = 0
top1 = imsitu_scorer.imsitu_scorer(encoder, 1, 3)
top5 = imsitu_scorer.imsitu_scorer(encoder, 5, 3)
del role_predict, loss, img, verb, labels
print('Epoch ', epoch, ' completed!')
scheduler.step()
def eval(model, dev_loader, encoder, gpu_mode, write_to_file = False):
model.eval()
print ('evaluating model...')
top1 = imsitu_scorer.imsitu_scorer(encoder, 1, 3, write_to_file)
top5 = imsitu_scorer.imsitu_scorer(encoder, 5, 3)
with torch.no_grad():
for i, (img_id, img, verb, labels) in enumerate(dev_loader):
print(img_id[0], encoder.verb2_role_dict[encoder.verb_list[verb[0]]])
if gpu_mode >= 0:
img = torch.autograd.Variable(img.cuda())
verb = torch.autograd.Variable(verb.cuda())
labels = torch.autograd.Variable(labels.cuda())
labels = torch.autograd.Variable(labels.cuda())
else:
img = torch.autograd.Variable(img)
verb = torch.autograd.Variable(verb)
labels = torch.autograd.Variable(labels)
role_predict = model(img, verb)
top1.add_point_noun(verb, role_predict, labels)
top5.add_point_noun(verb, role_predict, labels)
del role_predict, img, verb, labels
return top1, top5, 0
def main():
import argparse
parser = argparse.ArgumentParser(description="imsitu VSRL. Training, evaluation and prediction.")
parser.add_argument("--gpuid", default=0, help="put GPU id > -1 in GPU mode", type=int)
parser.add_argument('--output_dir', type=str, default='./main-TDA', help='Location to output the model')
parser.add_argument('--resume_training', action='store_true', help='Resume training from the model [resume_model]')
parser.add_argument('--resume_model', type=str, default='', help='The model we resume')
parser.add_argument('--evaluate', action='store_true', help='Only use the testing mode')
parser.add_argument('--evaluate_visualize', action='store_true', help='Only use the testing mode to visualize ')
parser.add_argument('--evaluate_rare', action='store_true', help='Only use the testing mode')
parser.add_argument('--test', action='store_true', help='Only use the testing mode')
parser.add_argument('--dataset_folder', type=str, default='./imSitu', help='Location of annotations')
parser.add_argument('--imgset_dir', type=str, default='./resized_256', help='Location of original images')
parser.add_argument('--train_file', default="train_freq2000.json", type=str, help='trainfile name')
parser.add_argument('--dev_file', default="dev_freq2000.json", type=str, help='dev file name')
parser.add_argument('--test_file', default="test_freq2000.json", type=str, help='test file name')
parser.add_argument('--model_saving_name', type=str, help='saving name of the outpul model')
parser.add_argument('--epochs', type=int, default=200)
parser.add_argument('--model', type=str, default='top_down_baseline')
parser.add_argument('--batch_size', type=int, default=8)
parser.add_argument('--seed', type=int, default=1111, help='random seed')
parser.add_argument('--clip_norm', type=float, default=0.25)
parser.add_argument('--num_workers', type=int, default=3)
args = parser.parse_args()
n_epoch = args.epochs
batch_size = args.batch_size
clip_norm = args.clip_norm
n_worker = args.num_workers
dataset_folder = args.dataset_folder
imgset_folder = args.imgset_dir
train_set = json.load(open(dataset_folder + '/' + args.train_file))
encoder = imsitu_encoder.imsitu_encoder(train_set)
train_set = imsitu_loader.imsitu_loader(imgset_folder, train_set, encoder,'train', encoder.train_transform)
constructor = 'build_%s' % args.model
model = getattr(top_down_baseline, constructor)(encoder.get_num_roles(),encoder.get_num_verbs(), encoder.get_num_labels(), encoder)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=n_worker)
dev_set = json.load(open(dataset_folder + '/' + args.dev_file))
dev_set = imsitu_loader.imsitu_loader(imgset_folder, dev_set, encoder, 'val', encoder.dev_transform)
dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=batch_size, shuffle=True, num_workers=n_worker)
test_set = json.load(open(dataset_folder + '/' + args.test_file))
test_set = imsitu_loader.imsitu_loader(imgset_folder, test_set, encoder, 'test', encoder.dev_transform)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=True, num_workers=n_worker)
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
torch.manual_seed(args.seed)
if args.gpuid >= 0:
model.cuda()
torch.cuda.manual_seed(args.seed)
torch.backends.cudnn.benchmark = True
if args.resume_training:
print('Resume training from: {}'.format(args.resume_model))
args.train_all = True
if len(args.resume_model) == 0:
raise Exception('[pretrained module] not specified')
utils.load_net(args.resume_model, [model])
optimizer = torch.optim.Adamax(model.parameters(), lr=1e-3)
model_name = 'resume_all'
else:
print('Training from the scratch.')
model_name = 'train_full'
utils.set_trainable(model, True)
optimizer = torch.optim.Adamax([
{'params': model.convnet.parameters(), 'lr': 5e-5},
{'params': model.role_emb.parameters()},
{'params': model.verb_emb.parameters()},
{'params': model.query_composer.parameters()},
{'params': model.v_att.parameters()},
{'params': model.q_net.parameters()},
{'params': model.v_net.parameters()},
{'params': model.classifier.parameters()}
], lr=1e-3)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
if args.evaluate:
top1, top5, val_loss = eval(model, dev_loader, encoder, args.gpuid)
top1_avg = top1.get_average_results_nouns()
top5_avg = top5.get_average_results_nouns()
avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"]
avg_score /= 8
print ('Dev average :{:.2f} {} {}'.format( avg_score*100,
utils.format_dict(top1_avg,'{:.2f}', '1-'),
utils.format_dict(top5_avg, '{:.2f}', '5-')))
elif args.test:
top1, top5, val_loss = eval(model, test_loader, encoder, args.gpuid)
top1_avg = top1.get_average_results_nouns()
top5_avg = top5.get_average_results_nouns()
avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"]
avg_score /= 8
print ('Test average :{:.2f} {} {}'.format( avg_score*100,
utils.format_dict(top1_avg,'{:.2f}', '1-'),
utils.format_dict(top5_avg, '{:.2f}', '5-')))
else:
print('Model training started!')
train(model, train_loader, dev_loader, optimizer, scheduler, n_epoch, args.output_dir, encoder, args.gpuid, clip_norm, model_name, args.model_saving_name,
)
if __name__ == "__main__":
main()
The solution provided by jeff is not working for me. @ptrblck @albanD can you help me?
Could you reduce the code by removing unnecessary parts and post an executable code snippet to reproduce the issue, please?
Hi, I’m facing a similar issue but I don’t know what causes this error to rise.
This is my model
class ResidualBlock(nn.Module):
def __init__(self, in_channels, kernel_size=1, stride=1, padding=0, num_res = 2):
if num_res%2 !=0:
raise ValueError(f'num_res={num_res} which is not even, num_res must be even')
super(ResidualBlock, self).__init__()
layers = []
for _ in range(num_res):
layers.append(nn.Conv2d(in_channels = in_channels,
out_channels = in_channels, kernel_size=kernel_size, stride=stride, padding=padding))
layers.append(nn.ReLU())
self.seq = nn.Sequential(*layers)
def forward(self, x):
res = torch.clone(x)
for i in range(len(self.seq)):
x = self.seq[i].forward(x)
# print(self.seq[i])
if (i+1)%4==0:
# print('res')
x+=res
res = torch.clone(x)
return x
class Resnet16(nn.Module):
def __init__(self, in_channels=3):
super(Resnet16, self).__init__()
self.convnet = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
ResidualBlock(in_channels=64,kernel_size=3, stride=1, padding=1,num_res=4),
nn.Conv2d(in_channels=64, out_channels=256, kernel_size=3, stride=2, padding=1),
nn.ReLU(),
ResidualBlock(in_channels=256, kernel_size=3, stride=1, padding=1, num_res=4),
nn.BatchNorm2d(256),
nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
nn.ReLU(),
ResidualBlock(in_channels=512, kernel_size=3, stride=1, padding=1, num_res=4),
nn.AvgPool2d(kernel_size=3),
)
self.fcnet = nn.Sequential(
nn.Linear(2048,512),
nn.ReLU(),
nn.Linear(512,256),
nn.ReLU(),
nn.Linear(256,10)
)
def forward(self,x):
x = self.convnet(x)
bs = x.shape[0]
x = x.view(bs,-1)
x = self.fcnet(x)
return x
And this is the part of the optimizer that optimizes my model
from torch.optim import Adam
# model = VGG()
model = Resnet16(3)
optimizer_params = {
'params': model.parameters(),
'lr':1e-3
}
optimizer = Adam(optimizer_params['params'],optimizer_params['lr'])
trainer_params = {
'model': model,
'train_loader': trainloader,
'valid_loader': testloader,
'criterion': nn.CrossEntropyLoss(),
'optimizer': optimizer,
'epochs': 10,
'print_every':2,
'pth_filename': 'model.pth',
'trainset_sz': len(trainset),
'validset_sz': len(testset)
}
.
.
.
.
for data, labels in train_loader:
# putting data in working device
data = data.to(device)
labels = labels.to(device)
# forward propagation
output = model(data)
loss = criterion(output, labels)
# backward propagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
# logging training loop results
training_loss += loss.item()
Try to remove all inplace operations such as x += res
and see, if this would help.
It worked, thanks so much!!
hi guys, I’m facing the similar issue when I want to do backward() too. really need your help. Big thanks for your help here.
@ptrblck @albanD
code in training process:
def train(rho_data, size, train_size, mine_net, optimizer, iteration, input_size, tau):
criterion = nn.BCEWithLogitsLoss()
diff_et = torch.tensor(0.0)
data, test_p0, test_q0, label, train_index, marg_index = recons_data(rho_data, size,
train_size)
for i in range(iteration):
batch_size = int(len(data)/4)
if input_size == 2:
test_p = torch.FloatTensor(test_p0[:,[0,2]])
test_q = torch.FloatTensor(test_q0[:,[0,2]])
else:
test_p = torch.FloatTensor(test_p0)
test_q = torch.FloatTensor(test_q0)
train_batch, index1, index2 = sample_batch(data, input_size,
batch_size = batch_size,
sample_mode = 'joint')
label_batch = label[index1]
train_batch = torch.autograd.Variable(torch.FloatTensor(train_batch), requires_grad=True)
label_batch = torch.FloatTensor(label_batch)
logit = mine_net(train_batch)[0]
loss = criterion(logit.reshape(-1), label_batch)
if i < iteration-1:
optimizer.zero_grad()
loss.backward()
optimizer.step()
else:
optimizer.zero_grad()
loss.backward(retain_graph = True)
optimizer.step()
train_batch.grad.zero_()
loss.backward()
grads = train_batch.grad
if i >= iteration-101:
prob_p = mine_net(test_p)[1]
rn_est_p = prob_p/(1-prob_p)
finp_p = torch.log(torch.abs(rn_est_p))
prob_q = mine_net(test_q)[1]
rn_est_q = prob_q/(1-prob_q)
a = torch.abs(rn_est_q)
clip = torch.max(torch.min(a,torch.exp(tau)), torch.exp(-tau))
diff_et = diff_et+torch.max(torch.mean(finp_p)-torch.log(torch.mean(clip)), torch.tensor(0.0))
return (diff_et/100).detach().cpu().numpy(), grads, index1, train_index, marg_index
problem from system:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [130, 1]], which is output 0 of AsStridedBackward0, is at version 1002; expected version 1001 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
Could you explain your use case and in particular why you are using retain_graph=True
as this ususally yields these kind of errors (and is often used as a workaround for another error)?
Hi ptrblck, many thanks for your help here. I have solved this bug here.
else: optimizer.zero_grad() loss.backward(retain_graph = True) optimizer.step() train_batch.grad.zero_() loss.backward() grads = train_batch.grad
Hi guys . I met the problem with loss.backward() as you can see here
File “train.py”, line 360, in train
loss_adv.backward(retain_graph=True)
File “/usr/local/lib/python3.7/dist-packages/torch/_tensor.py”, line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File “/usr/local/lib/python3.7/dist-packages/torch/autograd/init.py”, line 175, in backward
allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [512, 7]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
My code is
Could you also check why retain_graph
is used in your code?