Hello!

I was porting 0.3.1 code to 0.4.1 but performance was lower than before. It was the same result when I wrote and tested it with a simple code. Below is the simple code.

Is there a mistake in my code??

```
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
# ----------------Test Model----------------
class Test(nn.Module):
def __init__(self):
super(Test, self).__init__()
self.layers = nn.Sequential(
nn.Linear(4, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, 2)
)
def forward(self, x):
return self.layers(x)
# ------------------------------------------
# ----------------Make Model----------------
use_cuda = torch.cuda.is_available()
if torch.__version__ == '0.4.1':
device = torch.device("cuda" if use_cuda else "cpu")
model = Test().to(device)
#criterion = nn.MSELoss()
elif torch.__version__ == '0.3.1.post2':
import torch.autograd as autograd
if use_cuda:
model = Test().cuda()
else:
model = Test()
# -------------------------------------------
optimizer = optim.Adam(model.parameters())
# -------------------Test--------------------
if torch.__version__ == '0.4.1':
q_value = model(torch.FloatTensor(np.random.rand(1, 4)).to(device))
expected_q_value = torch.FloatTensor(np.random.rand(1, 2)).to(device)
print(q_value.requires_grad, expected_q_value.requires_grad)
loss = (q_value - expected_q_value).pow(2).mean()
#loss = criterion(q_value, expected_q_value)
torch.cuda.synchronize()
s = time.time()
optimizer.zero_grad()
loss.backward()
optimizer.step()
torch.cuda.synchronize()
print(time.time() - s)
elif torch.__version__ == '0.3.1.post2':
if use_cuda:
a = autograd.Variable(torch.FloatTensor(np.random.rand(1, 4))).cuda()
b = torch.FloatTensor(np.random.rand(1, 2)).cuda()
else:
a = autograd.Variable(torch.FloatTensor(np.random.rand(1, 4)))
b = torch.FloatTensor(np.random.rand(1, 2))
q_value = model(a)
expected_q_value = autograd.Variable(b)
print(q_value.requires_grad, expected_q_value.requires_grad)
loss = (q_value - expected_q_value).pow(2).mean()
torch.cuda.synchronize()
s = time.time()
optimizer.zero_grad()
loss.backward()
optimizer.step()
torch.cuda.synchronize()
print(time.time() - s)
```