I noticed that there is a weird slow down of the training phase when I accumulate the losses using .item() instead of .data[0] (note I am testing this code on google colab GPU). The network is a relatively simple CNN:

```
import torch
import time
from torch.autograd import Variable
import torchvision
from torchvision import transforms, datasets
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
#these are the per channel mean and standart deviation of
#CIFAR10 image database. We will use these to normalize each
#channel to unit deviation with mean 0.
mean_CIFAR10=np.array([0.49139968, 0.48215841, 0.44653091])
std_CIFAR10=np.array([0.49139968, 0.48215841, 0.44653091])
#this transformation is used to transform the images to 0 mean and 1 std.
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize(mean_CIFAR10 , std_CIFAR10)])
#load the CIFAR10 training and test sets
training_set_CIFAR10 = datasets.CIFAR10(root = 'cifar10/',
transform = transform,
train = True,
download = True)
test_set_CIFAR10 = datasets.CIFAR10(root = 'cifar10/',
transform = transform,
train = False,
download = True)
print('Number of training examples:', len(training_set_CIFAR10))
print('Number of test examples:', len(test_set_CIFAR10))
#there are ten classes in the CIFAR10 database
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
#DataLoaders are used to iterate over the database images in batches rather
#one by one using for loops which is expensive in python since it is interpreted
training_loader_CIFAR10 = torch.utils.data.DataLoader(dataset=training_set_CIFAR10,
batch_size=512,
shuffle=True)
test_loader_CIFAR10 = torch.utils.data.DataLoader(dataset=test_set_CIFAR10,
batch_size=512,
shuffle=False)
#this function is used to test the accuracy of the model
#over the test set. The network cnn is defined later on in the code.
def test():
print('Started evaluating test accuracy...')
cnn.eval()
#calculate the accuracy of our model over the whole test set in batches
correct = 0
for x, y in test_loader_CIFAR10:
x, y = Variable(x).cuda(), y.cuda()
h = cnn.forward(x)
pred = h.data.max(1)[1]
correct += pred.eq(y).sum()
return correct/len(test_set_CIFAR10)
#Below we define the convolutional network class.
#See the beginning of the document for the architecture
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
#define the feature extraction layers
self.conv1 = torch.nn.Conv2d(3,16,kernel_size=3,stride=1,padding=1)
self.pool1 = nn.MaxPool2d(2, stride=2)
self.conv2 = torch.nn.Conv2d(16,32,kernel_size=3,stride=1,padding=1)
self.pool2 = nn.MaxPool2d(2, stride=2)
self.conv3 = torch.nn.Conv2d(32,64,kernel_size=3,stride=1,padding=1)
self.pool3 = nn.MaxPool2d(2, stride=2)
self.conv4 = torch.nn.Conv2d(64,128,kernel_size=3,stride=1,padding=1)
self.pool4 = nn.MaxPool2d(2, stride=2)
#define the categorization layers
self.full1=nn.Linear(512,512)
self.full2=nn.Linear(512, 256)
self.full3=nn.Linear(256,10)
#define the forward run for the input data x
def forward(self, x):
#convolutional feature extraction layers
x = F.relu(self.conv1(x))
x = self.pool1(x)
x = F.relu(self.conv2(x))
x = self.pool2(x)
x = F.relu(self.conv3(x))
x = self.pool3(x)
x = F.relu(self.conv4(x))
x = self.pool4(x)
#learning layers
x = x.view(-1,512)
x = F.relu(self.full1(x))
x = self.full2(x) #no relu here since we use crossentropyloss
return x
#this is the training function. cnn is the network that is defined later
#optimizer and learning rate lr are modified inside the function
def train(cycles,cost_criterion,cnn,optimizer):
average_cost=0 #cost function for the training
acc=0 #accuracy over the test set
for e in range(cycles): #cycle through the database many times
print('Cycle: ',e)
cnn.train()
loadt=0
cudat=0
forwardt=0
costt=0
stept=0
avcostt=0
#following for loop cycles over the training set in batches
#of batch_number=5 using the training_loader object
s1 = time.clock()
t1 = time.clock()
for i, (x, y) in enumerate(training_loader_CIFAR10 ,0):
s2 = time.clock()
loadt=loadt+s2-s1
#here x,y will store data from the training set in batches
x, y = Variable(x).cuda(), Variable(y).cuda()
s3 = time.clock()
cudat=cudat+s3-s2
h = cnn.forward(x) #calculate hypothesis over the batch
s4 = time.clock()
forwardt=forwardt+s4-s3
cost = cost_criterion(h, y) #calculate cost the cost of the results
#print(type(cost))
s5 = time.clock()
costt=costt+s5-s4
optimizer.zero_grad() #set the gradients to 0
cost.backward() # calculate derivatives wrt parameters
optimizer.step() #update parameters
s6 = time.clock()
stept=stept+s6-s5
average_cost+=cost.item(); #add the cost to the costs
s1 = time.clock()
avcostt=avcostt+s1-s6
t2 = time.clock()
print('total time %.2f loading time %.2f, cuda transfer time %.2f, forward time: %.2f, cost time %.2f, step time %.2f, average cost time %.2f'%(t2-t1,loadt,cudat,forwardt,costt,stept,avcostt))
average_cost=0
cycles = 50 #number of cycles that the training runs over the database
cost_criterion = torch.nn.CrossEntropyLoss() #cost function
cnn = ConvNet().cuda() #build the initial network (in the GPU)
optimizer=optim.Adam(cnn.parameters(), lr= 0.0001)
train(cycles,cost_criterion,cnn,optimizer)
torch.save(cnn.state_dict(), 'cnn_trained')
```

It happens when I try to accumulate losses by

`average_cost+=cost.item()`

in Pytorch 0.4 (The full code is at the end of the message). The timing is as follows

```
Cycle: 0
total time 16.31 loading time 10.85, cuda transfer time 0.11, forward time: 0.37, cost time 0.02, step time 0.80, average cost time 4.17
Cycle: 1
total time 16.32 loading time 10.84, cuda transfer time 0.11, forward time: 0.36, cost time 0.02, step time 0.80, average cost time 4.18
Cycle: 2
total time 16.32 loading time 10.84, cuda transfer time 0.11, forward time: 0.36, cost time 0.02, step time 0.80, average cost time 4.19
Cycle: 3
```

where total time is the time it takes the network to optimize through the whole dataset once and average cost time is the time it takes for the operation I mentioned above. If I use .data[0] instead I get

```
Cycle 0
total time 12.11 loading time 10.80, cuda transfer time 0.11, forward time: 0.38, cost time 0.02, step time 0.80, average cost time 0.01
Cycle: 1
total time 12.05 loading time 10.75, cuda transfer time 0.11, forward time: 0.36, cost time 0.02, step time 0.80, average cost time 0.01
```

Am I making a mistake elsewhere that affects this operation?

Even weirder is the following. I considered the same code with a more complicated network (residual network). It has the same behaviour but something funny happens, when I replace .item() with .data[0] the time for accumulating the cost decreases but the time for transfering the tensors to CUDA increases? The code is below

```
import torch
import time
from torch.autograd import Variable
import torchvision
from torchvision import transforms, datasets
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
#these are the per channel mean and standart deviation of
#CIFAR10 image database. We will use these to normalize each
#channel to unit deviation with mean 0.
mean_CIFAR10=np.array([0.49139968, 0.48215841, 0.44653091])
std_CIFAR10=np.array([0.49139968, 0.48215841, 0.44653091])
#this transformation is used to transform the images to 0 mean and 1 std.
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize(mean_CIFAR10 , std_CIFAR10)])
#load the CIFAR10 training and test sets
training_set_CIFAR10 = datasets.CIFAR10(root = 'cifar10/',
transform = transform,
train = True,
download = True)
test_set_CIFAR10 = datasets.CIFAR10(root = 'cifar10/',
transform = transform,
train = False,
download = True)
print('Number of training examples:', len(training_set_CIFAR10))
print('Number of test examples:', len(test_set_CIFAR10))
#there are ten classes in the CIFAR10 database
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
#DataLoaders are used to iterate over the database images in batches rather
#one by one using for loops which is expensive in python since it is interpreted
training_loader_CIFAR10 = torch.utils.data.DataLoader(dataset=training_set_CIFAR10,
batch_size=512,
shuffle=True)
test_loader_CIFAR10 = torch.utils.data.DataLoader(dataset=test_set_CIFAR10,
batch_size=512,
shuffle=False)
#this function is used to test the accuracy of the model
#over the test set. The network cnn is defined later on in the code.
def test():
print('Started evaluating test accuracy...')
cnn.eval()
#calculate the accuracy of our model over the whole test set in batches
correct = 0
for x, y in test_loader_CIFAR10:
x, y = Variable(x).cuda(), y.cuda()
h = cnn.forward(x)
pred = h.data.max(1)[1]
correct += pred.eq(y).sum()
return correct/len(test_set_CIFAR10)
#These are the two types of the basic blocks in a residual network. The residual network
#in this code is built by concatenating several such blocks together.
#Basic blocks are of the form x -> D(x) + F(x), where D(x) is x downsampled
#to the same dimensions as F(x) by a single convolution and F(x) is collection of
#successive operations involving several convolutions and batchnorms.
class BasicResBlock1(nn.Module):
def __init__(self, input, output, downsample, stride=1):
super(BasicResBlock1, self).__init__()
self.conv1 = torch.nn.Conv2d(input,output,kernel_size=3,stride=stride,padding=1, bias=False)
self.batchNorm1 = torch.nn.BatchNorm2d(output)
self.conv2 = torch.nn.Conv2d(output,output,kernel_size=3,padding=1, stride=1, bias=False)
self.downsample=downsample
#applied to the residual to downsample
def forward(self,x1):
residual = self.downsample(x1)
x2 = self.conv1(x1)
x2 = self.batchNorm1(x2)
x2 = F.relu(x2,inplace=True)
x2 = self.conv2(x2)
x2+= residual
return x2
class BasicResBlock2(nn.Module):
def __init__(self, input, output):
super(BasicResBlock2, self).__init__()
self.conv1 = torch.nn.Conv2d(input,output,kernel_size=3,stride=1,padding=1, bias=False)
self.batchNorm1 = torch.nn.BatchNorm2d(input)
self.conv2 = torch.nn.Conv2d(output,output,kernel_size=3,padding=1, stride=1, bias=False)
self.batchNorm2 = torch.nn.BatchNorm2d(output)
self.batchNorm3 = torch.nn.BatchNorm2d(output)
def forward(self,x1):
residual = x1
x2 = self.batchNorm1(x1)
x2 = F.relu(x2,inplace=True)
x2 = self.conv1(x1);
x2 = self.batchNorm2(x2)
x2 = F.relu(x2,inplace=True)
x2 = self.conv2(x2)
x2+= residual
x2 = self.batchNorm3(x2)
x2 = F.relu(x2, inplace=True)
return x2
#Below we define the residual network class
class ResNet(nn.Module):
def __init__(self,width, number_of_blocks):
super(ResNet, self).__init__()
#these are the inital layers applied before basic blocks
self.conv1 = torch.nn.Conv2d(3,width,kernel_size=3,stride=1,padding=1, bias=False)
self.batchNorm1 = torch.nn.BatchNorm2d(width)
self.relu1 = nn.ReLU(inplace=True)
#resLayer1 is the basic block for the residual network that is formed by
#concatenating several basic blocks of increasing dimensions together.
self.downsample1=torch.nn.Conv2d(width,2*width,kernel_size=1,stride=1,bias=False)
self.downsample2=torch.nn.Conv2d(2*width,4*width,kernel_size=1,stride=2,bias=False)
self.downsample3=torch.nn.Conv2d(4*width,8*width,kernel_size=1,stride=2,bias=False)
self.resLayer1=[]
self.resLayer1.append(BasicResBlock1(width,2*width,self.downsample1,1))
for x in range (0, number_of_blocks[0]) : #stage1
self.resLayer1.append(BasicResBlock2(2*width,2*width))
self.resLayer1=nn.Sequential(*self.resLayer1)
self.resLayer2=[]
self.resLayer2.append(BasicResBlock1(2*width,4*width,self.downsample2,2)) #stage2
for x in range (0, number_of_blocks[1]) :
self.resLayer2.append(BasicResBlock2(4*width,4*width))
self.resLayer2=nn.Sequential(*self.resLayer2)
self.resLayer3=[]
self.resLayer3.append(BasicResBlock1(4*width,8*width,self.downsample3,2)) #stage3
for x in range (0, number_of_blocks[2]) :
self.resLayer3.append(BasicResBlock2(8*width,8*width))
self.resLayer3=nn.Sequential(*self.resLayer3)
self.avgpool1 = torch.nn.AvgPool2d(8,stride=1)
#define the final linear classifier layer
self.full1=nn.Linear(8*width,10)
#weight initializations
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal(m.weight, mode='fan_out')
elif isinstance(m, nn.BatchNorm2d):
torch.nn.init.constant(m.weight, 1)
torch.nn.init.constant(m.bias, 0)
elif isinstance(m, nn.Linear):
torch.nn.init.kaiming_normal(m.weight, mode='fan_out')
torch.nn.init.constant(m.bias, 0)
#define the forward run for the input data x
def forward(self, x):
#initial layers before basic blocks
x = self.conv1(x)
x = self.batchNorm1(x)
x = self.relu1(x)
#residual layers and then average pooling
x = self.resLayer1(x);
x = self.resLayer2(x);
x = self.resLayer3(x);
#x = self.resLayer4(x);
x = self.avgpool1(x)
#linear classifier layer (since we
#use CrossEntropyLoss for the loss function
#which already has logsoftmax incorporated inside
#we dont have any activation function here.)
x = x.view(x.size(0), -1)
x = self.full1(x)
return x
#this is the training function. cnn is the network that is defined later
#optimizer and learning rate lr are modified inside the function
def train(cycles,cost_criterion,cnn,optimizer):
average_cost=0 #cost function for the training
acc=0 #accuracy over the test set
for e in range(cycles): #cycle through the database many times
print('Cycle: ',e)
cnn.train()
loadt=0
cudat=0
forwardt=0
costt=0
stept=0
avcostt=0
#following for loop cycles over the training set in batches
#of batch_number=5 using the training_loader object
s1 = time.clock()
t1 = time.clock()
for i, (x, y) in enumerate(training_loader_CIFAR10 ,0):
s2 = time.clock()
loadt=loadt+s2-s1
#here x,y will store data from the training set in batches
x, y = Variable(x).cuda(), Variable(y).cuda()
s3 = time.clock()
cudat=cudat+s3-s2
h = cnn.forward(x) #calculate hypothesis over the batch
s4 = time.clock()
forwardt=forwardt+s4-s3
cost = cost_criterion(h, y) #calculate cost the cost of the results
#print(type(cost))
s5 = time.clock()
costt=costt+s5-s4
optimizer.zero_grad() #set the gradients to 0
cost.backward() # calculate derivatives wrt parameters
optimizer.step() #update parameters
s6 = time.clock()
stept=stept+s6-s5
average_cost+=cost.data[0]; #add the cost to the costs
s1 = time.clock()
avcostt=avcostt+s1-s6
t2 = time.clock()
print('total time %.2f loading time %.2f, cuda transfer time %.2f, forward time: %.2f, cost time %.2f, step time %.2f, average cost time %.2f'%(t2-t1,loadt,cudat,forwardt,costt,stept,avcostt))
average_cost=0
cycles = 50 #number of cycles that the training runs over the database
cost_criterion = torch.nn.CrossEntropyLoss() #cost function
cnn = ResNet(16,[1, 1, 1]).cuda() #build the initial network (in the GPU)
optimizer=optim.Adam(cnn.parameters(), lr= 0.0001)
train(cycles,cost_criterion,cnn,optimizer)
torch.save(cnn.state_dict(), 'cnn_trained')
```

In this case if I use .item(0) I get

```
Cycle: 1
total time 51.80 loading time 10.91, cuda transfer time 0.10, forward time: 9.27, cost time 0.02, step time 2.63, average cost time 28.87
```

where as if use. data[0] I get

```
Cycle: 1
total time 41.51 loading time 10.99, cuda transfer time 18.51, forward time: 9.34, cost time 0.02, step time 2.65, average cost time 0.01
```

I am completely confused. I checked the type of the cost thinking maybe it is also a GPU tensor and somehow I mess somethings up but it is in the CPU.