I am trying to implement Squeezenet and train it on Cifar10 data, I have got the code ready but there seems to be some problem, my training set accuracy never increases though the loss function graph makes sense.
In Squeezenet, fire module require us to concatenate 1x1 Convolution and 3x3 convolution, to achieve this I have used torch.cat function? Below is the code for fire module, I want to know if its right?
class fire(nn.Module):
def __init__(self, inplanes, squeeze_planes, expand_planes):
super(fire, self).__init__()
self.conv1 = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1, stride=1)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(squeeze_planes, expand_planes, kernel_size=1, stride=1)
self.conv3 = nn.Conv2d(squeeze_planes, expand_planes, kernel_size=3, stride=1, padding=1)
self.relu2 = nn.ReLU(inplace=True)
# using MSR initilization
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2./n))
def forward(self, x):
x = self.conv1(x)
x = self.relu1(x)
out1 = self.conv2(x)
out2 = self.conv3(x)
out = torch.cat([out1, out2], 1)
out = self.relu2(out)
return out
Now I am trying to use 55 epoch learning rule used by @Soumith in his imagenet-multiGPU code, but I am facing a weird issue, it is giving me segfault, but when I prepare optimizer by choosing a static learning rate it runs fine?
I think the learning rate itself was the issue, it must be doing division by zero somewhere, I changed the learning rate and now it seems to be working fine.
is there a chance they are cause by not zeroing the grad parameters
The script that I used for training is:
def paramsforepoch(epoch):
p = dict()
regimes = [[1, 18, 1e-3, 5e-4],
[19, 29, 5e-3, 5e-4],
[30, 43, 1e-3, 0],
[44, 52, 5e-4, 0],
[53, 1e8, 1e-4, 0]]
for i, row in enumerate(regimes):
if epoch >= row[0] and epoch <= row[1]:
p['learning_rate'] = row[2]
p['weight_decay'] = row[3]
return p
avg_loss = list()
fig1, ax1 = plt.subplots()
fig2, ax2 = plt.subplots()
# train the model
# TODO: Compute training accuracy and test accuracy
# TODO: train it on some data and see if it overfits.
# TODO: train the data on final model
# create a temporary optimizer
optimizer = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=0.0005)
def adjustlrwd(p):
for param_group in optimizer.state_dict()['param_groups']:
param_group['lr'] = p['learning_rate']
param_group['weight_decay'] = p['weight_decay']
# train the network
def train(epoch):
# set the optimizer for this epoch
if epoch > 0 or epoch > 18 or epoch > 29 or epoch > 43 or epoch > 52:
p = paramsforepoch(epoch)
print("Configuring optimizer with lr={:.3f} and weight_decay={:.3f}".format(p['learning_rate'], p['weight_decay']))
adjustlrwd(p)
###########################################################################
global avg_loss
correct = 0
net.train()
for b_idx, (data, targets) in enumerate(train_loader):
# trying to overfit a small data
if b_idx == 100:
break
if args.cuda:
data.cuda(), targets.cuda()
# convert the data and targets into Variable and cuda form
data, targets = Variable(data), Variable(targets)
# train the network
optimizer.zero_grad()
scores = net.forward(data)
loss = F.nll_loss(scores, targets)
# compute the accuracy
pred = scores.data.max(1)[1] # get the index of the max log-probability
correct += pred.eq(targets.data).cpu().sum()
avg_loss.append(loss.data[0])
loss.backward()
optimizer.step()
if b_idx % args.log_schedule == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (b_idx+1) * len(data), len(train_loader.dataset),
100. * (b_idx+1) / 100, loss.data[0]))
# also plot the loss, it should go down exponentially at some point
ax1.plot(avg_loss)
fig1.savefig("Squeezenet_loss.jpg")
# now that the epoch is completed plot the accuracy
accuracy = correct / 6400.0
print("training accuracy ({:.2f}%)".format(100*accuracy))
ax2.plot(100*accuracy)
fig2.savefig("Training-test-acc.jpg")