Trouble when trying to train a convnet on mnist

cdluminate · July 21, 2017, 7:35am

I wrote a script to train conv net on mnist, but there are some problems.
The mnist dataset is from Kaggle (train.csv). The reference codes are
listed in the head part of the code.

Problems:

(1) line 46: Without this line, it will raise RuntimeError: expected Double tensor (got Float tensor).
The tutorial and example code don’t do like this…

    46	convnet = convnet.double() #RuntimeError: expected Double tensor (got Float tensor)

(2) line 66: will crash if I don’t explicitly assign True to model.trainning. The totorial and example code don’t do like this…

    66	    model.trainning = True #AttributeError: 'MnistConvNet' object has no attribute 'trainning'

(3) The model seems not learning (test accuracy still 0.11 after 100 iterations), but I didn’t find out why. The test accuracy should arise after merely several iterations (The TF example codes in Kaggle do so). Is the code wrong?

Thanks in advance!

Complete script:

     1	# http://pytorch.org/tutorials/
     2	# http://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html
     3	# https://github.com/pytorch/examples/blob/master/mnist/main.py
     4	
     5	import sys
     6	import os
     7	
     8	os.putenv('OPENBLAS_NUM_THREADS', '4')
     9	
    10	import torch as th
    11	import torch.nn.functional as thnf
    12	import numpy as np
    13	import pandas as pd
    14	from sklearn.model_selection import train_test_split
    15	print('-> Using TH', th.__version__)
    16	
    17	### Read Train-Val data and split ###
    18	trainval = pd.read_csv("train.csv")
    19	trainval_images = trainval.iloc[:, 1:].div(255)
    20	trainval_labels = trainval.iloc[:, :1]
    21	train_images, val_images, train_labels, val_labels = train_test_split(
    22	        trainval_images, trainval_labels, train_size=0.8, random_state=0)
    23	print('-> train set shape', train_images.shape)
    24	print('-> val   set shape', val_images.shape)
    25	
    26	### Model ###
    27	class MnistConvNet(th.nn.Module):
    28	    def __init__(self):
    29	        super(MnistConvNet, self).__init__()
    30	        self.conv1 = th.nn.Conv2d(1, 10, kernel_size=5)
    31	        self.conv2 = th.nn.Conv2d(10, 20, kernel_size=5)
    32	        self.conv2_drop = th.nn.Dropout2d()
    33	        self.fc1 = th.nn.Linear(320, 50)
    34	        self.fc2 = th.nn.Linear(50, 10)
    35	    def forward(self, x):
    36	        x = thnf.relu(thnf.max_pool2d(self.conv1(x), 2))
    37	        x = thnf.relu(thnf.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
    38	        x = x.view(-1, 320)
    39	        x = thnf.relu(self.fc1(x))
    40	        x = thnf.dropout(x, training=self.trainning)
    41	        x = self.fc2(x)
    42	        return x
    43	        #return thnf.log_softmax(x)
    44	
    45	convnet = MnistConvNet()
    46	convnet = convnet.double() #RuntimeError: expected Double tensor (got Float tensor)
    47	crit = th.nn.CrossEntropyLoss()
    48	optimizer = th.optim.Adam(convnet.parameters(), lr=1e-2)
    49	
    50	### Train and Val ###
    51	
    52	def step_train(model, iteration):
    53	    i = iteration
    54	    batch_images = train_images.iloc[
    55	        (i*50)%33600:
    56	        (i+1)%672==0 and 33600 or ((i+1)*50)%33600].values
    57	    batch_labels = train_labels.iloc[
    58	        (i*50)%33600:
    59	        (i+1)%672==0 and 33600 or ((i+1)*50)%33600].values
    60	    batch_images = th.autograd.Variable(th.from_numpy(batch_images))
    61	    batch_labels = th.autograd.Variable(th.from_numpy(batch_labels))
    62	    batch_images = batch_images.resize(50, 1, 28, 28)
    63	    batch_labels = batch_labels.resize(50)
    64	
    65	    model.train()
    66	    model.trainning = True #AttributeError: 'MnistConvNet' object has no attribute 'trainning'
    67	    optimizer.zero_grad()
    68	    output = model(batch_images)
    69	    loss = crit(output, batch_labels)
    70	    loss.backward()
    71	    optimizer.step()
    72	
    73	    pred = output.data.max(1)[1]
    74	    correct = pred.eq(batch_labels.data).cpu().sum()
    75	    print('-> Iter {:5d} |'.format(i), 'loss {:7.3f} |'.format(loss.data[0]),
    76	            'Bch Train Accu {:.2f}'.format(correct / output.size()[1]))
    77	
    78	def step_eval(model, iteration):
    79	    correct = 0
    80	    total   = val_images.shape[0]
    81	    lossaccum = 0.
    82	    print('-> TEST @ {} |'.format(iteration), end='')
    83	    for i in range(0, val_images.shape[0], 50):
    84	        images = val_images.iloc[i:i+50].values
    85	        labels = val_labels.iloc[i:i+50].values
    86	        images = th.autograd.Variable(th.from_numpy(images))
    87	        labels = th.autograd.Variable(th.from_numpy(labels))
    88	        images = images.resize(50, 1, 28, 28)
    89	        labels = labels.resize(50)
    90	
    91	        model.eval()
    92	        model.trainning = False
    93	        output = model(images)
    94	        loss = thnf.nll_loss(output, labels)
    95	        lossaccum += loss.data[0]
    96	        pred = output.data.max(1)[1]
    97	        correct += pred.eq(labels.data).cpu().sum()
    98	        print('.', end=''); sys.stdout.flush()
    99	    print('|')
   100	    print('-> TEST @ {} |'.format(iteration),
   101	            'Loss {:7.3f} |'.format(lossaccum),
   102	            'Accu {:.2f}|'.format(correct / total))
   103	    exit()
   104	
   105	for i in range(20000):
   106	    step_train(convnet, i)
   107	    if i>0 and i%100==0:
   108	        step_eval(convnet, i)

part of output

-> Using TH 0.1.12
-> train set shape (33600, 784)
-> val   set shape (8400, 784)
-> Iter     0 | loss   2.286 | Bch Train Accu 0.60
-> Iter     1 | loss   2.340 | Bch Train Accu 0.70