I am implementing LeCun’s architecture on his Recursive Networks work, however I don’t see any improvement during the training on the network.
Architecture
- The first layer applies a set of M kernels V of size 8 × 8 × 3. We use “same” convolution
- Then max-pooled within each feature map with non-overlapping 4×4 windows
- All L succeeding hidden layers apply a set of M kernels Wl m of size 3 × 3 ×M (“same” convolution)
- The final hidden layer is subject to pixel-wise L2 normalization and passed into a logistic classifier
Instantiation
- The first-layer kernels Vm are initialized from a zero-mean Gaussian distribution with std=0.1
- The kernels of the higher layers Wl m are initialized to the identity transformation (Kronecker delta)
Implementation
class Conv_Net(nn.Module):
def __init__(self, name:str, layers:int, filters:int=32):
super(Conv_Net, self).__init__()
self.name = name
self.L = layers
self.M = filters
self.act = nn.ReLU(inplace=True)
self.V = nn.Conv2d(3,self.M,8, stride=1, padding=3) # Out: 32x32xM -- Maybe padding = 4?
self.P = nn.MaxPool2d(4, stride=4, padding=2) # Out: 8x8xM -- Check also padding here
self.W = nn.ModuleList( # Out: 8x8xM -- Check also padding here)]
[nn.Conv2d(32,32,3, padding=1) for _ in range(self.L)])
self.fc = nn.Linear(8*8*self.M, 10)
# Custom Initialization
for name, param in self.named_parameters():
# Vm has zero mean and 0.1 std (0.01 var)
if 'V' in name and 'weight' in name:
param.data.normal_(0, 0.1)
# W are initialized with the identity matrix - Kronecker delta
elif 'W' in name and 'weight' in name:
param.data.fill_(0)
for i in range(32):
param.data[i][0][0][0].fill_(1)
def forward(self, x):
x = self.act(self.V(x))
x = self.P(x)
for w in self.W:
x = self.act(w(x))
x = x.view(x.size(0), -1)
return self.fc(x)
Note: I have seen that nn.CrossEntropy
expects the logits and not the probabilities, but I have included the Softmax
in the forward pass because it was the criterion was doing the log
of inf
and then returning nan
for the loss --> After initializing W to Identity Matrix.
[EDIT]: Removed this to avoid confusion.
Training loop
net = Conv_Net('net', layers=16, filters=32)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9, weight_decay=1e-5)
def train(epoch):
net.train()
print('\nEpoch: %d' % epoch)
total = 0
correct = 0
global results
for batch_idx, (inputs, targets) in enumerate(trainloader):
optimizer.zero_grad()
inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
accuracy = 100.*correct/total
results.append_loss(loss.item(), 'train')
results.append_accy(accuracy, 'train')
print('Train :: Loss: {} | Accy: {}'.format(round(loss.item(),2), round(accuracy,2)))
def test(epoch):
net.eval()
total = 0
correct = 0
global results
global best_acc
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(testloader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
# Save checkpoint.
acc = 100.*correct/total
results.append_loss(loss.item(), 'valid')
results.append_accy(acc, 'valid')
print('Valid :: Loss: {} | Accy: {}'.format(round(loss.item(),2), round(acc,2)))
if acc > best_acc:
print('Saving..')
state = {
'net': net.state_dict(),
'acc': acc,
'epoch': epoch,
}
if not os.path.isdir('checkpoint'):
os.mkdir('checkpoint')
torch.save(state, './checkpoint/ckpt.t7')
best_acc = acc
Sample Output
Epoch: 0
Train :: Loss: 2.37 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Saving..
Time: 12.98
Epoch: 1
Train :: Loss: 2.29 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 9.18
Epoch: 2
Train :: Loss: 2.42 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 9.56
Epoch: 3
Train :: Loss: 2.31 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 8.95
Epoch: 4
Train :: Loss: 2.35 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 8.89
Epoch: 5
Train :: Loss: 2.4 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 8.99
Epoch: 6
Train :: Loss: 2.39 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 8.93
Epoch: 7
Train :: Loss: 2.34 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 9.04
Epoch: 8
Train :: Loss: 2.32 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 9.14
Epoch: 9
Train :: Loss: 2.36 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 9.05
Epoch: 10
Train :: Loss: 2.37 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 9.66
Epoch: 11
Train :: Loss: 2.39 | Accy: 10.0
Valid :: Loss: 2.37 | Accy: 10.0
Time: 9.33