I’m trying out Pytorch by comparing model with its Theano version equivalence, and I noticed:

- Pytorch model runs much faster than Theano version, ~0.9s vs 1.6s per batch
- Pytorch model however converges much slower than Theano version, error rate ~83% vs ~62% after 250 batches.

The Pytorch model is defined by:

```
class model_5_1(nn.Module):
def __init__(self, batchsize=None, channel=1, imsize=(256, 256), Nclass=16, kernel_size=3, border_mode='same'):
super(model_5_1, self).__init__()
self.batchsize = batchsize
self.channel = channel
self.imsize = imsize
self.Nclass = Nclass
self.kernel_size = kernel_size
self.border_mode = border_mode
if border_mode == 'same':
pad = kernel_size // 2
else:
pad = 0
self.conv0 = nn.Conv2d(channel, 32, kernel_size, padding=pad)
self.conv1 = nn.Conv2d(32, 64, kernel_size, padding=pad)
self.conv2 = nn.Conv2d(96, 128, kernel_size, padding=pad)
self.conv3 = nn.Conv2d(128, 128, kernel_size, padding=pad)
self.conv4 = nn.Conv2d(128, 128, kernel_size, padding=pad)
self.conv5 = nn.Conv2d(256, 512, kernel_size, padding=pad)
self.bn0 = nn.BatchNorm2d(128)
self.bn1 = nn.BatchNorm2d(256)
self.bn2 = nn.BatchNorm2d(512)
self.rnn0 = nn.LSTM(input_size=512, hidden_size=100, batch_first=True, bidirectional=True)
self.rnn1 = nn.LSTM(input_size=200, hidden_size=100, batch_first=True, bidirectional=True)
self.fc0 = nn.Linear(200, Nclass)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv0(x)), (2, 2))
x1 = F.relu(self.conv1(x))
x = tr.cat((x, x1), 1)
x = F.max_pool2d(x, (2,2))
x = F.relu(self.bn0(self.conv2(x)))
x = F.max_pool2d(F.relu(self.conv3(x)), (2,2))
x1 = F.relu(self.conv4(x))
x = tr.cat((x, x1), 1)
x = self.bn1(x)
x = F.max_pool2d(F.relu(self.conv5(x)), (4,4))
x = self.bn2(x)
x = x.view(x.size(0), x.size(1), x.size(2) * x.size(3))
x = tr.transpose(x, 1, 2)
x, _ = self.rnn0(x)
x = F.tanh(x)
x, _ = self.rnn1(x)
x = x[:,-1,:]
x = F.tanh(x)
x = F.softmax(self.fc0(x))
return x
```

And I use cross_entropy for loss, Adadelta for optimizer. For Theano version I use categorical_crossentropy for loss, Adadelta with same parameters for optimizer.

Anyone has any thought on this problem?