Recently, I decided to move to PyTorch from TensorFlow. So I try to convert my TensorFlow CNN code to PyTorch and create a training loop accordingly to the tutorial. However, the performance of the model drop dramatically with training accuracy only <50% for 15 epochs while TensorFlow achieve almost 93%.
I would like to confirm if my implementation was correct
This is my model definition in both TensorFlow and PyTorch
TensorFlow \w Fit method
def TSNet(Chans = 19, Samples = 1000, output_classes = 4):
input1 = Input(shape = (Chans, Samples, 1),name='input')
block1 = Conv2D(32, (1, 20), strides = (1,1),input_shape = (Chans, Samples, 1), name='conv1')(input1)
block1 = Conv2D(32, (3, 1),strides = (1,1),name='conv2')(block1)
block1 = BatchNormalization(name='bn1')(block1)
block1 = Activation('relu',name='relu1')(block1)
block1 = MaxPooling2D((1, 5), strides = (1,2),name='pool1')(block1)
block2 = Conv2D(64, (1, 20),name='conv3')(block1)
block2 = BatchNormalization(name='bn2')(block2)
block2 = Activation('relu',name='relu2')(block2)
block2 = MaxPooling2D((1, 7), strides = (1,2),name='pool2')(block2)
block3 = Conv2D(64, (1,10),strides = (1,1), name='conv4')(block2)
block3 = BatchNormalization(name='bn3')(block3)
block3 = Activation('relu',name='relu3')(block3)
block3 = MaxPooling2D((1, 5), strides = (1,2),name='pool3')(block3)
flatten = Flatten(name='f1')(block3)
dp1 = Dropout(0.5,name='drop1')(flatten)
dense1 = Dense(32,name='ful1')(dp1)
nrm = BatchNormalization(name='bn7')(dense1)
act = Activation('relu',name='relu7')(nrm)
dp2 = Dropout(0.3, name='drop3')(act)
dense2 = Dense(output_classes,name='out')(dp2)
out = Activation('softmax',name='soft1_out')(dense2)
return Model(inputs=input1, outputs=out)
# TRAIN LOOP------------------------------------------------------------------------------------
x,y = load() #load 540 instances with shape 19x1000 data, label was one-hot
x = np.expand_dims(x, 3) #change shape to 540x19x1000x1
classifier_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
classifier_loss = tf.keras.losses.CategoricalCrossentropy()
clf = TSNet(Chans = 19, Samples = 1000, output_classes = 4)
clf.compile(optimizer = classifier_optimizer, loss= classifier_loss , metrics=['accuracy'])
clf.fit(x, y,
batch_size=8,
epochs = 15,
verbose = True)
PyTorch
class PTNet(torch.nn.Module):
def __init__(self, channels = 19, samples = 1000.0, outputs = 4):
super(PTNet, self).__init__()
#Sequential 1
self.seq1 = torch.nn.Sequential(
torch.nn.Conv2d(in_channels = 1, out_channels = 32,
kernel_size = (1,20), stride = 1),
torch.nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3,1), stride = 1),
torch.nn.BatchNorm2d(32, eps = 0.001, momentum = 0.99),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size = [1,5], stride = [1,2])
)
#calculate output of sample at each opeartion
samples = (samples - 20) + 1
samples = (samples - 1) + 1
channels = channels - 3 + 1
samples = floor((samples - 5) / 2 + 1)
#Sequential 2
self.seq2 = torch.nn.Sequential(
torch.nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = (1,20)),
torch.nn.BatchNorm2d(64, eps = 0.001, momentum = 0.99), #tensorflow duplicate
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size = [1,7], stride = [1,2])
)
samples = (samples - 20) + 1
samples = floor((samples- 7) / 2 + 1)
#Sequential 3
self.seq3 = torch.nn.Sequential(
torch.nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = (1,10)),
torch.nn.BatchNorm2d(64, eps = 0.001, momentum = 0.99),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size = [1,5], stride = [1,2])
)
samples = (samples - 10) + 1
samples = floor((samples - 5) / 2 + 1)
#fully connect
self.fc = torch.nn.Sequential(
torch.nn.Dropout1d(p = 0.5),
#cal from (initla_ch - 2) * last_layer_fmap * final datapoint from conv
torch.nn.Linear(in_features = channels * 64 * samples, out_features = 32),
torch.nn.BatchNorm1d(32, eps = 0.001, momentum = 0.99),
torch.nn.ReLU(),
torch.nn.Dropout1d(p = 0.3),
torch.nn.Linear(in_features = 32, out_features = outputs),
torch.nn.Softmax()
)
def forward(self, x):
x = self.seq1(x)
x = self.seq2(x)
x = self.seq3(x)
x = torch.flatten(x, start_dim = 1, end_dim = -1)
x = self.fc(x)
return x
# TRAIN LOOP------------------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PTNet()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0.0001)
criterion = torch.nn.CrossEntropyLoss()
criterion.to(device)
x,y = load() #load 540 instances with shape 19x1000 data, label was one-hot
x = np.expand_dims(x, 1) #Change shape to 540 x 1 x 19 x 1000 for PyTorch
train_dat = torch.utils.data.TensorDataset(torch.tensor(x).to(device), torch.tensor(y).to(device))
train_loader = torch.utils.data.DataLoader(train_dat, batch_size = 8, shuffle = True)
for epoch in range(15):
res = 0
#train loop
for i, data in enumerate(train_loader ,0):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
#zero grad
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
_, predicted = outputs.max(1)
_, gt = labels.max(1)
res += predicted.eq(gt).sum().item()
print(f'epoch {epoch + 1} | acc {100 * res / (67*8)}')
The noticeable different is that input shape of TensorFlow is 540 x 19 x 1000 x 1 as last dimension is channel while PyTorch is 540 x 1 x 19 x 1000
I print out model summary and output shape from each layers are almost identical (except for batch normalize layer). I guess is that there will be a performance different but not that high (40% diff).