I am porting a net from Keras to Pytorch, however the training in Pytorch doesn’t seem to learn anything. The net is trying to learn sub-word embeddings in a phrase and classify it among three classes.
The architecture (along with output shapes) of the net is as follows :
- Embedding Layer (batch x 200 x 128)
- Convolution Layer (batch x 198 x 128)
- Max Pooling Layer (batch x 66 x 128)
- LSTM Layer (batch x 66 x 128)
- LSTM Layer (batch x 128)
- Dense Layer (batch x 3)
I have tried with different learning rates, changing the loss from Cross Entropy to Softmax+NLL, however the Pytorch model doesn’t seem to learn.
The input X
is a file of shape (num_samples, 200)
with each value in the range [0,26], and y
are the labels in [0,1,2] for each sample.
Here is the Keras code :
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Embedding, LSTM, GRU, Convolution1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
#Format conversion for y
y_train = to_categorical(y, 3)
#Train & Validation data splitting
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
embedLayer = Embedding(input_dim=27, output_dim=128, input_length=200)
convLayer = Convolution1D(filters=128, kernel_size=3, activation='relu')
poolLayer = MaxPooling1D(pool_size=3)
lstmLayer1 = LSTM(units=128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)
lstmLayer2 = LSTM(units=128, dropout=0.2, recurrent_dropout=0.2, return_sequences=False)
denseLayer = Dense(units=3)
model = Sequential()
model.add(embedLayer)
model.add(convLayer)
model.add(poolLayer)
model.add(lstmLayer1)
model.add(lstmLayer2)
model.add(denseLayer)
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adamax',
metrics=['accuracy'])
# Training data
model.fit(x=X_train, y=y_train,
batch_size=128,
epochs=50,
validation_data=(X_valid, y_valid))
Here is the PyTorch code :
class RNNModel(nn.Module):
def __init__(self):
super(RNNModel, self).__init__()
# Layers
self.embeddingLayer = nn.Embedding(num_embeddings=27, embedding_dim=128)
self.convLayer = nn.Conv1d(in_channels=128, out_channels=128,
kernel_size=3)
self.lstmLayer1 = nn.LSTM(input_size=128, hidden_size=128)
self.lstmLayer2 = nn.LSTM(input_size=128, hidden_size=128)
self.denseLayer = nn.Linear(in_features=128, out_features=3)
def forward(self, x):
# Forming embeddings
x = self.embeddingLayer(x)
# Convolution and pooling
x = x.view(-1, 128, 200)
x = F.relu(self.convLayer(x))
x = F.max_pool1d(x, kernel_size=3)
# LSTM layers
x = x.view(x.shape[2], x.shape[0], x.shape[1])
x, _ = self.lstmLayer1(x)
x = F.dropout(x, p=0.2)
_, (x, _) = self.lstmLayer2(x)
x = F.dropout(x, p=0.2)
# Dense layer
x = x.view(x.shape[1], x.shape[2])
x = self.denseLayer(x)
return x
device = torch.device('cuda')
model = RNNModel()
model = model.to(device)
lossFunc = nn.CrossEntropyLoss()
optimizer = torch.optim.Adamax(model.parameters())
# trainGen and devGen are generators for training and dev set respectively
for i in range(50):
for X_batch, y_batch in trainGen:
X_batch = X_batch.to(device)
y_batch = y_batch.to(device)
pred_batch = model(X_batch.long())
loss = lossFunc(pred_batch, y_batch.long())
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("\nITERATION ", i+1, "\n Train Loss = ", loss.item())
with torch.no_grad():
for X_dev, y_dev in devGen:
X_dev = X_dev.to(device)
y_dev = y_dev.to(device)
pred_batch_dev = model(X_dev.long())
loss = lossFunc(pred_batch_dev, y_dev.long())
print(" Dev Loss = ", loss.item())
The Keras model does learn something because it brings up the accuracy from around 47% to 63%. However the Pytorch models keeps wavering around the initial loss only. What is the problem here?