class googleNet(nn.Module):
def __init__(self, latent_dim=512):
super(googleNet, self).__init__()
self.model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)
#freeze paramters (trains faster and keeps weight values of ImageNet)
for params in self.model.parameters():
params.requires_grad = False
#change last fully completerd layer
self.model.fc = nn.Linear(self.model.fc.in_features, latent_dim)
def forward(self, x):
output = self.model(x)
return output
class Lstm(nn.Module):
def __init__(self, latent_dim = 512, hidden_size = 256, lstm_layers = 2, bidirectional = True):
super(Lstm, self).__init__()
self.latent_dim = latent_dim
self.hidden_size = hidden_size
self.lstm_layers = lstm_layers
self.bidirectional = bidirectional
self.Lstm = nn.LSTM(self.latent_dim, hidden_size=self.hidden_size, num_layers=self.lstm_layers, batch_first=True, bidirectional=self.bidirectional)
self.hidden_state = None
def reset_hidden_state(self):
self.hidden_state = None
def forward(self,x):
output, self.hidden_state = self.Lstm(x, self.hidden_state)
return output
class ConvLstm(nn.Module):
def __init__(self, google, lstm, n_class = 10):
super(ConvLstm, self).__init__()
self.modela = google
self.modelb = lstm
self.output_layer = nn.Sequential(
nn.Linear(2 * self.modelb.hidden_size if self.modelb.bidirectional==True else self.modelb.hidden_size, n_class),
nn.Softmax(dim=-1)
)
def forward(self, x):
batch_size, timesteps, channel_x, h_x, w_x = x.shape
conv_input = x.view(batch_size * timesteps, channel_x, h_x, w_x)
conv_output = self.modela(conv_input)
lstm_input = conv_output.view(batch_size, timesteps, -1)
lstm_output = self.modelb(lstm_input)
lstm_output = lstm_output[:, -1, :]
output = self.output_layer(lstm_output)
return output
Above is the NN that I use and the following code is used to train it.
modela = googleNet()
modelb = Lstm()
modelc = ConvLstm(modela,modelb).to(device)
## Loss and optimizer
learning_rate = 5e-4 #I picked this because it seems to be the most used by experts
load_model = True
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(modelc.parameters(), lr= learning_rate) #Adam seems to be the most popular for deep learning
modelc.train()
for epoch in range(100): #I decided to train the model for 100 epochs
loss_ep = 0
for batch_idx, (data, targets) in enumerate(zip(features_train, labels_train)):
data = data.to(device)
targets = targets.to(device)
## Forward Pass
optimizer.zero_grad()
modelc.modelb.reset_hidden_state()
scores = modelc(data)
loss = criterion(scores,targets)
loss.backward()
optimizer.step()
loss_ep += loss.item()
print(f"Loss in epoch {epoch} :::: {loss_ep/len(features_train)}")
with torch.no_grad():
num_correct = 0
num_samples = 0
The cross entropy through the 100 epochs is at 2.301. What is going wrong?
I have read that the crossentropy includes softmax and I removed it from the output layer but the cross entropy still stays on the same value.