I’m fairly unfamiliar with pytorch (and ML in general) so please do bear with me. I’ve written a CNN that takes input 1x64x200 to predict between 7 labels (labelled 0-7). My model is as follows:
class Net(nn.Module):
def __init__(self,dropout):
super(Net,self).__init__()
self.conv1=nn.Conv2d(1,32,7,stride=1,padding=3)
self.conv2=nn.Conv2d(32,16,7,stride=1,padding=3)
self.conv3=nn.Conv2d(16,8,5,stride=1,padding=2)
self.conv4=nn.Conv2d(8,16,5,stride=1,padding=2)
self.conv5=nn.Conv2d(16,4,3,stride=1,padding=1)
self.pool1=nn.MaxPool2d(2)
self.pool2=nn.MaxPool2d(4)
self.fc1=nn.Linear(800,7)
self.dropout1=nn.Dropout2d(dropout)
self.batchnorm1=nn.BatchNorm2d(32)
self.batchnorm2=nn.BatchNorm2d(16)
self.batchnorm3=nn.BatchNorm2d(8)
self.batchnorm4=nn.BatchNorm2d(4)
def forward(self,x):
x=self.conv1(x)
x=self.batchnorm1(x)
x=self.dropout1(x)
x=nn.functional.relu(x)
x=self.conv2(x)
x=self.batchnorm2(x)
x=self.dropout1(x)
x=nn.functional.relu(x)
x=self.pool1(x)
x=self.conv3(x)
x=self.batchnorm3(x)
x=self.dropout1(x)
x=nn.functional.relu(x)
x=self.conv4(x)
x=self.batchnorm2(x)
x=self.dropout1(x)
x=nn.functional.relu(x)
x=self.conv5(x)
x=self.batchnorm4(x)
x=self.dropout1(x)
x=nn.functional.relu(x)
x=self.pool2(x)
x=torch.flatten(x,1)
x=self.fc1(x)
x=nn.functional.relu(x)
return x
def train_test(net,epochs,train_loader,test_loader,device):
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(net.parameters(),lr=3e-3)
train_acc=[]
train_loss=[]
test_acc=[]
test_loss=[]
net.to(device)
for epoch in tqdm.tqdm(range(epochs)):
net.train()
running_loss=0.0
correct,total=0,0
for i,data in enumerate(train_loader,start=0):
inputs,labels=data
inputs=inputs.to(device).float()
labels=labels.to(device).long()
#train
optimizer.zero_grad()
outputs=net.forward(inputs)
loss=criterion(outputs,labels)
loss.backward()
optimizer.step()
running_loss+=loss.item()
#training accuracy
_,predicted=torch.max(outputs,1)
total+=labels.size(0)
correct+=(predicted==labels).sum()
train_loss.append(running_loss/len(train_loader))
train_acc.append(correct/total)
print(f"epoch {epoch} --> TRAIN loss: {running_loss/len(train_loader):.5f}, TRAIN accuracy: {correct/total:.2f}")
#eval on test
net.eval()
running_loss=0.0
correct,total=0,0
for inputs,labels in test_loader:
inputs,labels=inputs.to(device).float(),labels.to(device).long()
outputs=net.forward(inputs)
loss=criterion(outputs,labels)
running_loss+=loss.item()
#test acc
_,predicted=torch.max(outputs,1)
total+=labels.size(0)
correct+=(predicted==labels).sum()
test_loss.append(running_loss/len(test_loader))
test_acc.append(correct/total)
print(f"epoch {epoch} --> TEST loss: {running_loss/len(train_loader):.2f}, TEST accuracy: {correct/total:.2f}")
return train_loss,train_acc,test_loss,test_acc
batch_size=150
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size,
shuffle=False
)
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data,
batch_size=batch_size,
drop_last=False,
shuffle=True
)
Training with a low learning rate (1e-4) leads to training loss oscillating at a high value (around 1.95). Increasing the learning rate, even to 0.1, doesn’t decrease the training loss. I can’t tell if my model is stuck at a local minima or if there something else fundamentally wrong with my model design or code.
Any help would be greatly appreciated. Thank you!
training at learning rate=0.1:
0%| | 0/50 [00:00<?, ?it/s]
epoch 0 --> TRAIN loss: 1.97984, TRAIN accuracy: 0.14
2%|▏ | 1/50 [00:18<14:43, 18.02s/it]
epoch 0 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 1 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
4%|▍ | 2/50 [00:36<14:40, 18.33s/it]
epoch 1 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 2 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
6%|▌ | 3/50 [00:54<14:17, 18.24s/it]
epoch 2 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 3 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
8%|▊ | 4/50 [01:13<14:01, 18.30s/it]
epoch 3 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 4 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
10%|█ | 5/50 [01:31<13:40, 18.24s/it]
epoch 4 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 5 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
12%|█▏ | 6/50 [01:50<13:31, 18.43s/it]
epoch 5 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 6 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
14%|█▍ | 7/50 [02:08<13:15, 18.51s/it]
epoch 6 --> TEST loss: 0.49, TEST accuracy: 0.14
training at learning rate=1e-3:
0%| | 0/50 [00:00<?, ?it/s]
epoch 0 --> TRAIN loss: 1.94748, TRAIN accuracy: 0.15
2%|▏ | 1/50 [00:18<15:06, 18.50s/it]
epoch 0 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 1 --> TRAIN loss: 1.94572, TRAIN accuracy: 0.14
4%|▍ | 2/50 [00:36<14:43, 18.40s/it]
epoch 1 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 2 --> TRAIN loss: 1.94623, TRAIN accuracy: 0.14
6%|▌ | 3/50 [00:54<14:17, 18.25s/it]
epoch 2 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 3 --> TRAIN loss: 1.94588, TRAIN accuracy: 0.14
8%|▊ | 4/50 [01:13<13:59, 18.25s/it]
epoch 3 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 4 --> TRAIN loss: 1.94597, TRAIN accuracy: 0.14
10%|█ | 5/50 [01:31<13:45, 18.35s/it]
epoch 4 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 5 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
12%|█▏ | 6/50 [01:50<13:29, 18.40s/it]
epoch 5 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 6 --> TRAIN loss: 1.94586, TRAIN accuracy: 0.14
14%|█▍ | 7/50 [02:08<13:13, 18.46s/it]
epoch 6 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 7 --> TRAIN loss: 1.94593, TRAIN accuracy: 0.14
16%|█▌ | 8/50 [02:27<12:56, 18.49s/it]
epoch 7 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 8 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
18%|█▊ | 9/50 [02:45<12:39, 18.53s/it]
epoch 8 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 9 --> TRAIN loss: 1.94592, TRAIN accuracy: 0.14
20%|██ | 10/50 [03:04<12:22, 18.56s/it]
epoch 9 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 10 --> TRAIN loss: 1.94590, TRAIN accuracy: 0.14
22%|██▏ | 11/50 [03:23<12:08, 18.67s/it]
epoch 10 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 11 --> TRAIN loss: 1.94597, TRAIN accuracy: 0.14
24%|██▍ | 12/50 [03:41<11:47, 18.61s/it]
epoch 11 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 12 --> TRAIN loss: 1.94591, TRAIN accuracy: 0.14
26%|██▌ | 13/50 [04:00<11:28, 18.60s/it]
epoch 12 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 13 --> TRAIN loss: 1.94592, TRAIN accuracy: 0.14
28%|██▊ | 14/50 [04:19<11:09, 18.60s/it]
epoch 13 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 14 --> TRAIN loss: 1.94594, TRAIN accuracy: 0.14
30%|███ | 15/50 [04:37<10:50, 18.59s/it]
epoch 14 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 15 --> TRAIN loss: 1.94588, TRAIN accuracy: 0.14
32%|███▏ | 16/50 [04:56<10:29, 18.52s/it]
epoch 15 --> TEST loss: 0.49, TEST accuracy: 0.14
epoch 16 --> TRAIN loss: 1.94593, TRAIN accuracy: 0.14
34%|███▍ | 17/50 [05:14<10:09, 18.48s/it]
epoch 16 --> TEST loss: 0.49, TEST accuracy: 0.14