Hi all,
I am trying to train an object detector in pytorch however even though my gradients are non zero and losses are generated when I use the step function my models parameters do not change.
Here is my code:
import os
from statistics import mode
import torch
from PIL import Image
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import retinanet_resnet50_fpn
from torchvision import transforms
import pandas as pd
from torchsummary import summary
from tqdm import tqdm
import matplotlib.pyplot as plt
class DataGen(torch.utils.data.Dataset):
def __init__(self, csv_path, images_folder):
df = pd.read_csv(csv_path)
df = df.reset_index()
self.images_folder = images_folder
self.images_name = df['image_name']
self.class_id=pd.to_numeric(df['class_id'])
self.length = len(df)
self.xmin=df['xmin']
self.xmax=df['xmax']
self.ymin=df['ymin']
self.ymax=df['ymax']
def __getitem__(self, index):
# load images
img_path=os.path.join(self.images_folder,
self.images_name[index])
input_image = Image.open(img_path)
preprocess = transforms.Compose([
transforms.ToTensor(),
])
img = preprocess(input_image)
# TODO: Add multiple images into one batch
img = img.reshape([1,img.shape[0],img.shape[1],img.shape[2]])
# TODO: Implement multiple bbox
xmin=self.xmin[index]/img.shape[1]
xmax=self.xmax[index]/img.shape[1]
ymin=self.ymin[index]/img.shape[2]
ymax=self.ymax[index]/img.shape[2]
# convert everything into a torch.Tensor
boxes = torch.as_tensor([[xmin, ymin, xmax, ymax]], dtype=torch.float32)
labels = torch.as_tensor([int(self.class_id[index])], dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
return img, target
def __len__(self):
return len(self.images_name)
def train(model, optimizer, lr_scheduler, train_set, val_set, device, epochs, scaler ):
running_training_loss = []
running_validation_loss = []
# TODO: Add in checkpointing
# TODO: Add in early stopping
for epoch in range(epochs):
epoch_loss = 0.0
print("Starting Epoch ", epoch+1, "/",epochs)
model.train()
a = list(model.parameters())[0].clone()
for x in tqdm(range(len(train_set))):
images, target = train_set[x]
images = list(image.to(device) for image in images)
target = {k: v.to(device) for k, v in target.items()}
loss_dict = model(images, [target])
losses = sum(loss for loss in loss_dict.values())
optimizer.zero_grad()
with torch.set_grad_enabled(True):
scaler.scale(losses).backward()
# grads = []
# for param in model.parameters():
# if param.requires_grad:
# grads.append(param.grad.view(-1))
# grads = torch.cat(grads)
# print(grads)
# for param in model.parameters():
# if param.requires_grad:
# print(param.grad.data.sum())
scaler.step(optimizer)
scaler.update()
if lr_scheduler is not None:
lr_scheduler.step()
epoch_loss += losses.item()
b = list(model.parameters())[0].clone()
print(torch.equal(a.data, b.data))
running_training_loss.append(epoch_loss)
print("Total training loss: ", str(epoch_loss))
print("Average training loss: ", str(epoch_loss/len(train_set)))
epoch_loss = 0.0
print("Validating...")
model.eval()
total_correct = 0
with torch.no_grad():
for x in tqdm(range(len(val_set))):
images, target = val_set[x]
images = list(image.to(device) for image in images)
target = {k: v.to(device) for k, v in target.items()}
output = model(images)[0]
correct = False
for x in output['labels']:
if x.item() == target['labels'].item():
correct = True
if correct:
total_correct +=1
# TODO: Implement IOU accuracy
# TODO: Calculate loss
running_validation_loss.append(epoch_loss)
print("Total validation loss: ", str(epoch_loss))
print("Average validation loss: ", str(epoch_loss/len(val_set)))
print("Validation Accuracy: ", str(total_correct/len(val_set)))
if epoch !=0:
if running_validation_loss[epoch-1] > running_validation_loss[epoch]:
torch.save(model.state_dict(), "models/first_try.pt")
return model, running_training_loss, running_validation_loss
@torch.inference_mode() # Dont know why but this makes inference faster
def test(model, test_set, device):
total_correct = 0
model.eval()
with torch.no_grad():
for x in tqdm(range(len(test_set))):
images, target = test_set[x]
images = list(image.to(device) for image in images)
output = model(images)[0]
correct = False
# TODO: Implement IOU accuracy
for x in output['labels']:
if x.item() == target['labels'].item():
correct = True
if correct:
total_correct +=1
return total_correct/len(test_set)
def main():
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
dataset_train = DataGen('data/Suspension_10/train_small.csv','data/Suspension_10/images' )
dataset_val = DataGen('data/Suspension_10/validation_small.csv','data/Suspension_10/images')
# model = fasterrcnn_resnet50_fpn_v2(weights="DEFAULT", box_score_thresh=0.7)
model = retinanet_resnet50_fpn(weights="DEFAULT")
# in_features = model.roi_heads.box_predictor.cls_score.in_features
# # replace the pre-trained head with a new one
# model.roi_heads.box_predictor = FastRCNNPredictor(in_features, 8)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
scaler = torch.cuda.amp.GradScaler()
model, train_loss, validation_loss = train(model, optimizer, lr_scheduler, dataset_train, dataset_val, device, 5, scaler)
fig = plt.figure(figsize=(10,8))
plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
plt.plot(range(1,len(validation_loss)+1),validation_loss,label='Validation Loss')
plt.title("Learning Curve Graph")
plt.xlabel('epochs')
plt.ylabel('loss')
plt.ylim(0, 1) # consistent scale
plt.xlim(0, len(train_loss)+1) # consistent scale
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
fig.savefig('model_training.png', bbox_inches='tight')
dataset_test = DataGen('data/Suspension_10/test.csv','data/Suspension_10/images')
test_accuracy = test(model, dataset_test, device)
print("Test accuracy:", test_accuracy)
main()
Any ideas on what im doing wrong or things I should check to make sure I am training correctly