import os
import configparser
import torchvision
import csv
import os
import os.path as osp
import pickle
from PIL import Image
import numpy as np
import scipy
import torch
import matplotlib.pyplot as plt
from typing import overload
import torchvision.transforms as T
import utils
from dataloader import MOT17ObjDetect
from engine import train_one_epoch, evaluate
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator
import torchvision.models as models
from torchvision.models.feature_extraction import get_graph_node_names, create_feature_extractor
import torchvision.models as models
import torchvision
from torchvision.models.detection.backbone_utils import LastLevelMaxPool
from torchvision.models.detection import FasterRCNN
import torch.nn as nn
from dataloader import MOT17ObjDetect
import torchvision
OUTPUT_DIR=“outputs”
LOG_RESULTS =“inference”
PREDICTED_IMAGES =“images_with_boxes”
import random
from torchvision.models.vision_transformer import vit_b_16
def plot(img, boxes):
x=random.randint(100000, 100000000)
fig, ax = plt.subplots(1, dpi=96)
img = img.mul(255).permute(1, 2, 0).cpu().byte().numpy()
width, height, _ = img.shape
ax.imshow(img, cmap=‘gray’)
fig.set_size_inches(width / 80, height / 80)
for box in boxes:
rect = plt.Rectangle(
(box[0], box[1]),
box[2] - box[0],
box[3] - box[1],
fill=False,
linewidth=1.0)
ax.add_patch(rect)
plt.axis(‘off’)
plt.savefig("./images_with_boxes/"+str(x)+".png")
class VITWithFPN(torch.nn.Module):
def init(self):
super(VITWithFPN, self).init()
# Get a VIT backbone
self.model =models.vit_b_16(pretrained=True,image_size=224)
#Extract the encoder
self.body = create_feature_extractor(
self.model, return_nodes=['encoder'])
inp = torch.randn(2, 3, 224, 224)
with torch.no_grad():
out = self.body(inp)
in_channels_list = [o.shape[1] for o in out.values()]
# Build FPN
self.out_channels = self.model.hidden_dim
self.fpn = torchvision.ops.FeaturePyramidNetwork(
in_channels_list, out_channels=self.out_channels,
extra_blocks=LastLevelMaxPool())
def _process_input(self, x: torch.Tensor) -> torch.Tensor:
n, c, h, w = x.shape
print (n,c,h,w)
print(x.shape)
p = self.model.patch_size
torch._assert(h == self.model.image_size, "Wrong image height!")
torch._assert(w == self.model.image_size, "Wrong image width!")
n_h = h // p
n_w = w // p
# (n, c, h, w) -> (n, hidden_dim, n_h, n_w)
x = self.model.conv_proj(x)
# (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w))
x = x.reshape(n, self.model.hidden_dim, n_h * n_w)
# (n, hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), hidden_dim)
# The self attention layer expects inputs in the format (N, S, E)
# where S is the source sequence length, N is the batch size, E is the
# embedding dimension
x = x.permute(0, 2, 1)
return x
def forward(self, x):
print("Image is -->",x.size())
x = self._process_input(x)
n = x.shape[0]
# Expand the class token to the full batch
batch_class_token = self.model.class_token.expand(n, -1, -1)
x = torch.cat([batch_class_token, x], dim=1)
x = self.body(x)
x = self.fpn(x)
return x
#Build model
model = FasterRCNN(VITWithFPN(), num_classes=2)
def get_transform(train):
transforms = []
# converts the image, a PIL image, into a PyTorch Tensor
transforms.append(T.ToTensor())
if train:
# during training, randomly flip the training images
# and ground-truth for data augmentation
transforms.append(T.RandomHorizontalFlip(0.5))
transforms.append(T.Resize((224,224)))
return T.Compose(transforms)
use our dataset and defined transformations
dataset = MOT17ObjDetect(’./data/MOT17Det/train’, get_transform(train=True))
dataset_no_random = MOT17ObjDetect(’./data/MOT17Det/train’, get_transform(train=False))
dataset_test = MOT17ObjDetect(’./data/MOT17Det/test’, get_transform(train=False))
split the dataset in train and test set
torch.manual_seed(1)
define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=8, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_no_random = torch.utils.data.DataLoader(
dataset_no_random, batch_size=8, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=2, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
device = torch.device(‘cuda’) if torch.cuda.is_available() else torch.device(‘cpu’)
get the model using our helper function
model = get_detection_model(dataset.num_classes)
move model to the right device
model.to(device)
construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.00125,
momentum=0.9, weight_decay=0.0005)
and a learning rate scheduler which decreases the learning rate by
10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=10,
gamma=0.1)
os.makedirs(OUTPUT_DIR,exist_ok=True)
os.makedirs(LOG_RESULTS,exist_ok=True)
def evaluate_and_write_result_files(model, data_loader):
model.eval()
results = {}
for imgs, targets in data_loader:
imgs = [img.to(device) for img in imgs]
with torch.no_grad():
preds = model(imgs)
for pred, target in zip(preds, targets):
results[target['image_id'].item()] = {'boxes': pred['boxes'].cpu(),
'scores': pred['scores'].cpu()}
data_loader.dataset.print_eval(results)
data_loader.dataset.write_results_files(results, OUTPUT_DIR+"/")
num_epochs = 27
for epoch in range(1, num_epochs + 1):
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=300)
# update the learning rate
# lr_scheduler.step()
# evaluate on the test dataset
if epoch % 3 == 0:
evaluate_and_write_result_files(model, data_loader_no_random)
torch.save(model.state_dict(), f"{OUTPUT_DIR}/model_epoch_{epoch}.model")
# pick one image from the test set
os.makedirs(PREDICTED_IMAGES,exist_ok=True)
dataset = MOT17ObjDetect(’./data/MOT17Det/train’, get_transform(train=False))
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=20, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
print("---------------Done training------------------")
for imgs, target in data_loader:
# put the model in evaluation mode
model.eval()
with torch.no_grad():
for k in range(0,20):
prediction = model([imgs[k].to(device)])[0]
plot(imgs[k], prediction['boxes'].cpu())
@Matias_Vasquez ,here is the complete code file