import torch
import torch.nn as nn
import torchvision.models as models
class SSDWithVGG16(nn.Module):
def init(self, num_classes=2): # 2 object classes: filled and unfilled
super(SSDWithVGG16, self).init()
# Load pretrained VGG16 backbone
vgg16 = models.vgg16(weights='DEFAULT')
self.backbone = nn.Sequential(*list(vgg16.features.children())) # Use up to conv5_3 layer
# Additional convolution layers for SSD300
self.extra_layers = nn.Sequential(
nn.Conv2d(512, 1024, kernel_size=3, padding=1, stride=2), # Conv6
nn.ReLU(),
nn.Conv2d(1024, 1024, kernel_size=1), # Conv7
nn.ReLU(),
nn.Conv2d(1024, 256, kernel_size=1), # Conv8_1
nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), # Conv8_2
nn.Conv2d(512, 128, kernel_size=1), # Conv9_1
nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # Conv9_2
nn.Conv2d(256, 128, kernel_size=1), # Conv10_1
nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # Conv10_2
nn.Conv2d(256, 128, kernel_size=1), # Conv11_1
nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # Conv11_2
)
# Multibox layers for bounding box predictions and class scores
self.loc_layers = nn.ModuleList([
nn.Conv2d(512, 4 * 4, kernel_size=3, padding=1), # For conv4_3 feature map
nn.Conv2d(512, 6 * 4, kernel_size=3, padding=1), # For conv5_3 feature map
nn.Conv2d(512, 6 * 4, kernel_size=3, padding=1), # For conv8_2 feature map
nn.Conv2d(256, 6 * 4, kernel_size=3, padding=1), # For conv9_2 feature map
nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1), # For conv10_2 feature map
nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1), # For conv11_2 feature map
])
self.conf_layers = nn.ModuleList([
nn.Conv2d(512, 4 * num_classes, kernel_size=3, padding=1), # For conv4_3 feature map
nn.Conv2d(512, 6 * num_classes, kernel_size=3, padding=1), # For conv5_3 feature map
nn.Conv2d(512, 6 * num_classes, kernel_size=3, padding=1), # For conv8_2 feature map
nn.Conv2d(256, 6 * num_classes, kernel_size=3, padding=1), # For conv9_2 feature map
nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1), # For conv10_2 feature map
nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1), # For conv11_2 feature map
])
def forward(self, x):
# Extract feature maps from VGG16 backbone
conv4_3 = self.backbone[:23](x) # Output from conv4_3 (512 channels)
conv5_3 = self.backbone[23:](x) # Output from conv5_3 (512 channels)
# Pass through additional layers
x = self.extra_layers[0](conv5_3) # Apply Conv6
conv7 = self.extra_layers[1](x) # Apply Conv7
x = self.extra_layers[2](conv7) # Apply Conv8_1
conv8_2 = self.extra_layers[3](x) # Apply Conv8_2
x = self.extra_layers[4](conv8_2) # Apply Conv9_1
conv9_2 = self.extra_layers[5](x) # Apply Conv9_2
x = self.extra_layers[6](conv9_2) # Apply Conv10_1
conv10_2 = self.extra_layers[7](x) # Apply Conv10_2
x = self.extra_layers[8](conv10_2) # Apply Conv11_1
conv11_2 = self.extra_layers[9](x) # Apply Conv11_2
# Generate predictions at different scales
loc_preds, conf_preds = [], []
loc_preds.append(self.loc_layers[0](conv4_3).permute(0, 2, 3, 1).contiguous()) # conv4_3 output
conf_preds.append(self.conf_layers[0](conv4_3).permute(0, 2, 3, 1).contiguous()) # conv4_3 output
loc_preds.append(self.loc_layers[1](conv5_3).permute(0, 2, 3, 1).contiguous()) # conv5_3 output
conf_preds.append(self.conf_layers[1](conv5_3).permute(0, 2, 3, 1).contiguous()) # conv5_3 output
loc_preds.append(self.loc_layers[2](conv8_2).permute(0, 2, 3, 1).contiguous()) # conv8_2 output
conf_preds.append(self.conf_layers[2](conv8_2).permute(0, 2, 3, 1).contiguous()) # conv8_2 output
loc_preds.append(self.loc_layers[3](conv9_2).permute(0, 2, 3, 1).contiguous()) # conv9_2 output
conf_preds.append(self.conf_layers[3](conv9_2).permute(0, 2, 3, 1).contiguous()) # conv9_2 output
loc_preds.append(self.loc_layers[4](conv10_2).permute(0, 2, 3, 1).contiguous()) # conv10_2 output
conf_preds.append(self.conf_layers[4](conv10_2).permute(0, 2, 3, 1).contiguous()) # conv10_2 output
loc_preds.append(self.loc_layers[5](conv11_2).permute(0, 2, 3, 1).contiguous()) # conv11_2 output
conf_preds.append(self.conf_layers[5](conv11_2).permute(0, 2, 3, 1).contiguous()) # conv11_2 output
# Flatten the predictions
loc_preds = torch.cat([p.view(p.size(0), -1) for p in loc_preds], dim=1)
conf_preds = torch.cat([p.view(p.size(0), -1) for p in conf_preds], dim=1)
return loc_preds, conf_preds
Example usage:
if name == “main”:
model = SSDWithVGG16(num_classes=2) # Adjust based on your needs
input_tensor = torch.randn(1, 3, 300, 300) # Example input shape for SSD300
loc_preds, conf_preds = model(input_tensor)
print(“Location Predictions Shape:”, loc_preds.shape)
print(“Confidence Predictions Shape:”, conf_preds.shape)
pls could any one help with this, becoz i struggling with more than a days