Thank you all for your inputs!
So I can reproduce the results of two ResNets I load from PyTorch. However, my own implementation still yields different results compared to the loaded ones and I’ve checked it multiple times and cannot figure out why it behaves different. I can reproduce the results of my own implementation as well, so there does not seem to be some weird random stuff happening…
Here is my ResNet implementation:
# this type of block is used to build ResNet18 and ResNet34
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, stride=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(identity)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes):
super(ResNet, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(in_channels=3, out_channels=self.in_channels, kernel_size=7,
padding=3, stride=2, bias=False)
self.bn1 = nn.BatchNorm2d(self.in_channels)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, padding=1, stride=2)
self.layer1 = self._make_layer(block, layers[0], out_channels=64, stride=1)
self.layer2 = self._make_layer(block, layers[1], out_channels=128, stride=2)
self.layer3 = self._make_layer(block, layers[2], out_channels=256, stride=2)
self.layer4 = self._make_layer(block, layers[3], out_channels=512, stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def _make_layer(self, block, num_blocks, out_channels, stride):
downsample = None
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels*block.expansion, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels*block.expansion)
)
layers = []
layers.append(block(self.in_channels, out_channels, stride, downsample))
self.in_channels = out_channels * block.expansion
for _ in range(1, num_blocks):
layers.append(block(self.in_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
And here how I test it:
def set_seed(seed):
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
# for cuda
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
set_seed(0)
modelA = PyTorchModels.resnet18()
in_ = modelA.fc.in_features
classes = 10
modelA.fc = nn.Linear(in_features=in_, out_features=classes)
set_seed(0)
modelB = PyTorchModels.resnet18()
in_ = modelB.fc.in_features
classes = 10
modelB.fc = nn.Linear(in_features=in_, out_features=classes)
set_seed(0)
modelC = ResNet(BasicBlock, [2, 2, 2, 2], 10)
t = torch.rand(32, 3, 32, 32)
outA = modelA(t)
outB = modelB(t)
outC = modelC(t)
print(outA[0])
print('\n')
print(outB[0])
print('\n')
print(outC[0])
tensor([-0.0160, -0.0413, 0.5379, -0.3654, -0.0620, -0.7079, -0.9632, -0.9346,
1.5941, 1.0369], grad_fn=<SelectBackward>)
tensor([-0.0160, -0.0413, 0.5379, -0.3654, -0.0620, -0.7079, -0.9632, -0.9346,
1.5941, 1.0369], grad_fn=<SelectBackward>)
tensor([ 0.1272, 0.1153, -0.4902, -0.2696, -0.4524, -0.4243, -0.5799, -0.0227,
0.5023, 0.8597], grad_fn=<SelectBackward>)
So the two loaded ResNets behave the same but diffreent to my own…