I need to implement a highway network and run it on cifar-10. So far, the highway block looks like this:
class HwNetBasicblock(nn.Module):
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(HwNetBasicblock, self).__init__()
self.conv_a = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn_a = nn.BatchNorm2d(planes)
self.conv_b = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn_b = nn.BatchNorm2d(planes)
self.gate = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=True)
self.downsample = downsample
BIAS_INIT = -13
self.gate.bias.data.fill_(BIAS_INIT)
def forward(self, x):
residual = x
basicblock = self.conv_a(x)
basicblock = self.bn_a(basicblock)
basicblock = F.relu(basicblock, inplace=True)
basicblock = self.conv_b(basicblock)
basicblock = self.bn_b(basicblock)
t_activation = self.gate(residual)
t_value = F.sigmoid(torch.mean(t_activation))
if self.downsample is not None:
residual = self.downsample(x)
return F.relu(residual*(1-t_value) + basicblock*t_value, inplace=True)
I remember, from the highway network paper, that if you go deep (50-100 layers) , you need to initialize the bias of the transform gate to a very small negative number (-7, -15…) . I am trying to do this by using the
BIAS_INIT = -13
self.gate.bias.data.fill_(BIAS_INIT)
but this is obviously not working. On CIFAR-10, the 50 layer network goes well above 93% accuracy, while the 100 layer net stops at around 86% .
How can I correctly initialize the biases ?
EDIT:
Is doing this outside the network gonna work?
# initialize the bias
def bias_init(m):
if isinstance(m, nn.Conv2d):
BIAS_INIT = -13
m.bias.data.fill_(BIAS_INIT)
model = CifarResNet(HwNetBasicblock, depth=110, num_classes=10)
model = model.apply(bias_init)
net = model.cuda()
I am using the fact that my normal convolutions have bias=False
, and only the gate has bias=True
.