model.load_state_dict(checkpoint[‘model’])
Error(s) in loading state_dict for Cnn14:
Unexpected key(s) in state_dict: “spectrogram_extractor.stft.conv_real.weight”, “spectrogram_extractor.stft.conv_imag.weight”, “logmel_extractor.melW”.
class Cnn14(nn.Module):
def __init__(self, classes_num=527):
super(Cnn14, self).__init__()
self.bn0 = nn.BatchNorm2d(64)
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
self.fc1 = nn.Linear(2048, 2048, bias=True)
self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
self.init_weight()
def init_weight(self):
init_bn(self.bn0)
def forward(self, input):
x = input.unsqueeze(1) # (batch_size, 1, time_steps, mel_bins)
x = x.transpose(1, 3)
x = self.bn0(x)
x = x.transpose(1, 3)
x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block5(x, pool_size=(1, 1), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training) #(batch_size, 2048, T/16, mel_bins/16)
'''
x = torch.mean(x, dim=3)
(x1, _) = torch.max(x, dim=2)
x2 = torch.mean(x, dim=2)
x = x1 + x2
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu_(self.fc1(x))
embedding = F.dropout(x, p=0.5, training=self.training)
clipwise_output = torch.sigmoid(self.fc_audioset(x))
output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}
'''
return x
print(checkpoint['model'])
'spectrogram_extractor.stft.conv_real.weight',
tensor([[[ 0.0000e+00, 9.4124e-06, 3.7649e-05, ..., 8.4709e-05,
3.7649e-05, 9.4124e-06]],
[[ 0.0000e+00, 9.4122e-06, 3.7646e-05, ..., 8.4695e-05,
3.7646e-05, 9.4122e-06]],
[[ 0.0000e+00, 9.4117e-06, 3.7638e-05, ..., 8.4652e-05,
3.7638e-05, 9.4117e-06]],
...,
[[ 0.0000e+00, -9.4117e-06, 3.7638e-05, ..., -8.4652e-05,
3.7638e-05, -9.4117e-06]],
[[ 0.0000e+00, -9.4122e-06, 3.7646e-05, ..., -8.4695e-05,
3.7646e-05, -9.4122e-06]],
[[ 0.0000e+00, -9.4124e-06, 3.7649e-05, ..., -8.4709e-05,
3.7649e-05, -9.4124e-06]]], device='cuda:0')),
('spectrogram_extractor.stft.conv_imag.weight',
I faced a problem when I was transfer learning. There is an error that does not match the number of weights of pre-train model with the number of weights of the cnn14 model to be learned. Should the weight be the same when I do transfer learning?
The following is the structure of the model called up by the torch.load command:
class Cnn14(nn.Module):
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
fmax, classes_num):
super(Cnn14, self).__init__()
window = 'hann'
center = True
pad_mode = 'reflect'
ref = 1.0
amin = 1e-10
top_db = None
# Spectrogram extractor
self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
win_length=window_size, window=window, center=center, pad_mode=pad_mode,
freeze_parameters=True)
# Logmel feature extractor
self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
freeze_parameters=True)
# Spec augmenter
self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
freq_drop_width=8, freq_stripes_num=2)
self.bn0 = nn.BatchNorm2d(64)
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
self.fc1 = nn.Linear(2048, 2048, bias=True)
self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
self.init_weight()
def init_weight(self):
init_bn(self.bn0)
init_layer(self.fc1)
init_layer(self.fc_audioset)
def forward(self, input, mixup_lambda=None):
"""
Input: (batch_size, data_length)"""
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
x = x.transpose(1, 3)
x = self.bn0(x)
x = x.transpose(1, 3)
if self.training:
x = self.spec_augmenter(x)
# Mixup on spectrogram
if self.training and mixup_lambda is not None:
x = do_mixup(x, mixup_lambda)
x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = torch.mean(x, dim=3)
(x1, _) = torch.max(x, dim=2)
x2 = torch.mean(x, dim=2)
x = x1 + x2
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu_(self.fc1(x))
embedding = F.dropout(x, p=0.5, training=self.training)
clipwise_output = torch.sigmoid(self.fc_audioset(x))
output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}
return output_dict
Help me