Below is my code, I want to use a audio pretrained model vggish, but it seems that the input dimension cannot fit the dimension of the pretrained model. How can I modify the code? Thank you!
from torchaudio.datasets import SPEECHCOMMANDS
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import sys
import matplotlib.pyplot as plt
from tqdm import tqdm
from VGGish_pytorch.network.vggish import VGGish,
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
class SubsetSC(SPEECHCOMMANDS):
def __init__(self, subset: str = None):
super().__init__("Datasets", download=True)
def load_list(filename):
filepath = os.path.join(self._path, filename)
with open(filepath) as fileobj:
return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]
if subset == "validation":
self._walker = load_list("validation_list.txt")
elif subset == "testing":
self._walker = load_list("testing_list.txt")
elif subset == "training":
excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
excludes = set(excludes)
self._walker = [w for w in self._walker if w not in excludes]
# Create training and testing split of the data. We do not use validation in this tutorial.
train_set = SubsetSC("training") #84841
test_set = SubsetSC("testing") # 11003
labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
new_sample_rate = 8000
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
transformed = transform(waveform)
def label_to_index(word):
# Return the position of the word in labels
return torch.tensor(labels.index(word))
def index_to_label(index):
# Return the word corresponding to the index in labels
# This is the inverse of label_to_index
return labels[index]
word_start = "yes"
index = label_to_index(word_start)
word_recovered = index_to_label(index)
print(word_start, "-->", index, "-->", word_recovered)
def pad_sequence(batch):
# Make all tensor in a batch the same length by padding with zeros
batch = [item.t() for item in batch]
batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
return batch.permute(0, 2, 1)
def collate_fn(batch):
# A data tuple has the form:
# waveform, sample_rate, label, speaker_id, utterance_number
tensors, targets = [], []
# Gather in lists, and encode labels as indices
for waveform, _, label, *_ in batch:
tensors += [waveform]
targets += [label_to_index(label)]
# Group the list of tensors into a batched tensor
tensors = pad_sequence(tensors)
targets = torch.stack(targets)
return tensors, targets
batch_size = 256
if device == "cuda":
num_workers = 1
pin_memory = True
else:
num_workers = 0
pin_memory = False
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=batch_size,
shuffle=True,
collate_fn=collate_fn,
num_workers=num_workers,
pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=batch_size,
shuffle=False,
drop_last=False,
collate_fn=collate_fn,
num_workers=num_workers,
pin_memory=pin_memory,
)
# Load a pre-trained model
model = VGGish()
model.load_state_dict(torch.load("/home/Voice2Series-Reprogramming/VGGish_pytorch/model/vggish_model.pt"))
model.eval()
model.to(device)
print(model)
The VGGish() is
import torch.nn as nn
import torch
class VGGish(nn.Module):
def __init__(self):
super(VGGish, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 64, 3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, stride=2),
nn.Conv2d(64, 128, 3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, stride=2),
nn.Conv2d(128, 256, 3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, 3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, stride=2),
nn.Conv2d(256, 512, 3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, 3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, stride=2)
)
self.fc = nn.Sequential(
nn.Linear(512 * 24, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, 128),
nn.ReLU(inplace=True),
)
def forward(self, x):
x = self.features(x).permute(0, 2, 3, 1).contiguous()
x = x.view(x.size(0), -1)
x = self.fc(x)
return x