Dear community,
I am trying to implement the following paper (github here) in Pytorch.
I must confess I find their code difficult to follow, but I think I got the most things correct.
To summarize, the whole idea is to predict the class associated with a specific group of viruses from their genomic sequence.
Now, what I did is the following:
- I used the available data that was already encoded by them as specified in the paper, meaning for each nucleobase (T, G, A, C) they assigned a floating value. There are 5 groups which are corresponding to the classes.
- I did the cross-validation step, standardizing each fold separately, obtaining 10 folds in the end.
- I ended up using a 1D CNN because I couldn’t understand their choice of 2D CNN. I was thinking that using a Conv1D layer is more appropriate for this scenario because it would make a more natural choice if you have a signal of length L, instead of reshaping the length into H x W to accommodate a Conv2D layer.
- They used L2 regularization. I tried to make use of the optimizer’s weight decay parameter
All these said and done, I can’t figure out why the training loss doesn’t decrease. I might have missed something because I don’t fully understand it, but a hint in the right direction might help me figure it out.
Code
Data .py file:
Batch = DataLoader
DataContainer = Tuple[Dict[int, Tuple[Batch, Batch]], Batch]
class GenomeData(Dataset):
def __init__(self, inputs: DataFrame, labels: DataFrame,
bs: int = 16,
shuffle: bool = True) -> None:
self.bs = bs
self.shuffle = shuffle
self.inputs = Tensor(inputs.values)
self.labels = Tensor(labels.values)
def __len__(self) -> int:
return len(self.labels)
def __getitem__(self, i: Union[int, slice]) -> Tuple[Tensor, Tensor]:
return self.inputs[i], self.labels[i]
def __call__(self, tr_sz: float = 0.9, ts_sz: float = 0.1,
n_folds: int = 10) -> DataContainer:
# Make use of sklearn for splitting data
X, X_ts, y, y_ts = train_test_split(self.inputs,
self.labels,
train_size=tr_sz,
test_size=ts_sz,
shuffle=False)
# Normalize the testing set
scaler = StandardScaler().fit(X_ts)
X_ts_norm = Tensor(scaler.transform(X_ts))
testing_set = DataLoader(torch.cat([X_ts_norm, y_ts], axis=1),
batch_size=self.bs,
shuffle=self.shuffle)
skf = StratifiedKFold(n_splits=n_folds)
folds = {}
for i, (tr_i, val_i) in enumerate(skf.split(X, y), 1):
X_tr, X_val = X[tr_i], X[val_i]
y_tr, y_val = y[tr_i], y[val_i]
# Normalize each fold separately using training set as fit
scaler = StandardScaler().fit(X_tr)
X_tr_norm = Tensor(scaler.transform(X_tr))
X_val_norm = Tensor(scaler.transform(X_val))
training_set = DataLoader(torch.cat([X_tr_norm, y_tr], axis=1),
batch_size=self.bs,
shuffle=self.shuffle)
validation_set = DataLoader(torch.cat([X_val_norm, y_val], axis=1),
batch_size=self.bs,
shuffle=self.shuffle)
folds[i] = training_set, validation_set
return folds, testing_set
Model .py file:
BATCH_SIZE = 16
CLASSES = 5
IN_CHANNELS = 1
LENGTH = 31029
KERNEL = 4
STRIDE = 4
DROPOUT = .5
# conv1d: N x C x L -> BATCH x CHANNELS x LENGTH
INPUT_SHAPE = (BATCH_SIZE, IN_CHANNELS, LENGTH)
class GenomeConvNet(nn.Module):
'''
Paper based architecture.
'''
def __init__(self):
super(GenomeConvNet, self).__init__()
self.conv1 = nn.Conv1d(IN_CHANNELS, 32, KERNEL)
self.conv2 = nn.Conv1d(32, 64, KERNEL)
self.conv3 = nn.Conv1d(64, 128, KERNEL)
self.fc1 = nn.Linear(128*483, 256)
self.fc2 = nn.Linear(256, CLASSES)
self.maxpool = nn.MaxPool1d(KERNEL, STRIDE)
def forward(self, x: Tensor) -> Tensor:
x = self.maxpool(F.relu(self.conv1(x)))
x = self.maxpool(F.relu(self.conv2(x)))
x = self.maxpool(F.relu(self.conv3(x)))
x = x.view(-1, 128*483)
x = F.dropout(F.relu(self.fc1(x)), DROPOUT)
return self.fc2(x)
Training .py file:
def transform_tensor(x: Tensor, shape: Any) -> Tensor:
'''
Prepare tensor for convolutional layer.
'''
return x.view(shape)
def train_nn(net: Any,
data: Batch,
opt: str,
criterion: Any,
lr: float,
decay: Union[float, None],
epochs: int = 500) -> None:
if opt == 'Adam':
optim = Adam(net.parameters(), lr=lr, weight_decay=decay)
elif opt == 'SGD':
optim = SGD(net.parameters(), lr=lr, weight_decay=decay)
else:
raise NotImplementedError
running_loss = 0.0
for i in range(epochs):
net.train()
for sample in data:
inp, target = sample[:, :-1], sample[:, -1]
try:
inp = transform_tensor(inp, INPUT_SHAPE)
except Exception:
break
target = target.long()
optim.zero_grad()
out = net.forward(inp)
loss = criterion(out, target)
loss.backward()
optim.step()
running_loss += loss.item()
print(f"{running_loss/len(data)}")
I would very much appreciate a hint.
Ty