Hello,
I am a student who has some limited experience with keras, and for a new project recently decided to learn how to use pytorch to implement my models. I’m a beginner with both, so apologies in advance for my inexperience, I am doing my best to follow tutorials, but my limited experience combined with most examples being in different uses has resulted in slower comprehension.
I have built a custom dataset class for my data, the X data is genetic sequence 256 bases long (i.e. “AGCTGGAGCT…”), so the resulting array after one-hotting to four channels for each of the four bases looks like [[[1,0,0,0],[0,1,0,0]...], [0,0,1,0], ...]]
and has shape 48,976, 256, 4. I read that Conv1d looks for channels first, so I permuted the channels in the dataset’s tensor to read in that way, resulting in torch.Size([48976, 4, 256])
. The Y data is 2 values for a given sequence of X, ESC and TSC, each numeric values derived from other source data. The dataset code is as follows:
device = "cuda" if torch.cuda.is_available() else "cpu"
def onehotseq(dataset, input_shape):
onehot = np.zeros(input_shape)
for i in range(0, dataset.shape[0]):
seq = dataset.iloc[i,1]
for c in range(0,len(seq)):
if (seq[c] == "A"):
onehot[i,c,:] = [1,0,0,0]
elif (seq[c] == "C"):
onehot[i,c,:] = [0,1,0,0]
elif (seq[c] == "G"):
onehot[i,c,:] = [0,0,1,0]
elif (seq[c] == "T"):
onehot[i,c,:] = [0,0,0,1]
return onehot
class EpiDataset(torch.utils.data.Dataset):
def __init__(self, Seq_filepath="path_to_sequence_data", Y_data_filepath="path_to_output_data"):
self.seq_data = pd.read_csv(Seq_filepath, sep="\t", header=None)
self.seq_data.rename(columns={0:"id", 1:"seq"}, inplace=True)
self.y_data = pd.read_csv(Y_data_filepath, sep="\t", header = 0)
self.y_data["ESC"] = np.log2((self.y_data["ESC.H3K27ac"].values+1)/(self.y_data["ESC.input"].values+1))
self.y_data["TSC"] = np.log2((self.y_data["TSC.H3K27ac"].values+1)/(self.y_data["TSC.input"].values+1))
self.dataset = self.seq_data.merge(self.y_data, on="id")
self.list_IDs = self.dataset["id"]
self.seq = self.dataset["seq"]
self.esc = self.dataset["ESC"]
self.tsc = self.dataset["TSC"]
self.input_shape = (self.dataset.shape[0], 256, 4)
self.onehotseq = onehotseq(self.dataset, self.input_shape)
self.tensorX = torch.from_numpy(self.onehotseq)
self.tensorX = self.tensorX.permute(0, 2, 1)
self.labels = self.dataset[["ESC","TSC"]].to_numpy()
self.tensorY = torch.from_numpy(self.labels)
def __len__(self):
return len(self.list_IDs)
def __getitem__(self, index):
ID = self.list_IDs[index]
seq = self.seq[index]
esc = self.esc[index]
tsc = self.tsc[index]
return {
"ID: ": ID,
"sequence: ": seq,
"ESC: ": esc,
"TSC: ": tsc
}
This all seems to work as intended, and I was able to design a Module class, which also seems to be functionally correct, but I get a type error whenever I try to use the model. The code and error are:
def nin_block(out_channels, kernel_size, padding="same"):
return nn.Sequential(
nn.LazyConv1d(out_channels, kernel_size, padding),
nn.ReLU(),
nn.LazyConv1d(out_channels, kernel_size=1), nn.ReLU(),
nn.LazyConv1d(out_channels, kernel_size=1), nn.ReLU()
)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.NiN = nn.Sequential(
nin_block(32, kernel_size=11,padding="same"),
nn.MaxPool1d(3, stride=2),
nin_block(64, kernel_size=4, padding="same"),
nn.MaxPool1d(3, stride=2),
nin_block(128, kernel_size=4, padding="same"),
nn.MaxPool1d(3, stride=2),
nin_block(256, kernel_size=3, padding="same"),
nn.MaxPool1d(3, stride=2),
nn.Dropout(0.4),
nin_block(4, kernel_size=3, padding="same"),
nn.AdaptiveAvgPool1d(2),
nn.Flatten(),
)
def forward(self, x):
x = self.flatten(x)
logits = self.NiN(x)
return logits
Error message, resulting from running model = NeuralNetwork().to(device)
and then
logit = model(x.tensorX)
TypeError: conv1d() received an invalid combination of arguments - got (Tensor, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
* (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (Tensor, !Parameter!, !Parameter!, !tuple!, !tuple!, !tuple!, int)
* (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (Tensor, !Parameter!, !Parameter!, !tuple!, !tuple!, !tuple!, int)
My question is, what am I doing wrong either in building my module or dataset, or am I missing a step? The data loaded in is the prepared data for initial exploration/training of different model architectures.