Sure thing,
the code for loading data is taken from this thread.
What I can’t provide is the actual dataset.
Here’s an example:
class SparseDataset(tud.Dataset):
"""
Custom Dataset class for scipy sparse matrix
"""
def __init__(self, data:Union[np.ndarray, sps.coo_matrix, sps.csr_matrix],
targets:Union[np.ndarray, sps.coo_matrix, sps.csr_matrix],
transform:bool = None):
# Transform data coo_matrix to csr_matrix for indexing
if type(data) == sps.coo_matrix:
self.data = data.tocsr()
else:
self.data = data
# Transform targets coo_matrix to csr_matrix for indexing
if type(targets) == sps.coo_matrix:
self.targets = targets.tocsr()
else:
self.targets = targets
self.transform = transform # Can be removed
def __getitem__(self, index:int):
return self.data[index], self.targets[index]
def __len__(self):
return self.data.shape[0]
def sparse_coo_to_tensor(coo:sps.coo_matrix):
"""
Transform scipy coo matrix to pytorch sparse tensor
"""
values = coo.data
indices = np.vstack((coo.row, coo.col))
shape = coo.shape
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
s = torch.Size(shape)
return torch.sparse.FloatTensor(i, v, s)
def sparse_batch_collate(batch:list):
"""
Collate function which to transform scipy coo matrix to pytorch sparse tensor
"""
data_batch, targets_batch = zip(*batch)
if type(data_batch[0]) == sps.csr_matrix:
data_batch = sps.vstack(data_batch).tocoo()
data_batch = sparse_coo_to_tensor(data_batch)
else:
data_batch = torch.FloatTensor(data_batch)
if type(targets_batch[0]) == sps.csr_matrix:
targets_batch = sps.vstack(targets_batch).tocoo()
targets_batch = sparse_coo_to_tensor(targets_batch)
else:
targets_batch = torch.FloatTensor(targets_batch)
return data_batch, targets_batch
class MLP(nn.Module):
def __init__(self, in_dim, num_classes=2):
super(LogisticRegression, self).__init__()
self.in_dim = in_dim
self.num_classes = num_classes
self.l1 = nn.Linear(in_dim, 2048)
self.d1 = nn.Dropout(p=0.5)
self.l2 = nn.Linear(2048, 1024)
self.l3 = nn.Linear(1024, 512)
self.m1 = nn.MaxPool1d(kernel_size=2, stride=2)
self.l4 = nn.Linear(256, 128)
self.l5 = nn.Linear(128, 64)
self.m2 = nn.MaxPool1d(kernel_size=2, stride=2)
self.l6 = nn.Linear(32, 16)
self.l7 = nn.Linear(16, 8)
self.l8 = nn.Linear(8, num_classes)
def forward(self, x):
x = self.l1(x)
x = self.d1(x)
x = self.l2(x).relu_()
x = self.l3(x)
x = self.m1(x.unsqueeze(dim=1))
x = self.l4(x.squeeze())
x = self.l5(x)
x = self.m2(x.unsqueeze(dim=1))
x = self.l6(x.squeeze())
x = self.l7(x).relu_()
x = self.l8(x)
return x
def criterion(model, x, y):
y_hat = model(x)
prob = torch.sigmoid(y_hat)[:, 1]
return nn.BCELoss(reduction='mean')(prob, y), y_hat
X = sps.random(80000, 4096, density=0.25)
y = np.arange(80000)
dset = SparseDataset(X, y)
loader = tud.DataLoader(dset, batch_size=512, collate_fn=sparse_batch_collate)
Training proceed as usual......