I am facing this error while running cluster GCN. I am not able to understand why am getting this error and how to resolve it. I tried setting up conda venv for pytorch3d and torch-geometric on my local system to run this code on the CPU but failed multiple times coz of torch-sparse.
I am trying to build a basic code inspired by https://github.com/pyg-team/pytorch_geometric/blob/master/examples/cluster_gcn_ppi.py
All suggestions and help will be appreciated. Thanks in advance!
import os
import torch
import torch.nn.functional as F
from torch_geometric.loader import ClusterData, ClusterLoader, DataLoader
from torch_geometric.nn import BatchNorm, SAGEConv
from torch_geometric.data import Data
class ClusterGCN(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels,
noise_dim, device, dropout, n_partitions, batch_size,
num_layers, saving_dir, shuffle=False):
super(ClusterGCN, self).__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.n_partitions = n_partitions
self.batch_size = batch_size
self.saving_dir = saving_dir
self.device = device
self.num_layers = num_layers
self.shuffle = shuffle
self.conv0 = SAGEConv(in_channels, hidden_channels)
self.conv = SAGEConv(hidden_channels, hidden_channels)
self.convfinal = SAGEConv(hidden_channels, out_channels)
self.bn = BatchNorm(hidden_channels)
self.lin = torch.nn.Linear(hidden_channels + noise_dim, hidden_channels)
self.dropout = torch.nn.Dropout(p=dropout)
self.num_layers = num_layers
def forward(self, x, adj, noise):
# get edge matrix and create the graph
# for clustering and loader
edge_index = adj.coalesce().indices()
train_dataset = Data(x=x, edge_index=edge_index)
# clustering and loading the train_data
clustered_train_dataset = ClusterData(train_dataset,
num_parts=self.n_partitions,
recursive=False,
save_dir=self.saving_dir)
train_dataloader = ClusterLoader(clustered_train_dataset,
batch_size=self.batch_size,
shuffle=self.shuffle,
num_workers=0)
# train the model for the given object mesh
# partitioned into subgraphs
outputs = list()
print("length of the train_dataloader: ", len(train_dataloader))
for idx, data in enumerate(train_dataloader):
print("idx:", idx)
data = data.to(self.device)
x, edge_index = data.x, data.edge_index
x = x.to(self.device)
edge_index = edge_index.to(self.device)
# initial layer
# x: (N, in_channels)
x = self.dropout(F.relu(self.bn(self.conv0(x, edge_index)))) # (N, hidden)
# print("x0 shape: ", x.shape)
# adding noise
# cluster_noise = noise
# cluster_noise = cluster_noise.unsqueeze(dim=0).repeat(x.shape[0],1) # (N,noise_dim)
# print("noise shape: ", cluster_noise.shape)
x = torch.cat((x, cluster_noise), dim=1) # (N, noise_dim + hidden)
x = self.lin(x) # (N, hidden)
# print("x+noise shape after lin: ", x.shape)
# in-between layers
for _ in range(self.num_layers-2):
# x: (N, hidden)
x = self.dropout(F.relu(self.bn(self.conv(x, edge_index))))
# print("x in between shape: ", x.shape)
# final layer
x = self.convfinal(x, edge_index) # (N, out_channels)
outputs.append(x)
texture = torch.cat(outputs, dim=0).to(self.device) # (V, channels)
return texture
Traceback (most recent call last):
File "/home/gauravs/github/arete/arete-realsim/gnn.py", line 213, in <module>
val_fid = get_fid(shapenet_path, netG, val_set, device, dual, out_res, category=category,
File "/home/gauravs/github/arete/arete-realsim/Utils/fid.py", line 178, in get_fid
texture = model(features, adj, noise).to(device).unsqueeze(dim=0)
File "/home/gauravs/miniconda3/envs/pytorch3d_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/gauravs/github/arete/arete-realsim/Models/clustergcn.py", line 38, in forward
clustered_train_dataset = ClusterData(train_dataset,
File "/home/gauravs/miniconda3/envs/pytorch3d_gpu/lib/python3.9/site-packages/torch_geometric/loader/cluster.py", line 58, in __init__
self.data = self.__permute_data__(data, perm, adj)
File "/home/gauravs/miniconda3/envs/pytorch3d_gpu/lib/python3.9/site-packages/torch_geometric/loader/cluster.py", line 70, in __permute_data__
out[key] = item[node_idx]
RuntimeError: CUDA error: device-side assert triggered