GPU training with GCN sparse matrix bug

class GraphConvolution(nn.Module):

def __init__(self, in_features, out_features, dropout=0., act=F.relu):
    super(GraphConvolution, self).__init__()
    self.in_features = in_features
    self.out_features = out_features
    self.dropout = dropout
    self.act = act
    self.weight = glorot_init(in_features, out_features)
    self.reset_parameters()

def reset_parameters(self):
    torch.nn.init.xavier_uniform_(self.weight)

def forward(self, input, adj):
    input = F.dropout(input, self.dropout, self.training)
    support = torch.mm(input, self.weight)
    output = torch.spmm(adj, support)
    output = self.act(output)
    return output

def __repr__(self):
    return self.__class__.__name__ + ' (' \
           + str(self.in_features) + ' -> ' \
           + str(self.out_features) + ')'

device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)

def collate(samples):
g_list, f_list = zip(*(samples))
batched_graph = dgl.batch([g.to(device) for g in g_list])
features = torch.cat([f.to(device) for f in f_list])
return batched_graph, features

train_data_loader = DataLoader(training_graphs, batch_size= batch_size, shuffle=True, collate_fn=collate)

model = GVAE(input_feat_dim = 156,
hidden_dim1 = vae_d1,
hidden_dim2 = vae_d2,
emb_weights = embedding_vector,
dropout=0.0)

model.to(device)

[quote=“sungtae, post:1, topic:19965, full:true”]
Dear members,

When I a Autoencoder on GPU using GCN i m having this bug. Anyone knows how to resolve this bug?

Thanks

Which PyTorch version are you using? Could you install the nightly binary (in a new virtual environment) and rerun the code?
Also, is the code working on the CPU?

Code runs on “cpu”. My Pytorch vesion is 1.4.0.
As you recommended i have installed the nightly version of pytorch 1.7.0 nightly (gpu version) in a new environment and also installed DGL (gpu version). But error remains the same.

I reimplemented the Network in DGL (Pytorch Backend) instead of pure Pytorch. The problem is gone, but my cpu is way more faster than gpu. In the end of the day I would like to have the Network in Pytorch.

Could you post the input shapes so that we could reproduce this issue in your initial code snippet?

Hi,

Feature_matrix = torch.Size([44, 156])
Adjacency_matrix = torch.Size([44, 44])

Btw, i checked all the input and the outputs, Adjacency matrix wasn’t on cuda:0 device, so i assigned it and it worked. My current problem, why does my computation is slower than my cpu, even though i have tried various batch sizes.

Please see my training step

for epoch in range(num_epoch):
model.train()

for (g, features) in train_data_loader:
    
    adj  = g.adjacency_matrix(transpose = False)
    adj = sp.coo_matrix(adj.to_dense())
    n_nodes, feat_dim = features.shape
    nodes = list(g.nodes())
    
    # Against Class Imbalance
    adj_norm = preprocess_graph(adj)
    adj_label = adj + sp.eye(adj.shape[0])
    adj_label = torch.FloatTensor(adj_label.toarray()).to(device)

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    pos_weight = torch.from_numpy(np.array((pos_weight)))
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    print(features.device, adj_norm.to(device).device, adj_label.device)

    recovered, mu, logvar = model(features, adj_norm.to(device))
    loss_train = loss_function(recovered, adj_label, mu, logvar, n_nodes, norm, pos_weight)
    loss = loss_train
    optimizer.zero_grad()
    loss.backward()
    cur_loss = loss.item()
    optimizer.step()

Additionally, all inputs are on cuda:0 but cpu takes all the computation. Am i missing anything?

Thank you

I cannot reproduce the slowdown using your (partial) code snippets and get approx. 0.0152s/iter for CPU and 0.00044s/iter for a GPU run using:

import torch
import torch.nn as nn
import torch.nn.functional as F
import time

class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features, dropout=0., act=F.relu):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.dropout = dropout
        self.act = act
        self.weight = nn.Parameter(torch.randn(in_features, out_features))
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.weight)

    def forward(self, input, adj):
        input = F.dropout(input, self.dropout, self.training)
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        output = self.act(output)
        return output

device = 'cpu'
model = GraphConvolution(in_features=156, out_features=44).to(device)
x = torch.randn(44, 156).to(device)
adj = torch.randn(44, 44).to(device)

for _ in range(10):
    out = model(x, adj)
    out.backward(torch.ones_like(out))

nb_iters = 100
torch.cuda.synchronize()
t0 = time.time()
for _ in range(nb_iters):
    out = model(x, adj)
    out.backward(torch.ones_like(out))
torch.cuda.synchronize()
t1 = time.time()
print((t1 - t0)/nb_iters)

Note that I had to change:

  • self.weight to nn.Parameter, as glorot_init is undefined
  • used out_features=44 based on your provided shapes
  • used only GraphConvolution, since GVAE is undefined

When i run your code in each device i m getting:

cpu --> 0.00017951
cuda:0 --> 0.000260612

this looks confusing. I was expecting results similar to yours.

My hyperparameters:
batch_size = 100 (batching graphs in DGL) --> dgl.batch()
input_feat_dim = 156
hidden_dim1= 32
hidden_dim2= 16

This is the complete model by using Graph Convolutions if it helps further. Sorry sending you the codes partially. And thanks alot for taking time for this issue.

class GraphConvolution(nn.Module):

def __init__(self, in_features, out_features, dropout=0., act=F.relu):
    super(GraphConvolution, self).__init__()
    self.in_features = in_features
    self.out_features = out_features
    self.dropout = dropout
    self.act = act
    self.weight = glorot_init(in_features, out_features)
    self.reset_parameters()

def reset_parameters(self):
    torch.nn.init.xavier_uniform_(self.weight)

def forward(self, input, adj):
    input = F.dropout(input, self.dropout, self.training)
    support = torch.mm(input, self.weight)
    output = torch.spmm(adj, support)
    output = self.act(output)
    return output

def __repr__(self):
    return self.__class__.__name__ + ' (' \
           + str(self.in_features) + ' -> ' \
           + str(self.out_features) + ')'

class GVAE(nn.Module):

def __init__(self, input_feat_dim, hidden_dim1, hidden_dim2, emb_matrix, dropout):
    super(GVAE, self).__init__()

    self.gc1 = GraphConvolution(input_feat_dim, hidden_dim1, dropout, act=F.relu)
    # Import node features
    self.gc1.weight.data = torch.Tensor(emb_weights)
    self.gc1.weight.requires_grad = True
    self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
    self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
    self.decoder = InnerProductDecoder(dropout, act=lambda x: x)

def encode(self, x, adj):
    # x will be our embedding vectors
    hidden1 = self.gc1(x, adj)
    return self.gc2(hidden1, adj), self.gc3(hidden1, adj)

def reparameterize(self, mu, logvar):
    if self.training:
        std = torch.exp(logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)
    else:
        return mu

def forward(self, x, adj):
    mu, logvar = self.encode(x, adj)
    z = self.reparameterize(mu, logvar)
    decoded = self.decoder(z)

    return decoded, mu, logvar

class InnerProductDecoder(nn.Module):
“”“Decoder for using inner product for prediction.”""

def __init__(self, dropout, act=torch.sigmoid):
    super(InnerProductDecoder, self).__init__()
    self.dropout = dropout
    self.act = act

def forward(self, z):
    z = F.dropout(z, self.dropout, training=self.training)
    adj = self.act(torch.mm(z, z.t()))
    return adj

I tried the code by Peter, and got the same result that cpu (0.00015/iter) is faster than cuda (0.00026/iter) for 1000 iterations. This is perplexing.