Hey!
I am currently working on a project training a gcn where the network parameters are manually changed periodically between adam steps. This seems to be working well, however, after the weights are changed, the first reference to the model parameters takes .3 seconds. This is in contrast to 0.003 seconds for an adam step. For further context I first noted this during a call to optim.step(), however the behavior can be replicated by just printing out the parameters of the network instead.
I am confused what may be causing this. It seems as though there is some shifting of data in the background that I am not aware of since this is a one-time event after changing the weights. Any help is appreciated!
Edit: I have added the code below for anyone else who comes across this. The issue seems to be an interaction between cupy and pyTorchthat is yet to be diagnosed.
import cupy as cp
import numpy as np
import time
import torch
from torch.nn.utils import parameters_to_vector,vector_to_parameters
from torch import optim
from torch.nn import NLLLoss
from torch_geometric.nn import GCNConv
import torch_geometric.datasets as tds
import torch.nn.functional as F
class Net(torch.nn.Module):
def __init__(self, num_features, hidden, num_classes ):
seed = np.random.randint(0,high=999999,dtype=int)
torch.manual_seed(seed)
super(Net, self).__init__()
self.conv1 = GCNConv(num_features, hidden)
self.conv2 = GCNConv(hidden, num_classes)
def reset_parameters(self):
self.conv1.reset_parameters()
self.conv2.reset_parameters()
def forward(self, torchG):
x, edge_index, = torchG.x, torchG.edge_index
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
def train_epoch(model, data, optimizer, loss_fn, mask=True):
model.train()
optimizer.zero_grad()
yhat = model(data)
loss = loss_fn(yhat[mask], data.y[mask])
loss.backward()
optimizer.step()
return loss
def cupy_stuff( data, times=False ):
if times:
print("cupy forecast...")
print(" data shape: "+str(data.shape))
# Perform the svd
if times:
start = time.time()
u,s,vt = cp.linalg.svd( data, full_matrices=False )
if times:
print( " svd time: "+str(time.time()-start) )
# Return outs.
return( 0 )
def train_full( model, graph, optimizer, loss_fn, its, track = 20, printem=False ):
# Get the breakdown of our parameter dimensions for flattening.
if track > 0:
flat_params = parameters_to_vector(model.parameters())
param_num = flat_params.size()[0]
appended_param_num = param_num + (32 - param_num%32)
W = cp.zeros((track,appended_param_num))
# Loop through our epochs, performing a cupy svd every 20 steps.
for epoch in range(its):
# Get the start time for our iteration.
start = time.time()
# If the iteration is correct, predict weights.
if track > 0 and epoch%track == 0 and epoch > 0:
# Temporarily reshape W.
W = W.transpose().reshape((appended_param_num//32,32,track))
# Do some garbage cupy operation on W.
cupy_stuff( W )
# Reshape W.
W = W.transpose().reshape(track,appended_param_num)
# Otherwise perform some standard adam steps!
else:
train_epoch(model, graph, optimizer, loss_fn, mask=graph.train_mask)
# Get the parameters and populate W.
if track > 0:
W[epoch%track,0:param_num] = cp.array( torch.tensor(parameters_to_vector(model.parameters()),device="cuda:0"), dtype=cp.float32 )
# Print the end time of our iteration.
if printem:
if epoch%track == 0:
print()
print( "SVD STEP" )
if epoch%track == 1:
print( " vvv THIS IS WHERE THINGS GENERALLY BREAK vvv " )
print( "iteration: "+str(epoch)+" time: "+str(time.time()-start) )
if epoch%track == 1:
print( )
cora = tds.Planetoid(root="tmp/Cora//", name='Cora', split='full')[0].to('cuda')
num_features = cora.x.shape[1]
num_classes = cora.y.unique().shape[0]
model = Net( num_features, 32, num_classes ).to('cuda')
loss = NLLLoss()
optimizer = optim.Adam
optimizer = optimizer( model.parameters(), lr=0.0001 )
train_full( model, cora, optimizer, loss, 30 )
train_full( model, cora, optimizer, loss, 100, printem=True )