Optimizer slow for one step after manually changing weights

Christopher_Brissett · May 3, 2023, 6:40pm

Hey!

I am currently working on a project training a gcn where the network parameters are manually changed periodically between adam steps. This seems to be working well, however, after the weights are changed, the first reference to the model parameters takes .3 seconds. This is in contrast to 0.003 seconds for an adam step. For further context I first noted this during a call to optim.step(), however the behavior can be replicated by just printing out the parameters of the network instead.

I am confused what may be causing this. It seems as though there is some shifting of data in the background that I am not aware of since this is a one-time event after changing the weights. Any help is appreciated!

Edit: I have added the code below for anyone else who comes across this. The issue seems to be an interaction between cupy and pyTorchthat is yet to be diagnosed.

import cupy as cp
import numpy as np
import time

import torch
from torch.nn.utils import parameters_to_vector,vector_to_parameters
from torch import optim
from torch.nn import NLLLoss
from torch_geometric.nn import GCNConv
import torch_geometric.datasets as tds
import torch.nn.functional as F

class Net(torch.nn.Module):
	def __init__(self, num_features, hidden, num_classes ):

		seed = np.random.randint(0,high=999999,dtype=int)
		torch.manual_seed(seed)
		
		super(Net, self).__init__()
		self.conv1 = GCNConv(num_features, hidden)
		self.conv2 = GCNConv(hidden, num_classes)

	def reset_parameters(self):
		self.conv1.reset_parameters()
		self.conv2.reset_parameters()

	def forward(self, torchG):
		x, edge_index, = torchG.x, torchG.edge_index

		x = self.conv1(x, edge_index)
		x = F.relu(x)
		x = F.dropout(x, training=self.training)
		x = self.conv2(x, edge_index)

		return F.log_softmax(x, dim=1)

def train_epoch(model, data, optimizer, loss_fn, mask=True):
	model.train()
	optimizer.zero_grad()
	yhat = model(data)
	loss = loss_fn(yhat[mask], data.y[mask])
	loss.backward()
	optimizer.step()
	return loss

def cupy_stuff( data, times=False ):

    if times:
        print("cupy forecast...")
        print("     data shape: "+str(data.shape))

    # Perform the svd
    if times:
        start = time.time()
    u,s,vt = cp.linalg.svd( data, full_matrices=False )
    if times:
        print( "        svd time: "+str(time.time()-start) )

    # Return outs.
    return( 0 )


def train_full( model, graph, optimizer, loss_fn, its, track = 20, printem=False ):

	# Get the breakdown of our parameter dimensions for flattening.
	if track > 0:
		flat_params = parameters_to_vector(model.parameters())
		param_num = flat_params.size()[0]
		appended_param_num = param_num + (32 - param_num%32)
		W = cp.zeros((track,appended_param_num))

	# Loop through our epochs, performing a cupy svd every 20 steps.
	for epoch in range(its):

		# Get the start time for our iteration.
		start = time.time()

		# If the iteration is correct, predict weights.
		if track > 0 and epoch%track == 0 and epoch > 0:

			# Temporarily reshape W.
			W = W.transpose().reshape((appended_param_num//32,32,track))

			# Do some garbage cupy operation on W.
			cupy_stuff( W )
			
			# Reshape W.
			W = W.transpose().reshape(track,appended_param_num)

		# Otherwise perform some standard adam steps!
		else:
			train_epoch(model, graph, optimizer, loss_fn, mask=graph.train_mask)

		# Get the parameters and populate W.
		if track > 0:
			W[epoch%track,0:param_num] = cp.array( torch.tensor(parameters_to_vector(model.parameters()),device="cuda:0"), dtype=cp.float32 )

		# Print the end time of our iteration.
		if printem:
			if epoch%track == 0:
				print()
				print( "SVD STEP" )
			if epoch%track == 1:
				print( " vvv THIS IS WHERE THINGS GENERALLY BREAK vvv " )
			print( "iteration: "+str(epoch)+" time: "+str(time.time()-start) )
			if epoch%track == 1:
				print( )



cora = tds.Planetoid(root="tmp/Cora//", name='Cora', split='full')[0].to('cuda')

num_features = cora.x.shape[1]
num_classes  = cora.y.unique().shape[0]
model = Net( num_features, 32, num_classes ).to('cuda')

loss = NLLLoss()

optimizer = optim.Adam
optimizer = optimizer( model.parameters(), lr=0.0001 )

train_full( model, cora, optimizer, loss, 30 )
train_full( model, cora, optimizer, loss, 100, printem=True )

AlphaBetaGamma96 · May 3, 2023, 7:34pm

Hi @Christopher_Brissett,

Do you have a minimal reproducible example for your problem?

If you’re changing all your parameters, you might be syncing your model, which might slow down your model.

Christopher_Brissett · May 4, 2023, 12:41pm

I am currently working on this. It appears that some cupy operations I am doing may be causing the issue further downstream. I will post it here when I get it working, or rather, broken I guess?

Christopher_Brissett · May 4, 2023, 3:28pm

import cupy as cp
import numpy as np
import time

import torch
from torch.nn.utils import parameters_to_vector,vector_to_parameters
from torch import optim
from torch.nn import NLLLoss
from torch_geometric.nn import GCNConv
import torch_geometric.datasets as tds
import torch.nn.functional as F

class Net(torch.nn.Module):
	def __init__(self, num_features, hidden, num_classes ):

		seed = np.random.randint(0,high=999999,dtype=int)
		torch.manual_seed(seed)
		
		super(Net, self).__init__()
		self.conv1 = GCNConv(num_features, hidden)
		self.conv2 = GCNConv(hidden, num_classes)

	def reset_parameters(self):
		self.conv1.reset_parameters()
		self.conv2.reset_parameters()

	def forward(self, torchG):
		x, edge_index, = torchG.x, torchG.edge_index

		x = self.conv1(x, edge_index)
		x = F.relu(x)
		x = F.dropout(x, training=self.training)
		x = self.conv2(x, edge_index)

		return F.log_softmax(x, dim=1)

def train_epoch(model, data, optimizer, loss_fn, mask=True):
	model.train()
	optimizer.zero_grad()
	yhat = model(data)
	loss = loss_fn(yhat[mask], data.y[mask])
	loss.backward()
	optimizer.step()
	return loss

def cupy_stuff( data, times=False ):

    if times:
        print("cupy forecast...")
        print("     data shape: "+str(data.shape))

    # Perform the svd
    if times:
        start = time.time()
    u,s,vt = cp.linalg.svd( data, full_matrices=False )
    if times:
        print( "        svd time: "+str(time.time()-start) )

    # Return outs.
    return( 0 )


def train_full( model, graph, optimizer, loss_fn, its, track = 20, printem=False ):

	# Get the breakdown of our parameter dimensions for flattening.
	if track > 0:
		flat_params = parameters_to_vector(model.parameters())
		param_num = flat_params.size()[0]
		appended_param_num = param_num + (32 - param_num%32)
		W = cp.zeros((track,appended_param_num))

	# Loop through our epochs, performing a cupy svd every 20 steps.
	for epoch in range(its):

		# Get the start time for our iteration.
		start = time.time()

		# If the iteration is correct, predict weights.
		if track > 0 and epoch%track == 0 and epoch > 0:

			# Temporarily reshape W.
			W = W.transpose().reshape((appended_param_num//32,32,track))

			# Do some garbage cupy operation on W.
			cupy_stuff( W )
			
			# Reshape W.
			W = W.transpose().reshape(track,appended_param_num)

		# Otherwise perform some standard adam steps!
		else:
			train_epoch(model, graph, optimizer, loss_fn, mask=graph.train_mask)

		# Get the parameters and populate W.
		if track > 0:
			W[epoch%track,0:param_num] = cp.array( torch.tensor(parameters_to_vector(model.parameters()),device="cuda:0"), dtype=cp.float32 )

		# Print the end time of our iteration.
		if printem:
			if epoch%track == 0:
				print()
				print( "SVD STEP" )
			if epoch%track == 1:
				print( " vvv THIS IS WHERE THINGS GENERALLY BREAK vvv " )
			print( "iteration: "+str(epoch)+" time: "+str(time.time()-start) )
			if epoch%track == 1:
				print( )



cora = tds.Planetoid(root="tmp/Cora//", name='Cora', split='full')[0].to('cuda')

num_features = cora.x.shape[1]
num_classes  = cora.y.unique().shape[0]
model = Net( num_features, 32, num_classes ).to('cuda')

loss = NLLLoss()

optimizer = optim.Adam
optimizer = optimizer( model.parameters(), lr=0.0001 )

train_full( model, cora, optimizer, loss, 30 )
train_full( model, cora, optimizer, loss, 100, printem=True )

Here is a simple script example of what is happening. Note that the function “cupy_stuff” will ideally be done in cupy because I have noticed that the batched cupy svd is up to 10x faster than the bached torch svd. It should also be noted that this does not happen when using the torch svd. This leads me to believe cupy is “kicking pytorch out” in some way, but I can’t figure out how, or how to fix it.

ptrblck · May 14, 2023, 7:56pm

CUDA operations are executed asynchronously which means you would need to synchronize the code before starting and stopping the host timers.
In your current code you are profiling the dispatching, kernel launches, and random implicit synchronizations which would output an invalid profile and will not reflect the actual kernel execution times.