Extremely slow for backward: ask for guidance in using autograd for general purpose computation

Hi,
I’m new to pytorch, and I’m doing something a little heterodox. I build a very dynamic computational graph with a lot of inplace operations due to the complex nature of my subject. I expect it to be slow as I build the whole ‘network’ from scratch, but the backward is just so slow (about 1 minutes for forward and 20+ minutes for backward) that I must have done something wrong. So I’m asking for some advice on implementing pytorch for general purpose computation. Below is an example of my code:

First, I defined a function to update the state of one node:

def neuronForward(self, InputS, InputR, InputTau, StateTau, endTime, V0, Dv0, G, Tau_m, H, Tau_n, I, Tau_r, Tau_s, G_exc, G_inh ):
	Delta = (Tau_m+G*Tau_n)**2 - 4*Tau_m*Tau_n*(H+G)
	deta = Delta.data[0]
	nptr = (-1/Tau_r).data[0]
	npts = (-1/Tau_s).data[0]
	P = G/Tau_m +1/Tau_n
	Q = (H+G)/(Tau_m*Tau_n)
	L = I/(H+G)
	Cr = 1/(Tau_m*Tau_n) - 1/(Tau_m*Tau_r)
	Cs = 1/(Tau_m*Tau_n) - 1/(Tau_m*Tau_s)
	PQR = 1-P*Tau_r+Q*Tau_r**2
	PQS = 1-P*Tau_s+Q*Tau_s**2

	InheritMask = (InputTau == StateTau).float().detach()
	EffectiveMask = (InputTau > 0).float().detach()
	GexcMask = (InputR > 0).float().detach()
	GinhMask = (InputR < 0).float().detach()

	S0 = torch.sum(InheritMask*InputS)
	Kr = EffectiveMask*(GexcMask*G_exc*Tau_r*InputR/(Tau_r-Tau_s) - GinhMask*G_inh*Tau_r*InputR/(Tau_r-Tau_s))
	Ks = EffectiveMask*(InputS - GexcMask*G_exc*Tau_r*InputR/(Tau_r-Tau_s) + GinhMask*G_inh*Tau_r*InputR/(Tau_r-Tau_s))

	if(deta >0):
		r1 = -0.5/Tau_n -0.5*G/Tau_m +Delta**0.5/(2*Tau_m*Tau_n)
		r2 = -0.5/Tau_n -0.5*G/Tau_m -Delta**0.5/(2*Tau_m*Tau_n)
		if(r1.data[0]!= nptr and r2.data[0]!= nptr):
			Vr = Cr*Kr*Tau_r**2*torch.exp(-InputTau/Tau_r)/PQR
			C1r = Kr*(1/(Tau_m*(r1-r2)) + Cr*(Tau_r+r2*Tau_r**2)/((r1-r2)*PQR))
			C2r = -Cr*Kr*Tau_r**2/PQR-C1r
			Vr = Vr + C1r*torch.exp(r1*InputTau) + C2r*torch.exp(r2*InputTau)
			Dvr = -Cr*Kr*Tau_r*torch.exp(-InputTau/Tau_r)/PQR + C1r*r1*torch.exp(r1*InputTau) + C2r*r2*torch.exp(r2*InputTau)
		else:
			Vr = Cr*Kr*Tau_r*InputTau*torch.exp(-InputTau/Tau_r)/(P*Tau_r-2)
			C1r = Kr*(Tau_r*(P-Cr*Tau_m)-2)/((r1-r2)*(P*Tau_r-2)*Tau_m)
			C2r = -C1r
			Vr = Vr + C1r*torch.exp(r1*InputTau) + C2r*torch.exp(r2*InputTau)
			Dvr = Cr*Kr*Tau_r*torch.exp(-InputTau/Tau_r)/(P*Tau_r - 2) - Cr*Kr*InputTau*torch.exp(-InputTau/Tau_r)/(P*Tau_r - 2) \
				+ C1r*r1*torch.exp(r1*InputTau) + C2r*r2*torch.exp(r2*InputTau)
		if(r1.data[0]!= npts and r2.data[0]!= npts):
			Vs = Cs*Ks*Tau_s ** 2*torch.exp(-InputTau/Tau_s)/PQS
			C1s = Ks*(1/(Tau_m*(r1 - r2)) + Cs*(Tau_s + r2*Tau_s ** 2)/((r1 - r2)*PQS))
			C2s = -Cs*Ks*Tau_s ** 2/PQS - C1s
			Vs = Vs + C1s*torch.exp(r1*InputTau) + C2s*torch.exp(r2*InputTau)
			Dvs = -Cs*Ks*Tau_s*torch.exp(-InputTau/Tau_s)/PQS + C1s*r1*torch.exp(r1*InputTau) + C2s*r2*torch.exp(r2*InputTau)
		else:
			Vs = Cs*Ks*Tau_s*InputTau*torch.exp(-InputTau/Tau_s)/(P*Tau_s-2)
			C1s = Ks*(Tau_s*(P-Cs*Tau_m)-2)/((r1-r2)*(P*Tau_s-2)*Tau_m)
			C2s = -C1s
			Vs = Vs + C1s*torch.exp(r1*InputTau) + C2s*torch.exp(r2*InputTau)
			Dvs = Cs*Ks*Tau_s*torch.exp(-InputTau/Tau_s)/(P*Tau_s - 2) - Cs*Ks*InputTau*torch.exp(-InputTau/Tau_s)/(P*Tau_s - 2) \
					+ C1s*r1*torch.exp(r1*InputTau) + C2s*r2*torch.exp(r2*InputTau)
		C3 = (Dv0-S0/Tau_m+r2*L-r2*V0)/(r1-r2)
		C4 = V0-C3-L
		VI = C3*torch.exp(r1*StateTau)+C4*torch.exp(r2*StateTau)+L
		DvI = C3*r1*torch.exp(r1*StateTau) + C4*r2*torch.exp(r2*StateTau)
	elif(deta < 0):
		alpha = -0.5/Tau_n -0.5*G/Tau_m
		beta = (-Delta)**0.5/(2*Tau_m*Tau_n)
		Vr = Cr*Kr*Tau_r ** 2*torch.exp(-InputTau/Tau_r)/PQR
		C1r = -Cr*Kr*Tau_r**2/PQR
		C2r = Cr*Kr*Tau_r*(1+alpha*Tau_r)/(beta*PQR)+Kr/(beta*Tau_m)
		Vr = Vr + torch.exp(alpha*InputTau)*(C1r*torch.cos(beta*InputTau)+C2r*torch.sin(beta*InputTau))
		Vs = Cs*Ks*Tau_s ** 2*torch.exp(-InputTau/Tau_s)/PQS
		C1s = -Cs*Ks*Tau_s**2/PQS
		C2s = Cs*Ks*Tau_s*(1+alpha*Tau_s)/(beta*PQS)+Ks/(beta*Tau_m)
		Vs = Vs + torch.exp(alpha*InputTau)*(C1s*torch.cos(beta*InputTau)+C2s*torch.sin(beta*InputTau))
		C3 = V0 - L
		C4 = (Dv0 - S0/Tau_m -alpha*C3)/beta
		VI = torch.exp(alpha*StateTau)*(C3*torch.cos(beta*StateTau)+C4*torch.sin(beta*StateTau)) + L
		Dvr = -Cr*Kr*Tau_r*torch.exp(-InputTau/Tau_r)/PQR + torch.exp(alpha*InputTau)*((alpha*C1r+beta*C2r)*torch.cos(beta*InputTau) +
		                                       (alpha*C2r-beta*C1r)*torch.sin(beta*InputTau))
		Dvs = -Cs*Ks*Tau_s*torch.exp(-InputTau/Tau_s)/PQS + torch.exp(alpha*InputTau)*((alpha*C1s+beta*C2s)*torch.cos(beta*InputTau) +
		                                       (alpha*C2s-beta*C1s)*torch.sin(beta*InputTau))
		DvI = torch.exp(alpha*StateTau)*((alpha*C3 + beta*C4)*torch.cos(beta*StateTau) +
		                                 (alpha*C4 - beta*C3)*torch.sin(beta*StateTau))
	elif(deta==0):
		r = -0.5/Tau_n -0.5*G/Tau_m
		if(r!=nptr):
			Vr = Cr*Kr*Tau_r**2*torch.exp(-InputTau/Tau_r)/PQR
			C1r = -Cr*Kr*Tau_r**2/PQR
			C2r = Cr*Kr*Tau_r*(1+r*Tau_r)/PQR +Kr/Tau_m
			Vr = Vr + (C1r+C2r*InputTau)*torch.exp(r*InputTau)
			Dvr = -Cr*Kr*Tau_r*torch.exp(-InputTau/Tau_r)/PQR + (r*C1r + (r*InputTau + 1)*C2r)*torch.exp(r*InputTau)
		else:
			Vr = Cr*Kr*InputTau**2*torch.exp(-InputTau/Tau_r)/2
			C2r = Kr/Tau_m
			Vr = Vr + C2r*InputTau*torch.exp(r*InputTau)
			Dvr = Cr*Kr*InputTau*torch.exp(-InputTau/Tau_r) - Cr*Kr*InputTau ** 2*torch.exp(-InputTau/Tau_r)/(2*Tau_r)+\
			      (r*InputTau+1)*C2r*torch.exp(r*InputTau)
		if(r!=npts):
			Vs = Cs*Ks*Tau_s ** 2*torch.exp(-InputTau/Tau_s)/PQS
			C1s = -Cs*Ks*Tau_s**2/PQS
			C2s = Cs*Ks*Tau_s*(1+r*Tau_s)/PQS +Ks/Tau_m
			Vs = Vs + (C1s+C2s*InputTau)*torch.exp(r*InputTau)
			Dvs = -Cs*Ks*Tau_s*torch.exp(-InputTau/Tau_s)/PQS + (r*C1s + (r*InputTau + 1)*C2s)*torch.exp(r*InputTau)
		else:
			Vs = Cs*Ks*InputTau**2*torch.exp(-InputTau/Tau_s)
			C2s = Ks/Tau_m
			Vs = Vs + C2s*InputTau*torch.exp(r*InputTau)
			Dvs = Cs*Ks*InputTau*torch.exp(-InputTau/Tau_s) - Cs*Ks*InputTau ** 2*torch.exp(-
	V = torch.sum(Vr+Vs) + VI
	Dv = torch.sum(Dvr + Dvs) + DvI
	endTime = couple(V, endTime, -Dv.data[0])
	theTime = 0*InputTau + endTime
	InputTau = couple(theTime,InputTau,1)
	StateTau = couple(endTime,StateTau,1)

	InputS = Kr*(torch.exp(-InputTau/Tau_r)-torch.exp(-InputTau/Tau_s)) + EffectiveMask*InputS*torch.exp(-InputTau/Tau_s) +\
	         (1-EffectiveMask)*InputS
	InputR = EffectiveMask*InputR*torch.exp(-InputTau/Tau_r) + (1-EffectiveMask)*InputR

	return V, endTime, Dv, InputR, InputS

And then, I update the state one-by-one in the main loop:

for ii in range(StateUpdateNum):
	neuronInd = int(SrIndex[ii])
	StateTau = SrTime[ii] - Time[neuronInd]
	InpTau = SrTime[ii] - PreSynpticTimeList[neuronInd]
	InpTau = torch.min(InpTau, StateTau)
	endTime = Variable(torch.Tensor([SrTime[ii]]), requires_grad=True)
	theV, endTime, theDv, InputR, InputS = \
		neuronForward(S[neuronInd], R[neuronInd], InpTau, StateTau, endTime, V[neuronInd], Dv[neuronInd], theG,
					  Tau_m[neuronInd],
					  H[neuronInd], theTau_n, theI, Tau_r[neuronInd], Tau_s[neuronInd], theG_exc, theG_inh)

	Time[neuronInd] = endTime
	S[neuronInd] = InputS
	R[neuronInd] = InputR
	Dv = Dv.clone()
	Dv[neuronInd] = theDv
	V = V.clone()
	V[neuronInd] = theV.detach()
	State[neuronInd] = SrState[ii]
	if (SrState[ii] == 4):
		if (NetSpkTime is None):
			NetSpkTime = endTime.clone()
		else:
			NetSpkTime = torch.cat((NetSpkTime, endTime))
		for postInd in ForwardLink[neuronInd + InpNeuronNum]:
			if (len(PreSynpticTimeList[postInd]) == 0):
				PreSynpticTimeList[postInd] = preTime + DelaysMatrix[postInd, neuronInd + InpNeuronNum]
			else:
				PreSynpticTimeList[postInd] = torch.cat(
					(PreSynpticTimeList[postInd], preTime + DelaysMatrix[postInd, neuronInd + InpNeuronNum]))
			if len(S[postInd]) == 0:
				S[postInd] = Variable(torch.zeros(1), requires_grad=True).clone()
				R[postInd] = WeightsMatrix[postInd, neuronInd + InpNeuronNum].clone()
			else:
				S[postInd] = torch.cat((S[postInd], Variable(torch.zeros(1), requires_grad=True)))
				R[postInd] = torch.cat((R[postInd], WeightsMatrix[postInd, neuronInd + InpNeuronNum]))

I understand that my ‘network’ is too fragmentary, and is really bad for vectorization. But the ‘network’ is so dynamic that I can’t know each state and connectivity before state update. Even through, it shouldn’t be that slow. I can do something to speed it up? Thanks in advance. And by the way, I can’t find the module torch.autograd.profiler in my pytorch (linux python2.7 conda installation latest version), do I need a seperate installation?

1 Like

your network is very unique and interesting.
One thing that will help us understand the problem better is if you can do the following:

install pytorch from source. Instructions here: https://github.com/pytorch/pytorch#from-source

Use the autograd profiler to take a profile summary: http://pytorch.org/docs/master/autograd.html#profiler
The autograd profiler is for now only available in master branch, hence the need for install from source.

Post the profile output here, we will look further.

Thanks for the reply. I have tried the profiler and graph visualization.
The profile of a network is too long, so I ran a simple net with single ‘neuron’ (but it’s still too long). So I uploaded an example computational graph. Maybe it just because I used too many operations, and calling those ops takes so much time. I thought pytorch could do some optimizations like op fusions and reuse repeated subgraphs in the backward process, but maybe I didn’t use it right.

here is a small part of the profile output:


Name CPU time CUDA time Calls CPU total CUDA total


Sub 29.851us 0.000us 1 29.000us 0.000us
MulConstant 9.183us 0.000us 1 9.000us 0.000us
Add 5.844us 0.000us 1 5.000us 0.000us
Sub 4.935us 0.000us 1 4.000us 0.000us
MulConstant 4.535us 0.000us 1 4.000us 0.000us
Add 4.226us 0.000us 1 4.000us 0.000us
Sub 4.302us 0.000us 1 4.000us 0.000us
MulConstant 4.089us 0.000us 1 4.000us 0.000us
Add 3.743us 0.000us 1 3.000us 0.000us
Sub 4.033us 0.000us 1 4.000us 0.000us
MulConstant 4.064us 0.000us 1 4.000us 0.000us
Add 3.895us 0.000us 1 3.000us 0.000us
SubConstant 5.761us 0.000us 1 5.000us 0.000us
SubConstant 4.067us 0.000us 1 4.000us 0.000us
SubConstant 3.921us 0.000us 1 3.000us 0.000us
SubConstant 3.819us 0.000us 1 3.000us 0.000us
SubConstant 3.670us 0.000us 1 3.000us 0.000us
SubConstant 5.695us 0.000us 1 5.000us 0.000us
SubConstant 3.909us 0.000us 1 3.000us 0.000us
SubConstant 3.478us 0.000us 1 3.000us 0.000us
SubConstant 3.537us 0.000us 1 3.000us 0.000us
SubConstant 4.440us 0.000us 1 4.000us 0.000us
Clone 8.353us 0.000us 1 8.000us 0.000us
Clone 2.541us 0.000us 1 2.000us 0.000us
Index 7.655us 0.000us 1 7.000us 0.000us
Index 5.591us 0.000us 1 5.000us 0.000us
Add 5.253us 0.000us 1 5.000us 0.000us
Clone 2.849us 0.000us 1 2.000us 0.000us
Index 5.130us 0.000us 1 5.000us 0.000us
Clone 2.460us 0.000us 1 2.000us 0.000us
Index 4.396us 0.000us 1 4.000us 0.000us
Index 4.865us 0.000us 1 4.000us 0.000us
Add 4.246us 0.000us 1 4.000us 0.000us
Concat 10.516us 0.000us 1 10.000us 0.000us
Concat 4.911us 0.000us 1 4.000us 0.000us
Index 7.460us 0.000us 1 7.000us 0.000us
Concat 4.461us 0.000us 1 4.000us 0.000us
Index 4.265us 0.000us 1 4.000us 0.000us
Index 4.793us 0.000us 1 4.000us 0.000us
Add 5.977us 0.000us 1 5.000us 0.000us
Concat 5.696us 0.000us 1 5.000us 0.000us
Concat 4.192us 0.000us 1 4.000us 0.000us
Index 4.850us 0.000us 1 4.000us 0.000us
Concat 4.051us 0.000us 1 4.000us 0.000us
Index 4.810us 0.000us 1 4.000us 0.000us
SubConstant 6.971us 0.000us 1 6.000us 0.000us
SubConstant 4.399us 0.000us 1 4.000us 0.000us
Cmin 30.417us 0.000us 1 30.000us 0.000us
Index 5.044us 0.000us 1 5.000us 0.000us
MulConstant 6.544us 0.000us 1 6.000us 0.000us
Index 4.558us 0.000us 1 4.000us 0.000us
MulConstant 4.072us 0.000us 1 4.000us 0.000us
Add 4.428us 0.000us 1 4.000us 0.000us
Index 4.108us 0.000us 1 4.000us 0.000us
MulConstant 3.575us 0.000us 1 3.000us 0.000us
Add 3.925us 0.000us 1 3.000us 0.000us
Index 4.080us 0.000us 1 4.000us 0.000us
MulConstant 3.702us 0.000us 1 3.000us 0.000us
Add 3.821us 0.000us 1 3.000us 0.000us
Index 4.531us 0.000us 1 4.000us 0.000us
MulConstant 3.560us 0.000us 1 3.000us 0.000us
Add 3.762us 0.000us 1 3.000us 0.000us
Index 5.589us 0.000us 1 5.000us 0.000us
MulConstant 4.795us 0.000us 1 4.000us 0.000us
Index 4.479us 0.000us 1 4.000us 0.000us
MulConstant 3.612us 0.000us 1 3.000us 0.000us
Add 3.786us 0.000us 1 3.000us 0.000us
Index 4.004us 0.000us 1 4.000us 0.000us
MulConstant 3.699us 0.000us 1 3.000us 0.000us
Add 4.701us 0.000us 1 4.000us 0.000us
Index 4.231us 0.000us 1 4.000us 0.000us
MulConstant 3.663us 0.000us 1 3.000us 0.000us
Add 4.016us 0.000us 1 4.000us 0.000us
Index 4.205us 0.000us 1 4.000us 0.000us
MulConstant 3.606us 0.000us 1 3.000us 0.000us
Add 3.613us 0.000us 1 3.000us 0.000us
Index 4.348us 0.000us 1 4.000us 0.000us
MulConstant 4.694us 0.000us 1 4.000us 0.000us
Index 4.649us 0.000us 1 4.000us 0.000us
MulConstant 3.426us 0.000us 1 3.000us 0.000us
Add 3.798us 0.000us 1 3.000us 0.000us
Index 4.251us 0.000us 1 4.000us 0.000us
MulConstant 3.526us 0.000us 1 3.000us 0.000us
Add 164.220us 0.000us 1 164.000us 0.000us
Index 6.363us 0.000us 1 6.000us 0.000us
MulConstant 4.815us 0.000us 1 4.000us 0.000us
Add 4.333us 0.000us 1 4.000us 0.000us
Index 4.850us 0.000us 1 4.000us 0.000us
MulConstant 3.650us 0.000us 1 3.000us 0.000us
Add 3.837us 0.000us 1 3.000us 0.000us
Index 4.279us 0.000us 1 4.000us 0.000us
MulConstant 3.693us 0.000us 1 3.000us 0.000us
Index 4.209us 0.000us 1 4.000us 0.000us
MulConstant 3.543us 0.000us 1 3.000us 0.000us
Add 3.815us 0.000us 1 3.000us 0.000us
Index 4.101us 0.000us 1 4.000us 0.000us
MulConstant 3.645us 0.000us 1 3.000us 0.000us
Add 3.707us 0.000us 1 3.000us 0.000us
Index 4.173us 0.000us 1 4.000us 0.000us
MulConstant 3.631us 0.000us 1 3.000us 0.000us
Add 3.760us 0.000us 1 3.000us 0.000us
Index 4.318us 0.000us 1 4.000us 0.000us
MulConstant 3.546us 0.000us 1 3.000us 0.000us
Add 3.683us 0.000us 1 3.000us 0.000us
Index 4.085us 0.000us 1 4.000us 0.000us
MulConstant 3.684us 0.000us 1 3.000us 0.000us
Index 4.113us 0.000us 1 4.000us 0.000us
MulConstant 3.627us 0.000us 1 3.000us 0.000us
Add 3.818us 0.000us 1 3.000us 0.000us
Index 4.142us 0.000us 1 4.000us 0.000us
MulConstant 3.578us 0.000us 1 3.000us 0.000us
Add 3.733us 0.000us 1 3.000us 0.000us
Index 4.053us 0.000us 1 4.000us 0.000us
MulConstant 3.678us 0.000us 1 3.000us 0.000us
Add 3.715us 0.000us 1 3.000us 0.000us
Index 3.936us 0.000us 1 3.000us 0.000us
MulConstant 3.695us 0.000us 1 3.000us 0.000us
Add 3.708us 0.000us 1 3.000us 0.000us
Index 4.379us 0.000us 1 4.000us 0.000us
Index 5.327us 0.000us 1 5.000us 0.000us
Index 4.225us 0.000us 1 4.000us 0.000us
Index 4.189us 0.000us 1 4.000us 0.000us
Index 4.097us 0.000us 1 4.000us 0.000us
Index 4.061us 0.000us 1 4.000us 0.000us
Mul 6.801us 0.000us 1 6.000us 0.000us
Add 3.770us 0.000us 1 3.000us 0.000us
PowConstant 7.036us 0.000us 1 7.000us 0.000us
MulConstant 4.033us 0.000us 1 4.000us 0.000us
Mul 4.541us 0.000us 1 4.000us 0.000us
Add 3.642us 0.000us 1 3.000us 0.000us
Mul 3.976us 0.000us 1 3.000us 0.000us
Sub 4.136us 0.000us 1 4.000us 0.000us
DivConstant 7.525us 0.000us 1 7.000us 0.000us
DivConstant 5.071us 0.000us 1 5.000us 0.000us
Div 5.913us 0.000us 1 5.000us 0.000us
DivConstant 6.243us 0.000us 1 6.000us 0.000us
Add 3.929us 0.000us 1 3.000us 0.000us
Add 3.748us 0.000us 1 3.000us 0.000us
Mul 4.288us 0.000us 1 4.000us 0.000us
Div 4.352us 0.000us 1 4.000us 0.000us
Add 3.896us 0.000us 1 3.000us 0.000us
Div 4.120us 0.000us 1 4.000us 0.000us
Mul 4.095us 0.000us 1 4.000us 0.000us
DivConstant 5.071us 0.000us 1 5.000us 0.000us
Mul 7.763us 0.000us 1 7.000us 0.000us
DivConstant 4.915us 0.000us 1 4.000us 0.000us
Sub 3.874us 0.000us 1 3.000us 0.000us
Mul 4.318us 0.000us 1 4.000us 0.000us
DivConstant 4.795us 0.000us 1 4.000us 0.000us
Mul 4.951us 0.000us 1 4.000us 0.000us
DivConstant 5.476us 0.000us 1 5.000us 0.000us
Sub 3.845us 0.000us 1 3.000us 0.000us
Mul 4.227us 0.000us 1 4.000us 0.000us
SubConstant 4.658us 0.000us 1 4.000us 0.000us
PowConstant 4.957us 0.000us 1 4.000us 0.000us
Mul 4.069us 0.000us 1 4.000us 0.000us
Add 3.967us 0.000us 1 3.000us 0.000us
Mul 3.851us 0.000us 1 3.000us 0.000us
SubConstant 3.864us 0.000us 1 3.000us 0.000us
PowConstant 4.268us 0.000us 1 4.000us 0.000us
Mul 3.784us 0.000us 1 3.000us 0.000us
Add 3.587us 0.000us 1 3.000us 0.000us
Eq 14.635us 0.000us 1 14.000us 0.000us
Type 11.656us 0.000us 1 11.000us 0.000us
NoGrad 6.236us 0.000us 1 6.000us 0.000us
Gt 6.841us 0.000us 1 6.000us 0.000us
Type 7.187us 0.000us 1 7.000us 0.000us
NoGrad 3.962us 0.000us 1 3.000us 0.000us
Gt 5.028us 0.000us 1 5.000us 0.000us
Type 6.152us 0.000us 1 6.000us 0.000us
NoGrad 3.362us 0.000us 1 3.000us 0.000us
Lt 5.453us 0.000us 1 5.000us 0.000us
Type 5.978us 0.000us 1 5.000us 0.000us
NoGrad 3.166us 0.000us 1 3.000us 0.000us
Mul 4.906us 0.000us 1 4.000us 0.000us
Sum 12.540us 0.000us 1 12.000us 0.000us
Mul 6.646us 0.000us 1 6.000us 0.000us
Mul 4.944us 0.000us 1 4.000us 0.000us
Mul 4.456us 0.000us 1 4.000us 0.000us
Sub 4.104us 0.000us 1 4.000us 0.000us
Div 5.229us 0.000us 1 5.000us 0.000us
Mul 5.140us 0.000us 1 5.000us 0.000us
Mul 4.564us 0.000us 1 4.000us 0.000us
Mul 5.240us 0.000us 1 5.000us 0.000us
Sub 3.861us 0.000us 1 3.000us 0.000us

nice. can you give the full profile output? maybe upload to https://gist.github.com/ and give a link here?

This is the output of a single node run. Now I’m thinking of cutting off all connections, run all the node at the same time, and link back the connections… Have no ideas about how to link back the connections yet…