Extremely slow for backward: ask for guidance in using autograd for general purpose computation

Chaofei · September 27, 2017, 9:59am

Hi,
I’m new to pytorch, and I’m doing something a little heterodox. I build a very dynamic computational graph with a lot of inplace operations due to the complex nature of my subject. I expect it to be slow as I build the whole ‘network’ from scratch, but the backward is just so slow (about 1 minutes for forward and 20+ minutes for backward) that I must have done something wrong. So I’m asking for some advice on implementing pytorch for general purpose computation. Below is an example of my code:

First, I defined a function to update the state of one node:

def neuronForward(self, InputS, InputR, InputTau, StateTau, endTime, V0, Dv0, G, Tau_m, H, Tau_n, I, Tau_r, Tau_s, G_exc, G_inh ):
	Delta = (Tau_m+G*Tau_n)**2 - 4*Tau_m*Tau_n*(H+G)
	deta = Delta.data[0]
	nptr = (-1/Tau_r).data[0]
	npts = (-1/Tau_s).data[0]
	P = G/Tau_m +1/Tau_n
	Q = (H+G)/(Tau_m*Tau_n)
	L = I/(H+G)
	Cr = 1/(Tau_m*Tau_n) - 1/(Tau_m*Tau_r)
	Cs = 1/(Tau_m*Tau_n) - 1/(Tau_m*Tau_s)
	PQR = 1-P*Tau_r+Q*Tau_r**2
	PQS = 1-P*Tau_s+Q*Tau_s**2

	InheritMask = (InputTau == StateTau).float().detach()
	EffectiveMask = (InputTau > 0).float().detach()
	GexcMask = (InputR > 0).float().detach()
	GinhMask = (InputR < 0).float().detach()

	S0 = torch.sum(InheritMask*InputS)
	Kr = EffectiveMask*(GexcMask*G_exc*Tau_r*InputR/(Tau_r-Tau_s) - GinhMask*G_inh*Tau_r*InputR/(Tau_r-Tau_s))
	Ks = EffectiveMask*(InputS - GexcMask*G_exc*Tau_r*InputR/(Tau_r-Tau_s) + GinhMask*G_inh*Tau_r*InputR/(Tau_r-Tau_s))

	if(deta >0):
		r1 = -0.5/Tau_n -0.5*G/Tau_m +Delta**0.5/(2*Tau_m*Tau_n)
		r2 = -0.5/Tau_n -0.5*G/Tau_m -Delta**0.5/(2*Tau_m*Tau_n)
		if(r1.data[0]!= nptr and r2.data[0]!= nptr):
			Vr = Cr*Kr*Tau_r**2*torch.exp(-InputTau/Tau_r)/PQR
			C1r = Kr*(1/(Tau_m*(r1-r2)) + Cr*(Tau_r+r2*Tau_r**2)/((r1-r2)*PQR))
			C2r = -Cr*Kr*Tau_r**2/PQR-C1r
			Vr = Vr + C1r*torch.exp(r1*InputTau) + C2r*torch.exp(r2*InputTau)
			Dvr = -Cr*Kr*Tau_r*torch.exp(-InputTau/Tau_r)/PQR + C1r*r1*torch.exp(r1*InputTau) + C2r*r2*torch.exp(r2*InputTau)
		else:
			Vr = Cr*Kr*Tau_r*InputTau*torch.exp(-InputTau/Tau_r)/(P*Tau_r-2)
			C1r = Kr*(Tau_r*(P-Cr*Tau_m)-2)/((r1-r2)*(P*Tau_r-2)*Tau_m)
			C2r = -C1r
			Vr = Vr + C1r*torch.exp(r1*InputTau) + C2r*torch.exp(r2*InputTau)
			Dvr = Cr*Kr*Tau_r*torch.exp(-InputTau/Tau_r)/(P*Tau_r - 2) - Cr*Kr*InputTau*torch.exp(-InputTau/Tau_r)/(P*Tau_r - 2) \
				+ C1r*r1*torch.exp(r1*InputTau) + C2r*r2*torch.exp(r2*InputTau)
		if(r1.data[0]!= npts and r2.data[0]!= npts):
			Vs = Cs*Ks*Tau_s ** 2*torch.exp(-InputTau/Tau_s)/PQS
			C1s = Ks*(1/(Tau_m*(r1 - r2)) + Cs*(Tau_s + r2*Tau_s ** 2)/((r1 - r2)*PQS))
			C2s = -Cs*Ks*Tau_s ** 2/PQS - C1s
			Vs = Vs + C1s*torch.exp(r1*InputTau) + C2s*torch.exp(r2*InputTau)
			Dvs = -Cs*Ks*Tau_s*torch.exp(-InputTau/Tau_s)/PQS + C1s*r1*torch.exp(r1*InputTau) + C2s*r2*torch.exp(r2*InputTau)
		else:
			Vs = Cs*Ks*Tau_s*InputTau*torch.exp(-InputTau/Tau_s)/(P*Tau_s-2)
			C1s = Ks*(Tau_s*(P-Cs*Tau_m)-2)/((r1-r2)*(P*Tau_s-2)*Tau_m)
			C2s = -C1s
			Vs = Vs + C1s*torch.exp(r1*InputTau) + C2s*torch.exp(r2*InputTau)
			Dvs = Cs*Ks*Tau_s*torch.exp(-InputTau/Tau_s)/(P*Tau_s - 2) - Cs*Ks*InputTau*torch.exp(-InputTau/Tau_s)/(P*Tau_s - 2) \
					+ C1s*r1*torch.exp(r1*InputTau) + C2s*r2*torch.exp(r2*InputTau)
		C3 = (Dv0-S0/Tau_m+r2*L-r2*V0)/(r1-r2)
		C4 = V0-C3-L
		VI = C3*torch.exp(r1*StateTau)+C4*torch.exp(r2*StateTau)+L
		DvI = C3*r1*torch.exp(r1*StateTau) + C4*r2*torch.exp(r2*StateTau)
	elif(deta < 0):
		alpha = -0.5/Tau_n -0.5*G/Tau_m
		beta = (-Delta)**0.5/(2*Tau_m*Tau_n)
		Vr = Cr*Kr*Tau_r ** 2*torch.exp(-InputTau/Tau_r)/PQR
		C1r = -Cr*Kr*Tau_r**2/PQR
		C2r = Cr*Kr*Tau_r*(1+alpha*Tau_r)/(beta*PQR)+Kr/(beta*Tau_m)
		Vr = Vr + torch.exp(alpha*InputTau)*(C1r*torch.cos(beta*InputTau)+C2r*torch.sin(beta*InputTau))
		Vs = Cs*Ks*Tau_s ** 2*torch.exp(-InputTau/Tau_s)/PQS
		C1s = -Cs*Ks*Tau_s**2/PQS
		C2s = Cs*Ks*Tau_s*(1+alpha*Tau_s)/(beta*PQS)+Ks/(beta*Tau_m)
		Vs = Vs + torch.exp(alpha*InputTau)*(C1s*torch.cos(beta*InputTau)+C2s*torch.sin(beta*InputTau))
		C3 = V0 - L
		C4 = (Dv0 - S0/Tau_m -alpha*C3)/beta
		VI = torch.exp(alpha*StateTau)*(C3*torch.cos(beta*StateTau)+C4*torch.sin(beta*StateTau)) + L
		Dvr = -Cr*Kr*Tau_r*torch.exp(-InputTau/Tau_r)/PQR + torch.exp(alpha*InputTau)*((alpha*C1r+beta*C2r)*torch.cos(beta*InputTau) +
		                                       (alpha*C2r-beta*C1r)*torch.sin(beta*InputTau))
		Dvs = -Cs*Ks*Tau_s*torch.exp(-InputTau/Tau_s)/PQS + torch.exp(alpha*InputTau)*((alpha*C1s+beta*C2s)*torch.cos(beta*InputTau) +
		                                       (alpha*C2s-beta*C1s)*torch.sin(beta*InputTau))
		DvI = torch.exp(alpha*StateTau)*((alpha*C3 + beta*C4)*torch.cos(beta*StateTau) +
		                                 (alpha*C4 - beta*C3)*torch.sin(beta*StateTau))
	elif(deta==0):
		r = -0.5/Tau_n -0.5*G/Tau_m
		if(r!=nptr):
			Vr = Cr*Kr*Tau_r**2*torch.exp(-InputTau/Tau_r)/PQR
			C1r = -Cr*Kr*Tau_r**2/PQR
			C2r = Cr*Kr*Tau_r*(1+r*Tau_r)/PQR +Kr/Tau_m
			Vr = Vr + (C1r+C2r*InputTau)*torch.exp(r*InputTau)
			Dvr = -Cr*Kr*Tau_r*torch.exp(-InputTau/Tau_r)/PQR + (r*C1r + (r*InputTau + 1)*C2r)*torch.exp(r*InputTau)
		else:
			Vr = Cr*Kr*InputTau**2*torch.exp(-InputTau/Tau_r)/2
			C2r = Kr/Tau_m
			Vr = Vr + C2r*InputTau*torch.exp(r*InputTau)
			Dvr = Cr*Kr*InputTau*torch.exp(-InputTau/Tau_r) - Cr*Kr*InputTau ** 2*torch.exp(-InputTau/Tau_r)/(2*Tau_r)+\
			      (r*InputTau+1)*C2r*torch.exp(r*InputTau)
		if(r!=npts):
			Vs = Cs*Ks*Tau_s ** 2*torch.exp(-InputTau/Tau_s)/PQS
			C1s = -Cs*Ks*Tau_s**2/PQS
			C2s = Cs*Ks*Tau_s*(1+r*Tau_s)/PQS +Ks/Tau_m
			Vs = Vs + (C1s+C2s*InputTau)*torch.exp(r*InputTau)
			Dvs = -Cs*Ks*Tau_s*torch.exp(-InputTau/Tau_s)/PQS + (r*C1s + (r*InputTau + 1)*C2s)*torch.exp(r*InputTau)
		else:
			Vs = Cs*Ks*InputTau**2*torch.exp(-InputTau/Tau_s)
			C2s = Ks/Tau_m
			Vs = Vs + C2s*InputTau*torch.exp(r*InputTau)
			Dvs = Cs*Ks*InputTau*torch.exp(-InputTau/Tau_s) - Cs*Ks*InputTau ** 2*torch.exp(-
	V = torch.sum(Vr+Vs) + VI
	Dv = torch.sum(Dvr + Dvs) + DvI
	endTime = couple(V, endTime, -Dv.data[0])
	theTime = 0*InputTau + endTime
	InputTau = couple(theTime,InputTau,1)
	StateTau = couple(endTime,StateTau,1)

	InputS = Kr*(torch.exp(-InputTau/Tau_r)-torch.exp(-InputTau/Tau_s)) + EffectiveMask*InputS*torch.exp(-InputTau/Tau_s) +\
	         (1-EffectiveMask)*InputS
	InputR = EffectiveMask*InputR*torch.exp(-InputTau/Tau_r) + (1-EffectiveMask)*InputR

	return V, endTime, Dv, InputR, InputS

And then, I update the state one-by-one in the main loop:

for ii in range(StateUpdateNum):
	neuronInd = int(SrIndex[ii])
	StateTau = SrTime[ii] - Time[neuronInd]
	InpTau = SrTime[ii] - PreSynpticTimeList[neuronInd]
	InpTau = torch.min(InpTau, StateTau)
	endTime = Variable(torch.Tensor([SrTime[ii]]), requires_grad=True)
	theV, endTime, theDv, InputR, InputS = \
		neuronForward(S[neuronInd], R[neuronInd], InpTau, StateTau, endTime, V[neuronInd], Dv[neuronInd], theG,
					  Tau_m[neuronInd],
					  H[neuronInd], theTau_n, theI, Tau_r[neuronInd], Tau_s[neuronInd], theG_exc, theG_inh)

	Time[neuronInd] = endTime
	S[neuronInd] = InputS
	R[neuronInd] = InputR
	Dv = Dv.clone()
	Dv[neuronInd] = theDv
	V = V.clone()
	V[neuronInd] = theV.detach()
	State[neuronInd] = SrState[ii]
	if (SrState[ii] == 4):
		if (NetSpkTime is None):
			NetSpkTime = endTime.clone()
		else:
			NetSpkTime = torch.cat((NetSpkTime, endTime))
		for postInd in ForwardLink[neuronInd + InpNeuronNum]:
			if (len(PreSynpticTimeList[postInd]) == 0):
				PreSynpticTimeList[postInd] = preTime + DelaysMatrix[postInd, neuronInd + InpNeuronNum]
			else:
				PreSynpticTimeList[postInd] = torch.cat(
					(PreSynpticTimeList[postInd], preTime + DelaysMatrix[postInd, neuronInd + InpNeuronNum]))
			if len(S[postInd]) == 0:
				S[postInd] = Variable(torch.zeros(1), requires_grad=True).clone()
				R[postInd] = WeightsMatrix[postInd, neuronInd + InpNeuronNum].clone()
			else:
				S[postInd] = torch.cat((S[postInd], Variable(torch.zeros(1), requires_grad=True)))
				R[postInd] = torch.cat((R[postInd], WeightsMatrix[postInd, neuronInd + InpNeuronNum]))

I understand that my ‘network’ is too fragmentary, and is really bad for vectorization. But the ‘network’ is so dynamic that I can’t know each state and connectivity before state update. Even through, it shouldn’t be that slow. I can do something to speed it up? Thanks in advance. And by the way, I can’t find the module torch.autograd.profiler in my pytorch (linux python2.7 conda installation latest version), do I need a seperate installation?

smth · September 27, 2017, 3:58pm

your network is very unique and interesting.
One thing that will help us understand the problem better is if you can do the following:

install pytorch from source. Instructions here: https://github.com/pytorch/pytorch#from-source

Use the autograd profiler to take a profile summary: http://pytorch.org/docs/master/autograd.html#profiler
The autograd profiler is for now only available in master branch, hence the need for install from source.

Post the profile output here, we will look further.

Chaofei · September 29, 2017, 3:25am

Thanks for the reply. I have tried the profiler and graph visualization.
The profile of a network is too long, so I ran a simple net with single ‘neuron’ (but it’s still too long). So I uploaded an example computational graph. Maybe it just because I used too many operations, and calling those ops takes so much time. I thought pytorch could do some optimizations like op fusions and reuse repeated subgraphs in the backward process, but maybe I didn’t use it right.

Chaofei · September 29, 2017, 3:26am

here is a small part of the profile output:

Name CPU time CUDA time Calls CPU total CUDA total

Sub 29.851us MulConstant 9.183us Add 5.844us Sub 4.935us MulConstant 4.535us Add 4.226us Sub 4.302us MulConstant 4.089us Add 3.743us Sub 4.033us MulConstant 4.064us Add 3.895us SubConstant 5.761us SubConstant 4.067us SubConstant 3.921us SubConstant 3.819us SubConstant 3.670us SubConstant 5.695us SubConstant 3.909us SubConstant 3.478us SubConstant 3.537us SubConstant 4.440us Clone 8.353us Clone 2.541us Index 7.655us Index 5.591us Add 5.253us Clone 2.849us Index 5.130us Clone 2.460us Index 4.396us Index 4.865us Add 4.246us Concat 10.516us Concat 4.911us Index 7.460us Concat 4.461us Index 4.265us Index 4.793us Add 5.977us Concat 5.696us Concat 4.192us Index 4.850us Concat 4.051us Index 4.810us SubConstant 6.971us SubConstant 4.399us Cmin 30.417us Index 5.044us MulConstant 6.544us Index 4.558us MulConstant 4.072us Add 4.428us Index 4.108us MulConstant 3.575us Add 3.925us Index 4.080us MulConstant 3.702us Add 3.821us Index 4.531us MulConstant 3.560us Add 3.762us Index 5.589us MulConstant 4.795us Index 4.479us MulConstant 3.612us Add 3.786us Index 4.004us MulConstant 3.699us Add 4.701us Index 4.231us MulConstant 3.663us Add 4.016us Index 4.205us MulConstant 3.606us Add 3.613us Index 4.348us MulConstant 4.694us Index 4.649us MulConstant 3.426us Add 3.798us Index 4.251us MulConstant 3.526us Add 164.220us Index 6.363us MulConstant 4.815us Add 4.333us Index 4.850us MulConstant 3.650us Add 3.837us Index 4.279us MulConstant 3.693us Index 4.209us MulConstant 3.543us Add 3.815us Index 4.101us MulConstant 3.645us Add 3.707us Index 4.173us MulConstant 3.631us Add 3.760us Index 4.318us MulConstant 3.546us Add 3.683us Index 4.085us MulConstant 3.684us Index 4.113us MulConstant 3.627us Add 3.818us Index 4.142us MulConstant 3.578us Add 3.733us Index 4.053us MulConstant 3.678us Add 3.715us Index 3.936us MulConstant 3.695us Add 3.708us Index 4.379us Index 5.327us Index 4.225us Index 4.189us Index 4.097us Index 4.061us Mul 6.801us Add 3.770us PowConstant 7.036us MulConstant 4.033us Mul 4.541us Add 3.642us Mul 3.976us Sub 4.136us DivConstant 7.525us DivConstant 5.071us Div 5.913us DivConstant 6.243us Add 3.929us Add 3.748us Mul 4.288us Div 4.352us Add 3.896us Div 4.120us Mul 4.095us DivConstant 5.071us Mul 7.763us DivConstant 4.915us Sub 3.874us Mul 4.318us DivConstant 4.795us Mul 4.951us DivConstant 5.476us Sub 3.845us Mul 4.227us SubConstant 4.658us PowConstant 4.957us Mul 4.069us Add 3.967us Mul 3.851us SubConstant 3.864us PowConstant 4.268us Mul 3.784us Add 3.587us Eq 14.635us Type 11.656us NoGrad 6.236us Gt 6.841us Type 7.187us NoGrad 3.962us Gt 5.028us Type 6.152us NoGrad 3.362us Lt 5.453us Type 5.978us NoGrad 3.166us Mul 4.906us Sum 12.540us Mul 6.646us Mul 4.944us Mul 4.456us Sub 4.104us Div 5.229us Mul 5.140us Mul 4.564us Mul 5.240us Sub 3.861us 0.000us 1 29.000us 0.000us
0.000us 1 9.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 8.000us 0.000us
0.000us 1 2.000us 0.000us
0.000us 1 7.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 2.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 2.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 10.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 7.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 30.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 164.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 7.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 7.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 7.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 14.000us 0.000us
0.000us 1 11.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 7.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 3.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 12.000us 0.000us
0.000us 1 6.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 4.000us 0.000us
0.000us 1 5.000us 0.000us
0.000us 1 3.000us 0.000us

smth · September 29, 2017, 3:47am

nice. can you give the full profile output? maybe upload to https://gist.github.com/ and give a link here?

Chaofei · September 29, 2017, 9:09am

This is the output of a single node run. Now I’m thinking of cutting off all connections, run all the node at the same time, and link back the connections… Have no ideas about how to link back the connections yet…

gist.github.com

https://gist.github.com/anonymous/51c35d88c18e6602b33dbc1586ddd614

gistfile1.txt

-------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                                          CPU time        CUDA time            Calls        CPU total       CUDA total
-------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
Sub                                           29.851us          0.000us                1         29.000us          0.000us
MulConstant                                    9.183us          0.000us                1          9.000us          0.000us
Add                                            5.844us          0.000us                1          5.000us          0.000us
Sub                                            4.935us          0.000us                1          4.000us          0.000us
MulConstant                                    4.535us          0.000us                1          4.000us          0.000us
Add                                            4.226us          0.000us                1          4.000us          0.000us
Sub                                            4.302us          0.000us                1          4.000us          0.000us

This file has been truncated. show original