Hello all,
I’m trying to implement a CUDA extension to do nearest neighborhood search. The forward result is tested to be right. When i do gradcheck, it raise the Jacobian mismatch error.I learned on the forum that change the eps can fix that, so i change the eps from 1e-1 to 1e-7 but it always raise the Jacobian mismatch error whatever the eps is.I wonder if there is something wrong with my cuda code, but i just can not find it .Please help me to see if it is a code preblem or a precision problem of the gradcheck
The error is like this:
Traceback (most recent call last):
File "test_nnd.py", line 21, in <module>
print(torch.autograd.gradcheck(dist.double(), (data1.double(), data2.double()), eps=1e-6))
File "/home/Gilgamesh/anaconda3/envs/pytorch1.3/lib/python3.7/site-packages/torch/autograd/gradcheck.py", line 289, in gradcheck
'numerical:%s\nanalytical:%s\n' % (i, j, n, a))
File "/home/Gilgamesh/anaconda3/envs/pytorch1.3/lib/python3.7/site-packages/torch/autograd/gradcheck.py", line 227, in fail_test
raise RuntimeError(msg)
RuntimeError: Jacobian mismatch for output 0 with respect to input 0,
numerical:tensor([[ 0.1937, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, -0.3576, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.2347, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0447, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.5029, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.4172, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, -0.7376, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.4731, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.6407, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.2049, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[-0.6109, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, -0.5588, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, -0.4247, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, -0.9388, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, -0.4619, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -0.2682, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -0.4619, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -1.0729,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
-0.1267, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.1173],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.9835, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.4992, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7004,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.2980, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, -0.1118],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -0.2682, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -0.0745, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -0.3725,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
-0.4619, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, -0.3725]], dtype=torch.float64)
analytical:tensor([[ 0.7765, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 1.9917, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 1.6396, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.8206, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[-2.4007, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, -1.8262, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -1.0693, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.4617],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 3.8508, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, -0.4484],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -1.0813, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, -1.4701]], dtype=torch.float64)
Here is my cuda code:
#include <stdio.h>
#include <vector>
#include <math.h>
#include "../cuda_utils.cuh"
//[b,c,n], [b,c,m], float[b,n], int[b,n]
__global__ void NmDistanceKernel(int b, int c, int n, int m,
const float *__restrict__ xyz1,const float *__restrict__ xyz2,
float *__restrict__ result,int *__restrict__ result_i){
int batchsize = blockIdx.x;
int index = threadIdx.x;
int stride = blockDim.x;
xyz1 += batchsize*c*n;
xyz2 += batchsize*c*m;
result += batchsize*n;
result_i += batchsize*n;
for(int i=index;i<n;i+=stride)
{
float min_d = -1;
int min_i;
for(int j=0;j<m;j++)
{
float d = 0;
for(int k=0;k<c;k++)
{
d += (xyz1[i + k*n] - xyz2[j + k*m]) * (xyz1[i + k*n] - xyz2[j + k*m]);
}
//float d = sqrt(s_d)
if(min_d == -1 || d < min_d)
{
min_d = d;
min_i = j;
//printf("(%d)",j);
}
}
//printf("(%d)",min_i);
result[i] = min_d;
result_i[i] = min_i;
//printf("%d",min_i);
}
}
//[b,c,n],[b,c,m],float[b,n] int[b,n]
void NmDistance(int b, int c, int n, int m, const float *xyz1_data, const float *xyz2_data,
float *dist1_data, float *dist2_data,
int *idx1_data, int *idx2_data){
NmDistanceKernel<<<b,optimal_num_threads(n)>>>(b,c,n,m,xyz1_data,xyz2_data,dist1_data,idx1_data);
NmDistanceKernel<<<b,optimal_num_threads(m)>>>(b,c,m,n,xyz2_data,xyz1_data,dist2_data,idx2_data);
CUDA_CHECK_ERRORS();
}
//[b,c,n],[b,c,m],float[b,n] int[b,n]
__global__ void NmDistanceGradKernel(int b,int c,int n,int m,
const float *__restrict__ xyz1,const float *__restrict__ xyz2,
const float *__restrict__ grad_dist1,const int *__restrict__ idx1,
float *__restrict__ grad_xyz1,float *__restrict__ grad_xyz2){
int batchsize = blockIdx.x;
int index = threadIdx.x;
int stride = blockDim.x;
xyz1 += batchsize*c*n;
xyz2 += batchsize*c*m;
grad_xyz1 += batchsize*c*n;
grad_xyz2 += batchsize*c*m;
grad_dist1 += batchsize*n;
idx1 += batchsize*n;
for (int i=0;i<n;i+=stride){
float g = grad_dist1[i]*2;
int id = idx1[i];
for (int k=0;k<c;k++)
{
atomicAdd(grad_xyz1 + i + k*n, g*(xyz1[i + k*n]-xyz2[id + k*m]));
atomicAdd(grad_xyz2 + id + k*m, -(g*(xyz1[i + k*n]-xyz2[id + k*m])));
}
}
}
//[b,c,n],[b,c,m],float[b,n] int[b,n]
void NmDistanceGrad(int b,int c,int n,int m,
const float *xyz1_data,const float *xyz2_data,
float *gradxyz1_data,float *gradxyz2_data,
const float *graddist1_data,const float *graddist2_data,
const int *idx1_data,const int *idx2_data){
NmDistanceGradKernel<<<b,optimal_num_threads(n)>>>(b,c,n,m,xyz1_data,xyz2_data,graddist1_data,idx1_data,gradxyz1_data,gradxyz2_data);
NmDistanceGradKernel<<<b,optimal_num_threads(m)>>>(b,c,m,n,xyz2_data,xyz1_data,graddist2_data,idx2_data,gradxyz2_data,gradxyz1_data);
CUDA_CHECK_ERRORS();
}