Hi,
My environment is: ubuntu1604 docker container, and ananconda python3.6.9 and pytorch1.3.1. A simplified version of my code is as follows, it is an implementation of nn.CrossEntropyLoss done with cuda extension:
import fun_cpp
class CrossEntropyFunctionV2(torch.autograd.Function):
@staticmethod
def forward(ctx, logits, labels, ignore_index):
losses = fun_cpp.lsr_forward(logits, labels, ignore_index)
ctx.variables = logits, labels, ignore_index
return losses
@staticmethod
def backward(ctx, grad_output):
logits, labels, ignore_index = ctx.variables
grad = fun_cpp.lsr_backward(grad_output, logits, labels, ignore_index)
print('grad2: ', grad[0, :, 0, 0])
return grad, None, None
class CrossEntropyLossV2(nn.Module):
def __init__(self, reduction='mean', ignore_index=-100):
super(CrossEntropyLossV2, self).__init__()
self.reduction = reduction
self.ignore_index = ignore_index
def forward(self, logits, labels):
losses = CrossEntropyFunctionV2.apply(
logits, labels, self.ignore_index)
# losses = losses[labels != self.ignore_index]
if self.reduction == 'sum':
losses = losses.sum()
elif self.reduction == 'mean':
losses = losses.mean()
return losses
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
net = torchvision.models.resnet18(pretrained=False)
self.conv1 = net.conv1
self.bn1 = net.bn1
self.maxpool = net.maxpool
self.relu = net.relu
self.layer1 = net.layer1
self.layer2 = net.layer2
self.layer3 = net.layer3
self.layer4 = net.layer4
self.out = nn.Conv2d(512, 3, 3, 1, 1)
def forward(self, x):
feat1 = self.conv1(x)
feat2 = self.bn1(feat1)
feat3 = self.relu(feat2)
# feat4 = self.maxpool(feat3)
feat5 = self.layer1(feat3)
feat6 = self.layer2(feat5)
feat7 = self.layer3(feat6)
feat8 = self.layer4(feat7)
feat9 = self.out(feat8)
feat7.retain_grad()
feat7.register_hook(lambda grad: grad*1000)
return feat9, feat7
net1 = Model()
net2 = Model()
from copy import deepcopy
net2.load_state_dict(deepcopy(net1.state_dict()))
# criteria1 = CrossEntropyLossV1(reduction='mean', ignore_index=255)
criteria1 = CrossEntropyLossV2(reduction='mean', ignore_index=255)
criteria2 = nn.CrossEntropyLoss(reduction='mean', ignore_index=255)
net1.cuda()
net2.cuda()
net1.train()
net2.train()
criteria1.cuda()
criteria2.cuda()
optim1 = torch.optim.SGD(net1.parameters(), lr=1e-2)
optim2 = torch.optim.SGD(net2.parameters(), lr=1e-2)
bs = 32
for it in range(10):
inten = torch.randn(bs, 3, 256, 256).cuda()
lbs = torch.randint(0, 3, (bs, 16, 16)).cuda()
# net2.load_state_dict(deepcopy(net1.state_dict()))
optim1.zero_grad()
logits, feat = net1(inten.clone())
loss1 = criteria1(logits, lbs.clone())
loss1.backward()
print('feat.grad1', feat.grad[0, :4, 0, 0])
optim1.step()
logits, feat = net2(inten.clone())
optim2.zero_grad()
loss2 = criteria2(logits, lbs.clone())
loss2.backward()
print('feat.grad2', feat.grad[0, :4, 0, 0])
optim2.step()
print(loss1.item() - loss2.item())
print()
And the cuda implementation is like this:
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
#include <THC/THCAtomics.cuh>
#include <THC/THCDeviceUtils.cuh>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cfloat>
#include <iostream>
using std::cout;
using std::endl;
#define BLOCKSIZE 512
// kernel function for forward and backward
template<typename scalar_t>
__global__ void LSRLossForward(const int n_size,
const int dimsize, const int m_size,
const scalar_t *log_scores,
const int64_t *labels,
scalar_t *losses,
const int64_t ignore_index) {
// shared memory
__shared__ scalar_t sdata[BLOCKSIZE + 2];
int tid = threadIdx.x;
int bid = blockIdx.x;
int samplesize = n_size * m_size;
for (int i{bid}; i < samplesize; i+=gridDim.x) {
int n_idx = i / m_size;
int m_idx = i % m_size;
int64_t lb = labels[i];
if (lb == ignore_index) {
if (tid == 0) losses[i] = 0;
continue;
}
int idx = n_idx * dimsize * m_size + lb * m_size + m_idx;
if (tid == 0) losses[i] = -log_scores[idx];
}
}
template<typename scalar_t>
__global__ void LSRLossBackward(const int n_size,
const int dimsize, const int m_size,
const scalar_t *grad,
scalar_t *grad_logits,
const scalar_t *scores,
const int64_t *labels,
const int64_t ignore_index) {
int tid = threadIdx.x;
int bid = blockIdx.x;
int samplesize = n_size * m_size;
for (int i{bid}; i < samplesize; i+=gridDim.x) {
int n_idx = i / m_size;
int m_idx = i % m_size;
int64_t lb{labels[i]};
for (int j{tid}; j < dimsize; j+=blockDim.x) {
int idx = n_idx * dimsize * m_size + j * m_size + m_idx;
scalar_t gradval = 0;
if (lb != ignore_index) {
gradval = scores[idx];
if (j == lb) {
gradval -= 1.;
}
}
grad_logits[idx] = gradval * grad[i];
}
}
}
// cuda forward and backward
at::Tensor LSR_forward_cuda(const at::Tensor &logits,
const at::Tensor &labels,
const int64_t ignore_index) {
// CHECK type and shape
AT_ASSERTM(logits.type().is_cuda(), "logits should be cuda");
AT_ASSERTM(labels.type().is_cuda(), "labels should be cuda");
const int n_size = logits.size(0);
const int dimsize = logits.size(1);
const int m_size = logits.numel() / (n_size * dimsize);
const int samplesize = labels.numel();
// allocate memory and cuda grid/block
auto losses = torch::zeros_like(labels, logits.options());
auto log_scores = torch::log_softmax(logits, 1);
dim3 grid1(std::min(samplesize, (int)4096));
dim3 block1(std::min(dimsize, (int)BLOCKSIZE));
if (losses.numel() == 0) {
THCudaCheck(cudaGetLastError());
return losses;
}
// call kernel
AT_DISPATCH_FLOATING_TYPES_AND_HALF(losses.scalar_type(), "lsr forward", [&] {
int shm_size = BLOCKSIZE * sizeof(scalar_t) * 2;
LSRLossForward<scalar_t><<<grid1, block1, shm_size, at::cuda::getCurrentCUDAStream()>>>(
n_size, dimsize, m_size,
log_scores.contiguous().data<scalar_t>(),
labels.contiguous().data<int64_t>(),
losses.contiguous().data<scalar_t>(),
ignore_index
);
});
THCudaCheck(cudaGetLastError());
return losses;
}
at::Tensor LSR_backward_cuda(const at::Tensor &grad,
const at::Tensor &logits,
const at::Tensor &labels,
const int64_t ignore_index) {
// CHECK type and shape
AT_ASSERTM(grad.type().is_cuda(), "grad should be cuda");
AT_ASSERTM(logits.type().is_cuda(), "logits should be cuda");
AT_ASSERTM(labels.type().is_cuda(), "labels should be cuda");
const int n_size = logits.size(0);
const int dimsize = logits.size(1);
const int m_size = logits.numel() / (n_size * dimsize);
const int samplesize = labels.numel();
// allocate memory and cuda grid/block
auto grad_logits = torch::empty_like(logits);
auto scores = torch::softmax(logits, 1);
dim3 grid(std::min(samplesize, (int)4096));
dim3 block(std::min(dimsize, (int)BLOCKSIZE));
if (grad_logits.numel() == 0) {
THCudaCheck(cudaGetLastError());
return grad_logits;
}
// call kernel
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_logits.scalar_type(), "lsr backwrd", [&] {
int shm_size = BLOCKSIZE * sizeof(scalar_t) * 2;
LSRLossBackward<scalar_t><<<grid, block, shm_size, at::cuda::getCurrentCUDAStream()>>>(
n_size, dimsize, m_size,
grad.contiguous().data<scalar_t>(),
grad_logits.contiguous().data<scalar_t>(),
scores.contiguous().data<scalar_t>(),
labels.contiguous().data<int64_t>(),
ignore_index
);
});
THCudaCheck(cudaGetLastError());
return grad_logits;
}
// python inferface
at::Tensor LSR_forward(const at::Tensor &logits,
const at::Tensor &labels,
const int64_t ignore_index
) {
if (!(logits.type().is_cuda() && labels.type().is_cuda())) {
AT_ERROR("this LSR loss only supports gpu mode\n");
}
at::DeviceGuard guard(logits.device());
return LSR_forward_cuda(logits, labels, ignore_index);
}
at::Tensor LSR_backward(const at::Tensor &grad,
const at::Tensor &logits,
const at::Tensor &labels,
const int64_t ignore_index) {
// TODO: try AT_ASSERTM
if (!(logits.type().is_cuda() && labels.type().is_cuda())) {
AT_ERROR("this LSR loss only supports gpu mode\n");
}
at::DeviceGuard guard(logits.device());
return LSR_backward_cuda(grad, logits, labels, ignore_index);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("lsr_forward", &LSR_forward, "lsr forward");
m.def("lsr_backward", &LSR_backward, "lsr backward");
}
My problem is that, the difference of gradient and loss between the cuda extension implementation and the nn.CrossEntropyLoss gets too big after about 10 iters. Though the gradient of the cuda implementation of the first iter is same with nn.CrossEntropyLoss. How could I solve this problem please?