I recently noticed that nn.CrossEntropyLoss
doesn’t do correct back-prop on GPU when supplied labels is a slice of a tensor in some cases. It seems like the GPU implementation supposes that the label I provide is memory contiguous (no stride). Here’s a minimal example:
import torch
from torch import nn
class BasicNetwork(nn.Module):
def __init__(self, linear_n_feats: int, n_outputs: int):
super(BasicNetwork, self).__init__()
self.network = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features=linear_n_feats, out_features=n_outputs)
)
def forward(self, X):
return self.network(X)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
batch_size = 64
in_channels = 3
temporal_depth = 128
model = BasicNetwork(linear_n_feats=in_channels * temporal_depth, n_outputs=7)
model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
losses = []
for batch_idx in range(300):
# random input
X = torch.rand((batch_size, in_channels, temporal_depth), device=device)
# fixed label
y = torch.arange(7, dtype=torch.int64, device=device)
y = y.repeat(batch_size, 1)
pred = model(X)
# first column of y is the classification label
curr_y = y[:, 0] # .contiguous() # .clone()
loss = loss_fn(pred, curr_y)
# back-propagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
if (batch_idx + 1) % 10 == 0:
avg_loss = sum(losses[-10:]) / 10
print(f"{batch_idx+1:>8,d}: Loss = {avg_loss:.5f}")
You can see that the label is a (64, 7)
array, and the label I use is the first column (y[:,0]
). If you run it, you would see that the loss doesn’t go down much. But as soon as you either use .contiguous()
, or .clone()
, or just set device = "cpu"
, the loss starts improving.
I haven’t experimented with other losses or slices of the prediction tensor - so can’t say if all loss GPU implementations expect contiguous tensors.