nn.Embedding weight normalization error in CUDA?

Hi, it seems that weight renormalized in cpu and gpu gives different result:

import torch
import numpy as np

np.random.seed(1)

# initialize input and embedding weight
max_norm = 1
x_np = np.random.randint(0, 1000, (128, 20))
weight_np = np.random.randn(1000, 100).astype(np.float32)

# cpu
m_cpu = torch.nn.Embedding(1000, 100, max_norm=max_norm)
m_cpu.weight.data = torch.tensor(weight_np)
y_cpu = m_cpu(torch.tensor(x_np))

# gpu
m_cuda = torch.nn.Embedding(1000, 100, max_norm=max_norm).cuda()
m_cuda.weight.data = torch.tensor(weight_np).cuda()
y_cuda = m_cuda(torch.tensor(x_np).cuda())

# normalize weight manually
indices = np.unique(x_np)
norm = np.linalg.norm(weight_np[indices], ord=2, axis=-1, keepdims=True)  # ord = 2 is the default of nn.Embedding
rescale = (norm > max_norm)[:, 0]
selection = np.zeros(weight_np.shape[0], dtype=bool)
selection[indices] = rescale
weight_np[selection] *= max_norm / (norm[rescale] + 1e-7)

# max diff
print(torch.abs(m_cpu.weight - m_cuda.weight.cpu()).max())  # max diff between weight renormalized in gpu and cpu
print(torch.abs(m_cpu.weight - torch.tensor(weight_np)).max())  # weight renormalized in cpu is correct

The output is:

tensor(0.3130, grad_fn=<MaxBackward1>)
tensor(8.9407e-08, grad_fn=<MaxBackward1>)

Am I doing something wrong? Thanks!

The manipulation of the .data attribute is deprecated and can yield unwanted side effects.
Copy the values into the weigth parameter and load the state_dict instead:

import torch
import numpy as np

np.random.seed(1)

# initialize input and embedding weight
max_norm = 1
x_np = np.random.randint(0, 1000, (128, 20))
weight_np = np.random.randn(1000, 100).astype(np.float32)

# cpu
m_cpu = torch.nn.Embedding(1000, 100, max_norm=max_norm)
with torch.no_grad():
    m_cpu.weight.copy_(torch.tensor(weight_np))
y_cpu = m_cpu(torch.tensor(x_np))

# gpu
m_cuda = torch.nn.Embedding(1000, 100, max_norm=max_norm)#.cuda()
m_cuda.load_state_dict(m_cpu.state_dict())
m_cuda.cuda()
y_cuda = m_cuda(torch.tensor(x_np).cuda())

# normalize weight manually
indices = np.unique(x_np)
norm = np.linalg.norm(weight_np[indices], ord=2, axis=-1, keepdims=True)  # ord = 2 is the default of nn.Embedding
rescale = (norm > max_norm)[:, 0]
selection = np.zeros(weight_np.shape[0], dtype=bool)
selection[indices] = rescale
weight_np[selection] *= max_norm / (norm[rescale] + 1e-7)

# max diff
print(torch.abs(m_cpu.weight - m_cuda.weight.cpu()).max())  # max diff between weight renormalized in gpu and cpu
print(torch.abs(m_cpu.weight - torch.tensor(weight_np)).max())  

Output:

tensor(2.3842e-07, grad_fn=<MaxBackward1>)
tensor(8.9407e-08, grad_fn=<MaxBackward1>)

Hi @ptrblck, suppose I only have one instance of nn.Embedding, i.e. no load state_dict from other instance:

import torch
import numpy as np

np.random.seed(1)

# initialize input and embedding weight
max_norm = 1
x_np = np.random.randint(0, 1000, (128, 20))
weight_np = np.random.randn(1000, 100).astype(np.float32)

# gpu
m = torch.nn.Embedding(1000, 100, max_norm=max_norm)
## replace weight
????????????????????
##
m_cuda = m.cuda()
y_cuda = m_cuda(torch.tensor(x_np).cuda())


# correct normalize weight
indices = np.unique(x_np)
norm = np.linalg.norm(weight_np[indices], ord=2, axis=-1, keepdims=True)
rescale = (norm > max_norm)[:, 0]
selection = np.zeros(weight_np.shape[0], dtype=bool)
selection[indices] = rescale
weight_np[selection] *= max_norm / (norm[rescale] + 1e-7)

# max diff
print(torch.abs(torch.tensor(weight_np) - m_cuda.weight.cpu()).max())

I’ve tried to replace weight with

with torch.no_grad():
    m.weight.copy_(torch.tensor(weight_np))

but the result is wrong as before.
Are you suggesting that, to replace the weight, I need to create another nn.Embedding instance, replace its weight, then load its state_dict?

Thanks for your help!