Hi,
I see in the new release of pytorch 1.6 that torch.cuda.amp should support SparseGradient.
Does that mean we can now use Embedding with sparse option = True ?
If that’s the case, how to train the model ? I tried to use the example on the pytorch doc but got the error message : "RuntimeError: Could not run 'aten::_amp_non_finite_check_and_unscale_' with arguments from the 'SparseCUDA' backend. 'aten::_amp_non_finite_check_and_unscale_' is only available for these backends: [CUDA, Autograd, Profiler, Tracer]. "
. I was using SparseAdam for the embedding and AdamW for the other layers.
# Model class
class TextClassifier(torch.nn.Module):
def __init__(self, num_class=1, vocab_size=1000):
super().__init__()
self.embeddings = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=200, sparse=True)
self.norm = torch.nn.LayerNorm(200)
self.rnn = torch.nn.GRU(200, 256, bidirectional=True)
self.cls = torch.nn.Linear(512, num_class)
def forward(self, x):
x = self.embeddings(x)
x = self.norm(x)
x, _ = self.rnn(x)
x = x.max(1)[0]
x = self.cls(x)
return x
# Creates model and optimizer in default precision
model = TextClassifier(1,50000)
embeddings_name = {"embeddings.weight"}
param_optimizer = [(n,p) for n, p in list(model.named_parameters()) if n not in set(embeddings_name) ]
param_optimizer2 =[(n,p) for n, p in list(model.named_parameters()) if n in set(embeddings_name) ]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
params2 = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
params1 = [
{'params': [p for n, p in param_optimizer2 if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
{'params': [p for n, p in param_optimizer2 if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = [optim.SparseAdam(params1, lr=lr), AdamW(params2, lr=lr)]
# Creates a GradScaler once at the beginning of training.
scaler = GradScaler()
for epoch in epochs:
for input, target in data:
for opt in optimizer:
opt.zero_grad()
# Runs the forward pass with autocasting.
with autocast():
output = model(input)
loss = loss_fn(output, target)
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
# Backward passes under autocast are not recommended.
# Backward ops run in the same dtype autocast chose for corresponding forward ops.
scaler.scale(loss).backward()
# scaler.step() first unscales the gradients of the optimizer's assigned params.
# If these gradients do not contain infs or NaNs, optimizer.step() is then called,
# otherwise, optimizer.step() is skipped.
for opt in optimizer:
scaler.step(opt)
# Updates the scale for next iteration.
scaler.update()
UPDATE: add Model class. AdamW comes from Huggingface library. But even with Adam it is not working.