So this works fine on CPU and yes I have read related Stack Overflow and PyTorch Discuss posts for common CUDA errors. No, my input does not have more classes than expected. No, my tensors are not mismatching. I am running out of things to try.
ERROR:
<ipython-input-12-0a931e7c38e6> in forward(self, query, key, value, attention_mask)
74 # print(self.weights_query)
75 # print(self.weights_query(query))
---> 76 query_score = self.weights_query(query).view(batch_size, -1, self.number_of_heads, self.dimension_query).transpose(1,2) # q_s: [batch_size x n_heads x len_q x d_k]
77 key_score = self.weights_key(key).view(batch_size, -1, self.number_of_heads, self.dimension_key).transpose(1,2) # k_s: [batch_size x n_heads x len_k x d_k]
78 value_score = self.weights_value(value).view(batch_size, -1, self.number_of_heads, self.dimension_value).transpose(1,2) # v_s: [batch_size x n_heads x len_k x d_v]
/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
487 result = self._slow_forward(*input, **kwargs)
488 else:
--> 489 result = self.forward(*input, **kwargs)
490 for hook in self._forward_hooks.values():
491 hook_result = hook(self, input, result)
/usr/local/lib/python3.7/site-packages/torch/nn/modules/linear.py in forward(self, input)
65 @weak_script_method
66 def forward(self, input):
---> 67 return F.linear(input, self.weight, self.bias)
68
69 def extra_repr(self):
/usr/local/lib/python3.7/site-packages/torch/nn/functional.py in linear(input, weight, bias)
1352 ret = torch.addmm(torch.jit._unwrap_optional(bias), input, weight.t())
1353 else:
-> 1354 output = input.matmul(weight.t())
1355 if bias is not None:
1356 output += torch.jit._unwrap_optional(bias)
RuntimeError: CUDA error: device-side assert triggered
Relevant Code:
def gaussian_error_linear_unit_activation(x):
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
class TokenPositionSegmentEmbedding(nn.Module):
def __init__(self,
vocabulary_size:int,
embedding_size:int,
maximum_length:int,
):
super(TokenPositionSegmentEmbedding, self).__init__()
# embedding for the tokens
self.token_embedding = nn.Embedding(vocabulary_size, embedding_size)
# embedding for corresponding position
self.position_embedding = nn.Embedding(maximum_length, embedding_size)
self.norm = nn.LayerNorm(embedding_size)
def forward(self, tokens):
sequence_length = tokens.size(1)
position = torch.arange(sequence_length, dtype=torch.long).to(tokens.device)
# (sequence_length,) -> (batch_size, sequence_length)
position = position.unsqueeze(0).expand_as(tokens)
embedding = (
self.token_embedding(tokens) + \
self.position_embedding(position)
)
return self.norm(embedding)
class ScaledDotProductAttention(nn.Module):
def __init__(self, dimension_key:int):
super(ScaledDotProductAttention, self).__init__()
# dimension of key is the same as query
self.dimension_key = dimension_key
def forward(self, query, key, value, attention_mask):
# scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
scores = torch.matmul(query, key.transpose(-1, -2)) / np.sqrt(self.dimension_key)
# Fills elements of self tensor with value where mask is one.
scores.masked_fill_(attention_mask, -1e9)
attention = nn.Softmax(dim=-1)(scores)
context = torch.matmul(attention, value)
return context, attention
class MultiHeadAttention(nn.Module):
def __init__(
self,
embedding_size:int,
dimension_query:int,
dimension_key:int,
dimension_value:int,
number_of_heads:int,
batch_size: int,
):
assert dimension_query == dimension_key, 'query and key do not share the same dimension!'
super(MultiHeadAttention, self).__init__()
self.embedding_size = embedding_size
self.dimension_query = dimension_query
self.dimension_key = dimension_key
self.dimension_value = dimension_value
self.number_of_heads = number_of_heads
self.batch_size = batch_size
self.weights_query = nn.Linear(embedding_size, number_of_heads * dimension_query)
self.weights_key = nn.Linear(embedding_size, number_of_heads * dimension_key)
self.weights_value = nn.Linear(embedding_size, number_of_heads * dimension_value)
self.attention = ScaledDotProductAttention(dimension_key=dimension_key)
def forward(self, query, key, value, attention_mask):
# q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
residual, batch_size = query, query.size(0)
# (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
# print(query.shape, query)
# print(self.embedding_size, self.number_of_heads * self.dimension_query)
# print(self.weights_query)
# print(self.weights_query(query))
query_score = self.weights_query(query).view(batch_size, -1, self.number_of_heads, self.dimension_query).transpose(1,2) # q_s: [batch_size x n_heads x len_q x d_k]
key_score = self.weights_key(key).view(batch_size, -1, self.number_of_heads, self.dimension_key).transpose(1,2) # k_s: [batch_size x n_heads x len_k x d_k]
value_score = self.weights_value(value).view(batch_size, -1, self.number_of_heads, self.dimension_value).transpose(1,2) # v_s: [batch_size x n_heads x len_k x d_v]
attention_mask = attention_mask.unsqueeze(1).repeat(1, self.number_of_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]
# context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
context, attention = self.attention(query_score, key_score, value_score, attention_mask)
context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.number_of_heads * self.dimension_value) # context: [batch_size x len_q x n_heads * d_v]
output = nn.Linear(self.number_of_heads * self.dimension_value, self.embedding_size)(context)
return nn.LayerNorm(self.embedding_size)(output + residual), attention # output: [batch_size x len_q x d_model]