class LitModel(pl.LightningModule):
def __init__(
self,
data: Tensor,
enc_seq_len: int,
dec_seq_len: int,
output_seq_len: int,
batch_first: bool,
learning_rate: float,
max_seq_len: int=5000,
dim_model: int=512,
n_layers: int=4,
n_heads: int=8,
dropout_encoder: float=0.2,
dropout_decoder: float=0.2,
dropout_pos_enc: float=0.1,
dim_feedforward_encoder: int=2048,
dim_feedforward_decoder: int=2048
):
super().__init__()
self.data = data
self.enc_seq_len = enc_seq_len
self.dec_seq_len = dec_seq_len
self.output_seq_len = output_seq_len
self.batch_first = batch_first
self.learning_rate = learning_rate
self.loss = nn.CrossEntropyLoss()
self.batch_size = 32
self.test_size = 31*24
# linear layers used for encoder/decoder input layer and linear mapping
self.encoder_input_layer = nn.Linear(
in_features=enc_seq_len,
out_features=dim_model
)
self.decoder_input_layer = nn.Linear(
in_features=dec_seq_len,
out_features=dim_model
)
self.linear_mapping = nn.Linear(
in_features=dim_model,
out_features=output_seq_len
)
# Create positional encoder
self.positional_encoding_layer = PositionalEncoder(
d_model=dim_model,
dropout=dropout_pos_enc,
max_seq_len=max_seq_len,
batch_first=batch_first
)
# encoder layer exactly as in the original Transformer
encoder_layer = nn.TransformerEncoderLayer(
d_model=dim_model,
nhead=n_heads,
dim_feedforward=dim_feedforward_encoder,
dropout=dropout_encoder,
batch_first=batch_first)
# stack the encoder layers in nn.TransformerEncoder
self.encoder = nn.TransformerEncoder(
encoder_layer=encoder_layer,
num_layers=n_layers,
norm=None)
# decoder layer
decoder_layer = nn.TransformerDecoderLayer(
d_model=dim_model,
nhead=n_heads,
dim_feedforward=dim_feedforward_decoder,
dropout=dropout_decoder,
batch_first=batch_first)
# stack the decoder layers
self.decoder = nn.TransformerDecoder(
decoder_layer=decoder_layer,
num_layers=n_layers,
norm=None)
def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor=None, tgt_mask: Tensor=None) -> Tensor:
# Permute from shape [batch size, seq len, num features] to [seq len, batch size, num features]
if self.batch_first == False:
shape_before = src.size()
src = src.permute(1, 0, 2)
print("src shape changed from {} to {}".format(shape_before, src.shape))
shape_before = tgt.size()
tgt = tgt.permute(1, 0, 2)
print("src shape changed from {} to {}".format(shape_before, tgt.shape))
# pass through input layer before decoder
src = self.encoder_input_layer(src)
# pass through the positional encoding layer
src = self.positional_encoding_layer(src)
# pass through all the stacked encoder layer
src = self.encoder(src=src)
# pass decoder input through decoder input layer
decoder_output = self.decoder_input_layer(tgt)
# pass through decoder
decoder_output = self.decoder(
tgt=decoder_output,
memory=src,
tgt_mask=tgt_mask,
memory_mask=src_mask)
decoder_output = self.linear_mapping(decoder_output)
return decoder_output
I am currently implementing a simple transformer on time series data. When executing above network in colab on GPU I get the following error:
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in _scaled_dot_product_attention(q, k, v, attn_mask, dropout_p)
4850 # (B, Nt, E) x (B, E, Ns) → (B, Nt, Ns)
4851 if attn_mask is not None:
→ 4852 attn = torch.baddbmm(attn_mask, q, k.transpose(-2, -1))
4853 else:
4854 attn = torch.bmm(q, k.transpose(-2, -1))
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument batch1 in method wrapper_baddbmm)
Seems like it might be happening in the transformer encoder block since I only use masking there.
Pytorch_lightning is supposed to relieve the use of .to(device) and .to(‘cuda’) so it should hopefully not be solved by this.
I also had this problem before making the network a pytorch lightning module (the reason for why I did it in the first place)
package versions
pytorch_lightning 1.7.7
torch 1.12.1+cu113
Appreciates all help