RuntimeError: Expected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: float and query.dtype: double instead

I am dealing with Transformers time series forecasting using Pytorch
the model everything fin but the error like below


  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\modules\activation.py:1241 in forward
    attn_output, attn_output_weights = F.multi_head_attention_forward(

  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\functional.py:5440 in multi_head_attention_forward
    attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)

RuntimeError: Expected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: float and  query.dtype: double instead.

below is the whole procedure

1 - Preprocessing step

dim_val = 512
n_heads = 4
n_decoder_layers = 2
n_encoder_layers = 2
dec_seq_len = 1 # length of input given to decoder
enc_seq_len = 5 # length of input given to encoder
output_sequence_length = 1 # target sequence length. If hourly data and length = 48, you predict 2 days ahead
window_size = enc_seq_len + output_sequence_length # used to slice data into sub-sequences
step_size = 1 # Step size, i.e. how many time steps does the moving window move at each step
in_features_encoder_linear_layer = 2048
in_features_decoder_linear_layer = 2048
max_seq_len = enc_seq_len
batch_first = True
target_seq_len = 1


def is_ne_in_df(df:pd.DataFrame):
    for col in df.columns:
        true_bool = (df[col] == "n/e")
        if any(true_bool):
            return True
    return False

def to_numeric_and_downcast_data(df: pd.DataFrame):
    fcols = df.select_dtypes('float64').columns
    icols = df.select_dtypes('integer').columns
    df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float64')
    df[icols] = df[icols].apply(pd.to_numeric, downcast='integer')
    return df

def process_missing_and_duplicate_timestamps(filepath, train_size=80, val_size=50, verbose=True):
    df = pd.read_csv(filepath)
    df.sort_values('Datetime', inplace=True)
    df.reset_index(drop=True, inplace=True)

    indices_to_remove = []
    rows_to_add = []
    hour_counter = 1
    prev_date = ''

    if verbose:
        print(filepath)

    for index, row in df.iterrows():
        date_str = row['Datetime']

        year_str = date_str[0:4]
        month_str = date_str[5:7]
        day_str = date_str[8:10]
        hour_str = date_str[11:13]
        tail_str = date_str[14:]

        def date_to_str():
            return '-'.join([year_str, month_str, day_str]) + ' ' + ':'.join([hour_str, tail_str])

        def date_with_hour(hour):
            hour = '0' + str(hour) if hour < 10 else str(hour)
            return '-'.join([year_str, month_str, day_str]) + ' ' + ':'.join([hour, tail_str])

        if hour_counter != int(hour_str):
            if prev_date == date_to_str():
                # Duplicate datetime, calculate the average and keep only one
                average = int((df.iat[index, 1] + df.iat[index - 1, 1]) / 2)  # Get the average
                df.iat[index, 1] = average
                indices_to_remove.append(index - 1)
                if verbose:
                    print('Duplicate ' + date_to_str() + ' with average ' + str(average))
            elif hour_counter < 23:
                # Missing datetime, add it using the average of the previous and next for the consumption (MWs)
                average = int((df.iat[index, 1] + df.iat[index - 1, 1]) / 2)
                rows_to_add.append(pd.Series([date_with_hour(hour_counter), average], index=df.columns))
                if verbose:
                    print('Missing ' + date_with_hour(hour_counter) + ' with average ' + str(average))
            else:
                print(date_to_str() + ' and hour_counter ' + str(hour_counter) + " with previous: " + prev_date)

            # Adjust for the missing/duplicate value
            if prev_date < date_to_str():
                hour_counter = (hour_counter + 1) % 24
            else:
                hour_counter = (hour_counter - 1) if hour_counter - 1 > 0 else 0

        # Increment the hour
        hour_counter = (hour_counter + 1) % 24
        prev_date = date_str

    df.drop(indices_to_remove, inplace=True)
    if rows_to_add:
        new_rows = pd.concat(rows_to_add, axis=1).transpose()
        df = pd.concat([df, new_rows], ignore_index=True)  # Concatenating the new rows

    # New rows are added at the end, sort them and also recalculate the indices
    df.sort_values('Datetime', inplace=True)
    df.reset_index(drop=True, inplace=True)
    df = df.set_index("Datetime")
    if is_ne_in_df(df):
        raise ValueError("data frame contains 'n/e' values. These must be handled")
    df = to_numeric_and_downcast_data(df)
    
    train = df[:len(df) * train_size // 100]
    val = df[len(train) : len(train) + ((len(df) - len(train)) * val_size) // 100]
    test = df[len(val) + len(train) : ]
    
    return df , train , val , test

windowing Technique

def get_indices_entire_sequence(data: pd.DataFrame, window_size: int, step_size: int) -> list:

        stop_position = len(data)-1 # 1- because of 0 indexing
        
        # Start the first sub-sequence at index position 0
        subseq_first_idx = 0
        
        subseq_last_idx = window_size
        
        indices = []
        
        while subseq_last_idx <= stop_position:

            indices.append((subseq_first_idx, subseq_last_idx))
            
            subseq_first_idx += step_size
            
            subseq_last_idx += step_size

        return indices
    
train_indices = get_indices_entire_sequence(data = train_set, window_size = window_size, step_size = step_size)
test_indices = get_indices_entire_sequence(data = test_set, window_size = window_size, step_size = step_size)
val_indices = get_indices_entire_sequence(data = val_set, window_size = window_size, step_size = step_size)

train_set = train_set.astype(np.float64)
test_set = test_set.astype(np.float64)
val_set = val_set.astype(np.float64)

scaler = MinMaxScaler(feature_range=(0 , 1))
train_set = scaler.fit_transform(train_set)
test_set = scaler.fit_transform(test_set)
val_set = scaler.transform(val_set)

train_set = torch.tensor(train_set)
test_set = torch.tensor(test_set)
val_set = torch.tensor(val_set)

def generate_square_subsequent_mask(dim1: int, dim2: int):
    return torch.triu(torch.ones(dim1, dim2) * float('-inf'), diagonal=1)

class TransformerDataset(Dataset):
    def __init__(self, 
        data: torch.tensor,
        indices: list, 
        enc_seq_len: int, 
        dec_seq_len: int, 
        target_seq_len: int
        ) -> None:
        
        super().__init__()

        self.indices = indices

        self.data = data

        print("From get_src_trg: data size = {}".format(data.size()))

        self.enc_seq_len = enc_seq_len

        self.dec_seq_len = dec_seq_len

        self.target_seq_len = target_seq_len

    def __len__(self):
        
        return len(self.indices)

    def __getitem__(self, index):
        # Get the first element of the i'th tuple in the list self.indicesasdfas
        start_idx = self.indices[index][0]

        # Get the second (and last) element of the i'th tuple in the list self.indices
        end_idx = self.indices[index][1]

        sequence = self.data[start_idx:end_idx]

        src, trg, trg_y = self.get_src_trg(
            sequence=sequence,
            enc_seq_len=self.enc_seq_len,
            dec_seq_len=self.dec_seq_len,
            target_seq_len=self.target_seq_len
            )

        return src, trg, trg_y
    
    def get_src_trg(
        self,
        sequence: torch.Tensor, 
        enc_seq_len: int, 
        dec_seq_len: int, 
        target_seq_len: int
        ) -> Tuple[torch.tensor, torch.tensor, torch.tensor]:
        assert len(sequence) == enc_seq_len + target_seq_len, "Sequence length does not equal (input length + target length)"

        src = sequence[:enc_seq_len] 

        trg = sequence[enc_seq_len-1:len(sequence)-1]
        
        assert len(trg) == target_seq_len, "Length of trg does not match target sequence length"

        # The target sequence against which the model output will be compared to compute loss
        trg_y = sequence[-target_seq_len:]

        assert len(trg_y) == target_seq_len, "Length of trg_y does not match target sequence length"

        return src, trg, trg_y.squeeze(-1)

Transformers Encoder, Decoder and Positional Encoding layers

class PositionalEncoder(nn.Module):
    def __init__( self, dropout: 0.1, max_seq_len: int=5000, d_model: int=512,batch_first: bool=True):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(p=dropout)
        self.batch_first = batch_first
        position = torch.arange(max_seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        if self.batch_first:
            pe = torch.zeros(1, max_seq_len, d_model)
            pe[0, :, 0::2] = torch.sin(position * div_term)
            pe[0, :, 1::2] = torch.cos(position * div_term)
        else:
            pe = torch.zeros(max_seq_len, 1, d_model)
            pe[:, 0, 0::2] = torch.sin(position * div_term)
            pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        
    def forward(self, x: Tensor) -> Tensor:
        if self.batch_first:
            x = x + self.pe[:,:x.size(1)]
        else:
            x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TimeSeriesTransformer(nn.Module):
    def __init__(self, 
        input_size: int,
        dec_seq_len: int,
        batch_first: bool,
        out_seq_len: int=1,
        dim_val: int=512,  
        n_encoder_layers: int=4,
        n_decoder_layers: int=4,
        n_heads: int=8,
        dropout_encoder = 0.2, 
        dropout_decoder = 0.2,
        dropout_pos_enc = 0.1,
        dim_feedforward_encoder: int=2048,
        dim_feedforward_decoder: int=2048,
        num_predicted_features: int=1
        ): 

        super().__init__() 

        self.dec_seq_len = dec_seq_len

        self.encoder_input_layer = nn.Linear(
            in_features=input_size, 
            out_features=dim_val 
            )

        self.decoder_input_layer = nn.Linear(
            in_features=num_predicted_features,
            out_features=dim_val
            )  
        
        self.linear_mapping = nn.Linear(
            in_features=dim_val, 
            out_features=num_predicted_features
            )

        # Create positional encoder
        self.positional_encoding_layer = PositionalEncoder(
            d_model=dim_val,
            dropout=dropout_pos_enc
            )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_val, 
            nhead=n_heads,
            dim_feedforward=dim_feedforward_encoder,
            dropout=dropout_encoder,
            batch_first=batch_first
            )


        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=n_encoder_layers, 
            norm=None
            )

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=dim_val,
            nhead=n_heads,
            dim_feedforward=dim_feedforward_decoder,
            dropout=dropout_decoder,
            batch_first=batch_first
            )

        self.decoder = nn.TransformerDecoder(
            decoder_layer=decoder_layer,
            num_layers=n_decoder_layers, 
            norm=None
            )

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor=None, 
                tgt_mask: Tensor=None) -> Tensor:

        src = self.encoder_input_layer(src) 

        src = self.positional_encoding_layer(src) 
        src = self.encoder(src=src)

        decoder_output = self.decoder_input_layer(tgt) 
        decoder_output = self.decoder(
            tgt=decoder_output,
            memory=src,
            tgt_mask=tgt_mask,
            memory_mask=src_mask
            )

        decoder_output = self.linear_mapping(decoder_output)
        return decoder_output

when i want to run the model

model = TimeSeriesTransformer(
    input_size=1,
    dec_seq_len=enc_seq_len,
    batch_first=batch_first,
    num_predicted_features=1
    ).to(torch.float64)

i, batch = next(enumerate(train_loader))
src, trg, trg_y = batch

output = model(
    src=src,
    tgt=trg,
    src_mask=src_mask,
    tgt_mask=tgt_mask
    )

raised error is like below


  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\modules\activation.py:1241 in forward
    attn_output, attn_output_weights = F.multi_head_attention_forward(

  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\functional.py:5440 in multi_head_attention_forward
    attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)

RuntimeError: Expected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: float and  query.dtype: double instead.

Did you check the dtypes of the inputs to the failing operation as it seems you are mixing float32 and float64 types together?

Thank you, Mr. Ptrblck.

I solved the previous problem, but a new issue emerged from the previous days that I’ve been attempting to address. I’ve included all variables and the model I’ve devised, which runs on CUDA. However, the following bug has surfaced below.

/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5438         v = v.view(bsz, num_heads, src_len, head_dim)
   5439 
-> 5440         attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
   5441         attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
   5442 

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument attn_bias in method wrapper_CUDA___scaled_dot_product_efficient_attention)

Base on the error message it seems you might have forgotten to move e.g. the input to the GPU.
If you are stuck, post a minimal and executable code snippet to reproduce the issue.

initializing device setup in colab environment

device = (
    "mps"
    if getattr(torch, "has_mps", False)
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using device: {device}")

Positional Encoding

class PositionalEncoder(nn.Module):


    def __init__(
        self, 
        dropout: float=0.1, 
        max_seq_len: int=5000, 
        d_model: int=512,
        batch_first: bool=False
        ):


        super().__init__()

        self.d_model = d_model
        
        self.dropout = nn.Dropout(p=dropout)

        self.batch_first = batch_first

        # adapted from PyTorch tutorial
        position = torch.arange(max_seq_len).unsqueeze(1)
        
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        
        if self.batch_first:
            pe = torch.zeros(1, max_seq_len, d_model)
            
            pe[0, :, 0::2] = torch.sin(position * div_term)
            
            pe[0, :, 1::2] = torch.cos(position * div_term)
        else:
            pe = torch.zeros(max_seq_len, 1, d_model)
        
            pe[:, 0, 0::2] = torch.sin(position * div_term)
        
            pe[:, 0, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x: Tensor) -> Tensor:

        if self.batch_first:
            x = x + self.pe[:,:x.size(1)]
        else:
            x = x + self.pe[:x.size(0)]

        return self.dropout(x)

Transformers Encoder & Decoder whole code using Pytorch

class TimeSeriesTransformer(nn.Module):

    def __init__(self, 
        input_size: int,
        dec_seq_len: int,
        batch_first: bool,
        out_seq_len: int=58,
        dim_val: int=512,  
        n_encoder_layers: int=4,
        n_decoder_layers: int=4,
        n_heads: int=8,
        dropout_encoder: float=0.2, 
        dropout_decoder: float=0.2,
        dropout_pos_enc: float=0.1,
        dim_feedforward_encoder: int=2048,
        dim_feedforward_decoder: int=2048,
        num_predicted_features: int=1,
        device = device
        ): 

        super().__init__() 

        self.dec_seq_len = dec_seq_len

        self.encoder_input_layer = nn.Linear(
            in_features=input_size, 
            out_features=dim_val 
            )

        self.decoder_input_layer = nn.Linear(
            in_features=num_predicted_features,
            out_features=dim_val
            )  
        
        self.linear_mapping = nn.Linear(
            in_features=dim_val, 
            out_features=num_predicted_features
            )

        # Create positional encoder
        self.positional_encoding_layer = PositionalEncoder(
            d_model=dim_val,
            dropout=dropout_pos_enc
            )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_val, 
            nhead=n_heads,
            dim_feedforward=dim_feedforward_encoder,
            dropout=dropout_encoder,
            batch_first=batch_first
            )

        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=n_encoder_layers, 
            norm=None
            )

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=dim_val,
            nhead=n_heads,
            dim_feedforward=dim_feedforward_decoder,
            dropout=dropout_decoder,
            batch_first=batch_first
            )

        self.decoder = nn.TransformerDecoder(
            decoder_layer=decoder_layer,
            num_layers=n_decoder_layers, 
            norm=None
            )

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor=None, 
                tgt_mask: Tensor=None) -> Tensor:

        src = self.encoder_input_layer(src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
        #print("From model.forward(): Size of src after input layer: {}".format(src.size()))

        # Pass through the positional encoding layer
        src = self.positional_encoding_layer(src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
        
        src = self.encoder( # src shape: [batch_size, enc_seq_len, dim_val]
            src=src
            )

        decoder_output = self.decoder_input_layer(tgt) 
        decoder_output = self.decoder(
            tgt=decoder_output,
            memory=src,
            tgt_mask=tgt_mask,
            memory_mask=src_mask
            )
        decoder_output = self.linear_mapping(decoder_output) 
     
        return decoder_output

Masking process

def generate_square_subsequent_mask(dim1: int, dim2: int , device = device) -> Tensor:

    return torch.triu(torch.ones(dim1, dim2) * float('-inf'), diagonal=1)

Defining Model

model = TimeSeriesTransformer(
    input_size=len(input_variables),
    dec_seq_len=enc_seq_len,
    batch_first=batch_first,
    num_predicted_features=1
    )
src_mask = generate_square_subsequent_mask(
    dim1=output_sequence_length,
    dim2=enc_seq_len
    )

tgt_mask = generate_square_subsequent_mask( 
    dim1=output_sequence_length,
    dim2=output_sequence_length
    )
model = model.to(device)

Tensor Loader

i, batch = next(enumerate(training_data))

src, trg, trg_y = batch

src = src.to(device)
trg = trg.to(device)
trg_y = trg_y.to(device)

output

output = model(
    src=src,
    tgt=trg,
    src_mask=src_mask,
    tgt_mask=tgt_mask
    )

As a result in the title aformentioned error is raised

Your code is unfortunately not executable, so could you post the missing pieces and e.g. initialize the inputs with random data?

First step: importing all necessary pakages

import os
import numpy as np
from torch import nn, Tensor
from typing import Optional, Any, Union, Callable, Tuple
import torch
import math
import pandas as pd
from pathlib import Path
import datetime
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler , StandardScaler

Second Step setting up the device on google colab environment

try:
    import google.colab
    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False
# Make use of a GPU or MPS (Apple) if one is available.  (see module 3.2)
import torch
device = (
    "mps"
    if getattr(torch, "has_mps", False)
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using device: {device}")

Third Step Msking mehtod

def generate_square_subsequent_mask(dim1: int, dim2: int , device = device) -> Tensor:

    return torch.triu(torch.ones(dim1, dim2) * float('-inf'), diagonal=1)

fourth step: entire sequence index

def get_indices_entire_sequence(data: pd.DataFrame, window_size: int, step_size: int) -> list:


        stop_position = len(data)-1 # 1- because of 0 indexing
        
        # Start the first sub-sequence at index position 0
        subseq_first_idx = 0
        
        subseq_last_idx = window_size
        
        indices = []
        
        while subseq_last_idx <= stop_position:

            indices.append((subseq_first_idx, subseq_last_idx))
            
            subseq_first_idx += step_size
            
            subseq_last_idx += step_size

        return indices

Step Five reading data

def read_data(data_dir: Union[str, Path] = "data",  
    timestamp_col_name: str="timestamp") -> pd.DataFrame:


    # Ensure that `data_dir` is a Path object
    data_dir = Path(data_dir)

    # Read csv file
    csv_files = list(data_dir.glob("*.csv"))
    
    if len(csv_files) > 1:
        raise ValueError("data_dir contains more than 1 csv file. Must only contain 1")
    elif len(csv_files) == 0:
        raise ValueError("data_dir must contain at least 1 csv file.")
        
    data_path = csv_files[0]

    print("Reading file in {}".format(data_path))

    data = pd.read_csv(
        data_path, 
        parse_dates=[timestamp_col_name], 
        index_col=[timestamp_col_name], 
        infer_datetime_format=True,
        low_memory=False
    )

    # Make sure all "n/e" values have been removed from df. 
    if is_ne_in_df(data):
        raise ValueError("data frame contains 'n/e' values. These must be handled")

    data = to_numeric_and_downcast_data(data)

    # Make sure data is in ascending order by timestamp
    data.sort_values(by=[timestamp_col_name], inplace=True)

    return data

def is_ne_in_df(df:pd.DataFrame):

    for col in df.columns:

        true_bool = (df[col] == "n/e")

        if any(true_bool):
            return True

    return False


def to_numeric_and_downcast_data(df: pd.DataFrame):

    fcols = df.select_dtypes('float').columns
    
    icols = df.select_dtypes('integer').columns

    df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')
    
    df[icols] = df[icols].apply(pd.to_numeric, downcast='integer')

    return df

Creating dataset and Timeseries windowing

class TransformerDataset(Dataset):
  
    def __init__(self, 
        data: torch.tensor,
        indices: list, 
        enc_seq_len: int, 
        dec_seq_len: int, 
        target_seq_len: int
        ) -> None:
        
        super().__init__()

        self.indices = indices

        self.data = data

        print("From get_src_trg: data size = {}".format(data.size()))

        self.enc_seq_len = enc_seq_len

        self.dec_seq_len = dec_seq_len

        self.target_seq_len = target_seq_len



    def __len__(self):
        
        return len(self.indices)

    def __getitem__(self, index):

        # Get the first element of the i'th tuple in the list self.indicesasdfas
        start_idx = self.indices[index][0]

        # Get the second (and last) element of the i'th tuple in the list self.indices
        end_idx = self.indices[index][1]

        sequence = self.data[start_idx:end_idx]

        #print("From __getitem__: sequence length = {}".format(len(sequence)))

        src, trg, trg_y = self.get_src_trg(
            sequence=sequence,
            enc_seq_len=self.enc_seq_len,
            dec_seq_len=self.dec_seq_len,
            target_seq_len=self.target_seq_len
            )

        return src, trg, trg_y
    
    def get_src_trg(
        self,
        sequence: torch.Tensor, 
        enc_seq_len: int, 
        dec_seq_len: int, 
        target_seq_len: int
        ) -> Tuple[torch.tensor, torch.tensor, torch.tensor]:

        assert len(sequence) == enc_seq_len + target_seq_len, "Sequence length does not equal (input length + target length)"
        
        # encoder input
        src = sequence[:enc_seq_len] 
 
        trg = sequence[enc_seq_len-1:len(sequence)-1]
        
        assert len(trg) == target_seq_len, "Length of trg does not match target sequence length"

        # The target sequence against which the model output will be compared to compute loss
        trg_y = sequence[-target_seq_len:]

        assert len(trg_y) == target_seq_len, "Length of trg_y does not match target sequence length"

        return src, trg, trg_y.squeeze(-1)

Step six Positional Encoding for Transformers

class PositionalEncoder(nn.Module):


    def __init__(
        self, 
        dropout: float=0.1, 
        max_seq_len: int=5000, 
        d_model: int=512,
        batch_first: bool=False
        ):


        super().__init__()

        self.d_model = d_model
        
        self.dropout = nn.Dropout(p=dropout)

        self.batch_first = batch_first

        # adapted from PyTorch tutorial
        position = torch.arange(max_seq_len).unsqueeze(1)
        
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        
        if self.batch_first:
            pe = torch.zeros(1, max_seq_len, d_model)
            
            pe[0, :, 0::2] = torch.sin(position * div_term)
            
            pe[0, :, 1::2] = torch.cos(position * div_term)
        else:
            pe = torch.zeros(max_seq_len, 1, d_model)
        
            pe[:, 0, 0::2] = torch.sin(position * div_term)
        
            pe[:, 0, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x: Tensor) -> Tensor:

        if self.batch_first:
            x = x + self.pe[:,:x.size(1)]
        else:
            x = x + self.pe[:x.size(0)]

        return self.dropout(x)

Step Seven Hyperparameters

# Hyperparams

test_size = 0.1

batch_size = 128

target_col_name = "AEP_MW"

timestamp_col = "Datetime"

# Only use data from this date and onwards

#cutoff_date = datetime.datetime(2017, 1, 1)

## Params

dim_val = 512

n_heads = 8

n_decoder_layers = 4

n_encoder_layers = 4

dec_seq_len = 92 # length of input given to decoder

enc_seq_len = 153 # length of input given to encoder

output_sequence_length = 48 # target sequence length. If hourly data and length = 48, you predict 2 days ahead

window_size = enc_seq_len + output_sequence_length # used to slice data into sub-sequences

step_size = 1 # Step size, i.e. how many time steps does the moving window move at each step

in_features_encoder_linear_layer = 2048

in_features_decoder_linear_layer = 2048

max_seq_len = enc_seq_len

batch_first = True

# Define input variables

exogenous_vars = [] # should contain strings. Each string must correspond to a column name

input_variables = [target_col_name] + exogenous_vars

target_idx = 0 # index position of target in batched trg_y

input_size = len(input_variables)

Step eight reading data using above defined function (min case dataset is localy in my pc)

data = read_data("/content/",timestamp_col_name=timestamp_col)

training_data = data[:-(round(len(data)*test_size))]

Step nin Normalizing Data, Creating Dataset and Pytorch Loader

training_indices = get_indices_entire_sequence(
    data=training_data, 
    window_size=window_size, 
    step_size=step_size)

scaler = MinMaxScaler(feature_range=(0,1))
training_data = training_data["AEP_MW"]
training_data = pd.DataFrame(training_data)

training_data = scaler.fit_transform(training_data.values)

training_data = TransformerDataset(
    data=torch.tensor(training_data).float(),
    indices=training_indices,
    enc_seq_len=enc_seq_len,
    dec_seq_len=dec_seq_len,
    target_seq_len=output_sequence_length
    )

training_data = DataLoader(training_data, batch_size, shuffle=False , drop_last=True)

i, batch = next(enumerate(training_data))

src, trg, trg_y = batch

if batch_first == False:

    shape_before = src.shape
    src = src.permute(1, 0, 2)
    print("src shape changed from {} to {}".format(shape_before, src.shape))

    shape_before = trg.shape
    trg = trg.permute(1, 0, 2)
    print("src shape changed from {} to {}".format(shape_before, src.shape))

Step ten Defining Transformers Encodr and Decoder Model

class TimeSeriesTransformer(nn.Module):

    def __init__(self, 
        input_size: int,
        dec_seq_len: int,
        batch_first: bool,
        out_seq_len: int=58,
        dim_val: int=512,  
        n_encoder_layers: int=4,
        n_decoder_layers: int=4,
        n_heads: int=8,
        dropout_encoder: float=0.2, 
        dropout_decoder: float=0.2,
        dropout_pos_enc: float=0.1,
        dim_feedforward_encoder: int=2048,
        dim_feedforward_decoder: int=2048,
        num_predicted_features: int=1,
        device = device
        ): 

        super().__init__() 

        self.dec_seq_len = dec_seq_len

        self.encoder_input_layer = nn.Linear(
            in_features=input_size, 
            out_features=dim_val 
            )

        self.decoder_input_layer = nn.Linear(
            in_features=num_predicted_features,
            out_features=dim_val
            )  
        
        self.linear_mapping = nn.Linear(
            in_features=dim_val, 
            out_features=num_predicted_features
            )

        # Create positional encoder
        self.positional_encoding_layer = PositionalEncoder(
            d_model=dim_val,
            dropout=dropout_pos_enc
            )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_val, 
            nhead=n_heads,
            dim_feedforward=dim_feedforward_encoder,
            dropout=dropout_encoder,
            batch_first=batch_first
            )

        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=n_encoder_layers, 
            norm=None
            )

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=dim_val,
            nhead=n_heads,
            dim_feedforward=dim_feedforward_decoder,
            dropout=dropout_decoder,
            batch_first=batch_first
            )

        self.decoder = nn.TransformerDecoder(
            decoder_layer=decoder_layer,
            num_layers=n_decoder_layers, 
            norm=None
            )

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor=None, 
                tgt_mask: Tensor=None) -> Tensor:

        src = self.encoder_input_layer(src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
        #print("From model.forward(): Size of src after input layer: {}".format(src.size()))

        # Pass through the positional encoding layer
        src = self.positional_encoding_layer(src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
        
        src = self.encoder( # src shape: [batch_size, enc_seq_len, dim_val]
            src=src
            )

        decoder_output = self.decoder_input_layer(tgt) 
        decoder_output = self.decoder(
            tgt=decoder_output,
            memory=src,
            tgt_mask=tgt_mask,
            memory_mask=src_mask
            )
        decoder_output = self.linear_mapping(decoder_output) 
     
        return decoder_output

Step eleven model

model = TimeSeriesTransformer(
    input_size=len(input_variables),
    dec_seq_len=enc_seq_len,
    batch_first=batch_first,
    num_predicted_features=1
    )
src_mask = generate_square_subsequent_mask(
    dim1=output_sequence_length,
    dim2=enc_seq_len
    )

tgt_mask = generate_square_subsequent_mask( 
    dim1=output_sequence_length,
    dim2=output_sequence_length
    )
model = model.to(device)

step 12

i, batch = next(enumerate(training_data))

src, trg, trg_y = batch

src = src.to(device)
trg = trg.to(device)
trg_y = trg_y.to(device)

output

output = model(
    src=src,
    tgt=trg,
    src_mask=src_mask,
    tgt_mask=tgt_mask
    )

I am doing Time series forecasting using Transforers on AEP_hourly.csv dataset only for education propose

the AEP_hourly.csv dataset has it’s own preprocessing, step before using this dataset runing the below code will be usefull for removing some unecessary column and duplicate observation

def is_ne_in_df(df:pd.DataFrame):
    for col in df.columns:
        true_bool = (df[col] == "n/e")
        if any(true_bool):
            return True
    return False

def to_numeric_and_downcast_data(df: pd.DataFrame):
    fcols = df.select_dtypes('float').columns
    icols = df.select_dtypes('integer').columns
    df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')
    df[icols] = df[icols].apply(pd.to_numeric, downcast='integer')
    return df

def process_missing_and_duplicate_timestamps(filepath, train_size=80, val_size=50, verbose=True):
    df = pd.read_csv(filepath)
    df.sort_values('Datetime', inplace=True)
    df.reset_index(drop=True, inplace=True)

    indices_to_remove = []
    rows_to_add = []
    hour_counter = 1
    prev_date = ''

    if verbose:
        print(filepath)

    for index, row in df.iterrows():
        date_str = row['Datetime']

        year_str = date_str[0:4]
        month_str = date_str[5:7]
        day_str = date_str[8:10]
        hour_str = date_str[11:13]
        tail_str = date_str[14:]

        def date_to_str():
            return '-'.join([year_str, month_str, day_str]) + ' ' + ':'.join([hour_str, tail_str])

        def date_with_hour(hour):
            hour = '0' + str(hour) if hour < 10 else str(hour)
            return '-'.join([year_str, month_str, day_str]) + ' ' + ':'.join([hour, tail_str])

        if hour_counter != int(hour_str):
            if prev_date == date_to_str():
                # Duplicate datetime, calculate the average and keep only one
                average = int((df.iat[index, 1] + df.iat[index - 1, 1]) / 2)  # Get the average
                df.iat[index, 1] = average
                indices_to_remove.append(index - 1)
                if verbose:
                    print('Duplicate ' + date_to_str() + ' with average ' + str(average))
            elif hour_counter < 23:
                # Missing datetime, add it using the average of the previous and next for the consumption (MWs)
                average = int((df.iat[index, 1] + df.iat[index - 1, 1]) / 2)
                rows_to_add.append(pd.Series([date_with_hour(hour_counter), average], index=df.columns))
                if verbose:
                    print('Missing ' + date_with_hour(hour_counter) + ' with average ' + str(average))
            else:
                print(date_to_str() + ' and hour_counter ' + str(hour_counter) + " with previous: " + prev_date)

            # Adjust for the missing/duplicate value
            if prev_date < date_to_str():
                hour_counter = (hour_counter + 1) % 24
            else:
                hour_counter = (hour_counter - 1) if hour_counter - 1 > 0 else 0

        # Increment the hour
        hour_counter = (hour_counter + 1) % 24
        prev_date = date_str

    df.drop(indices_to_remove, inplace=True)
    if rows_to_add:
        new_rows = pd.concat(rows_to_add, axis=1).transpose()
        df = pd.concat([df, new_rows], ignore_index=True)  # Concatenating the new rows

    # New rows are added at the end, sort them and also recalculate the indices
    df.sort_values('Datetime', inplace=True)
    df.reset_index(drop=True, inplace=True)
    df = df.set_index("Datetime")
    if is_ne_in_df(df):
        raise ValueError("data frame contains 'n/e' values. These must be handled")
    df = to_numeric_and_downcast_data(df)
    
    train = df[:len(df) * train_size // 100]
    val = df[len(train) : len(train) + ((len(df) - len(train)) * val_size) // 100]
    test = df[len(val) + len(train) : ]
    
    return df , train , val , test

if you have necessary anything i am ready to online to here

runing the whole above code raising the below error

/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5438         v = v.view(bsz, num_heads, src_len, head_dim)
   5439 
-> 5440         attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
   5441         attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
   5442 

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument attn_bias in method wrapper_CUDA___scaled_dot_product_efficient_attention)

Please remove all data loading and create random tensors to execute the model without any data dependency.

Try changing this to:

def generate_square_subsequent_mask(dim1: int, dim2: int , device = device) -> Tensor:

    return torch.triu(torch.ones(dim1, dim2) * float('-inf'), diagonal=1).bool()

Notice the bool() at the end. There are different approaches to implement masks: Either with -inf or with True/False values. Which one is correct depends on the implementation of the transformer.

Thanks, Mr. The aforementioned bug was solved, but the new problem is that I still struggle and could not find a solution to the device problem, even though I put either model and all variables on the device. If I select the device CPU, it is fine and working. However, when I change the device to GPU using the same procedure, it does not work. if you a solution I will be so happy, the whole code is in the above

and raised error

/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5438         v = v.view(bsz, num_heads, src_len, head_dim)
   5439 
-> 5440         attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
   5441         attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
   5442 

`RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument attn_bias in method wrapper_CUDA___scaled_dot_product_efficient_attention)`

This problem typically arises if you forget to move some of the tensor to the GPU. Apart from the model itself, you have to make sure that any tensor you create gets moved to the same device.

For example, if you look into this notebook of mine and search for all occurrences of .to(DEVICE), you will see lines such:

    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

Where the input sequence and the mask get explicitly moved to the device (e.g., the GPU).

You probably just need to check your code for that.

thanks vdw from your honorable assistance the problem is solved and worked for Univariate timeseries forecasting problems. but the new problem is raised with multivariate time series forecasting I have posted the step by step details to here(https://discuss.pytorch.org/t/runtimeerror-mat1-and-mat2-shapes-cannot-be-multiplied-64x13056-and-153600x2048/101315)

once again the genereted bug i will post also in the below

Traceback (most recent call last):

  Cell In[348], line 1
    output = model(

  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\modules\module.py:1518 in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)

  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\modules\module.py:1527 in _call_impl
    return forward_call(*args, **kwargs)

  Cell In[344], line 80 in forward
    decoder_output = self.decoder_input_layer(tgt)

  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\modules\module.py:1518 in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)

  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\modules\module.py:1527 in _call_impl
    return forward_call(*args, **kwargs)

  File C:\ProgramData\anaconda3\Lib\site-packages\torch\nn\modules\linear.py:114 in forward
    return F.linear(input, self.weight, self.bias)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (25x7 and 1x512)

if you have any idea please share with me i will be so happy