Why are processing speed of GPU (1660 super and 3090 RTX) the same?

Hi, Is there somebody to help me?

I have two GPU workstations with the same CPU. One is the 1660 super, and another is the 3090 RTX. The problem is that the processing speed of the two workstations is almost equal. All you can be shown in the below code, where is the problem? Help me!

from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
import torch.autograd.profiler as profiler
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split

import matplotlib
from matplotlib import pyplot as plt

import os
PATH = os.getcwd()

# terminal command for convenience
parser = argparse.ArgumentParser(description='AE-based Inverse Rational Control.Stochastic.ver')
parser.add_argument('--batch-size', type=int, default=128, metavar='N',
                    help='input batch size for training (default: 128)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

# Dataset
DrivingData = pd.read_csv("../Trajectory_Dataset/Process_data2.csv")

# Dataset parameter
Seq_length = 10

tmpX_array = []
tmpY_array = []
for k in range(int(len(DrivingData) / 3000)):
    tmp_df = DrivingData[k * 3000:k * 3000 + 3000]
    tmp_X = tmp_df.drop(columns=['Unnamed: 0', 'idx', 's15', 's16', 's17'])
    tmp_Y = tmp_df.iloc[:, 16:19]
    for i in range(3000):
        if len(tmp_df.iloc[i:i + Seq_length].to_numpy()) == Seq_length:
            tmpX_array.append(tmp_X.iloc[i:i + Seq_length, ].to_numpy())
            tmpY_array.append(tmp_Y.iloc[i:i + Seq_length, ].to_numpy())

bs = 64
X_TrainLoader = torch.tensor(tmpX_array, dtype=torch.float32, device=torch.device('cuda'))
Y_TrainLoader = torch.tensor(tmpY_array, dtype=torch.float32, device=torch.device('cuda'))
XY_LOADER = TensorDataset(X_TrainLoader, Y_TrainLoader)
TrainLoader = torch.utils.data.DataLoader(dataset=XY_LOADER, batch_size=bs, shuffle=True)

# X_reshpae = torch.swapaxes(X_TrainLoader, 0, 1)
# Y_reshpae = torch.swapaxes(Y_TrainLoader, 0, 1)
# X_TrainLoader, X_TestLoader = train_test_split(X_TrainLoader, test_size=0.2)

# torch seed
random.seed(args.seed)
np.random.seed(args.seed)
os.environ["PYTHONHASHEED"] = str(args.seed)
torch.manual_seed(args.seed)

device = torch.device("cuda" if args.cuda else "cpu")
print(device)
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}

loss_list = []

lr = 5e-03
num_obs = 3
batch_size = 64
class LSTMIRC(nn.Module):
    # Load Parameters of Actor
    # weights = np.load('../ModelParameter.csv.npy', allow_pickle=True)

    def __init__(self,
                 input_size1=16,
                 hidden_size1=16,
                 input_size2=16,
                 hidden_size2=16,
                 output_size=3,
                 num_layers=10):
        super(LSTMIRC, self).__init__()

        self.input_size1 = input_size1
        self.hidden_size1 = hidden_size1
        self.input_size2 = input_size2
        self.hidden_size2 = hidden_size2
        self.output_size = output_size
        self.num_layers = num_layers

        self.lstm1 = nn.LSTM(self.input_size1, self.hidden_size1, self.num_layers, batch_first=True,
                            dropout=0.1, bidirectional=False)
        self.lstm2 = nn.LSTM(self.input_size2, self.hidden_size2, self.num_layers, batch_first=True,
                            dropout=0.1, bidirectional=False)
        self.fc = nn.Linear(self.hidden_size2, self.output_size)

    def forward(self, x):
        # Initialize hidden state with zeros
        lstm1_h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size1, device=x.device).requires_grad_()
        lstm2_h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size2, device=x.device).requires_grad_()

        # Initialize cell state
        lstm1_c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size1, device=x.device).requires_grad_()
        lstm2_c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size2, device=x.device).requires_grad_()

        # input x.shape == (batch size, length of sequence, feature dimension)
        # input of LSTM2 == LSTM1 output.shape == (batch size, length of sequence, LSTM1_hidden_size)
        # input of FCnet == LSTM2 output.shape == (batch size, length of sequence, LSTM2_hidden_size)
        # output of FCnet == (batch size, length of sequence, output_size)
        with profiler.record_function("Forward"):
            output, (hidden1, cell1) = self.lstm1(x, (lstm1_h0, lstm1_c0))
            output, (hidden2, cell2) = self.lstm2(output, (lstm2_h0, lstm2_c0))
            #decode
            pred = F.relu(self.fc(output))

        return pred

model = LSTMIRC().cuda()
opt = optim.Adam(model.parameters(), lr=lr, eps=1e-08)

def loss_function(rc_x, x):
    #  Reconstructure error: MSE
    MSE = F.mse_loss(rc_x, x, reduction='mean')
    return MSE

def train(epoch):
    model.train()
    train_loss = 0

    for batch_idx, (data, target) in enumerate(tqdm(TrainLoader)):
        data = data
        target = target
        # Initialize the gradients calculated previous step; Clears old parameters
        opt.zero_grad()
        Pred = model(data)

        loss = loss_function(Pred, target)

        # Backpropagation
        loss.backward(retain_graph=True)
        loss_list.append(loss.clone().detach())
        train_loss += loss.item()

        # Optimizer and Scheduler
        opt.step()
        # sched.step()

    print('====> Epoch: {} Average loss: {:.4f}'.format(
        epoch, train_loss / round(len(X_TrainLoader)/batch_size)))

if __name__ == "__main__":
    # print(device)
    for epoch in range(1, args.epochs + 1):
        train(epoch)

    torch.save(model.state_dict(), PATH +
               'model.pt')

    plt.plot(loss_list)
    plt.xlabel('number of the sample')
    plt.ylabel('loss')
    plt.show()

As a general advice, take a look at the performance guide to get the best performance and to avoid bottleneck.
Once this is done, you could profile your code using the PyTorch profiler or e.g. Nsight Systems to see where the bottlenecks in your code are.

Thank you @ptrblck!
I checked the profiling information by using PyTorch profiler, and I can identify the weirdo things. In RTX3090 setting, only use the cudaLaunchKernel (approximately CPU util 25%). Do you know anything about this?