Hi, Is there somebody to help me?
I have two GPU workstations with the same CPU. One is the 1660 super, and another is the 3090 RTX. The problem is that the processing speed of the two workstations is almost equal. All you can be shown in the below code, where is the problem? Help me!
from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
import torch.autograd.profiler as profiler
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib
from matplotlib import pyplot as plt
import os
PATH = os.getcwd()
# terminal command for convenience
parser = argparse.ArgumentParser(description='AE-based Inverse Rational Control.Stochastic.ver')
parser.add_argument('--batch-size', type=int, default=128, metavar='N',
help='input batch size for training (default: 128)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
# Dataset
DrivingData = pd.read_csv("../Trajectory_Dataset/Process_data2.csv")
# Dataset parameter
Seq_length = 10
tmpX_array = []
tmpY_array = []
for k in range(int(len(DrivingData) / 3000)):
tmp_df = DrivingData[k * 3000:k * 3000 + 3000]
tmp_X = tmp_df.drop(columns=['Unnamed: 0', 'idx', 's15', 's16', 's17'])
tmp_Y = tmp_df.iloc[:, 16:19]
for i in range(3000):
if len(tmp_df.iloc[i:i + Seq_length].to_numpy()) == Seq_length:
tmpX_array.append(tmp_X.iloc[i:i + Seq_length, ].to_numpy())
tmpY_array.append(tmp_Y.iloc[i:i + Seq_length, ].to_numpy())
bs = 64
X_TrainLoader = torch.tensor(tmpX_array, dtype=torch.float32, device=torch.device('cuda'))
Y_TrainLoader = torch.tensor(tmpY_array, dtype=torch.float32, device=torch.device('cuda'))
XY_LOADER = TensorDataset(X_TrainLoader, Y_TrainLoader)
TrainLoader = torch.utils.data.DataLoader(dataset=XY_LOADER, batch_size=bs, shuffle=True)
# X_reshpae = torch.swapaxes(X_TrainLoader, 0, 1)
# Y_reshpae = torch.swapaxes(Y_TrainLoader, 0, 1)
# X_TrainLoader, X_TestLoader = train_test_split(X_TrainLoader, test_size=0.2)
# torch seed
random.seed(args.seed)
np.random.seed(args.seed)
os.environ["PYTHONHASHEED"] = str(args.seed)
torch.manual_seed(args.seed)
device = torch.device("cuda" if args.cuda else "cpu")
print(device)
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
loss_list = []
lr = 5e-03
num_obs = 3
batch_size = 64
class LSTMIRC(nn.Module):
# Load Parameters of Actor
# weights = np.load('../ModelParameter.csv.npy', allow_pickle=True)
def __init__(self,
input_size1=16,
hidden_size1=16,
input_size2=16,
hidden_size2=16,
output_size=3,
num_layers=10):
super(LSTMIRC, self).__init__()
self.input_size1 = input_size1
self.hidden_size1 = hidden_size1
self.input_size2 = input_size2
self.hidden_size2 = hidden_size2
self.output_size = output_size
self.num_layers = num_layers
self.lstm1 = nn.LSTM(self.input_size1, self.hidden_size1, self.num_layers, batch_first=True,
dropout=0.1, bidirectional=False)
self.lstm2 = nn.LSTM(self.input_size2, self.hidden_size2, self.num_layers, batch_first=True,
dropout=0.1, bidirectional=False)
self.fc = nn.Linear(self.hidden_size2, self.output_size)
def forward(self, x):
# Initialize hidden state with zeros
lstm1_h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size1, device=x.device).requires_grad_()
lstm2_h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size2, device=x.device).requires_grad_()
# Initialize cell state
lstm1_c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size1, device=x.device).requires_grad_()
lstm2_c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size2, device=x.device).requires_grad_()
# input x.shape == (batch size, length of sequence, feature dimension)
# input of LSTM2 == LSTM1 output.shape == (batch size, length of sequence, LSTM1_hidden_size)
# input of FCnet == LSTM2 output.shape == (batch size, length of sequence, LSTM2_hidden_size)
# output of FCnet == (batch size, length of sequence, output_size)
with profiler.record_function("Forward"):
output, (hidden1, cell1) = self.lstm1(x, (lstm1_h0, lstm1_c0))
output, (hidden2, cell2) = self.lstm2(output, (lstm2_h0, lstm2_c0))
#decode
pred = F.relu(self.fc(output))
return pred
model = LSTMIRC().cuda()
opt = optim.Adam(model.parameters(), lr=lr, eps=1e-08)
def loss_function(rc_x, x):
# Reconstructure error: MSE
MSE = F.mse_loss(rc_x, x, reduction='mean')
return MSE
def train(epoch):
model.train()
train_loss = 0
for batch_idx, (data, target) in enumerate(tqdm(TrainLoader)):
data = data
target = target
# Initialize the gradients calculated previous step; Clears old parameters
opt.zero_grad()
Pred = model(data)
loss = loss_function(Pred, target)
# Backpropagation
loss.backward(retain_graph=True)
loss_list.append(loss.clone().detach())
train_loss += loss.item()
# Optimizer and Scheduler
opt.step()
# sched.step()
print('====> Epoch: {} Average loss: {:.4f}'.format(
epoch, train_loss / round(len(X_TrainLoader)/batch_size)))
if __name__ == "__main__":
# print(device)
for epoch in range(1, args.epochs + 1):
train(epoch)
torch.save(model.state_dict(), PATH +
'model.pt')
plt.plot(loss_list)
plt.xlabel('number of the sample')
plt.ylabel('loss')
plt.show()