LSTM and RNN causing CPU spike

I new to Pytorch - so please excuse my tender knowledge :grinning:

The issue I have is with a LSTM model - I am using a GPU (Nvidia 1060 6GB) - and all models are tasked to use the GPU

However - while running a LSTM or a RNN model the CPU usage jumps to 50+% and stays there for the duration of the epochs.

This is not the case with GRU or CNN models (CPU steady at 10-12%)

I have checked all the steps and all are running on the GPU (bar the optimiser.step()

It it normal for LSTM and RNN models to cause a CPU spike like this?

I think (pretty sure) the CPU spike is caused by the optimizer.step() call.

Thanks in advance.

Are you using any other CPU operations with your LSTM model (accuracy calculation, logging etc.)?
Also, are you using the same setup for your DataLoader?

Hi Ptrblck

Thanks for replying.

I have included the code I am using below to assist in identifying issue. This code is generic to all 4 models I am running (CNN, RNN, LSTM and GRU). The problem with the CPU only occurs on the RNN and LSTM models.

PS I am using the UNSW-NB15 dataset.

Thanks in advance for your help.

Gerry
#######################################################################
###Code Listing#####################

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from torchvision import datasets,transforms
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset

import torch.optim as optim

from timeit import default_timer as timer
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import torch.nn.functional as F

import dataModels as dm
import utilities as ut
import os
from datetime import datetime
from collections import OrderedDict
from collections import namedtuple
from itertools import product

class RunBuilder():
@staticmethod
def get_runs(params):

    Run = namedtuple('Run', params.keys())

    runs = []
    for v in product(*params.values()):
        runs.append(Run(*v))

    return runs

#load training data
def loadTrainingData(dataSet,batch_size):

try:
    np.set_printoptions(threshold=np.inf)

    if dataSet==1:
        
        fileName="c:\data\Train15All.csv"
    #elif dataSet==2:
        
    #    fileName="c:\data\B1520000.csv"
    #elif dataSet==3:
        
    #    fileName="c:\data\B1530000.csv"
    #elif dataSet==4:
        
    #    fileName="c:\data\B1540000.csv"
    #else:
    #    fileName="c:\data\B15train2.csv"
    
        
    dataImported=pd.read_csv(fileName) 

    
        
    x=dataImported.to_numpy()

    #balance dataset for processing
    if len(x)>10000:
        x=x[0:82000]

    #print(len(x))
    #normalise data to range
    #b = x / np.linalg.norm(x, ord=1)
   
    #l2=Euclidean norm
    ##l2-normalization, “unit norm” essentially means that if we squared each element in the vector, and summed them, it would equal 1.
    # https://kawahara.ca/how-to-normalize-vectors-to-unit-norm-in-python/
    b = preprocessing.normalize(x, norm='l2')
    
    #verification of normalisation
    #X_squared = b ** 2
    ##print(X_squared)
    #X_sum_squared = np.sum(X_squared, axis=1)
    #print(X_sum_squared)
    #print(X_sum_squared.size)


    #extract dataset first 42 columns
    data=b[:,0:41]
   
    #get data
    d=data

   

    #extract labels set
    labels = b[:, [42]]

     #classify labels
    l=np.ceil(labels)

   
    #convert to tensor
    allDataTensor = torch.from_numpy(d).float()

    #rint(allDataTensor)

    allLabelsTensor = torch.from_numpy(l).long()
    allLabelsTensor=allLabelsTensor.squeeze(1)
    
    #print(allLabelsTensor)

    #calculate number unique elements in dataset for future embedding layer vector generation
    uniqueElements=np.unique(d)

   
    combinedDataLabelTensor = TensorDataset(allDataTensor, allLabelsTensor)

    train_loader = DataLoader(combinedDataLabelTensor, shuffle=True, batch_size=batch_size)

    return train_loader


except Exception as e:
    print("Error Generating Data Sets - generateDataSets module\n\n" + str(e))

#run model
def runModel(model,train_loader,num_epochs,lr2,optim,momentum2,modelNumber,batch_size):

#declare loss function
criterion = nn.CrossEntropyLoss()


#set optopmiser
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
if optim==1:
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr2)
    
else:
    optimizer = torch.optim.SGD(model.parameters(), lr=lr2, momentum=momentum2)
   

iter = 0
r_total=0

start = timer()
optomiserSteps=0

for epoch in range(num_epochs):

    total=0
    correct=0
    accuracy=0

    #adustment for GRU model
    if modelNumber==3:               
       h = model.init_hidden(batch_size)
       h=h.to(device)


    for i, (data, labels) in enumerate(train_loader):

        #gpu assign data
        data = data.requires_grad_().to(device)
        labels = labels.to(device)

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        #outputs = model(images)

        #determine if standard or GRU model
        if modelNumber !=3:
            outputs = model(data) 
        else:
            #initialisation for GRU model
            h = h.data
            # forward pass
            outputs, h = model(data, h) 


        #adjust label for classification
        #labels=labels.squeeze(1)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()


        #optomiserSteps+=1
        #print("O Steps = ",optomiserSteps)
        
        iter += 1

      
        # Get predictions from the maximum value
        _, predicted = torch.max(outputs.data, 1)

        # Total number of labels
        total += labels.size(0)
        
        
        # Total correct predictions
        if torch.cuda.is_available():
            correct += (predicted.cpu() == labels.cpu()).sum()
        else:
            correct += (predicted == labels).sum()

        
        #correct += (predicted.to(device) == labels).sum()

    accuracy = 100 * correct.item() / total
    r_total+=total

    # Print Loss
    print('Epoch\t%d\tNIP:\t%d\tLoss:\t%.3f\tAccuracy:\t%.2f\t%d\tsamples' % (epoch+1,r_total,loss.item(), accuracy,total))

end=timer()      

print('\n\nModel Training Time (secs): %d' % (end - start))

#define model and assign to gpu
def defineModel(modelNo):

#model params
input_dim = 41 
hidden_dim = 100
output_dim = 2
layer_dim = 2

if modelNo==1:
    model = dm.CNNModel()
elif modelNo==2:
    model = dm.LSTMModel(input_dim,hidden_dim,layer_dim,output_dim)
elif modelNo==3:
    model = dm.GRUModel(input_dim,hidden_dim,output_dim,layer_dim)
else:
    model = dm.RNNModel(input_dim,hidden_dim,layer_dim,output_dim)

return model

def setDevice():

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

return device

params = OrderedDict(
ep =[40], #number epochs
ml =[2], #modelNumber=2 #1=CNN 2=LSTM 3=GRU 4=RNN
lr =[0.00146],
mm =[0.2,0.3,0.4,0.5] #momentum

)
#lr=0.00146 #0.005 and 0.00146 and 0.001 https://medium.com/octavian-ai/which-optimizer-and-learning-rate-should-i-use-for-deep-learning-5acb418f9b2

#main
try:

#optomise cuda
torch.backends.cudnn.benchmark = True

#path to save model train/validate/test results to
filename = os.path.join(os.environ['USERPROFILE'],"Desktop\PytorchModel\TestResults-%s.txt" % (datetime.now().strftime("%Y%m%d-%H%M%S")))

classes=('OK','Intrusion')

#model hyperparamaters
batch_size = 1000

#epochs=30
optimizer=1  #1=Adam 2= SGD
trainingDataset=1    #1=10000 2=20000 3= 30000 4=40000 5=82000
momentum=0.3

#build run parameters
runs = RunBuilder.get_runs(params)        
#print(runs)


#load dataset
train_loader=loadTrainingData(trainingDataset,batch_size)

#start writing to file
ut.startWritingFile(filename)

#print date
todaysDate=datetime.now().strftime("%b %d %Y %H:%M:%S")
    

for run in runs:

    ########### MODEL SETUP #########################################################
    model=defineModel(run.ml)      
    device=setDevice()
    model.to(device)
    #######################################################################

   
    print()
    if run.ml==1:
        print("CNN Model -  Adam: "+str(run) +" "+str(todaysDate))
    elif run.ml==2:
        print("LSTM Model - Adam: "+str(run) +" "+str(todaysDate))
    elif run.ml==3:
        print("GRU Model - Adam: "+str(run) +" "+str(todaysDate))
    else:
        print("RNN Model -  Adam: "+str(run) +" "+str(todaysDate))        
    print()

    #run this model
    runModel(model,train_loader,run.ep,run.lr,optimizer,run.mm,run.ml,batch_size)
    print()
    print()

      

ut.stopWritingFile()

except Exception as e :
print(“Error Training Network - Main Module code\n\n” + str(e))

###################################################################
####MODELS#####################################

#training template model
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os

class CNNModel(nn.Module):
def init(self):
super(CNNModel, self).init()

    # Convolution 1
    self.cnn1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=1, stride=1, padding=0)
    self.relu1 = nn.ReLU()

    # Max pool 1
    self.maxpool1 = nn.MaxPool2d(kernel_size=2)

    # Convolution 2
    self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=1, stride=1, padding=0)
    self.relu2 = nn.ReLU()

    # Max pool 2
    self.maxpool2 = nn.MaxPool2d(kernel_size=2)

    # Fully connected 1 (readout)
    #self.fc1 = nn.Linear(1312, 656) 
    #self.fc2 = nn.Linear(656, 328) 
    #self.fc3 = nn.Linear(328, 2) 
    #2 conv layers
    self.fc1=nn.Linear(1312,2) 
    #self.fc1=nn.Linear(1312,656)
    #self.fc2=nn.Linear(656,328)
    #self.fc3=nn.Linear(328,2)

def forward(self, x):
    # Convolution 1
    #out = self.cnn1(x)
    #out = self.relu1(out)

    ## Max pool 1
    #out = self.maxpool1(out)
    x=x.unsqueeze(1)
    x=x.unsqueeze(1)

    out=(self.relu1(self.cnn1(x)))
    # Convolution 2 
    #out = self.cnn2(out)
    #out = self.relu2(out)

    ## Max pool 2 
    #out = self.maxpool2(out)

    out=(self.relu2(self.cnn2(out)))

    # Resize
    # Original size: (100, 32, 7, 7)
    # out.size(0): 100
    # New out size: (100, 32*7*7)
    out = out.view(out.size(0), -1)

    # Linear function (readout)
    #out = F.relu(self.fc1(out))
    #out = F.relu(self.fc2(out))
    #out = self.fc3(out)

    out=self.fc1(out)

    return out

class RNNModel(nn.Module):
def init(self, input_dim, hidden_dim, layer_dim, output_dim):
super(RNNModel, self).init()

    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

    # Hidden dimensions
    self.hidden_dim = hidden_dim

    # Number of hidden layers
    self.layer_dim = layer_dim

    # Building your RNN
    # batch_first=True causes input/output tensors to be of shape
    # (batch_dim, seq_dim, feature_dim)
    self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='tanh')

    # Readout layer
    self.fc = nn.Linear(hidden_dim, output_dim)

def forward(self, x):
    # Initialize hidden state with zeros
    #######################
    #  USE GPU FOR MODEL  #
    #######################
    x = x.unsqueeze(1)

    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(self.device)

    # One time step
    # We need to detach the hidden state to prevent exploding/vanishing gradients
    # This is part of truncated backpropagation through time (BPTT)
    out, hn = self.rnn(x, h0.detach())

    # Index hidden state of last time step
    # out.size() --> 100, 28, 100
    # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
    out = self.fc(out[:, -1, :]) 
    # out.size() --> 100, 10
    return out

class LSTMModel(nn.Module):
def init(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).init()

    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

    #self.embed = nn.Embedding(256,8)

    # Number of hidden layers
    self.hidden_dim = hidden_dim
   
    self.layer_dim = layer_dim
    self.input_dim = input_dim
    
    # Building your LSTM
    # batch_first=True causes input/output tensors to be of shape
    #[65536 100 3]
    self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.layer_dim, batch_first=True)
    

    # Readout layer
    self.fc = nn.Linear(hidden_dim, output_dim)

    i=0
def forward(self, x):

    #shape data with embedding layer and reshape
    #x = self.embed(x)

    #[1,1,65536 (8192 - data input *8 from embedding layer)]
    x = x.reshape(x.size(0),1,-1)
  
    # Initialize hidden state with zeros
    #[1,1,100]
    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)

    #[1,1,100]
    # Initialize cell state
    c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)
    
    # One time step
    # We need to detach as we are doing truncated backpropagation through
    # time (BPTT) # If we don't, we'll backprop all the way to the start even after going
    # through another batch
    out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

    # Index hidden state of last time step
    # out.size() --> [1,2]

    out = self.fc(out[:, -1, :])

    return out

class GRUModel(nn.Module):
def init(self, input_dim, hidden_dim, output_dim, n_layers, drop_prob=0.2):
super(GRUModel, self).init()

    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    #self.embed = nn.Embedding(256,8)
    self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.relu = nn.ReLU()
    
    
def forward(self, x, h):
    #x = self.embed(x)
    #x=x.unsqueeze(0)
    x=x.reshape(x.size(0),1,-1)
    out, h = self.gru(x, h)
    out = self.fc(self.relu(out[:,-1]))
    return out, h

def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device)
    return hidden

Could you remove the complete data loading and use dummy input data for each model?
Also, which shapes are used for which model?

Patrick

As I understand it (newbie) the data loading is the same for all models – so if it is not an issue for GRU and CNN models it should not be impacting RNN /LSTM models.

The shapes for the LSTM model is as follows during each stage in the model in the forward code are shown below

def forward(self, x):

    #shape data with embedding layer and reshape

    #x = self.embed(x)

    #[1,1,65536 (8192 - data input *8 from embedding layer)]

    x = x.reshape(x.size(0),1,-1)

    # Initialize hidden state with zeros

    #[1,1,100]

    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)

    #[1,1,100]

    # Initialize cell state

    c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)

    # One time step

    # We need to detach as we are doing truncated backpropagation through

    # time (BPTT) # If we don't, we'll backprop all the way to the start even after going

    # through another batch

    out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

    # Index hidden state of last time step

    # out.size() --> [1,2]

    out = self.fc(out[:, -1, :])

    return out

Model Shapes – step by step

x ->[1000,41] (representing batch size 1000 and 41 distinct values)

x = x.reshape(x.size(0),1,-1)->[1000,1,41]

h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)->[2,1000,100]

c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)->[2,1000,100]

out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))->[1000,1,100]

out = self.fc(out[:, -1, :])->[1000,2]

Thank you again.

Gerry

Yes, the loading itself should be equal. However I wanted to check, if you are applying any preprocessing on the input data, which might be different for the models.

Patrick

All pre processing operations for all models are identical…as shown in original code posted.

I have also been investigating and when you halve the sample size to 40000, the CPU load on the LSTM and RNN models drops to 30%…so there is a relationship between sample size and CPU load for these LSTM and RNN…but not for the CNN or GRU models. Strange.

Thanks.

Gerry