I am running Python code on a Jupyter server on an AWS EC2 instance of type g5.4xlarge
with 24 GB of GPU memory and a Linux Amazon Machine Image (AMI). I only have 1 Jupyter notebook kernel. I am using Pytorch version torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl
. My dataset shape right now is tiny: (43615, 28). I’ve checked and found that this dataset occupies the following amount of space:
memory_usage = stack_demo.memory_usage(deep=True) # get memory usage of each column in bytes
memory_usage_mb = memory_usage / 1024 / 1024 # convert bytes to megabytes for each column
total_memory_usage = memory_usage.sum() / 1024 / 1024 # convert total memory usage to megabytes
print(f"Memory usage per column (in MB):\n{memory_usage_mb}")
print(f"Total memory usage (in MB): {total_memory_usage:.2f} MB")
which gives Total memory usage (in MB): 8.69 MB
and about 0.166378
to 0.332756
MB in usage per column. I originally had a version of this code adapted to tensorflow and had similar memory issues but read that PyTorch is generally easier to debug. I’ve spent a couple weeks researching but finding no solution that has worked. FYI I adjusted batch_size = 1 (in my Tensorflow version) but that did not resolve things. Any ideas on things to check or try would be appreciated.
The error is as follows:
---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
Cell In[168], line 98
96 model.train()
97 optimizer.zero_grad()
---> 98 outputs = model(X_train)
99 loss = criterion(outputs, y_train)
100 loss.backward()
File /opt/tensorflow/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
Cell In[168], line 84, in LSTMModel.forward(self, x)
83 def forward(self, x):
---> 84 out, _ = self.lstm(x)
85 out = self.dropout(out)
86 out = self.linear(out[:, -1, :]) # Use only the last output in the sequence
File /opt/tensorflow/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/tensorflow/lib/python3.10/site-packages/torch/nn/modules/rnn.py:812, in LSTM.forward(self, input, hx)
810 self.check_forward_args(input, hx, batch_sizes)
811 if batch_sizes is None:
--> 812 result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
813 self.dropout, self.training, self.bidirectional, self.batch_first)
814 else:
815 result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,
816 self.num_layers, self.dropout, self.training, self.bidirectional)
OutOfMemoryError: CUDA out of memory. Tried to allocate 142.00 MiB (GPU 0; 22.04 GiB total capacity; 420.19 MiB already allocated; 119.19 MiB free; 482.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Selection deleted
My code is as follows:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
from torch.optim import Adam
from torch.nn import HuberLoss # formerly, "from torch.nn.modules.loss import Huber"
import pandas as pd
scaler_price = MinMaxScaler(feature_range=(0, 1))
scaler_features = MinMaxScaler(feature_range=(0, 1))
scaled_price = scaler_price.fit_transform(stack_demo[['col25']])
scaled_features = scaler_features.fit_transform(stack_demo[[
'col2', 'col3','col4','col5','col6','col7','col8','col9','col10','col11','col12','col13','col14',
'col15','col16','col17','col18','col19','col20','col21','col22','col23','col24'
]])
scaled_data = np.concatenate([scaled_price, scaled_features], axis=1)
n_epochs = 1
n_batch_size = 1 # smaller batch = slower training but better generalization
def create_dataset(dataset, look_back=1):
dataX, dataY = [], []
for i in range(len(dataset) - look_back - 1):
a = dataset[i:(i + look_back), :]
dataX.append(a)
dataY.append(dataset[i + look_back, 0]) # Target is the 'PRICE'
return np.array(dataX), np.array(dataY)
look_back = 24
X, y = create_dataset(scaled_data, look_back)
k = 10 # number of folds
random_seed = 13 # set the random seed value
kf = KFold(n_splits=k, shuffle=True, random_state=random_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_scores = []
test_scores = []
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).to(device)
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(LSTMModel, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.dropout = nn.Dropout(0.2)
self.linear = nn.Linear(hidden_size, 1)
def forward(self, x):
out, _ = self.lstm(x)
out = self.dropout(out)
out = self.linear(out[:, -1, :]) # Use only the last output in the sequence
return out
model = LSTMModel(input_size = X_train.shape[2],
hidden_size = 1,
num_layers = 1).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())
for epoch in range(n_epochs):
model.train()
optimizer.zero_grad()
outputs = model(X_train)
loss = criterion(outputs, y_train)
loss.backward()
optimizer.step()
model.eval()
train_predict = model(X_train).cpu().detach().numpy()
test_predict = model(X_test).cpu().detach().numpy()
# calculate average scores across folds
avg_train_score = np.mean(train_scores)
avg_test_score = np.mean(test_scores)
print(f'Average Train Score: {avg_train_score:.2f} RMSE')
print(f'Average Test Score: {avg_test_score:.2f} RMSE')