Barely a few steps through my forward propagation for an LSTM I received an error:
THCudaCheck FAIL file=/home/soumith/local/builder/wheel/pytorch-src/torch/lib/THC/generic/THCStorage.cu line=66 error=2 : out of memory
Traceback (most recent call last):
File "external_script.py", line 1002, in <module>
final_loss = run()
File "external_script.py", line 584, in run
optimizer_iter_num, feature, x_dim)
File "/home/ubuntu/lstm_special/rnn.py", line 74, in lstm_forward
output, hn = self.net.rnn(input, (self.h, self.c))
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 210, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/rnn.py", line 79, in forward
return func(input, self.all_weights, hx)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/_functions/rnn.py", line 228, in forward
return func(input, *fargs, **fkwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/autograd/function.py", line 202, in _do_forward
flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
File "/usr/local/lib/python2.7/dist-packages/torch/autograd/function.py", line 218, in forward
result = self.forward_extended(*nested_tensors)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/_functions/rnn.py", line 180, in forward_extended
cudnn.rnn.forward(self, input, hx, weight, output, hy)
File "/usr/local/lib/python2.7/dist-packages/torch/backends/cudnn/rnn.py", line 257, in forward
fn.workspace = torch.cuda.ByteTensor(workspace_size.value)
RuntimeError: cuda runtime error (2) : out of memory at /home/soumith/local/builder/wheel/pytorch-src/torch/lib/THC/generic/THCStorage.cu:66
I’ve a 12gb Tesla K80 NVIDIA GPU so this shouldn’t be an issue I believe it has something to do with the variables. 12GB RAM.
Checking CUDA is working.
>>> torch.cuda.is_available()
True
>>> torch.cuda.current_stream()
<torch.cuda.Stream device=0 cuda_stream=0x0>
>>> torch.cuda.device_count()
1L
NVIDIA SMI
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 367.57 Driver Version: 367.57 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla K80 Off | 0000:00:1E.0 Off | 0 |
| N/A 51C P8 27W / 149W | 2MiB / 11439MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Script
from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib as mpl
import numpy as np
torch.manual_seed(0)
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, bias, dropout):
super(Net, self).__init__()
self.rnn = nn.LSTM(input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
bias=bias,
dropout=dropout)
class lstmModule:
def __init__(self, input_size, hidden_size, num_layers, bias, dropout,
seq_len, batch_size, meta_lr, n_meta_iter):
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bias = bias
self.dropout = dropout
self.seq_len = seq_len
self.batch_size = batch_size
self.meta_lr = meta_lr
self.n_meta_iter = n_meta_iter
self.net = Net(input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
bias=bias,
dropout=dropout)
self.net.cuda()
self.h0 = Variable(torch.randn(self.num_layers,
self.batch_size,
self.hidden_size)).cuda()
self.c0 = Variable(torch.randn(self.num_layers,
self.batch_size,
self.hidden_size)).cuda()
self.optimizer = optim.Adam(self.net.rnn.parameters(), lr=self.meta_lr)
self.loss_lst = []
self.loss_lst2 = []
def lstm_forward(self, seq_num, inp, x_dim):
inp = inp.reshape(1, 1, inp.shape[0]*inp.shape[1])
inp = torch.from_numpy(inp).float()
input = Variable(inp).cuda()
if seq_num == 0:
# Ensure clear gradient buffer
self.optimizer.zero_grad()
self.loss_tot = [0 for i in range(self.hidden_size)]
# LSTM
output, hn = self.net.rnn(input, (self.h0, self.c0))
output = torch.abs(2 * output)
op = [output[:, :, i] for i in range(self.hidden_size)]
self.output_all = op
self.h, self.c = hn
return output.cpu().data.numpy()
else:
output, hn = self.net.rnn(input, (self.h, self.c))
output = torch.abs(2 * output)
op = [output[:, :, i] for i in range(self.hidden_size)]
self.h, self.c = hn
self.output_all = [torch.cat((self.output_all[i], op[i]), 0) for i in range(self.hidden_size)]
return output.cpu().data.numpy()
def lstm_update(self, lab):
def pseudo_loss(output, label):
# print('output size', output.size())
# print('label size', label.size())
# return torch.mean(torch.abs(output*label))
return torch.mean(output * label)
lab = torch.from_numpy(lab).float()
self.label = Variable(lab).cuda()
# Get loss
self.loss_tot = [
self.loss_tot[i] + pseudo_loss(self.output_all[i], self.label[:, i]) for i in range(self.hidden_size)]
# Append loss
self.loss_lst.append(self.loss_tot[0].cpu().data.numpy()[0])
self.loss_lst2.append(self.loss_tot[1].cpu().data.numpy()[0])
# Backprop
sum(self.loss_tot).backward()
# Update optimizer
self.optimizer.step()
return self.loss_lst, self.loss_lst2
I’ve an external script that calls lstm_forward
while feeding it with a tensor of size (1, 1, 7520)
.
This shouldn’t occur (on my CPU version of PyTorch it didn’t) and the forward pass is really fast.