RunTime Error due to tensors being on different devices

rdb · March 16, 2021, 9:12am

device = "cuda" if torch.cuda.is_available() else "cpu"
class LSTM(nn.Module):
    def __init__(self, n_hidden = 128,):
        super(LSTM, self).__init__()
        self.n_hidden = n_hidden
        #self.linearIn = nn.Linear(6, 128)
        self.lstm1 = nn.LSTMCell(1, self.n_hidden,)
        self.lstm2 = nn.LSTMCell(self.n_hidden, self.n_hidden)
        self.linear = nn.Linear(self.n_hidden, 1)
    
    def forward(self, input, future=0):
        outputs = []
        h_t = torch.zeros(input.size(0), 128, dtype=torch.float32)
        c_t = torch.zeros(input.size(0), 128, dtype=torch.float32)
        h_t2 = torch.zeros(input.size(0), 128, dtype=torch.float32)
        c_t2 = torch.zeros(input.size(0), 128, dtype=torch.float32)
        
        for input_t in input.split(1, dim=1):
            h_t, c_t = self.lstm1(input_t, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t)
            outputs += [output]
        for i in range(future):# if we should predict the future
            h_t, c_t = self.lstm1(output, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t)
            outputs += [output]
        outputs = torch.cat(outputs, dim=1)
        return outputs

xy = np.loadtxt('bk18dt.csv', delimiter=',', skiprows=1, usecols=(1,2,3,4,5,6,7),)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(xy)
X = torch.from_numpy(X_Scaled).float().to(device)

for i, val in enumerate(X):
    X = X.to(device)

train_data = X[3:, :-1]
target_data = X[3:, 1:]
test_data = X[:3, :-1]
test_target = X[:3, 1:]

model = LSTM().to(device)

criterion = nn.MSELoss()
optimizer = optim.LBFGS(model.parameters(), lr = 0.03)


for i in range(128):
        print('STEP: ', i)
        def closure():
            optimizer.zero_grad()
            out = model(train_data).to(device)
            loss = criterion(out, target_data)
            print('loss:', loss.item())
            loss.backward()
            return loss
        optimizer.step(closure)
        # begin to predict, no need to track gradient here
        with torch.no_grad():
            future = 1000
            pred = model(test_data, future=future).to(device)
            loss = criterion(pred[:, :-future], test_target)
            print('test loss:', loss.item())
            yPred = pred.detach().numpy()

So this is my code. I want to train the LSTM on the gpu
Details of my GPU

nvidia-smi
Tue Mar 16 14:39:52 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  GeForce RTX 207...  Off  | 00000000:01:00.0  On |                  N/A |
| 39%   40C    P8    18W / 215W |   1996MiB /  7979MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A      1623      G   /usr/lib/xorg/Xorg                 59MiB |
|    0   N/A  N/A      2738      G   /usr/lib/xorg/Xorg                451MiB |
|    0   N/A  N/A      2884      G   /usr/bin/gnome-shell               42MiB |
|    0   N/A  N/A      3958      G   ...AAAAAAAAA= --shared-files      343MiB |
|    0   N/A  N/A      7184      G   /usr/lib/firefox/firefox            2MiB |
|    0   N/A  N/A     13860      C   ...ath/miniconda3/bin/python     1069MiB |
+-----------------------------------------------------------------------------+

It tells me the ‘out’ is in cpu, when I have pushed it to the GPU. Any help would be awesome!

Error


RuntimeError: Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU (while checking arguments for addmm)

Dwight_Foster · March 16, 2021, 12:57pm

This is happening because you did not put the input data and the hidden and cell states onto the gpu. You need to put both of those on the gpu then it should work.

rdb · March 16, 2021, 6:10pm

I tried that too! But it’s still the same. And by input data you mean the training samples correct? If that’s the case, I have add 'em to the GPU

Dwight_Foster · March 16, 2021, 6:12pm

Yes that is what I mean.

Dwight_Foster · March 16, 2021, 6:15pm

Does it give you the same error. Or a different one.

rdb · March 16, 2021, 6:15pm

Is this the correct way of push to GPU?

For all the hidden states right?

Dwight_Foster · March 16, 2021, 6:15pm

Yes all hidden and the cell states.

rdb · March 16, 2021, 6:16pm

It’s the same error.

rdb · March 16, 2021, 6:16pm

I did it, but no luck.

Dwight_Foster · March 16, 2021, 6:16pm

Can you send your new code?

rdb · March 16, 2021, 6:18pm

device = "cuda" if torch.cuda.is_available() else "cpu"
class LSTM(nn.Module):
    def __init__(self, n_hidden = 128,):
        super(LSTM, self).__init__()
        self.n_hidden = n_hidden
        #self.linearIn = nn.Linear(6, 128)
        self.lstm1 = nn.LSTMCell(1, self.n_hidden,)
        self.lstm2 = nn.LSTMCell(self.n_hidden, self.n_hidden)
        self.linear = nn.Linear(self.n_hidden, 1)
    
    def forward(self, input, future=0):
        outputs = []
        h_t = torch.zeros(input.size(0), 128, dtype=torch.float32).to(device)
        c_t = torch.zeros(input.size(0), 128, dtype=torch.float32).to(device)
        h_t2 = torch.zeros(input.size(0), 128, dtype=torch.float32).to(device)
        c_t2 = torch.zeros(input.size(0), 128, dtype=torch.float32).to(device)
        
        for input_t in input.split(1, dim=1):
            h_t, c_t = self.lstm1(input_t, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t).to(device)
            outputs += [output]
        for i in range(future):# if we should predict the future
            h_t, c_t = self.lstm1(output, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t)
            outputs += [output]
        outputs = torch.cat(outputs, dim=1)
        return outputs

New code, I added it to the c_t as well

Dwight_Foster · March 16, 2021, 6:20pm

Did you put test data and train data on the gpu like this

train_data = train_data.to(device)
test_data = test_data.to(device)
```?
Can you send your train loop too?

rdb · March 16, 2021, 6:24pm

Okay, I thought, just by doing X.to(device) is enough.

X = torch.from_numpy(X_Scaled).float().to(device)
for i, val in enumerate(X):
    X = X.to(device)

#before
train_data = X[3:, :-1]
target_data = X[3:, 1:]
test_data = X[:3, :-1]
test_target = X[:3, 1:]

#after

train_data = X[3:, :-1].to(device)
target_data = X[3:, 1:].to(device)
test_data = X[:3, :-1].to(device)
test_target = X[:3, 1:].to(device)

Dwight_Foster · March 16, 2021, 6:25pm

Oh no sorry I should have clarified you have to redefine the variable. Did it work?

rdb · March 16, 2021, 6:26pm

Thanks mate! Yes it worked!