[Resolved] RuntimeError: expected device cpu and dtype Float but got device cuda:0 and dtype Float

ptrblck · August 31, 2019, 11:15am

What error do you get?

111137 · August 31, 2019, 2:27pm

Segmentation fault (コアダンプ)

ptrblck · August 31, 2019, 2:28pm

Could you post a code snippet which creates this error?
I’m not sure, if it’s related to the parameter/buffer issue of might be something else.

111137 · August 31, 2019, 2:34pm

#!/usr/bin/env python
# coding: utf-8

from torchvision import datasets, transforms, models
import torch 
from torch import nn, optim, utils, device as device_, cuda
import numpy as np

# Architecture Hyper-Parameters
NUM_INPUT      = 28
TIME_STEPS     = 28
NUM_CLASS      = 10
NUM_HIDDEN     = 28

BATCH_SIZE     = 64
EPOCH          = 64
LEARNING_RATE  = 0.01
WEIGHT_DECAY   = 0.01

def one_hot_embedding (y, length):
  out = torch.zeros(length)
  out[y] = 1.0

def q_sel (self):
  for index in range(NUM_HIDDEN):
    self.sel[index] = torch.sigmoid(self.w_x[index] * self.x[index] + self.w_h[index] * self.h[index])

def mux (self):
  for index in range(NUM_HIDDEN):
    self.h[index] = self.sel[index] * self.h[index] + (1.0 - self.sel[index]) * self.x[index]

def q_layer (self):
  q_sel(self)
  mux(self)

def fw_prop (self):
  q_layer(self)

def fc (num_index_a, num_index_b, w, b, x):

  out = torch.zeros(num_index_b)
  for index_b in range(num_index_b):
    out[index_b] = b[index_b]
    for index_a in range(num_index_a):
      out[index_b] += w[index_a][index_b]*x[index_a][index_b]
  
  out = torch.sigmoid(out)
  return out

def fc2 (num_index_a, num_index_b, w, b, x):

  out = torch.zeros(num_index_b)
  for index_b in range(num_index_b):
    out[index_b] = b[index_b]
    for index_a in range(num_index_a):
      out[index_b] += w[index_a][index_b]*x[index_a]

dataset_train = datasets.MNIST(
    '~/mnist', 
    train=True, 
    download=True, 
    transform=transforms.ToTensor())

dataloader_train = utils.data.DataLoader(dataset_train,
                                          batch_size=BATCH_SIZE,
                                          shuffle=True,
                                          num_workers=4)

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
        # Gate-Weight
        self.w_x   = nn.ParameterList([nn.Parameter(torch.randn(NUM_INPUT)) for _ in range(NUM_HIDDEN)])
        self.w_h   = nn.ParameterList([nn.Parameter(torch.randn(NUM_INPUT)) for _ in range(NUM_HIDDEN)])
    
        # Gate-Selector
        self.register_buffer('sel', torch.stack([nn.Parameter(torch.zeros(NUM_INPUT)) for _ in range(NUM_HIDDEN)]))

        # Input Vector
        self.register_buffer('x', torch.stack([nn.Parameter(torch.zeros(NUM_INPUT)) for _ in range(NUM_HIDDEN)]))

        # Output Vector
        self.register_buffer('h', torch.stack([nn.Parameter(torch.zeros(NUM_INPUT)) for _ in range(NUM_HIDDEN)]))

        self.fc_w1 = nn.ParameterList([nn.Parameter(torch.randn(NUM_INPUT)) for _ in range(NUM_HIDDEN)])
        self.fc_b1 = nn.ParameterList([nn.Parameter(torch.randn(1))         for _ in range(NUM_HIDDEN)])
        
        self.fc_w2 = nn.ParameterList([nn.Parameter(torch.randn(NUM_CLASS)) for _ in range(NUM_INPUT)])
        self.fc_b2 = nn.ParameterList([nn.Parameter(torch.randn(1))         for _ in range(NUM_INPUT)])

    def forward(self):
      fw_prop(self)
      out1 = fc(NUM_HIDDEN, NUM_INPUT, self.fc_w1, self.fc_b1, self.h)
      out2 = fc2(NUM_INPUT, NUM_CLASS, self.fc_w2, self.fc_b2, out1)

      return out2

model = Model()
model.cuda()
optimizer = optim.SGD(list(model.parameters()), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.BCEWithLogitsLoss()

print(model.parameters())
for name, param in model.named_parameters():
    if param.device.type != 'cuda':
        print('param {}, not on GPU'.format(name))

model.train()

losses = []
count = 0

for epoch in range(EPOCH):
  for x, t in dataloader_train:

    y = one_hot_embedding(t, NUM_CLASS)
    for time in range(TIME_STEPS):

      for index in range(NUM_HIDDEN-1, 0, -1):
        model.x[index] = model.x[index - 1]
        model.h[index] = model.h[index - 1]
      
      model.x[0] = torch.cuda.FloatTensor(x[0][0][time])
      model.h[0] = torch.cuda.FloatTensor(torch.zeros(NUM_INPUT))
      
      model.zero_grad()
      out = model()
      loss = criterion(out, y)
      loss.backward(retain_graph = True)
      optimizer.step()
  
      losses.append(loss.cpu().data)
      epoch_loss = np.array([np.mean(losses)])

    print("{:6d}: Epoch:{} Loss:{:.9f}".format(count, epoch, np.mean(losses)))
    count += 1

ptrblck · August 31, 2019, 3:14pm

There are still several issues in the code:

one_hot_embedding and fc2 do not return anything, so you might want to add return out
use

model.x[0] = x[0][0][time].cuda()
model.h[0] = torch.zeros(NUM_INPUT).cuda()

to get rid of the seg fault. We’ll look into what’s happening.

if you are initializing a tensor in your forward pass, stick to @spanev’s suggestion and pass the device argument, e.g. for out = torch.zeros(num_index_b, device=x.device) in fc
once this is fixed, you’ll run into RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
change the inplace addition in fc and fc2 to out[index_b] = out[index_b] + ... and wrap the manipulation of self.sel in q_sel into a torch.no_grad() block, otherwise self.sel will require gradients after the operation

This code should work:

from torchvision import datasets, transforms, models
import torch 
from torch import nn, optim, utils, device as device_, cuda
import numpy as np

# Architecture Hyper-Parameters
NUM_INPUT      = 28
TIME_STEPS     = 28
NUM_CLASS      = 10
NUM_HIDDEN     = 28

BATCH_SIZE     = 64
EPOCH          = 64
LEARNING_RATE  = 0.01
WEIGHT_DECAY   = 0.01

def one_hot_embedding (y, length):
  out = torch.zeros(length, device=y.device)
  out[y] = 1.0
  return out

def q_sel (self):
  with torch.no_grad():
    for index in range(NUM_HIDDEN):
      self.sel[index] = torch.sigmoid(self.w_x[index] * self.x[index] + self.w_h[index] * self.h[index])

def mux (self):
  for index in range(NUM_HIDDEN):
    self.h[index] = self.sel[index] * self.h[index] + (1.0 - self.sel[index]) * self.x[index]

def q_layer (self):
  q_sel(self)
  mux(self)

def fw_prop (self):
  q_layer(self)

def fc (num_index_a, num_index_b, w, b, x):

  out = torch.zeros(num_index_b, device=x.device)
  for index_b in range(num_index_b):
    out[index_b] = b[index_b]
    for index_a in range(num_index_a):
      out[index_b] = out[index_b] + w[index_a][index_b]*x[index_a][index_b]
  
  out = torch.sigmoid(out)
  return out

def fc2 (num_index_a, num_index_b, w, b, x):

  out = torch.zeros(num_index_b, device=x.device)
  for index_b in range(num_index_b):
    out[index_b] = b[index_b]
    for index_a in range(num_index_a):
      out[index_b] = out[index_b] + w[index_a][index_b]*x[index_a]
  return out

dataset_train = datasets.MNIST(
    '/home/ptrblck/python/data', 
    train=True, 
    download=False, 
    transform=transforms.ToTensor())

dataloader_train = utils.data.DataLoader(dataset_train,
                                          batch_size=BATCH_SIZE,
                                          shuffle=True,
                                          num_workers=4)

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
        # Gate-Weight
        self.w_x   = nn.ParameterList([nn.Parameter(torch.randn(NUM_INPUT)) for _ in range(NUM_HIDDEN)])
        self.w_h   = nn.ParameterList([nn.Parameter(torch.randn(NUM_INPUT)) for _ in range(NUM_HIDDEN)])
    
        # Gate-Selector
        self.register_buffer('sel', torch.stack([torch.zeros(NUM_INPUT) for _ in range(NUM_HIDDEN)]))

        # Input Vector
        self.register_buffer('x', torch.stack([torch.zeros(NUM_INPUT) for _ in range(NUM_HIDDEN)]))

        # Output Vector
        self.register_buffer('h', torch.stack([torch.zeros(NUM_INPUT) for _ in range(NUM_HIDDEN)]))

        self.fc_w1 = nn.ParameterList([nn.Parameter(torch.randn(NUM_INPUT)) for _ in range(NUM_HIDDEN)])
        self.fc_b1 = nn.ParameterList([nn.Parameter(torch.randn(1))         for _ in range(NUM_HIDDEN)])
        
        self.fc_w2 = nn.ParameterList([nn.Parameter(torch.randn(NUM_CLASS)) for _ in range(NUM_INPUT)])
        self.fc_b2 = nn.ParameterList([nn.Parameter(torch.randn(1))         for _ in range(NUM_INPUT)])

    def forward(self):
      fw_prop(self)
      out1 = fc(NUM_HIDDEN, NUM_INPUT, self.fc_w1, self.fc_b1, self.h)
      out2 = fc2(NUM_INPUT, NUM_CLASS, self.fc_w2, self.fc_b2, out1)

      return out2

model = Model()
model.cuda()
optimizer = optim.SGD(list(model.parameters()), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.BCEWithLogitsLoss()

print(model.parameters())
for name, param in model.named_parameters():
    if param.device.type != 'cuda':
        print('param {}, not on GPU'.format(name))

model.train()

losses = []
count = 0

for epoch in range(EPOCH):
  for x, t in dataloader_train:

    y = one_hot_embedding(t.cuda(), NUM_CLASS)
    for time in range(TIME_STEPS):

      for index in range(NUM_HIDDEN-1, 0, -1):
        model.x[index] = model.x[index - 1]
        model.h[index] = model.h[index - 1]
      
      model.x[0] = x[0][0][time].cuda()
      model.h[0] = torch.zeros(NUM_INPUT).cuda()
      
      model.zero_grad()
      out = model()
      loss = criterion(out, y)
      loss.backward(retain_graph = True)
      optimizer.step()
  
      losses.append(loss.cpu().data)
      epoch_loss = np.array([np.mean(losses)])

    print("{:6d}: Epoch:{} Loss:{:.9f}".format(count, epoch, np.mean(losses)))
    count += 1

111137 · August 31, 2019, 3:28pm

@ptrblck -san

This was my miss-pasting from original one.

Now works fine.
But wait, why code before this topic did work on calab?
And I would like to know how you found my miss-coding point (excepting the return)?, because I want not make same misstake in the future.

ptrblck · August 31, 2019, 3:34pm

I’ve debugged the code step by step.
The creation of the cuda.FloatTensor is not the recommended way, so I just changed is to directly pushing x[0][0][time] to the device. However, this line of code should not create a segfault, so I’ve posted some debug information in your created issue.

Once this was done, the error messages helped debugging the other issues, e.g. by enabling anomaly detection.

111137 · August 31, 2019, 3:36pm

I see, thank you very very much for your valuable advice.