Best way to move parameters to GPU

Hi,

Here is a snippet of the code I have implemented.

class BiDAF(nn.Module):
    def __init__(self,char_embeddings,glove_model):
        super(BiDAF,self).__init__()
        self.context_highway = Context_Highway()
        self.question_highway = Question_Highway()
        self.char_embedding_dict = char_embeddings
        self.glove_model = glove_model
        self.tanh = nn.Tanh()
        self.train_w = torch.randn(420,requires_grad = True)
        self.w_p1 = torch.randn(700,requires_grad = True)
        self.w_p2 = torch.randn(700,requires_grad = True)
        self.bias = torch.randn(1,requires_grad = True)
        self.conv1 = nn.Conv2d(1,5,kernel_size = (50,3))
        self.conv2 = nn.Conv2d(1,5,kernel_size = (50,4))
        self.conv3 = nn.Conv2d(1,3,kernel_size = (50,2))
        self.conv4 = nn.Conv2d(1,7,kernel_size = (50,3))
        self.context_bilstm = nn.LSTM(70, 70,bidirectional = True)
        self.question_bilstm = nn.LSTM(70, 70,bidirectional = True)
        self.bilstm_m1 = nn.LSTM(560, 70,bidirectional = True)
        self.bilstm_m2 = nn.LSTM(140, 70,bidirectional = True)
        ..................................

bidaf = BiDAF(char_embeddings, glove_model)
bidaf = bidaf.to(device)
criterion = nn.CrossEntropyLoss()
lr = 0.01
optimizer = torch.optim.Adam(bidaf.parameters(),lr = lr)
max_data = len(dataset['data'])
epochs = 2
iterations_per_epochs = 10000
sum_loss = 0
for epoch in range(epochs):
    for iteration in range(1,iterations_per_epochs+1):
        index = random.randint(0, max_data)
        context = dataset['data'][index]['context']
        question = dataset['data'][index]['question']
        start_index = torch.tensor([dataset['data'][index]['start_index']], dtype=torch.int64)
        end_index = torch.tensor([dataset['data'][index]['end_index']], dtype=torch.int64)
        start_index = start_index.to(device)
        end_index = end_index.to(device)
        pred = bidaf.forward(context, question)
        start_pred = pred[0]
        end_pred = pred[1]
        loss1 = criterion(start_pred.expand(1,810), start_index.expand(1))
        loss2 = criterion(end_pred.expand(1,810), end_index.expand(1))
        loss = loss1 + loss2
        optimizer.zero_grad()
        loss.backward()
        sum_loss += loss.item()
        optimizer.step()
    print("Epoch---{}  Loss---{}".format(epoch,sum_loss))
    sum_loss = 0 

The above code works fine if I am running it on the CPU. When I run it on GPU it gives a run time error where expected was a torch.FloatTensor but found type torch.cuda.FloatTensor.
So I had to edit the code to be like this

class BiDAF(nn.Module):
    def __init__(self,char_embeddings,glove_model,device):
        super(BiDAF,self).__init__()
        self.device = device
        self.context_highway = Context_Highway().to(device)
        self.question_highway = Question_Highway().to(device)
        self.char_embedding_dict = char_embeddings
        self.glove_model = glove_model
        self.tanh = nn.Tanh()
        self.train_w = torch.randn(420,requires_grad = True).to(device)
        self.w_p1 = torch.randn(700,requires_grad = True).to(device)
        self.w_p2 = torch.randn(700,requires_grad = True).to(device)
        self.bias = torch.randn(1,requires_grad = True).to(device)
        self.conv1 = nn.Conv2d(1,5,kernel_size = (50,3))
        self.conv2 = nn.Conv2d(1,5,kernel_size = (50,4))
        self.conv3 = nn.Conv2d(1,3,kernel_size = (50,2))
        self.conv4 = nn.Conv2d(1,7,kernel_size = (50,3))
        self.context_bilstm = nn.LSTM(70, 70,bidirectional = True)
        self.question_bilstm = nn.LSTM(70, 70,bidirectional = True)
        self.bilstm_m1 = nn.LSTM(560, 70,bidirectional = True)
        self.bilstm_m2 = nn.LSTM(140, 70,bidirectional = True)

       ...................................

bidaf = BiDAF(char_embeddings, glove_model, device)
bidaf = bidaf.to(device)
criterion = nn.CrossEntropyLoss()
lr = 0.01
optimizer = torch.optim.Adam(bidaf.parameters(),lr = lr)

Other than the variables inside the constructor, I had to move any local variables I initialized in the methods to GPU too. I noticed while running the code that the GPU one is slower than the CPU code. Is there any other way to move all the tensors to GPU?. Why is it slower than the CPU ?

I know I have only posted a portion of code, I didn’t want to make the post unnecessarily long. I hope this will be sufficient. Thanks for any help in advance.

You should register the trainable tensors as nn.Parameters, which will then make sure to automatically push them to the specified device.
Change:

self.train_w = torch.randn(420,requires_grad = True)

to

self.train_w = nn.Parameter(torch.randn(420))
1 Like

Thanks, this solved one of the problems i had.
But it makes this below snippet throw an error because self.get_convolved_word_embeddings(char_tensor) expects the cuda tensor but char_tensor is FloatTensor, is there a way solve this. I tried char_tensor.cuda() and char_tensor.to(device). Both of the statements solve the issue but it throws the same error in the next line.

def create_question_char_embeddings(self,text):
        doc = nlp(text)
        embed = torch.zeros(20,60)
        for index,token in enumerate(doc):
            char_tensor = torch.zeros(50,45)
            for i,char in enumerate(token.text):
                try:
                    char_tensor[:,i] = torch.tensor(char_embeddings[char])
                except:
                    pass
            char_embedding = self.get_convolved_word_embeddings(char_tensor)
            embed[:,index] = char_embedding.squeeze(0).squeeze(1)
        return embed

Sorry if it is silly question.

If you need to create these tensors inside create_question_char_embedding, you could try to use the device attribute of a model parameter and create the new tensors on the same device:

char_tensor = torch.zeros(50, 45, device=self.parameter.device)

Thanks, this solved my errors. But the training is slower than when in CPU. Any ideas why ?

Tiny workloads can be faster on the CPU than on a GPU.
I’m not sure how all submodules are defined or how they are used in your model, so I can’t really guess what might be the reason besides the workload.