Pytorch freezes computer while training


(Sam Xox) #1

I am using a GeForce GTX 1060 6GB/PCIe/SSE2 and a AMD® Ryzen 5 1600 six-core processor × 12

I am trying to train a simple model on flower photos dataset

My code hangs my whole pc when training starts

Heres my code

import torch
from torchvision import datasets, transforms, models
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from train import train
from collections import OrderedDict
import matplotlib.pyplot as plt
import numpy as np
import helper
from fashion_mnist_conv import FeatureNet

model = models.resnet34(pretrained=True)
train_transform = transforms.Compose([transforms.RandomRotation(30),
                                transforms.RandomResizedCrop(224),
                                transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(),
                                
                                ])


test_transform = transforms.Compose([transforms.Resize(124,124), transforms.ToTensor()])

testset = datasets.ImageFolder(root='flowers/test', transform=test_transform)
trainset = datasets.ImageFolder(root='flowers/train', transform=train_transform)


testloader = DataLoader(testset, batch_size=1, shuffle=True)
trainloader = DataLoader(trainset, batch_size=1, num_workers=1, shuffle=True)


image, label = next(iter(trainloader))
featnet = FeatureNet()
output = featnet(image)

#for param in model.parameters():
#    param.requires_grad = False

classifier = nn.Sequential(OrderedDict([
                ('fc1', nn.Linear(512, 300)),
                ('relu', nn.ReLU()),
                ('fc2', nn.Linear(300, 4)),
                ('output', nn.LogSoftmax(dim=1))
                ]))

model.fc = classifier
featnet = FeatureNet()

print("getting in cuda..")
criterion = nn.NLLLoss()
criterion2 = nn.CrossEntropyLoss()

train(10, featnet, criterion2, trainloader, testloader, cuda)

if you change the featnet with model(resnet34) same thing happens.

heres featnet

import torch
import torch.nn as nn


class FeatureNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(3,8, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(8,20, kernel_size=5) 
        self.conv3 = nn.Conv2d(20, 30, kernel_size=5)
        self.dropout = nn.Dropout(p=0.5)

        self.mp = nn.MaxPool2d(1)
        

        self.fc = nn.Linear(1348320, 10)

    def forward(self, x):
        x = self.dropout(torch.relu(self.mp(self.conv1(x))))
        x = self.dropout(torch.relu(self.mp(self.conv2(x))))
        x = self.dropout(torch.relu(self.mp(self.conv3(x))))
        x = x.view(x.shape[0], -1)
        print(x.shape)

        x = self.fc(x) 

        return x

and heres my train.py

import torch
import torch.optim as optim

def test(model, testloader, criterion, cuda):
    equals = 0
    n = 0
    running_loss = 0
    with torch.no_grad():
        for x, y in testloader:
            if cuda:
               x,y = x.cuda(), y.cuda()
            model.eval()
            y_hat = model(x)
            test_loss = criterion(y_hat, y)
            running_loss += test_loss
            ps = torch.softmax(y_hat, dim=1)
            predictions = ps.topk(1, dim=1)[1].view(1,-1)[0]
            equals += (predictions == y).sum().item()
            n += len(y)
    test_loss = running_loss/len(testloader)
    accuracy = "Accuracy: {}%\n".format(equals/n*100)
    model.train()
    return (accuracy, test_loss.item())



def train(epoch,  model, criterion, trainloader, testloader, cuda):
    train_loss_list = []
    test_loss_list = []

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for i in range(epoch):
        running_loss = 0
        for x, y in trainloader:
               if cuda:
                      x,y = x.cuda(), y.cuda()

            optimizer.zero_grad()
            y_hat = model(x)
            print('forward propogated')

            loss = criterion(y_hat, y)
            print('calculated loss')
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        accuracy, test_loss = test(model, testloader, criterion, cuda)
        train_loss = running_loss/len(trainloader)

        train_loss_list.append(train_loss)
        test_loss_list.append(test_loss)

        message = "train-loss: {}    test_loss:{}".format(train_loss, test_loss)
        print(message)
        print(accuracy)
    return (train_loss_list, test_loss_list)

My machine completely stops while training.


(Simon Wang) #2

hmmm you usually don’t want to train deep convnets on cpu. may I ask why you did that?


(Sam Xox) #3

I actually trained on gpu same thing happens

Look what it does to my memory

But I used fastai’s resnet to train on huge datasets but this does not occur on those models.
And fastai is a wrapper of pytorch.
So the problem surely is in my code.


#4

Could you print the device of a parameter of your model just to make sure it’s actually running on the GPU?

print(model.conv1.weight.type())

should be enough.


(Sam Xox) #5

It spits out torch.cuda.FloatTensor.
I solved the problem as you can see I had lot of firefox windows open. 100s of tabs already raised my memory to 60% after that if I start training Pytorch need an extra 30% that’s why my computer crashed.
Sorry to bother you
Thanks for the replies