Process exit with -1073741819 (0xC0000005) when doing backward()

landmine · May 14, 2018, 6:50am

console window has nothing traceback.

Process finished with exit code -1073741819 (0xC0000005)

how does this happen???

ptrblck · May 15, 2018, 8:49pm

It seems to be an access violation on a Windows system.
Could you post a code snippet reproducing this error?

_ike · October 11, 2019, 5:51am

I have met the same problem.
and my code is that:

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import pickle
from torch.utils.data import Dataset,DataLoader

class ResNet18ResultDataset(Dataset):

def __init__(self, dataFile, transform, h):
    filename = dataFile + r'\\' + 'NormResNetData'+ str(h)
    with open(filename, 'rb') as f:
        # self.dataNumpy = torch.from_numpy(pickle.load(f)).float()
        self.dataNumpy = pickle.load(f)
    self.transform = transform

def __len__(self):
    return self.dataNumpy.shape[0]

def __getitem__(self, idx):
    sample = self.dataNumpy[idx, :].reshape((3,55,55))
    sample=torch.from_numpy(sample).float()
    return sample

class Generator(nn.Module):

def __init__(self):
    super(Generator, self).__init__()
    nz = int(16)
    ngf = int(64)
    ndf = int(64)
    nc = 3
    self.main = nn.Sequential(
    # input is Z, going into a convolution
        # input is Z, going into a convolution
        nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
        nn.BatchNorm2d(ngf * 8),
        nn.ReLU(True),
        # state size. (ngf*8) x 4 x 4
        nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
        nn.BatchNorm2d(ngf * 4),
        nn.ReLU(True),
        # state size. (ngf*4) x 8 x 8
        nn.ConvTranspose2d(ngf * 4, ngf * 2, 3, 2, 1, bias=False),
        nn.BatchNorm2d(ngf * 2),
        nn.ReLU(True),
        # state size. (ngf*2) x 16 x 16
        nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
        nn.BatchNorm2d(ngf),
        nn.ReLU(True),
        # state size. (ngf) x 32 x 32
        nn.ConvTranspose2d(ngf, nc, 3, 2, 1, bias=False),
        nn.Tanh()
            # state size. (nc) x 64 x 64
        )

def forward(self, input):

    output = self.main(input)
    return output

class Discriminator(nn.Module):

def __init__(self):
    super(Discriminator, self).__init__()
    ndf = int(64)
    nc = 3
    self.main = nn.Sequential(
        nn.Conv2d(nc, ndf, 3, 2, 1, bias=False),
        nn.LeakyReLU(0.2, inplace=True),
        # state size. (ndf) x 32 x 32
        nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
        nn.BatchNorm2d(ndf * 2),
        nn.LeakyReLU(0.2, inplace=True),
        # state size. (ndf*2) x 16 x 16
        nn.Conv2d(ndf * 2, ndf * 4, 3, 2, 1, bias=False),
        nn.BatchNorm2d(ndf * 4),
        nn.LeakyReLU(0.2, inplace=True),
        # state size. (ndf*4) x 8 x 8
        nn.Conv2d(ndf * 4, ndf * 8, 3, 2, 1, bias=False),
        nn.BatchNorm2d(ndf * 8),
        nn.LeakyReLU(0.2, inplace=True),
        # state size. (ndf*8) x 4 x 4
        nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
        nn.Sigmoid()
    )

def forward(self, input):
    output = self.main(input)

    return output.view(-1, 1).squeeze(1)

if name==‘main’:
nz = int(16)
ngf = int(64)
ndf = int(64)
nc = 3
batchSize =128
lr= 0.0002
niter=40
beta1=0.5
fakeNum=5
inputNoise=nz
fakeRate=10
init = True
lossListG = []
lossListD = []
outf = r’C:\YANG Luoxiao\Model\GAN’
dataFile = r’C:\YANG Luoxiao\Data\FengChang\pickle\10\23Train2’

device = torch.device("cuda")
transform=None


device = torch.device("cpu")
netD = Discriminator().cuda()
netD.train()
# netD.apply(weights_init)
netG = Generator().cuda()
# netG.apply(weights_init)
netG.train()
criterion = nn.BCELoss()

real_label = 1
fake_label = 0
start_epoch = 0
# setup optimizer
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))
if init:
    loadModel=False
else:
    loadModel=True
if loadModel:
    checkpoint = torch.load(r'C:\YANG Luoxiao\Model\GAN\DDGAN3.pth')  #largeNew5 Large5
    netG.load_state_dict(checkpoint['netG'])
    netD.load_state_dict(checkpoint['netD'])
    # netD.load_state_dict(checkpoint['netD'])
    optimizerG.load_state_dict(checkpoint['optimizerG'])
    optimizerD.load_state_dict(checkpoint['optimizerD'])


    # checkpoint['optimizer']['param_groups'][0]['weight_decay']=weight_decay
    # optmizer.load_state_dict(checkpoint['optimizer'])
    start_epoch = checkpoint['epoch'] + 1
    lossListG = checkpoint['lossListG']
    lossListD = checkpoint['lossListD']
    # lrList = checkpoint['lrList']


for epoch in range(start_epoch,start_epoch+niter):
    windDateset = ResNet18ResultDataset(dataFile, transform,epoch%18)
    dataloader = torch.utils.data.DataLoader(windDateset, batch_size=batchSize,
                                             shuffle=True, num_workers=int(0))
    for i, data in enumerate(dataloader):
        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        # train with real
        netD.zero_grad()
        real_cpu = data.cuda()
        batch_size = real_cpu.size(0)
        # label = torch.full((batch_size,), real_label).cuda()
        if epoch < fakeNum:
            Reallabel=np.ones([batch_size,])-0.1
            Reallabel[np.random.randint(batch_size,size=int(batch_size/fakeRate)),]=0
            Reallabel = torch.from_numpy(Reallabel).float().cuda()
        else:
            Reallabel = np.ones([batch_size, ])
            # Reallabel[np.random.randint(batch_size, int(batch_size / 10)), :] = 0
            Reallabel = torch.from_numpy(Reallabel).float().cuda()
        output = netD(real_cpu)
        errD_real = criterion(output, Reallabel)
        errD_real.backward()
        D_x = output.mean().item()

        # train with fake
        # noise = torch.randn(batch_size, 16, device=device)


        noise=np.random.random_sample((batch_size,inputNoise,1, 1))
        noise = torch.from_numpy(noise).float().cuda()
        fake = netG(noise)
        if epoch < fakeNum:
            fake_label=np.zeros([batch_size,])+0.1
            fake_label[np.random.randint(batch_size,size=int(batch_size/fakeRate)),]=1
            fake_label = torch.from_numpy(fake_label).float().cuda()
        else:
            fake_label = np.zeros([batch_size,])
            # Reallabel[np.random.randint(batch_size, int(batch_size / 10)), :] = 0
            fake_label=torch.from_numpy(fake_label).float().cuda()
        # label.fill_(fake_label)
        output = netD(fake.detach())
        errD_fake = criterion(output, fake_label)
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        errD = errD_real + errD_fake
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        Reallabel = np.ones([batch_size,])
        # Reallabel[np.random.randint(batch_size, int(batch_size / 10)), :] = 0
        Reallabel = torch.from_numpy(Reallabel).float().cuda()
        netG.zero_grad()
        # label.fill_(real_label)  # fake labels are real for generator cost
        output = netD(fake)
        errG = criterion(output, Reallabel)
        errG.backward()
        D_G_z2 = output.mean().item()
        optimizerG.step()
        if i%50==0:
            print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
                  % (epoch, niter, i, len(dataloader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
            lossListG.append(errG.item())
            lossListD.append(errD.item())
    # do checkpointing
    torch.save(netG.state_dict(), '%s/netG.pth' % (outf))
    torch.save(netD.state_dict(), '%s/netD.pth' % (outf))
    if epoch%1==0:
        state={'netG':netG.state_dict(),'netD':netD.state_dict(),'optimizerG':optimizerG.state_dict(),'epoch':epoch,
               'lossListG': lossListG,'lossListD':lossListD,'optimizerD':optimizerD.state_dict()}
        torch.save(state, '%s/DDGAN3EPOCH%d.pth' % (outf,epoch))

J_Johnson · April 14, 2021, 3:41pm

I am running into this same issue when running:

model.load_state_dict(torch.load('previous_dict.pt'))

I train a model for a few epochs, then try to continue training and this issue happens. Same error code.