Hi, I am going to use Apex to train my model since I am working on 3D medical images and I am frequently going OOM. I read the documentation of Apex API and I wanted to try using apex on a simple model first to get better intuition and then try it for my main model. Below is my code, but I do not know why I get the error
CUDA error: an illegal memory access was encountered
I will be thankful if you can help me.
mnist = torchvision.datasets.MNIST('./data',train= True,download=True,transform =transforms.ToTensor())
data_loader = DataLoader(mnist,batch_size=20,num_workers=2,shuffle=True)
class Model(nn.Module):
# Our model
def __init__(self):
super(Model, self).__init__()
self.fc1 = nn.Conv2d(1,10,3)
self.bn1 = nn.BatchNorm2d(10)
self.fc2= nn.Conv2d(10,20,3)
self.bn2 = nn.BatchNorm2d(20)
self.fc3= nn.Linear(11520,10)
def forward(self,x):
x = F.relu(self.fc1(x))
x = self.bn1(x)
x = F.relu(self.fc2(x))
x = self.bn2(x)
x = x.view(x.size(0),-1)
x = self.fc3(x)
return(x)
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
model = Model().to(device)
optimizer = optim.Adam(model.parameters(),lr=0.1)
lr_sch = lr_scheduler.StepLR(optimizer,step_size=2,gamma=0.1)
criterion = nn.CrossEntropyLoss()
model,optimizer = amp.initialize(model,optimizer,opt_level="O2",keep_batchnorm_fp32=True,loss_scale="dynamic")
def train(epoch):
model.train()
t_loss = 0
for X,y in data_loader:
X= X.to(device)
y = y.long().to(device)
pred = model(X)
loss = criterion(pred,y)
t_loss+= loss.item()
optimizer.zero_grad()
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
return(t_loss/len(data_loader.dataset))
num_epochs = 20
train_loss = []
cudnn.benchmark = True
for epoch in range(num_epochs):
t_loss = train(epoch)
train_loss.append(t_loss)