Hy to all, i don’t know why i go out of memory (with 11 GiB of nvidia geforce 1080 ti).
The module with my net is this:
import torch.nn as nn
from torchvision.models import resnet50
class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
self.model = resnet50(pretrained = True)
self.model.fc = nn.Sequential(nn.Linear(self.model.fc.in_features, 1024),
nn.ReLU(),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Linear(512, 256),
nn.ReLU(),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(128, 64),
nn.ReLU())
def forward(self, x):
to_return = self.model(x)
return to_return
encoder = Encoder()
class AutoEncoder(nn.Module):
def __init__(self, enc):
super(AutoEncoder, self).__init__()
self.encoder = enc
self.decoder = nn.Sequential(
nn.Conv2d(1, 64, 3, padding=1),
nn.Upsample(scale_factor=5, mode='bicubic'),
nn.ReLU(),
nn.Conv2d(64, 128, 3, padding=1),
nn.Upsample(scale_factor=5, mode='bicubic'),
nn.ReLU(),
nn.Conv2d(128, 256, 3, padding=1),
nn.ReLU(),
nn.Conv2d(256, 512, 3, padding=1),
nn.Upsample(scale_factor=2.5, mode='bicubic'),
nn.ReLU(),
nn.Conv2d(512, 1024, 3, padding=1),
nn.ReLU(),
nn.Conv2d(1024, 2048, 3, padding=1),
nn.ReLU(),
nn.Conv2d(2048, 3, 3, padding=1)
)
def forward(self, x):
code = self.encoder(x)
code2 = code.view(1, 1, 8, 8)
reconstructed = self.decoder(code2)
return code, reconstructed
class MovementOrientation(nn.Module):
def __init__(self):
super(MovementOrientation, self).__init__()
self.mov_orie = nn.Sequential(nn.Linear(64, 4))#, nn.ReLU())
def forward(self, x):
mov_ori = self.mov_orie(x)
return mov_ori
class final_model(nn.Module):
def __init__(self):
super(final_model, self).__init__()
self.AutoEncoder = AutoEncoder(encoder)
self.MovementOrientation = MovementOrientation()
def forward(self, x):
code, rec = self.AutoEncoder(x)
MovementOrientation = self.MovementOrientation(code)
return code, rec, MovementOrientation
The module of my training is this:
import torch
import torch.nn as nn
from torch.optim import Adam, SGD
import numpy as np
from dataset import Dataset
from torch.utils.data import DataLoader
from torchnet.meter import AverageValueMeter
from torchnet.logger import VisdomPlotLogger, VisdomSaver
from nets import final_model
def train(model, train_loader, valid_loader, exp_name = "prova", lr=0.000001, epochs=1000, wd = 0.00001):
#La loss di training
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=lr, weight_decay=wd)
# meters
lossAE_meter = AverageValueMeter()
lossXZ_meter = AverageValueMeter()
lossUV_meter = AverageValueMeter()
total_loss_meter = AverageValueMeter()
# device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model.to(device)
loader = {"train": train_loader, "test": valid_loader}
loss_AE_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'LossAE', 'legend': ['train', 'test']})
loss_XZ_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'LossXZ', 'legend': ['train', 'test']})
loss_UV_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'LossUV', 'legend': ['train', 'test']})
total_loss_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'Total Loss', 'legend': ['train', 'test']})
visdom_saver = VisdomSaver(envs=[exp_name])
last_best_loss = np.inf
for e in range(epochs):
for mode in ["train", "test"]:
lossAE_meter.reset()
lossXZ_meter.reset()
lossUV_meter.reset()
total_loss_meter.reset()
model.train() if mode == "train" else model.eval()
with torch.set_grad_enabled(mode == "train"): # abilitiamo i gradienti in training
for i, batch in enumerate(loader[mode]):
x = batch["image"].to(device)
Movement = batch['Movement'][:, :2].float().to(device)
Orientation = batch['Movement'][:, 2:4].float().to(device)
target = batch["image_intensity"].to(device) #Immagine da ricostruire
code, reconstructed, MovementOrientation = model(x)
out1, out2 = MovementOrientation[:, :2], MovementOrientation[:, 2:4]
lossAE = criterion(reconstructed,target)
lossXZ = criterion(out1,Movement)
lossUV = criterion(out2, Orientation)
l = lossAE + lossXZ + lossUV
if mode == "train":
optimizer.zero_grad()
l.backward()
optimizer.step()
else:
if l < last_best_loss:
torch.save(model.state_dict(), 'Best_ %s.pth' % exp_name)
last_best_loss = l
n = x.shape[0] # numero di elementi nel batch
lossAE_meter.add(lossAE.item() * n, n)
lossXZ_meter.add(lossXZ.item()*n,n)
lossUV_meter.add(lossUV.item()* n, n)
if mode == "train":
loss_AE_logger.log(e + (i + 1) / len(loader[mode]), lossAE_meter.value()[0], name=mode)
loss_XZ_logger.log(e + (i + 1) / len(loader[mode]), lossXZ_meter.value()[0], name=mode)
loss_UV_logger.log(e + (i + 1) / len(loader[mode]), lossUV_meter.value()[0], name=mode)
loss_AE_logger.log(e + (i + 1) / len(loader[mode]), lossAE_meter.value()[0], name=mode)
loss_XZ_logger.log(e + (i + 1) / len(loader[mode]), lossXZ_meter.value()[0], name=mode)
loss_UV_logger.log(e + (i + 1) / len(loader[mode]), lossUV_meter.value()[0], name=mode)
visdom_saver.save()
torch.save(model.state_dict(), '%s.pth' % exp_name)
return model
def start_all():
torch.set_num_threads(4)
model = final_model()
#Define train dataset and loader
train_dataset = Dataset('./Dataset/Input','./Dataset/ToRec','./Dataset/Train.csv')
valid_dataset = Dataset('./Dataset/Input','./Dataset/ToRec','./Dataset/Validation.csv')
train_loader = DataLoader(train_dataset, batch_size=1,num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=1,num_workers=2)
model_trained= train(model, train_loader, valid_loader, exp_name="MultiHead", epochs=300)
If i have a batch size of one , how is it possible to address cuda out of memory ?
For precision:
RuntimeError: CUDA out of memory. Tried to allocate 1.91 GiB (GPU 0; 10.92 GiB total capacity; 7.65 GiB already allocated; 698.12 MiB free; 1.82 GiB cached)