I am training a small network on ca CPU. It trains by reading through .csv files in a folder consecutively. Here is my training function:
def train(epoch):
model.train()
train_loss = 0
file_number = 0
dataset_idx = [idx for idx in range(num_folders)]
for dataset in dataset_idx:
data, _ = data_folder.__getitem__(dataset)
train_loader = torch.utils.data.DataLoader(data(),
batch_size=64,
shuffle=False,
num_workers=4)
for batch_idx, data in enumerate(train_loader):
#scale the data
scaler = MinMaxScaler()
data = scaler.fit_transform(data)
data = torch.from_numpy(data)
data = data.to(device)
optimizer.zero_grad()
output = model(data)
data = data.unsqueeze(dim=1)
loss_func = loss(output.float(), data.float())
loss_func.backward()
train_loss += loss_func
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader),
loss_func / len(data)))
print('Training from the ', file_number, " file finished.")
file_number += 1
print('====> Epoch: {} Average loss: {:.4f}'.format(
epoch, train_loss / len(train_loader.dataset)))
Training from the first .csv file works fine but then it uses an enormous amount of memory and I get:
OSError: [Errno 12] Cannot allocate memory
I understand my code is leaking memory, I just don’t see where. Furthermore, after getting the error, jupyter notebook keeps the ram occupied until I kill the kernel. Any ideas?