In creating a simplified version of my code that recreates the problem, I think I have it narrowed down to some issue with my Dataset class.
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.init as weight_init
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
channels = 100
layers=1
# AutoEncoder for onehot representation
self.autoencoder_head = nn.Linear(4, channels)
self.autoencoder = nn.ModuleList([nn.Linear(channels, channels) for i in range(layers)])
self.autoencoder.append(nn.Linear(channels, 2))
self.encoded_map = nn.Linear(2, channels)
self.vel_map = nn.Linear(2, channels)
self.predictor = nn.ModuleList(
[nn.Linear(channels, channels) for i in range(layers)])
self.predictor.append(nn.Linear(channels, 1))
for m in self.modules():
if isinstance(m, (nn.Linear)):
weight_init.xavier_normal(m.weight.data)
def embedding(self, onehot):
x = self.autoencoder_head(onehot)
for layer in self.autoencoder:
x = F.relu(x)
x = layer(x)
return x
def forward(self, x):
material = x[:, :4]
vel_stress = x[:, 4:]
x = self.embedding(material)
x = self.encoded_map(x) + self.vel_map(vel_stress)
for layer in self.predictor:
x = F.relu(x)
x = layer(x)
return x
class TestDataset1(Dataset):
def __len__(self):
return 40000
def __getitem__(self, idx):
xy, material = divmod(idx, 4)
x, y = divmod(xy, 100)
x = x/100
y = y/100
if material == 0:
return np.array([1.0, 0, 0, 0, x, y]), np.array([.2 * x*y + .1 * x**2 - 10*x + 5])
elif material == 1:
return np.array([0, 1.0, 0, 0, x, y]), np.array([.3 * x*y - .1 * x**2 - 10*x + 5])
elif material == 2:
return np.array([0, 0, 1.0, 0, x, y]), np.array([-.1 * x*y - .1 * x**2 + .01 * y**2 - 10*x + 5])
else:
return np.array([0, 0, 0, 1.0, x, y]), np.array([.1 * x*y + .1 * x**2 - .1 * y**2 - 10*x + 5])
class TestDataset2(Dataset):
def __init__(self):
x, y = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
self.m1 = .2 * x*y + .1 * x**2 - 10*x + 5
self.m2 = .3 * x*y - .1 * x**2 - 10*x + 5
self.m3 = -.1 * x*y - .1 * x**2 + .01 * y**2 - 10*x + 5
self.m4 = .1 * x*y + .1 * x**2 - .1 * y**2 - 10*x + 5
def __len__(self):
return 40000
def __getitem__(self, idx):
xy, material = divmod(idx, 4)
x_idx, y_idx = divmod(xy, 100)
x = x_idx/100
y = y_idx/100
if material == 0:
return np.array([1.0, 0, 0, 0, x, y]), self.m1[x_idx, y_idx]
elif material == 1:
return np.array([0, 1.0, 0, 0, x, y]), self.m2[x_idx, y_idx]
elif material == 2:
return np.array([0, 0, 1.0, 0, x, y]), self.m3[x_idx, y_idx]
else:
return np.array([0, 0, 0, 1.0, x, y]), self.m4[x_idx, y_idx]
def train(dataloader, model, optimizer, epoch, custom_loss=False):
model.train()
cum_loss = 0
for batch_idx, (data, target) in enumerate(dataloader):
dtype = torch.cuda.FloatTensor
data, target = data.type(dtype), target.type(dtype)
data = Variable(data)
target = Variable(target)
optimizer.zero_grad()
predicted = model(data)
loss_fn = nn.MSELoss()
if custom_loss:
loss = torch.mean((predicted - target)**2)
else:
loss = loss_fn(predicted, target)
# Update Model
loss.backward()
optimizer.step()
cum_loss = cum_loss + loss.data[0]
if epoch % 10 == 0:
print('Train epoch: {}\tLoss: {:.6f}'.format(epoch, cum_loss / batch_idx))
Training using TestDataset1 works as expected, with either loss function. However, when I use TestDataset2, it does not train properly with the custom loss function:
dataloader = DataLoader(TestDataset2(), batch_size=100, shuffle=True, pin_memory=True)
model = Model().float().cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(41):
train(dataloader, model, optimizer, epoch, custom_loss=False)
Train epoch: 0 Loss: 6.635968
Train epoch: 10 Loss: 0.002086
Train epoch: 20 Loss: 0.000487
Train epoch: 30 Loss: 0.000299
Train epoch: 40 Loss: 0.000168
dataloader = DataLoader(TestDataset2(), batch_size=100, shuffle=True, pin_memory=True)
model = Model().float().cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(41):
train(dataloader, model, optimizer, epoch, custom_loss=True)
Train epoch: 0 Loss: 8.424415
Train epoch: 10 Loss: 8.422460
Train epoch: 20 Loss: 8.423582
Train epoch: 30 Loss: 8.423183
Train epoch: 40 Loss: 8.422752
There seems to be some problem with using fixed numpy arrays to hold the training data, but it is not clear to me what I am doing wrong.
Thanks