Hello all,

I am training an auto encoder on tabular data that has 272 features that are mostly sparse one-hot encodings. I am experiencing incredibly slow training. In fact it appears to possibly be training faster on CPU than on GPU. **The time to train one epoch with a batch size of 5 is 5 minutes.**

I’m hoping someone can provide some insight into why this is happening. Thank you in advance.

My set-up is a very basic fully connected network:

```
class FCNetwork(nn.Module):
"""Fully Connected Network Class"""
def __init__(self, n_input, layers, n_output, act=('relu', nn.ReLU())):
"""
:param n_input: Integer. Size of input vector.
:param layers: Tuple. containing the desired hidden layer architecture.
:param n_output: Size of output vector.
:param act: Tuple ('name', act_func). The first element should be a string
describing the activation function. The second element is the activation
function itself. Default is ``('ReLU', nn.ReLU())``.
"""
super().__init__()
self.input = nn.Linear(n_input, layers[0])
self.hidden = self.init_hidden(layers, activation=act)
self.output = nn.Linear(layers[-1], n_output)
def init_hidden(self, layers, activation, dropout=0.0):
n_layers = len(layers)
modules = OrderedDict()
a_name = activation[0]
modules[f'{a_name}_in'] = activation[1]
for i in range(n_layers - 1):
modules[f'fc{i}'] = nn.Linear(layers[i], layers[i + 1])
modules[f'{a_name}{i}'] = activation[1]
modules[f'drop{i}'] = nn.Dropout(p=dropout)
modules[f'{a_name}_out'] = activation[1]
return nn.Sequential(modules)
def forward(self, x):
x = x.float()
x = self.input(x)
x = self.hidden(x)
return self.output(x)
```

And this is my training method:

```
def train_encoder(model, trainload, epochs, criterion=nn.MSELoss(), optimizer=optim.Adam, lr=1e-4, testload=None):
"""
Train auto-encoder reconstruction for given number of epochs.
:param trainload: a DataLoader object containing training variables and targets used for training.
:param testload: Optional. a DataLoader containing the validation set. If included, both training and
validation loss will be tracked and can be plotted using model.plot_loss().
:param epochs: Number of times the network will view the entire data set
:param optimizer: Learning method. Default optim.Adam
:param lr: Learning Rate. Default 0.003
:param criterion: Loss function
:return:
"""
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Training model on {device}')
model = model.to(device)
opt = optimizer(model.parameters(), lr)
train_loss = []
valid_loss = []
for e in tqdm(range(epochs)):
running_tl = 0
running_vl = 0
for x in trainload:
x = x.to(device).float()
opt.zero_grad()
loss = criterion(model(x), x)
loss.backward()
opt.step()
running_tl += loss.item()
if testload is not None:
model.eval()
with torch.no_grad():
for x in testload:
x = x.to(device).float()
loss = criterion(model(x), x)
running_vl += loss.item()
valid_loss.append(running_vl / len(testload))
model.train()
train_loss.append(running_tl / len(trainload))
return train_loss, valid_loss
```