I am running a PyTorch ANN model (for a classification task) and I am using skorch’s GridSearchCV to search for the optimal hyperparameters.
When I run GridSearchCV using n_jobs=1, it runs really slowly.
When I set n_jobs greater than 1, I get a memory blow-out error. So I am now trying to see if I could use PyTorch’s DataLoader to split up the dataset into batches to avoid the memory blow-out issue when I set n_jobs greater than 1. According to this other PyTorch Forum question (How to use Skorch for data that does not fit into memory?), it appears we could use SliceDataset. My code for this is as below:
# Setting up artifical neural net model
class TabularModel(nn.Module):
# Initialize parameters embeds, emb_drop, bn_cont and layers
def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
super().__init__()
self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
self.emb_drop = nn.Dropout(p)
self.bn_cont = nn.BatchNorm1d(n_cont)
# Create empty list for each layer in the neural net
layerlist = []
# Number of all embedded columns for categorical features
n_emb = sum((nf for ni, nf in emb_szs))
# Number of inputs for each layer
n_in = n_emb + n_cont
for i in layers:
# Set the linear function for the weights and biases, wX + b
layerlist.append(nn.Linear(n_in, i))
# Using ReLu activation function
layerlist.append(nn.ReLU(inplace=True))
# Normalised all the activation function output values
layerlist.append(nn.BatchNorm1d(i))
# Set some of the normalised activation function output values to zero
layerlist.append(nn.Dropout(p))
# Reassign number of inputs for the next layer
n_in = i
# Append last layer
layerlist.append(nn.Linear(layers[-1], out_sz))
# Create sequential layers
self.layers = nn.Sequential(*layerlist)
# Function for feedforward
def forward(self, x_cat_cont):
x_cat = x_cat_cont[:,0:cat_train.shape[1]].type(torch.int64)
x_cont = x_cat_cont[:,cat_train.shape[1]:].type(torch.float32)
# Create empty list for embedded categorical features
embeddings = []
# Embed categorical features
for i, e in enumerate(self.embeds):
embeddings.append(e(x_cat[:,i]))
# Concatenate embedded categorical features
x = torch.cat(embeddings, 1)
# Apply dropout rates to categorical features
x = self.emb_drop(x)
# Batch normalize continuous features
x_cont = self.bn_cont(x_cont)
# Concatenate categorical and continuous features
x = torch.cat([x, x_cont], 1)
# Feed categorical and continuous features into neural net layers
x = self.layers(x)
return x
# Use cross entropy loss function since this is a classification problem
# Assign class weights to the loss function
criterion_skorch = nn.CrossEntropyLoss
# Use Adam solver with learning rate 0.001
optimizer_skorch = torch.optim.Adam
from skorch import NeuralNetClassifier
# Random seed chosen to ensure results are reproducible by using the same initial random weights and biases,
# and applying dropout rates to the same random embedded categorical features and neurons in the hidden layers
torch.manual_seed(0)
net = NeuralNetClassifier(module=TabularModel,
module__emb_szs=emb_szs,
module__n_cont=con_train.shape[1],
module__out_sz=2,
module__layers=[30],
module__p=0.0,
criterion=criterion_skorch,
criterion__weight=cls_wgt,
optimizer=optimizer_skorch,
optimizer__lr=0.001,
max_epochs=150,
device='cuda'
)
from sklearn.model_selection import GridSearchCV
param_grid = {'module__layers': [[30], [50,20]],
'module__p': [0.0],
'max_epochs': [150, 175]
}
from torch.utils.data import TensorDataset, DataLoader
from skorch.helper import SliceDataset
# cat_con_train and y_train is a PyTorch tensor
tsr_ds = TensorDataset(cat_con_train.cpu(), y_train.cpu())
torch.manual_seed(0) # Set random seed for shuffling results to be reproducible
d_loader = DataLoader(tsr_ds, batch_size=100000, shuffle=True)
d_loader_slice_X = SliceDataset(d_loader, idx=0)
d_loader_slice_y = SliceDataset(d_loader, idx=1)
models = GridSearchCV(net, param_grid, scoring='roc_auc', n_jobs=2).fit(d_loader_slice_X, d_loader_slice_y)
However, when I ran this code, I get the following error message:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-47-df3fc792ad5e> in <module>()
104
--> 105 models = GridSearchCV(net, param_grid, scoring='roc_auc', n_jobs=2).fit(d_loader_slice_X, d_loader_slice_y)
106
6 frames
/usr/local/lib/python3.6/dist-packages/skorch/helper.py in __getitem__(self, i)
230 def __getitem__(self, i):
231 if isinstance(i, (int, np.integer)):
--> 232 Xn = self.dataset[self.indices_[i]]
233 Xi = self._select_item(Xn)
234 return self.transform(Xi)
TypeError: 'DataLoader' object does not support indexing
I am now confused because based on the other PyTorch Forum question I had mentioned above, someone there said that SliceDataset could work with DataLoader, but I am getting this error message above. What is wrong and/or how do I fix this?
Many many thanks in advance!