I want to do a multi-class classification and have 4 classes and have adapted this code.
My table is of the shape (1046, 41) features
and target (1046,). How can I prepare the data loader for such a problem?
I get errors when I change my batch size>1, as my sequence are varying of different classes
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-199-529330b2e568> in <module>()
2 model = ActionClassificationLSTM(input_features=41,
3 hidden_dim=50) #.double()
----> 4 sample = next(iter(valid_loader))
5 x, y = sample["sequences"], sample["label"]
6 print("input shape:{},target shape:{}".format(x.shape,y.shape))
5 frames
/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
54 storage = elem.storage()._new_shared(numel)
55 out = elem.new(storage)
---> 56 return torch.stack(batch, 0, out=out)
57 elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
58 and elem_type.__name__ != 'string_':
RuntimeError: stack expects each tensor to be equal size, but got [330, 41] at entry 0 and [451, 41] at entry 1
My setup:
import torchmetrics
# We have 4 classes
TOT_ACTION_CLASSES = 5
#lstm classifier definition
class ActionClassificationLSTM(pl.LightningModule):
# initialise method
def __init__(self, input_features, hidden_dim, learning_rate=0.001):
super().__init__()
# save hyperparameters
self.save_hyperparameters()
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = nn.LSTM(input_features,
hidden_dim,
batch_first=True)
# The linear layer that maps from hidden state space to classes
self.linear = nn.Linear(hidden_dim,
TOT_ACTION_CLASSES)
def forward(self, x):
# invoke lstm layer
lstm_out, (ht, ct) = self.lstm(x)
# invoke linear layer
return self.linear(ht[-1])
def training_step(self, batch, batch_idx):
# get data and labels from batch
x, y = batch["sequences"], batch["label"]
# reduce dimension
# y = torch.squeeze(y)
# convert to long
y = y.long()
# get prediction
y_pred = self(x)
# calculate loss
loss = F.cross_entropy(y_pred, y)
# get probability score using softmax
prob = F.softmax(y_pred, dim=1)
# get the index of the max probability
pred = prob.data.max(dim=1)[1]
# calculate accuracy
acc = torchmetrics.functional.accuracy(pred, y)
dic = {
'batch_train_loss': loss,
'batch_train_acc': acc
}
# log the metrics for pytorch lightning progress bar or any other operations
self.log('batch_train_loss', loss, prog_bar=True)
self.log('batch_train_acc', acc, prog_bar=True)
#return loss and dict
return {'loss': loss, 'result': dic}
def training_epoch_end(self, training_step_outputs):
# calculate average training loss end of the epoch
avg_train_loss = torch.tensor([x['result']['batch_train_loss'] for x in training_step_outputs]).mean()
# calculate average training accuracy end of the epoch
avg_train_acc = torch.tensor([x['result']['batch_train_acc'] for x in training_step_outputs]).mean()
# log the metrics for pytorch lightning progress bar and any further processing
self.log('train_loss', avg_train_loss, prog_bar=True)
self.log('train_acc', avg_train_acc, prog_bar=True)
def validation_step(self, batch, batch_idx):
# get data and labels from batch
x, y = batch["sequences"], batch["label"]
# reduce dimension
# y = torch.squeeze(y)
# convert to long
y = y.long()
# get prediction
y_pred = self(x)
# calculate loss
loss = F.cross_entropy(y_pred, y)
# get probability score using softmax
prob = F.softmax(y_pred, dim=1)
# get the index of the max probability
pred = prob.data.max(dim=1)[1]
# calculate accuracy
acc = torchmetrics.functional.accuracy(pred, y)
dic = {
'batch_val_loss': loss,
'batch_val_acc': acc
}
# log the metrics for pytorch lightning progress bar and any further processing
self.log('batch_val_loss', loss, prog_bar=True)
self.log('batch_val_acc', acc, prog_bar=True)
#return dict
return dic
def validation_epoch_end(self, validation_step_outputs):
# calculate average validation loss end of the epoch
avg_val_loss = torch.tensor([x['batch_val_loss']
for x in validation_step_outputs]).mean()
# calculate average validation accuracy end of the epoch
avg_val_acc = torch.tensor([x['batch_val_acc']
for x in validation_step_outputs]).mean()
# log the metrics for pytorch lightning progress bar and any further processing
self.log('val_loss', avg_val_loss, prog_bar=True)
self.log('val_acc', avg_val_acc, prog_bar=True)
def configure_optimizers(self):
# adam optimiser
optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
# learning rate reducer scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, min_lr=1e-15, verbose=True)
# scheduler reduces learning rate based on the value of val_loss metric
return {"optimizer": optimizer,
"lr_scheduler": {"scheduler": scheduler, "interval": "epoch", "frequency": 1, "monitor": "val_loss"}}
from sklearn.model_selection import train_test_split
feature_columns = df2.columns[1:-1]
sequences = []
for score, group in df2[1:].groupby("score"):
sequence_features = group[feature_columns]
label = score
sequences.append((sequence_features, label))
train_sequences, test_sequences = train_test_split(sequences, test_size=0.2)
len(train_sequences), len(test_sequences)
train_set = Post_data(train_sequences)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=1,
shuffle=True,
)