I am using pytorch lightning and have defined my training epoch end and validation epoch end as follows:
def training_step(self, batch, batch_index):
x, y, w = batch
return {"loss": some_loss}
def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
loss = torch.stack([x["loss"] for x in outputs]).mean()
self.log("train/loss", loss)
def validation_step(self, batch, batch_index):
x, y, w = batch
return {"val/loss": some_loss}
def validation_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
loss = torch.stack([x["val/loss"] for x in outputs]).mean()
self.log("val/loss", loss)
Now, I have data loaders like:
def train_dataloader(self):
weighted_sampler = WeightedRandomSampler(...)
return DataLoader(
self.data_train,
shuffle=False,
sampler=weighted_sampler,
batch_size=self.batch_size, #64
num_workers=self.num_workers,
pin_memory=True,
persistent_workers=True,
)
def val_dataloader(self):
if self.data_val is None:
return None
weighted_sampler = WeightedRandomSampler(...)
return DataLoader(
self.data_val,
shuffle=False,
sampler=weighted_sampler,
batch_size=self.batch_size, #64
num_workers=self.num_workers,
pin_memory=True,
persistent_workers=True,
)
The issue is the training loop is fine but when I run the validation loop, it comes back with:
loss = torch.stack([x["loss"] for x in outputs]).mean()
RuntimeError: stack expects each tensor to be equal size, but got [64, 1] at entry 0 and [35, 1] at entry 1658
Do you think this is because of the batch_size
which is set to 64 and maybe gets truncated at the last slice? How can I solve this?