Error Thrown During Training. Please Help!

cjcanaday · January 6, 2019, 4:24pm

I am trying to train a Convolutional Neural Network to classify Cat and Dog Images and it is throwing an error that I can’t seem to solve.

Here is the whole error:

  File "train.py", line 70, in <module>
    main(config, args.resume)
  File "train.py", line 44, in main
    trainer.train()
  File "...\Coding\ConvNet\base\base_trainer.py", line 85, in train
    result = self._train_epoch(epoch)
  File "...Coding\ConvNet\trainer\trainer.py", line 52, in _train_epoch
    loss = self.loss(output, target)
  File "...\Coding\ConvNet\model\loss.py", line 4, in ces_loss
    return nn.CrossEntropyLoss(output,target)
  File "...\Anaconda3\envs\ConvNet\lib\site-packages\torch\nn\modules\loss.py", line 898, in __init__
    super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
  File "...\Anaconda3\envs\ConvNet\lib\site-packages\torch\nn\modules\loss.py", line 23, in __init__
    super(_WeightedLoss, self).__init__(size_average, reduce, reduction)
  File "...\Anaconda3\envs\ConvNet\lib\site-packages\torch\nn\modules\loss.py", line 16, in __init__
    self.reduction = _Reduction.legacy_get_string(size_average, reduce)
  File "...\Anaconda3\envs\ConvNet\lib\site-packages\torch\nn\_reduction.py", line 42, in legacy_get_string
    if size_average and reduce:
RuntimeError: bool value of Tensor with more than one value is ambiguous

I am relatively new to Pytorch so I am using the format found in the Github Repo listed below. I changed the train loader so that it uses ImageFolder.

From what ive seen previously I would assume that I need to put in a:

if not None

but to no avail.

Here is my DataLoader Script:

from torchvision import datasets, transforms
from base import BaseDataLoader


class CatDogDataLoader(BaseDataLoader):
    """
    Cat and Dog data loading demo using BaseDataLoader
    """
    def __init__(self, data_dir, batch_size, shuffle, validation_split, num_workers, training=True, IMG_SIZE=50):
        self.IMG_SIZE = IMG_SIZE
        trsfm = transforms.Compose([
            transforms.Resize((self.IMG_SIZE,self.IMG_SIZE)),
            transforms.Grayscale(1),
            transforms.ToTensor()

            ])
        self.data_dir = data_dir
        self.dataset = datasets.ImageFolder(self.data_dir, transform=trsfm)
        super(CatDogDataLoader, self).__init__(self.dataset, batch_size, shuffle, validation_split, num_workers)

and here is my TrainLoader:

import numpy as np
import torch
from torchvision.utils import make_grid
from base import BaseTrainer


class Trainer(BaseTrainer):
    """
    Trainer class
    Note:
        Inherited from BaseTrainer.
    """
    def __init__(self, model, loss, metrics, optimizer, resume, config,
                 data_loader, valid_data_loader=None, lr_scheduler=None, train_logger=None):
        super(Trainer, self).__init__(model, loss, metrics, optimizer, resume, config, train_logger)
        self.config = config
        self.data_loader = data_loader
        self.valid_data_loader = valid_data_loader
        self.do_validation = self.valid_data_loader is not None
        self.lr_scheduler = lr_scheduler
        self.log_step = int(np.sqrt(data_loader.batch_size))

    def _eval_metrics(self, output, target):
        acc_metrics = np.zeros(len(self.metrics))
        for i, metric in enumerate(self.metrics):
            acc_metrics[i] += metric(output, target)
            self.writer.add_scalar(f'{metric.__name__}', acc_metrics[i])
        return acc_metrics

    def _train_epoch(self, epoch):
        """
        Training logic for an epoch
        :param epoch: Current training epoch.
        :return: A log that contains all information you want to save.
        Note:
            If you have additional information to record, for example:
                > additional_log = {"x": x, "y": y}
            merge it with log before return. i.e.
                > log = {**log, **additional_log}
                > return log
            The metrics in log must have the key 'metrics'.
        """
        self.model.train()
    
        total_loss = 0
        total_metrics = np.zeros(len(self.metrics))
        for batch_idx, (data, target) in enumerate(self.data_loader):
            data, target = data.to(self.device), target.to(self.device)

            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.loss(output, target)
            loss.backward()
            self.optimizer.step()

            self.writer.set_step((epoch - 1) * len(self.data_loader) + batch_idx)
            self.writer.add_scalar('loss', loss.item())
            total_loss += loss.item()
            total_metrics += self._eval_metrics(output, target)

            if self.verbosity >= 2 and batch_idx % self.log_step == 0:
                self.logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                    epoch,
                    batch_idx * self.data_loader.batch_size,
                    self.data_loader.n_samples,
                    100.0 * batch_idx / len(self.data_loader),
                    loss.item()))
                self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True))

        log = {
            'loss': total_loss / len(self.data_loader),
            'metrics': (total_metrics / len(self.data_loader)).tolist()
        }

        if self.do_validation:
            val_log = self._valid_epoch(epoch)
            log = {**log, **val_log}

        if self.lr_scheduler is not None:
            self.lr_scheduler.step()

        return log

    def _valid_epoch(self, epoch):
        """
        Validate after training an epoch
        :return: A log that contains information about validation
        Note:
            The validation metrics in log must have the key 'val_metrics'.
        """
        self.model.eval()
        total_val_loss = 0
        total_val_metrics = np.zeros(len(self.metrics))
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(self.valid_data_loader):
                data, target = data.to(self.device), target.to(self.device)

                output = self.model(data)
                loss = self.loss(output, target)

                self.writer.set_step((epoch - 1) * len(self.valid_data_loader) + batch_idx, 'valid')
                self.writer.add_scalar('loss', loss.item())
                total_val_loss += loss.item()
                total_val_metrics += self._eval_metrics(output, target)
                self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True))

        return {
            'val_loss': total_val_loss / len(self.valid_data_loader),
            'val_metrics': (total_val_metrics / len(self.valid_data_loader)).tolist()
        }

I can include some more code if needed.

Thanks.

alepack · January 6, 2019, 11:33pm

Hi

Looking at the error log, I think the problem is in the file “loss.py” when the line

return nn.CrossEntropyLoss(output, target)

is executed.
It seems that you’re returning a CrossEntropyModule which is intialized with two parameters “output” and “target”.
But the module constructor expects weights and some boolean flags (average and reduce), so when it sees a Tensor (target in this case) it gives you that error.
I can’t see the code but I’m assuming you want to compute the loss given the model outputs and the targets without instantiating any modules.
If this is the case you can try to use torch.nn.functional.cross_entropy

Hope this helps in finding the problem

stavnahum · July 1, 2019, 1:56pm

Hey @cjcanaday!
Did you manage to solve this issue?
I’m encountering the same problem and I’m not sure how to progress from here.

Cheers