Different batch sizes give different test accuracies

I am trying to test my model with different batch sizes and I am getting different accuracies for different batch sizes. here is my test snippet. (one_hot is False!)

    for idx, data in enumerate(test_loader):
        # if idx == 1:
        #     break
        # print(model.training)
        test_x, label = data['input'], data['label']
        # print(test_x)
        # print(test_x.shape)
        # this = test_x.numpy().squeeze(0).transpose(1,2,0)
        # print(this.shape, np.min(this), np.max(this))
        if cuda:
            test_x = test_x.cuda(device=device)
            label = label.cuda(device=device)
        # forward
        out_x, pred = model.forward(test_x)
        loss = criterion(out_x, label)
        un_confusion_meter.add(predicted=pred, target=label)
        confusion_meter.add(predicted=pred, target=label)

        ###############################
        # pred = pred.view(-1)
        # pred = pred.cpu().numpy()
        # label = label.cpu().numpy()
        # print(pred.shape, label.shape)
        ###############################

        # get accuracy metric
        # correct_count += np.sum((pred == label))
        # print(pred, label)
        # get accuracy metric
        if 'one_hot' in kwargs.keys():
            if kwargs['one_hot']:
                batch_correct = (torch.argmax(label, dim=1).eq(pred.long())).double().sum().item()
        else:
            batch_correct = (label.eq(pred.long())).double().sum().item()
        correct_count += batch_correct
        # print(batch_correct)
        total_count += np.float(batch_size)
        net_loss.append(loss.item())
        if idx % log_after == 0:
            print('log: on {}'.format(idx))

        #################################
    mean_loss = np.asarray(net_loss).mean()
    mean_accuracy = correct_count * 100 / total_count
    print(correct_count, total_count)
    print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
    print('log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy))
    print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

I have tried to fix all sorts of problems. Model is in .eval() mode and the function is decorated by torch.no_grad() as well. I can’t find any solution to this. Thanks

Do you have any random operations using the functional API, i.e. F.dropout()?
If so, could you check to set the training parameter accordingly, as it might not be set by model.eval().

How are you calculating pred inside your model?

As a small side note: you shouldn’t use model.forward() but rather call the model directly model(test_x).

1 Like

Thank you so much for such a quick reply ptrblck! this is my model definition

class VGG_5(nn.Module):
    """
            The following is an implementation of the lasagne based binarized VGG network, but with floating point weights
    """
    def __init__(self):
        super(VGG_5, self).__init__()
        # need some pretrained help!
        graph = models.vgg11(pretrained=True)
        graph_layers = list(graph.features)
        for i, layer in enumerate(graph_layers):
            print('{}.'.format(i), layer)
        drop_rate = 0.5
        activator = nn.Tanh()
        self.feauture_exctractor = nn.Sequential(
            nn.Conv2d(in_channels=5, out_channels=64, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(num_features=64, eps=1e-4, momentum=0.2),
            # nn.ReLU(),
            activator,

            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(num_features=64, eps=1e-4, momentum=0.2),
            # nn.ReLU(),
            activator,
            nn.Dropout2d(drop_rate),

            # nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            graph_layers[3], # pretrained on imagenet
            nn.BatchNorm2d(num_features=128, eps=1e-4, momentum=0.2),
            # nn.ReLU(),
            activator,

            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(num_features=128, eps=1e-4, momentum=0.2),
            # nn.ReLU(),
            activator,
            nn.Dropout(drop_rate),

            # nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            graph_layers[6], # pretrained on imagenet
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(num_features=256, eps=1e-4, momentum=0.2),
            # nn.ReLU(),
            activator,

            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(num_features=256, eps=1e-4, momentum=0.2),
            # nn.ReLU(),
            activator,
            nn.Dropout2d(drop_rate),
        )
        self.fc = nn.Sequential(
            nn.Linear(in_features=256 * 2 * 2, out_features=512),
            nn.BatchNorm1d(num_features=512, eps=1e-4, momentum=0.2),
            # nn.ReLU(),
            activator,
            nn.Dropout(drop_rate),
            nn.Linear(in_features=512, out_features=512),
            nn.BatchNorm1d(num_features=512, eps=1e-4, momentum=0.2),
            # nn.ReLU(),
            activator,
            nn.Linear(in_features=512, out_features=10),
            # nn.BatchNorm1d(num_features=10),
        )
        pass

    def forward(self, *input):
        x, = input
        x = self.feauture_exctractor(x)
        x = x.view(-1, 256*2*2)
        x = self.fc(x)
        return x, torch.argmax(input=x, dim=1)

so no F.anything() and here is my entire testing code (the one that runs here is the outermost else)

@torch.no_grad()
def eval_net(**kwargs):
    model = kwargs['model']
    cuda = kwargs['cuda']
    device = kwargs['device']
    if cuda:
        model.cuda(device=device)
    if 'criterion' in kwargs.keys():
        writer = kwargs['writer']
        val_loader = kwargs['val_loader']
        criterion = kwargs['criterion']
        global_step = kwargs['global_step']
        correct_count, total_count = 0, 0
        net_loss = []
        model.eval()  # put in eval mode first ############################
        print('evaluating with batch size = 1')
        for idx, data in enumerate(val_loader):
            test_x, label = data['input'], data['label']
            if cuda:
                test_x = test_x.cuda(device=device)
                label = label.cuda(device=device)
            # forward
            out_x, pred = model.forward(test_x)
            loss = criterion(out_x, label)
            net_loss.append(loss.item())

            # get accuracy metric
            if kwargs['one_hot']:
                batch_correct = (torch.argmax(label, dim=1).eq(pred.long())).double().sum().item()
            else:
                batch_correct = (label.eq(pred.long())).double().sum().item()
            correct_count += batch_correct
            total_count += np.float(pred.size(0))
        #################################
        mean_accuracy = correct_count / total_count * 100
        mean_loss = np.asarray(net_loss).mean()
        # summarize mean accuracy
        writer.add_scalar(tag='val. loss', scalar_value=mean_loss, global_step=global_step)
        writer.add_scalar(tag='val. over_all accuracy', scalar_value=mean_accuracy, global_step=global_step)
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print('log: validation:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

    else:
        # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda
        pre_model = kwargs['pre_model']
        base_folder = kwargs['base_folder']
        batch_size = kwargs['batch_size']
        log_after = kwargs['log_after']
        criterion = nn.CrossEntropyLoss()
        un_confusion_meter = tnt.meter.ConfusionMeter(10, normalized=False)
        confusion_meter = tnt.meter.ConfusionMeter(10, normalized=True)
        model.load_state_dict(torch.load(pre_model))
        print('log: resumed model {} successfully!'.format(pre_model))
        _, _, test_loader  = get_dataloaders(base_folder=base_folder, batch_size=batch_size)
        net_accuracy, net_loss = [], []
        correct_count = 0
        total_count = 0
        print('batch size = {}'.format(batch_size))
        model.eval()  # put in eval mode first
        for idx, data in enumerate(test_loader):
            # if idx == 1:
            #     break
            # print(model.training)
            test_x, label = data['input'], data['label']
            # print(test_x)
            # print(test_x.shape)
            # this = test_x.numpy().squeeze(0).transpose(1,2,0)
            # print(this.shape, np.min(this), np.max(this))
            if cuda:
                test_x = test_x.cuda(device=device)
                label = label.cuda(device=device)
            # forward
            out_x, pred = model(test_x)
            loss = criterion(out_x, label)
            un_confusion_meter.add(predicted=pred, target=label)
            confusion_meter.add(predicted=pred, target=label)

            ###############################
            # pred = pred.view(-1)
            # pred = pred.cpu().numpy()
            # label = label.cpu().numpy()
            # print(pred.shape, label.shape)
            ###############################

            # get accuracy metric
            # correct_count += np.sum((pred == label))
            # print(pred, label)
            # get accuracy metric
            if 'one_hot' in kwargs.keys():
                if kwargs['one_hot']:
                    batch_correct = (torch.argmax(label, dim=1).eq(pred.long())).double().sum().item()
            else:
                batch_correct = (label.eq(pred.long())).sum().item()
            # print(label.shape, pred.shape)
            # break
            correct_count += batch_correct
            # print(batch_correct)
            total_count += np.float(batch_size)
            net_loss.append(loss.item())
            if idx % log_after == 0:
                print('log: on {}'.format(idx))

            #################################
        mean_loss = np.asarray(net_loss).mean()
        mean_accuracy = correct_count * 100 / total_count
        print(correct_count, total_count)
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print('log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

        with open('normalized.pkl', 'wb') as this:
            pkl.dump(confusion_meter.value(), this, protocol=pkl.HIGHEST_PROTOCOL)
        with open('un_normalized.pkl', 'wb') as this:
            pkl.dump(un_confusion_meter.value(), this, protocol=pkl.HIGHEST_PROTOCOL)
        pass
    pass

I can’t see any problem with this thing.

and btw, my accuracy keeps jumping with different batch sizes. from 93% to 98.31% for different batch sizes. I trained it with batch size of 256 and testing it with 256, 257, 200, 1, 300, 512 and all give somewhat different results while 1, 200, 300 give 98.31%. Strange… (and I fixed it to call model() directly rather than its forward function as well)

Could you please tell me the input shape?
I’m currently trying to reproduce the issue and apparently [batch_size, 5, 128, 128] is working.
Is it the right shape as the output looks strange.

It’s apparently [batch_size, 5, 64, 64]?

EDIT:
I assume you are using the else branch in your code.
Could you add the cast to batch_correct like in the upper branch:

# Change
batch_correct = (label.eq(pred.long())).sum().item()
# to
batch_correct = (label.eq(pred.long())).double().sum().item() # or .float()

yes, I have five channels in my images (batch_size, 5, 64, 64) because they are coming from sentinel satellite.

Actually I have already used double() before because I read on this forum that equating pytorch tensors return a byte tensor so it is a good idea to cast them to double. But it still doesn’t work.
Just checked it now with batch size 1 (98.31%)
image
and batch size 512 (94.26%).
image

and I asked a similar question sometime ago. but the problem was different back then.

Input shape -> (batch_size, 5, 64, 64)
fixed the .double() thing too. Still doesn’t work

OK, I see. The float/double cast was only necessary in older versions, but I assumed it could be the mistake as I’ve found it in the other branch.

Regarding the other thread, it looks like you are performing some random transformations on your dataset.
Are you performing the same random transforms on the eval set currently?

Also, could you check, if this code snippet returns True?

model = VGG_5()
model.eval()
x = torch.randn(10, 5, 64, 64)
output_all, pred_all = model(x)
output_1, pred_1 = model(x[:5])
output_2, pred_2 = model(x[5:])
output_stacked = torch.cat((output_1, output_2), dim=0)

print(torch.allclose(output_all, output_stacked))

If so, the issue is probably located in the data not the model.

2 Likes

okay. So I have three sets, training, validation, test, and those transformations are only applied on the training set. Before dataloading I am saying

random.seed(74)

so that I get the same test, train split every time because I am reading from a folder directly without having train/test split image filenames declared

def get_dataloaders(base_folder, batch_size, one_hot=False):
    print('inside dataloading code...')

    class dataset(Dataset):
        def __init__(self, data_dictionary, bands, mode='train'):
            super(dataset, self).__init__()
            self.example_dictionary = data_dictionary
            # with open(mode+'.txt', 'wb') as this:
            #     this.write(json.dumps(self.example_dictionary))
            self.bands = bands # bands are a list bands to use as data, pass them as a list []
            self.mode = mode
            self.max = 0
            pass

        def __getitem__(self, k):
            example_path, label_name = self.example_dictionary[k]
            # print(example_path, label_name)
            # example is a tiff image, need to use gdal
            this_example = gdal.Open(example_path)
            this_label = all_labels[label_name]
            if one_hot:
                label_arr = np.zeros(10)
                label_arr[this_label] = 1
            # print(this_label, label_arr)
            example_array = this_example.GetRasterBand(self.bands[0]).ReadAsArray()
            for i in self.bands[1:]:
                example_array = np.dstack((example_array,
                                           this_example.GetRasterBand(i).ReadAsArray())).astype(np.int16)

            # transforms
            if self.mode == 'train':
                example_array = np.squeeze(seq.augment_images(
                    (np.expand_dims(example_array, axis=0))), axis=0)
                pass

            # range of vals = [0,1]
            example_array = np.clip((example_array.astype(np.float)/4096), a_min=0, a_max=1) # just to bring those values down
            # range of vals = [-1,1]
            example_array = 2*example_array-1

            # max value in test set is 28000
            # this_max = example_array.max()
            # if this_max > self.max:
            #     self.max = this_max
            # print(example_array.max(), example_array.min(), example_array.mean())

            example_array = toTensor(image=example_array)
            if one_hot:
                return {'input': example_array, 'label': torch.LongTensor(label_arr)}
            return {'input': example_array, 'label': this_label}


        def __len__(self):
            return len(self.example_dictionary)

    # create training set examples dictionary
    all_examples = {}
    for folder in sorted(os.listdir(base_folder)):
        # each folder name is a label itself
        # new folder, new dictionary!
        class_examples = []
        inner_path = os.path.join(base_folder, folder)
        for image in [x for x in os.listdir(inner_path) if x.endswith('.tif')]:
            image_path = os.path.join(inner_path, image)
            # for each index as key, we want to have its path and label as its items
            class_examples.append(image_path)
        all_examples[folder] = class_examples

    # split them into train and test
    train_dictionary, val_dictionary, test_dictionary = {}, {}, {}
    for class_name in all_examples.keys():
        class_examples = all_examples[class_name]
        # print(class_examples)
        random.shuffle(class_examples)

        total = len(class_examples)
        train_count = int(total * 0.8); train_ = class_examples[:train_count]
        test = class_examples[train_count:]

        total = len(train_)
        train_count = int(total * 0.9); train = train_[:train_count]
        validation = train_[train_count:]

        for example in train:
            train_dictionary[len(train_dictionary)] = (example, class_name)
        for example in test:
            test_dictionary[len(test_dictionary)] = (example, class_name)
        for example in validation:
            val_dictionary[len(val_dictionary)] = (example, class_name)


    # create dataset class instances
    bands = [4, 3, 2, 5, 8] # these are [NIR, Vegetation Red Edge, Red, Green, Blue] bands
    train_data = dataset(data_dictionary=train_dictionary, bands=bands, mode='train')
    val_data = dataset(data_dictionary=val_dictionary, bands=bands, mode='eval')
    test_data = dataset(data_dictionary=test_dictionary, bands=bands, mode='test')
    print('train examples =', len(train_dictionary), 'val examples =', len(val_dictionary),
          'test examples =', len(test_dictionary))

    train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size,
                                  shuffle=True, num_workers=4)
    val_dataloader = DataLoader(dataset=val_data, batch_size=batch_size,
                                shuffle=True, num_workers=4)
    test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size,
                                 shuffle=True, num_workers=4)

    return train_dataloader, val_dataloader, test_dataloader

I check for self.mode and only apply those transformations to the training set. I don’t think it’s creating any problems. And I ran your snippet and it returns True!

and I checked all my dataloaders before passing, I also viewed them with the corresponding labels and it was all okay. let me check again.

If the code snippet returns True, it might show that your model is working OK.
Is the shuffling of the dataset somehow seeded, i.e. are you getting the same split every time?

okay, so I found a huuge mistake but fixing it still doesn’t solve my problem. My data split was different every time I found out by comparing those dictionaries, but now I have fixed it like this

def get_dataloaders(base_folder, batch_size, one_hot=False):
    print('inside dataloading code...')

    class dataset(Dataset):
        def __init__(self, data_dictionary, bands, mode='train'):
            super(dataset, self).__init__()
            self.example_dictionary = data_dictionary
            # with open(mode+'.txt', 'wb') as this:
            #     this.write(json.dumps(self.example_dictionary))
            self.bands = bands # bands are a list bands to use as data, pass them as a list []
            self.mode = mode
            self.max = 0
            pass

        def __getitem__(self, k):
            example_path, label_name = self.example_dictionary[k]
            # print(example_path, label_name)
            # example is a tiff image, need to use gdal
            this_example = gdal.Open(example_path)
            this_label = all_labels[label_name]
            if one_hot:
                label_arr = np.zeros(10)
                label_arr[this_label] = 1
            # print(this_label, label_arr)
            example_array = this_example.GetRasterBand(self.bands[0]).ReadAsArray()
            for i in self.bands[1:]:
                example_array = np.dstack((example_array,
                                           this_example.GetRasterBand(i).ReadAsArray())).astype(np.int16)

            # transforms
            if self.mode == 'train':
                example_array = np.squeeze(seq.augment_images(
                    (np.expand_dims(example_array, axis=0))), axis=0)
                pass

            # range of vals = [0,1]
            example_array = np.clip((example_array.astype(np.float)/4096), a_min=0, a_max=1) # just to bring those values down
            # range of vals = [-1,1]
            example_array = 2*example_array-1

            # max value in test set is 28000
            # this_max = example_array.max()
            # if this_max > self.max:
            #     self.max = this_max
            # print(example_array.max(), example_array.min(), example_array.mean())

            example_array = toTensor(image=example_array)
            if one_hot:
                return {'input': example_array, 'label': torch.LongTensor(label_arr)}
            return {'input': example_array, 'label': this_label}


        def __len__(self):
            return len(self.example_dictionary)

    # create training set examples dictionary
    all_examples = {}
    for folder in sorted(os.listdir(base_folder)):
        # each folder name is a label itself
        # new folder, new dictionary!
        class_examples = []
        inner_path = os.path.join(base_folder, folder)
        #####################################3 this was a problem for a long time now.. because of not sorting it
        all_images_of_current_class = [x for x in os.listdir(inner_path) if x.endswith('.tif')]
        all_images_of_current_class.sort(key=lambda f: int(filter(str.isdigit, f)))
        # if folder == 'Forest':
        #     print(all_images_of_current_class)
        for image in all_images_of_current_class:
            # dirFiles.sort(key=lambda f: int(filter(str.isdigit, f)))
            # print(image)
            image_path = os.path.join(inner_path, image)
            # for each index as key, we want to have its path and label as its items
            class_examples.append(image_path)
        all_examples[folder] = class_examples

    # split them into train and test
    train_dictionary, val_dictionary, test_dictionary = {}, {}, {}
    for class_name in all_examples.keys():
        class_examples = all_examples[class_name]
        # print(class_examples)
        ########################## this doesn't work
        # random.shuffle(class_examples)
        ########################### but this does
        random.Random(4).shuffle(class_examples)

        total = len(class_examples)
        train_count = int(total * 0.8); train_ = class_examples[:train_count]
        test = class_examples[train_count:]

        total = len(train_)
        train_count = int(total * 0.9); train = train_[:train_count]
        validation = train_[train_count:]

        for example in train:
            train_dictionary[len(train_dictionary)] = (example, class_name)
        for example in test:
            test_dictionary[len(test_dictionary)] = (example, class_name)
        for example in validation:
            val_dictionary[len(val_dictionary)] = (example, class_name)

    # # test dataset
    with open('train1.txt', 'wb') as train_check:
        for k in range(len(train_dictionary)):
            train_check.write('{}\n'.format(train_dictionary[k][0]))


    # create dataset class instances
    bands = [4, 3, 2, 5, 8] # these are [NIR, Vegetation Red Edge, Red, Green, Blue] bands
    train_data = dataset(data_dictionary=train_dictionary, bands=bands, mode='train')
    val_data = dataset(data_dictionary=val_dictionary, bands=bands, mode='eval')
    test_data = dataset(data_dictionary=test_dictionary, bands=bands, mode='test')
    print('train examples =', len(train_dictionary), 'val examples =', len(val_dictionary),
          'test examples =', len(test_dictionary))

    train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size,
                                  shuffle=True, num_workers=4)
    val_dataloader = DataLoader(dataset=val_data, batch_size=batch_size,
                                shuffle=True, num_workers=4)
    test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size,
                                 shuffle=True, num_workers=4)

    return train_dataloader, val_dataloader, test_dataloader, test_dictionary

and tested it like this to compare data generated across runs, and the difference is zero now


def check_data_sanity():
    train, val, _, test1 = get_dataloaders(base_folder='/home/annus/Desktop/'
                                                      'projects/forest_cover_change/'
                                                      'eurosat/images/tif/',
                                          batch_size=16)

    train, val, _, test2 = get_dataloaders(base_folder='/home/annus/Desktop/'
                                                      'projects/forest_cover_change/'
                                                      'eurosat/images/tif/',
                                          batch_size=16)


    train, val, _, test3 = get_dataloaders(base_folder='/home/annus/Desktop/'
                                                      'projects/forest_cover_change/'
                                                      'eurosat/images/tif/',
                                          batch_size=16)

    # shared_items = {k: test1[k] for k in test1 if k in test2 and test1[k] == test2[k]}
    # print(len(shared_items), len(test1), len(test2))
    def get_dict_diff(d1, d2):
        return len(set(d1.values()) - set(d2.values()))

    import pickle
    # with open('test1.pkl', 'wb') as ts1:
    #     pickle.dump(test1, ts1, protocol=pickle.HIGHEST_PROTOCOL)

    with open('test1.pkl', 'rb') as ts1:
        test1_old = pickle.load(ts1)

    print(get_dict_diff(test2, test1_old))
    # print(get_dict_diff(test1, test2))
    # print(get_dict_diff(test1, test2))

and tested my model again and I am still getting different accuracies. But I am convinced that it must be a problem with the way I load the data and still it is getting different split every time

I have tested the dataloader’s test set by getting test set multiple times in the same run as well as across different runs, and there is no difference now. Here is the updated test now

def check_data_sanity():
    train, val, _, test1 = get_dataloaders(base_folder='/home/annus/Desktop/'
                                                      'projects/forest_cover_change/'
                                                      'eurosat/images/tif/',
                                          batch_size=16)

    train, val, _, test2 = get_dataloaders(base_folder='/home/annus/Desktop/'
                                                      'projects/forest_cover_change/'
                                                      'eurosat/images/tif/',
                                          batch_size=16)


    train, val, _, test3 = get_dataloaders(base_folder='/home/annus/Desktop/'
                                                      'projects/forest_cover_change/'
                                                      'eurosat/images/tif/',
                                          batch_size=16)

    def get_dict_diff(d1, d2):
        return len(set(d1.values()) - set(d2.values()))

    # compare on the same run
    print(get_dict_diff(test1, test2))
    print(get_dict_diff(test2, test3))
    print(get_dict_diff(test3, test1))

    # compare across runs
    import pickle
    with open('test1.pkl', 'rb') as ts1:
        test1_old = pickle.load(ts1)
    print(get_dict_diff(test2, test1_old))

okay so finally after all of this, I decided to save the pickle files for my train and test split to make sure same data loads every time and tested again but I still get different accuracy every time. This is my new loader code

def get_dataloaders(base_folder, batch_size, one_hot=False):
    print('inside dataloading code...')

    class dataset(Dataset):
        def __init__(self, data_dictionary, bands, mode='train'):
            super(dataset, self).__init__()
            self.example_dictionary = data_dictionary
            # with open(mode+'.txt', 'wb') as this:
            #     this.write(json.dumps(self.example_dictionary))
            self.bands = bands # bands are a list bands to use as data, pass them as a list []
            self.mode = mode
            self.max = 0
            pass

        def __getitem__(self, k):
            example_path, label_name = self.example_dictionary[k]
            # print(example_path, label_name)
            # example is a tiff image, need to use gdal
            this_example = gdal.Open(example_path)
            this_label = all_labels[label_name]
            if one_hot:
                label_arr = np.zeros(10)
                label_arr[this_label] = 1
            # print(this_label, label_arr)
            example_array = this_example.GetRasterBand(self.bands[0]).ReadAsArray()
            for i in self.bands[1:]:
                example_array = np.dstack((example_array,
                                           this_example.GetRasterBand(i).ReadAsArray())).astype(np.int16)

            # transforms
            if self.mode == 'train':
                example_array = np.squeeze(seq.augment_images(
                    (np.expand_dims(example_array, axis=0))), axis=0)
                pass

            # range of vals = [0,1]
            example_array = np.clip((example_array.astype(np.float)/4096), a_min=0, a_max=1) # just to bring those values down
            # range of vals = [-1,1]
            example_array = 2*example_array-1

            # max value in test set is 28000
            # this_max = example_array.max()
            # if this_max > self.max:
            #     self.max = this_max
            # print(example_array.max(), example_array.min(), example_array.mean())

            example_array = toTensor(image=example_array)
            if one_hot:
                return {'input': example_array, 'label': torch.LongTensor(label_arr)}
            return {'input': example_array, 'label': this_label}


        def __len__(self):
            return len(self.example_dictionary)


    """
        Okay so here is how we do it. We save the train, test and validation dictionaries if they don't exist, 
        and once they do, we load the preexisting ones to help us!
    """
    # check if we already have the data saved with us...
    count_data = 0 # count tells us what to do
    if os.path.exists('train_loader.pkl'):
        count_data += 1
        with open('train_loader.pkl', 'rb') as train_l:
            train_dictionary = p.load(train_l)
            print('INFO: Loaded pre-saved train data...')
    if os.path.exists('val_loader.pkl'):
        count_data += 1
        with open('val_loader.pkl', 'rb') as val_l:
            val_dictionary = p.load(val_l)
            print('INFO: Loaded pre-saved eval data...')
    if os.path.exists('test_loader.pkl'):
        count_data += 1
        with open('test_loader.pkl', 'rb') as test_l:
            test_dictionary = p.load(test_l)
            print('INFO: Loaded pre-saved test data...')

    # create training set examples dictionary
    if count_data != 3:
        all_examples = {}
        for folder in sorted(os.listdir(base_folder)):
            # each folder name is a label itself
            # new folder, new dictionary!
            class_examples = []
            inner_path = os.path.join(base_folder, folder)
            #####################################3 this was a problem for a long time now.. because of not sorting it
            all_images_of_current_class = [x for x in os.listdir(inner_path) if x.endswith('.tif')]
            all_images_of_current_class.sort(key=lambda f: int(filter(str.isdigit, f)))
            # if folder == 'Forest':
            #     print(all_images_of_current_class)
            for image in all_images_of_current_class:
                # dirFiles.sort(key=lambda f: int(filter(str.isdigit, f)))
                # print(image)
                image_path = os.path.join(inner_path, image)
                # for each index as key, we want to have its path and label as its items
                class_examples.append(image_path)
            all_examples[folder] = class_examples

        # split them into train and test
        train_dictionary, val_dictionary, test_dictionary = {}, {}, {}
        for class_name in all_examples.keys():
            class_examples = all_examples[class_name]
            # print(class_examples)
            ########################## this doesn't work
            # random.shuffle(class_examples)
            ########################### but this does
            random.Random(4).shuffle(class_examples)

            total = len(class_examples)
            train_count = int(total * 0.8); train_ = class_examples[:train_count]
            test = class_examples[train_count:]

            total = len(train_)
            train_count = int(total * 0.9); train = train_[:train_count]
            validation = train_[train_count:]

            for example in train:
                train_dictionary[len(train_dictionary)] = (example, class_name)
            for example in test:
                test_dictionary[len(test_dictionary)] = (example, class_name)
            for example in validation:
                val_dictionary[len(val_dictionary)] = (example, class_name)

        # # test dataset
        with open('train1.txt', 'wb') as train_check:
            for k in range(len(train_dictionary)):
                train_check.write('{}\n'.format(train_dictionary[k][0]))


    print(map(len, [train_dictionary, val_dictionary, test_dictionary]))
    # create dataset class instances
    bands = [4, 3, 2, 5, 8] # these are [NIR, Vegetation Red Edge, Red, Green, Blue] bands
    train_data = dataset(data_dictionary=train_dictionary, bands=bands, mode='train')
    val_data = dataset(data_dictionary=val_dictionary, bands=bands, mode='eval')
    test_data = dataset(data_dictionary=test_dictionary, bands=bands, mode='test')
    print('train examples =', len(train_dictionary), 'val examples =', len(val_dictionary),
          'test examples =', len(test_dictionary))

    train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size,
                                  shuffle=True, num_workers=4)
    val_dataloader = DataLoader(dataset=val_data, batch_size=batch_size,
                                shuffle=True, num_workers=4)
    test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size,
                                 shuffle=True, num_workers=4)

    # save the created datasets
    if count_data != 3:
        with open('train_loader.pkl', 'wb') as train_l:
            p.dump(train_dictionary, train_l, protocol=p.HIGHEST_PROTOCOL)
        with open('test_loader.pkl', 'wb') as test_l:
            p.dump(test_dictionary, test_l, protocol=p.HIGHEST_PROTOCOL)
        with open('val_loader.pkl', 'wb') as val_l:
            p.dump(val_dictionary, val_l, protocol=p.HIGHEST_PROTOCOL)
        print('INFO: saved data pickle files for later use')
    return train_dataloader, val_dataloader, test_dataloader #, test_dictionary

Would it be possible to upload the test_loader so that I could run your code?
If it’s too big, maybe a small part would be sufficient (e.g. 1000 samples).
Currently I can’t locate the source of this issue.

okay. So here are my labels as indices

all_labels = {
            'AnnualCrop'           : 0,
            'Forest'               : 1,
            'HerbaceousVegetation' : 2,
            'Highway'              : 3,
            'Industrial'           : 4,
            'Pasture'              : 5,
            'PermanentCrop'        : 6,
            'Residential'          : 7,
            'River'                : 8,
            'SeaLake'              : 9
            }

and I have uploaded my test images and the corresponding pickle file that contains their labels too.
https://drive.google.com/file/d/1HJ8auOSDAVNoV4Jll2izz8bMIAvJ3v0o/view?usp=sharing

I’ve downloaded the data and did some checks.
The model output of small stacked batches compared to one large batch (257) is the same.
The losses for batch_size=1 and the unreduced losses for batch_size=257 are the same.
I can’t check the accuracy as I don’t have the state dict, but so far the solution is deterministic.

Also, I had to remove some code, which should be dead anyway due to mode='test', e.g. the transform.

Thank you so much for your time ptrblck! I still can’t find out what the error is. By the way, when you have some free time you can test my trained model on this thing as well.
https://drive.google.com/file/d/1x6ebxznusnUEKVQahgIDHUPzrolVofYd/view?usp=sharing

1 Like