Convert grayscale images to RGB

Hello, I am trying to classify ImageNet using vgg and I am using a custom dataset as follows

train_dataset=CustomDataset(csv_file='/home/tboonesifuentes/Databases/ImageNet/Train/train.csv',root_dir='/home/tboonesifuentes/Databases/ImageNet/Train/Crops',
                                transform=transforms.Compose([  
                                transforms.ToPILImage(),
                                transforms.Resize([224,224]),
                                transforms.ToTensor()]))
    
    
test_dataset=CustomDataset(csv_file='/home/tboonesifuentes/Databases/ImageNet/Test/test.csv',root_dir='/home/tboonesifuentes/Databases/ImageNet/Test/Crops',
                                transform=transforms.Compose([
                                transforms.ToPILImage(),
                                transforms.Resize([224,224]),
                                #transforms.RandomCrop(24),
                                transforms.ToTensor()]))


batch_size=130


class TransformedDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, transform_fn):
        self.dataset = dataset
        self.transform_fn = transform_fn

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        dp = self.dataset[index]
        return self.transform_fn(dp)
    

labels_mapping = {     
8:0,
9:1,
10:2,
11:3,
12:4,
13:5,
14:6,
15:7,
16:8,
17:9,
18:10,
19:11,
20:12,
21:13,
22:14,
23:15,
24:16,
25:17,
26:18,
27:19,
28:20,
29:21,
30:22,
31:23,
32:24,
33:25,
34:26,
35:27,
36:28,
37:29,
38:30,
39:31,
40:32,
41:33,
42:34,
43:35,   
}

def map_targets_fn(dp, target_mapping):
    x, y = dp
    new_y  = target_mapping[y.item()]
    return x, new_y


train_dataset = TransformedDataset(train_dataset, partial(map_targets_fn, target_mapping=labels_mapping))
test_dataset = TransformedDataset(test_dataset, partial(map_targets_fn, target_mapping=labels_mapping))

for idx, (data,image) in enumerate (train_dataset):
    
    if data.shape[0] == 1:
        print(data.shape)
        print('1D image')
        


train_loader = DataLoader(train_dataset, batch_size,num_workers=num_workers, 
                        shuffle=True, drop_last=True)    
  
test_loader = DataLoader(test_dataset, batch_size,num_workers=num_workers, 
                          shuffle=False, drop_last=False)    

I didn’t know what ImageNet had grayscale images and I actually found some and read them on matlab and yes they are grayscale…that’s the reason Im getting the error of batch size mismatch at position 0. Now I know I have to convert these grayscale images if I want to train…my question is where can I catch the grayscale images and convert them to rgb? In matlab would be something like rgbImage = cat(3, A,A, A); where A is the grayscale image. But I don’t know how to do it or where exactly on my special code. Please someone help :slight_smile:

Assuming the tensors are loaded as [channels, height, width], you could probably use this lambda transformation:

trans = transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0)==1 else x)

x = torch.randn(3, 224, 224)
out = trans(x)
print(out.shape)
> torch.Size([3, 224, 224])

x = torch.randn(1, 224, 224)
out = trans(x)
print(out.shape)
> torch.Size([3, 224, 224])

If you are loading the images via PIL.Image.open inside your custom Dataset, you could also convert them directly to RGB via PIL.Image.open(...).convert('RGB').
However, since you are using ToPILImage as a transformation, I assume you are loading tensors directly.

1 Like

Hello ptrblck, Thanks for your quick response. Actually I discovered I also have images with four channels so I implemented this code in my custom dataset

import os
import pandas as pd
import torch

from torch.utils.data import Dataset

from skimage import io

class CustomDataset(Dataset):
	
    def __init__(self,csv_file,root_dir,transform=None):
        self.annotations=pd.read_csv(csv_file)
        self.root_dir=root_dir
        self.transform=transform

    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self,index):
        img_path=os.path.join(self.root_dir,self.annotations.iloc[index,0])
        image=io.imread(img_path)
        
        if len(image.shape) == 2:
        #convert grayscale to RGB
        #image = Image.open(path).convert('RGB') 
            image=torch.from_numpy(image)
            image=torch.stack([image,image,image],0)
            image=torch.transpose(image,0,2)
            image=image.numpy()
            #print('this was 1d before')
            #print(image.shape)
        elif len(image.shape) == 3: 
        #image has 4 channels
            if image.shape[0]==4:
                image=torch.from_numpy(image)
                image = image[:,:,:3]
                image=torch.transpose(image,0,2)
                image=image.numpy()
            
        y_label=torch.tensor(int(self.annotations.iloc[index,2]))

        if self.transform:
            image=self.transform(image)
        return (image,y_label)

I have mnist dataset that is in pytorch API its grayscale and I want to implement transfer learning using Alexnet. Alexnet accepts 2272273 rgb. How to convert it into rgb?
My code is here:

import torch
import torchvision
from torchvision import transforms as transforms
from torchvision import models
dir(models)
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(227),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
])

#Downloading training data

train_data = torchvision.datasets.MNIST(root=‘./data’, train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=2)

#Downloading test data

test_data = torchvision.datasets.MNIST(root=‘./data’, train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(test_data, batch_size=4, shuffle=False, num_workers=2)

classes = (‘0’, ‘1’, ‘2’, ‘3’, ‘4’, ‘5’, ‘6’, ‘7’, ‘8’, ‘9’)
import matplotlib.pyplot as plt
import numpy as np

#Function to show some random images
def imshow(img):
img = img / 2 + 0.5 # unnormalize
npimg = img.numpy()
plt.imshow((npimg * 255).astype(np.uint8))
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()

#Get some random training images

dataiter = iter(trainloader)

images, labels = next(dataiter)
#Show images

imshow(torchvision.utils.make_grid(images))

print labels

print(’ ‘.join(’%5s’ % classes[labels[j]] for j in range(4)))

My previous post describes different approaches including a minimal, executable code snippet showing how to repeat the single channel in the input channel via transforms.Lambda. Did you try to use any of these approaches?

yeah I read later on. I used transforms.Grayscale(3).

I have few questions which I couldnt find answers so asking here from experts.
I have to train mnist data for alexnet transfer learning.
My questions are:
1- How important is to change grayscale-rgb and size of mnist 2828 to 227227?
2-Why alexnet size is 227*227 not anything else? Like whats the significance?
3- We put mean and standard deviation of pre trained model, like here in my case imagenet for alexnet. So my data std and mean is diff? I have read so much about this but what I understood is , its ok to use your own and even it you use pre trained data’s. You can make your argument like mine was diff so I relied on it or you may say the pre trained was this so I am using it.

  1. If you want to use the pretrained first conv layer, you need to pass an input with 3 channels to the model. Alternatively, you can also replace the first conv layer and train it from scratch. AlexNet should use spatial inputs of 224x224 as described in ImageNet Classification with Deep Convolutional Neural Networks. If you don’t increase the spatial size of our inputs the forward pass will most likely raise a runtime error since your forward activations will be empty due to e.g. pooling layers.

  2. I would recommend to read the original paper and check if some explanation was given. Generally, I would guess 224x224 is picked because it works and might be a sweet spot balancing training speed, memory usage, model performance etc.

  3. Yes, the mean and stddev of your data might be different and you should consider using the stats of your dataset instead. Often the ImageNet stats are reused if your new custom datasets shares a similar domain, i.e. “natural” images in this case.

1 Like

Alright. Thanks. But I feel like pytorch is a mess. I don’t understand things but sadly tensorflow doesnt have pre trained alexnet. Here I have few questions-> I have frozen the layers like:
for param in alexnet.features.parameters():
param.requires_grad = False

and then updated last one like this:
alexnet.classifier[6] = nn.Linear(4096,10)

this is the snap of my summary after frozen layers and after updated last layer I got 10 classes output in place of 10000.

My question is why the sequential classifier in the image are true when I froze all layers. Moreover after freezing layers I directly changed the last layer output classes does it unfreezes ? Or do I need to manual set grad to true for this linear(6) ?

Sorry, but I don’t fully understand this claim. Could you describe what “snap” is why you are not expecting 10 output features even though you explicitly replace the last linear layer with out_features=10?
The posted screenshot also doesn’t represent your code as I see:

model = models.alexnet()
print(model.classifier)
# Sequential(
#   (0): Dropout(p=0.5, inplace=False)
#   (1): Linear(in_features=9216, out_features=4096, bias=True)
#   (2): ReLU(inplace=True)
#   (3): Dropout(p=0.5, inplace=False)
#   (4): Linear(in_features=4096, out_features=4096, bias=True)
#   (5): ReLU(inplace=True)
#   (6): Linear(in_features=4096, out_features=1000, bias=True)
# )

model.classifier[6] = nn.Linear(4096, 10)
print(model.classifier)
# Sequential(
#   (0): Dropout(p=0.5, inplace=False)
#   (1): Linear(in_features=9216, out_features=4096, bias=True)
#   (2): ReLU(inplace=True)
#   (3): Dropout(p=0.5, inplace=False)
#   (4): Linear(in_features=4096, out_features=4096, bias=True)
#   (5): ReLU(inplace=True)
#   (6): Linear(in_features=4096, out_features=10, bias=True)
# )

I don’t know which tool you are using, but you could always check the requires_grad attribute to see if a parameter is frozen or not.

Yes, since you are creating a new trainable layer and are assigning it to an internal model attribute.

These kind of statements are not really helpful as your questions could easily be solved by reading through the tutorials and docs while nevertheless more experienced users use their time to help you.

Well I meant to say the screenshot I attached has classifier part all true. This summary I printed after freezing features. My question was In the classifier you see the linear(6), I want to only unfreeze this and freeze rest of all.
And my other question is what does these linear(x) in classifier mean?

Im using google colab and My code is:
import torch
import torchvision
from torchvision import transforms as transforms
from torchvision import models
dir(models)
transform = transforms.Compose([
transforms.Grayscale(num_output_channels=3),
transforms.Resize(256),
transforms.CenterCrop(227),
transforms.ToTensor(),
#transforms.Normalize(mean=[0.16592294], std=[0.31460345]),
transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
])
train_data = torchvision.datasets.MNIST(root=‘./data’, train=True, download=True, transform=transform)

trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True)
test_data = torchvision.datasets.MNIST(root=‘./data’, train=False, download=True, transform=transform)

testloader = torch.utils.data.DataLoader(test_data, batch_size=4, shuffle=False)
for images, labels in trainloader:
print(‘Image batch dimensions:’, images.shape)
print(‘Image label dimensions:’, labels.shape)
print(‘Class labels of 10 examples:’, labels[:10])
break
#Class labels

classes = (‘0’, ‘1’, ‘2’, ‘3’, ‘4’, ‘5’, ‘6’, ‘7’, ‘8’, ‘9’)
import matplotlib.pyplot as plt
import numpy as np

#Function to show some random images
#def imshow(img):
#img = img / 2 + 0.5 # unnormalize
#npimg = img.numpy()
#plt.imshow(np.transpose(npimg, (1, 2, 0)))
#plt.show()
mean_cal=[0.485, 0.456, 0.406]
std_cal=[0.229, 0.224, 0.225]

def imshow(inp, title=None):
inp = inp.numpy().transpose((1, 2, 0))
meann = np.array([mean_cal])
std = np.array([std_cal])
inp = std * inp + meann
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
if title is not None:
plt.title(title)
plt.pause(0.001)
#Get some random training images

dataiter = iter(trainloader)

images, labels = next(dataiter)
#Show images

imshow(torchvision.utils.make_grid(images))

print labels

print(’ ‘.join(’%5s’ % classes[labels[j]] for j in range(4)))
alexnet = models.alexnet(pretrained=True) # This will download the weights for the network first time it is run!

alexnet
alexnet.eval()
import torchinfo
from torchinfo import summary
summary(model=alexnet,
input_size=(4, 3, 227, 227), # make sure this is “input_size”, not “input_shape”
# col_names=[“input_size”], # uncomment for smaller output
col_names=[“input_size”, “output_size”, “num_params”, “trainable”],
col_width=20,
row_settings=[“var_names”]
)
for name, child in alexnet.named_children():
print(name)
for param in alexnet.features.parameters():
param.requires_grad = False
summary(model=alexnet,
input_size=(4, 3, 227, 227), # make sure this is “input_size”, not “input_shape”
# col_names=[“input_size”], # uncomment for smaller output
col_names=[“input_size”, “output_size”, “num_params”, “trainable”],
col_width=20,
row_settings=[“var_names”]
)
import torch.nn as nn
alexnet.classifier[6] = nn.Linear(4096,10)
import torch.optim as optim
optimizer = optim.SGD(alexnet.classifier[6].parameters(), lr=0.001, momentum=0.9)
#Instantiating CUDA device

device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)

#Verifying CUDA → shouldn’t be

print(device)
alexnet.to(device)
loss_fn = nn.CrossEntropyLoss()
from google.colab import drive

drive.mount(‘/content/drive’)
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(device), data[1].to(device)

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    output = alexnet(inputs)
    loss = loss_fn(output, labels)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()
    if i % 2000 == 1999:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
              (epoch + 1, i + 1, running_loss / 2000))
        running_loss = 0.0

print(‘Finished Training of AlexNet’)

print(“Model’s state_dict:”)
for param_tensor in alexnet.state_dict():
print(param_tensor, “\t”, alexnet.state_dict()[param_tensor].size())

Print optimizer’s state_dict

print(“Optimizer’s state_dict:”)
for var_name in optimizer.state_dict():
print(var_name, “\t”, optimizer.state_dict()[var_name])

#Specify a path
PATH = “/content/drive/MyDrive/Colab Notebooks/MNIST/state_dict_model.pt”

Save

torch.save(alexnet.state_dict(), PATH)

and by the way what does transform.ToTensor() and np.clip(inp, 0, 1) do? I mean when I dont clip it says in plotting that values must be 0-1 or 255. But as far as I know ToTensor() does the same thing set images into range 0-1 ?