I am using a CNN-GRU-CTC model to perform image OCR of codes on a black background. The code is based on this repository. During training, after a few batches pass, the model starts outputting only the blank character. Similar questions show that this can occur when the blank character is part of the training data. This is not the case here, so I suspect it to be an error in my model. Could someone help me find the cause?
The data is made up of codes that are combinations of capital letters, numbers and the ‘:’ character as seperator:
My dataloader:
import torch
from skimage import io
from torch.utils.data import Dataset, DataLoader
class CodesDataset(Dataset):
def __init__(self, ann_path, code_path, transform=None):
self.code_path = code_path
self.ann_path = ann_path
self.transform = transform
def __len__(self):
return 1000
def __getitem__(self, idx):
img_name = self.code_path+"/"+str(idx)+".png"
image = io.imread(img_name)
annotation_name = self.ann_path+"/"+str(idx)+".txt"
f = open(annotation_name, "r")
annotation = f.read()
annotation_numbers = []
for c in annotation:
if(c.isdigit()):
annotation_numbers.append(int(c))
else:
if(c != ":"):
annotation_numbers.append(ord(c) - 55)
else:
annotation_numbers.append(36)
if self.transform:
image = self.transform(image)
return image, torch.IntTensor(annotation_numbers)
My model and training loop:
import sys
from itertools import groupby
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
import torchvision.transforms
from colorama import Fore
from torchvision import datasets, transforms
from tqdm import tqdm
from CodesDataset import CodesDataset
# ============================================= PREPARING DATASET ======================================================
epochs = 100
num_classes = 38
blank_label = 37
image_height = 28
gru_hidden_size = 128
gru_num_layers = 2
cnn_output_height = 4
cnn_output_width = 32
digits_per_sequence = 8
transform = torchvision.transforms.Compose([transforms.ToTensor(), transforms.Resize((100, 100))])
seq_dataset = CodesDataset("data/annotations", "data/codes", transform)
train_set, val_set = torch.utils.data.random_split(seq_dataset,
[int(len(seq_dataset) * 0.8), int(len(seq_dataset) * 0.2)])
train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=True)
# ================================================= MODEL ==============================================================
class CRNN(nn.Module):
def __init__(self):
super(CRNN, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=(3, 3))
self.norm1 = nn.InstanceNorm2d(32)
self.conv2 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=2)
self.norm2 = nn.InstanceNorm2d(32)
self.conv3 = nn.Conv2d(32, 64, kernel_size=(3, 3))
self.norm3 = nn.InstanceNorm2d(64)
self.conv4 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=2)
self.norm4 = nn.InstanceNorm2d(64)
self.gru_input_size = cnn_output_height * 64
self.gru = nn.GRU(self.gru_input_size, gru_hidden_size, gru_num_layers, batch_first=True, bidirectional=True)
self.fc = nn.Linear(gru_hidden_size * 2, num_classes)
def forward(self, x):
batch_size = x.shape[0]
out = self.conv1(x)
out = self.norm1(out)
out = F.leaky_relu(out)
out = self.conv2(out)
out = self.norm2(out)
out = F.leaky_relu(out)
out = self.conv3(out)
out = self.norm3(out)
out = F.leaky_relu(out)
out = self.conv4(out)
out = self.norm4(out)
out = F.leaky_relu(out)
out = out.permute(0, 3, 2, 1)
out = out.reshape(batch_size, -1, self.gru_input_size)
out, _ = self.gru(out)
out = torch.stack([F.log_softmax(self.fc(out[i]), dim=-1) for i in range(out.shape[0])])
return out
model = CRNN()
criterion = nn.CTCLoss(blank=blank_label, reduction='mean', zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# ================================================ TRAINING MODEL ======================================================
for _ in range(epochs):
# ============================================ TRAINING ============================================================
train_correct = 0
train_total = 0
for x_train, y_train in tqdm(train_loader,
position=0, leave=True,
file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
batch_size = x_train.shape[0]
optimizer.zero_grad()
y_pred = model(x_train)
y_pred = y_pred.permute(1, 0, 2)
input_lengths = torch.IntTensor(batch_size).fill_(cnn_output_width)
target_lengths = torch.IntTensor([len(t) for t in y_train])
loss = criterion(y_pred, y_train, input_lengths, target_lengths)
loss.backward()
optimizer.step()
_, max_index = torch.max(y_pred, dim=2) # max_index.shape == torch.Size([32, 64])
for i in range(batch_size):
raw_prediction = list(max_index[:, i].detach().cpu().numpy()) # len(raw_prediction) == 32
prediction = torch.IntTensor([c for c, _ in groupby(raw_prediction) if c != blank_label])
if len(prediction) == len(y_train[i]) and torch.all(prediction.eq(y_train[i])):
train_correct += 1
train_total += 1
print('TRAINING. Correct: ', train_correct, '/', train_total, '=', train_correct / train_total)
Thanks in advance.