Training u-net with RNN for ocr

Hi,
I tried to implement this paper for OCR handwritten recognition. I used the u-net code from here. Based on the paper I connected the u-net with the bidirectional RNN and implemented the model. But the output characters are formed randomly. And I am not able to understand where the problem comes. could you able to tell me where I am doing wrong in code, please.

The model training code is here below

class UNet(nn.Module):
    def contracting_block(self, in_channels, out_channels, kernel_size=3):
        block = torch.nn.Sequential(
                    torch.nn.Conv2d(kernel_size=kernel_size, in_channels=in_channels, out_channels=out_channels,padding=1),
                    torch.nn.ReLU(),
                    torch.nn.BatchNorm2d(out_channels),
                    torch.nn.Conv2d(kernel_size=kernel_size, in_channels=out_channels, out_channels=out_channels,padding=1),
                    torch.nn.ReLU(),
                    torch.nn.BatchNorm2d(out_channels),
                )
        return block
    
    def expansive_block(self, in_channels, mid_channel, out_channels, kernel_size=3):
        block = torch.nn.Sequential(
                torch.nn.Conv2d(kernel_size=kernel_size, in_channels=in_channels, out_channels=mid_channel,padding=1),
                torch.nn.ReLU(),
                torch.nn.BatchNorm2d(mid_channel),
                torch.nn.Conv2d(kernel_size=kernel_size, in_channels=mid_channel, out_channels=mid_channel,padding=1),
                torch.nn.ReLU(),
                torch.nn.BatchNorm2d(mid_channel),
                torch.nn.ConvTranspose2d(in_channels=mid_channel, out_channels=out_channels, kernel_size=3, stride=2, padding=1, output_padding=1)
                )
        return  block

    def final_block(self, in_channels, mid_channel, out_channels, kernel_size=3):
        block = torch.nn.Sequential(
                torch.nn.Conv2d(kernel_size=kernel_size, in_channels=in_channels, out_channels=mid_channel,padding=1),
                torch.nn.ReLU(),
                torch.nn.BatchNorm2d(mid_channel),
                torch.nn.Conv2d(kernel_size=kernel_size, in_channels=mid_channel, out_channels=mid_channel,padding=1),
                torch.nn.ReLU(),
                torch.nn.BatchNorm2d(mid_channel),
                torch.nn.Conv2d(kernel_size=kernel_size, in_channels=mid_channel, out_channels=out_channels, padding=1),
                torch.nn.ReLU(),
                torch.nn.BatchNorm2d(out_channels),
                )
        return  block


  

    def __init__(self, in_channel=1, charlist=len(loader.charList)):
        super(UNet, self).__init__()
        #Encode
        self.charlist = charlist
        self.conv_encode1 = self.contracting_block(in_channels=in_channel, out_channels=64)
        self.conv_maxpool1 = torch.nn.MaxPool2d(kernel_size=2)
        self.conv_encode2 = self.contracting_block(64, 128)
        self.conv_maxpool2 = torch.nn.MaxPool2d(kernel_size=2)
        self.conv_encode3 = self.contracting_block(128, 256)
        self.conv_maxpool3 = torch.nn.MaxPool2d(kernel_size=2)

        # Bottleneck
        self.bottleneck = torch.nn.Sequential(
                            torch.nn.Conv2d(kernel_size=3, in_channels=256, out_channels=512),
                            torch.nn.ReLU(),
                            torch.nn.BatchNorm2d(512),
                            torch.nn.Conv2d(kernel_size=3, in_channels=512, out_channels=512),
                            torch.nn.ReLU(),
                            torch.nn.BatchNorm2d(512),
                            torch.nn.ConvTranspose2d(in_channels=512, out_channels=256, kernel_size=3, stride=2, padding=1, output_padding=1)
                            )
        # Decode
        self.conv_decode2 = self.expansive_block(512, 256, 128)
        self.conv_decode1 = self.expansive_block(256, 128, 64)
        self.final_layer = self.final_block(128, 64, charlist+1)

        # RNN layer

        self.bi_rnn = torch.nn.RNN(input_size=charlist+1, hidden_size=numHidden, num_layers=2, batch_first=False, bidirectional=True)
        
        self.linear = nn.Linear(numHidden*2,charlist+1)
        self.flatten = nn.Flatten(2)


    def crop_and_concat(self, upsampled, bypass, crop=False):
        """
        This layer crop the layer from contraction block and concat it with expansive block vector
        """
        if crop:
            c = (bypass.size()[2] - upsampled.size()[2]) // 2
            bypass = F.pad(bypass, (-c, -c, -c, -c))
        return torch.cat((upsampled, bypass), 1)

    def forward(self, x, decode=False):
        # Encode
        encode_block1 = self.conv_encode1(x)
        encode_pool1 = self.conv_maxpool1(encode_block1)
        encode_block2 = self.conv_encode2(encode_pool1)
        encode_pool2 = self.conv_maxpool2(encode_block2)
        encode_block3 = self.conv_encode3(encode_pool2)
        encode_pool3 = self.conv_maxpool3(encode_block3)

        # Bottleneck
        bottleneck1 = self.bottleneck(encode_pool3)
        
        # Decode
        decode_block2 = self.crop_and_concat(bottleneck1, encode_block3, crop=True)
        cat_layer1 = self.conv_decode2(decode_block2)
        
        decode_block1 = self.crop_and_concat(cat_layer1, encode_block2, crop=True)
        
        cat_layer = self.conv_decode1(decode_block1)
        
        decode_block = self.crop_and_concat(cat_layer, encode_block1, crop=True)
        
        final_layer = self.final_layer(decode_block)
        
        flatten_final = self.flatten(final_layer)
        
        linear_layer = nn.Linear(final_layer.size(2)*final_layer.size(2),maxTextLen)
        linear_output = linear_layer(flatten_final)
        
        rnnIn3d = linear_output.permute(2,0,1)
        

        
        fw,_ = self.bi_rnn(rnnIn3d)

        output = self.linear(fw)
        
        output = nn.functional.log_softmax(output,2)
    
        return  output