Model not learning pytorch

Hi I’m trying to train an optical flow cnn-model however my model doesn’t train:
here is my model :
class Network_piv(torch.nn.Module):

def init(self):

super(Network_piv,self).__init__()

class Features(torch.nn.Module):

  def __init__(self):

    super(Features,self).__init__()

    self.netOne =  torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=7, stride=1, padding=3),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

    )

    self.netTwo = torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

    )

    self.netThr = torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(2,5), stride=1, padding=2),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

    )

    self.netFou = torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

    )

    self.netFiv = torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3, stride=2, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

    )

    self.netSix = torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=128, out_channels=192, kernel_size=3, stride=2, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

    )

    # end

  def forward(self, tenInput):

    tenOne = self.netOne(tenInput)

    tenTwo = self.netTwo(tenOne)

    tenThr = self.netThr(tenTwo)

    tenFou = self.netFou(tenThr)

    tenFiv = self.netFiv(tenFou)

    tenSix = self.netSix(tenFiv)

    return [ tenOne, tenTwo, tenThr, tenFou, tenFiv, tenSix ]

  # end

# end

class Matching(torch.nn.Module):

  def __init__(self,intLevel):

    super(Matching,self).__init__()

    self.fltBackwarp = [0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625][intLevel]

    

    if intLevel != 2:

      self.netFeat = torch.nn.Sequential()

    elif intLevel == 2:

      self.netFeat = torch.nn.Sequential(

          torch.nn.Conv2d(in_channels=32,out_channels=64,kernel_size=1,stride=1,padding=0),

          torch.nn.LeakyReLU(inplace=False,negative_slope=0.1)

      )

    self.netFeat = torch.nn.Sequential()  

    if intLevel == 6:

      self.netUpflow = None

    

    elif intLevel != 6:

        self.netUpflow = torch.nn.ConvTranspose2d(in_channels=2, out_channels=2, kernel_size=4, stride=2, padding=1, bias=False, groups=2)

    # end

    

    if intLevel >= 4:

      self.netUpcorr = None

    elif intLevel < 4:

      self.netUpcorr = torch.nn.ConvTranspose2d(in_channels=49, out_channels=49, kernel_size=4, stride=2, padding=1, bias=False, groups=49)

    # end

    self.netMain = torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=49, out_channels=128, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel])

    )

  # end

  def forward(self, tenFirst, tenSecond, tenFeaturesFirst, tenFeaturesSecond, tenFlow):

    tenFeaturesFirst = self.netFeat(tenFeaturesFirst)

    tenFeaturesSecond = self.netFeat(tenFeaturesSecond)

    

    if tenFlow is not None:

      tenFlow = self.netUpflow(tenFlow)

      

    # end

    if tenFlow is not None:

      tenFeaturesSecond = backwarp(tenInput=tenFeaturesSecond, tenFlow=tenFlow * self.fltBackwarp)

      

    # end

    

    if self.netUpcorr is None:

      tenCorrelation = torch.nn.functional.leaky_relu(input=correlation.FunctionCorrelation(tenFirst=tenFeaturesFirst, tenSecond=tenFeaturesSecond, intStride=1), negative_slope=0.1, inplace=False)

    elif self.netUpcorr is not None:

      tenCorrelation = self.netUpcorr(torch.nn.functional.leaky_relu(input=correlation.FunctionCorrelation(tenFirst=tenFeaturesFirst, tenSecond=tenFeaturesSecond, intStride=2), negative_slope=0.1, inplace=False))

    # end

    return (tenFlow if tenFlow is not None else 0.0) + self.netMain(tenCorrelation)

  # end

# end



class Subpixel(torch.nn.Module):

  def __init__(self, intLevel):

    super(Subpixel, self).__init__()

    self.fltBackward = [ 0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625 ][intLevel]

    if intLevel != 2:

      self.netFeat = torch.nn.Sequential()

    elif intLevel == 2:

      self.netFeat = torch.nn.Sequential(

        torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=1, stride=1, padding=0),

        torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

      )

    # end

    self.netMain = torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=[ 0, 0, 130, 130, 194, 258, 386 ][intLevel], out_channels=128, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel])

    )

  # end

  def forward(self, tenFirst, tenSecond, tenFeaturesFirst, tenFeaturesSecond, tenFlow):

    tenFeaturesFirst = self.netFeat(tenFeaturesFirst)

    tenFeaturesSecond = self.netFeat(tenFeaturesSecond)

    if tenFlow is not None:

      tenFeaturesSecond = backwarp(tenInput=tenFeaturesSecond, tenFlow=tenFlow * self.fltBackward)

    # end

    return (tenFlow if tenFlow is not None else 0.0) + self.netMain(torch.cat([ tenFeaturesFirst, tenFeaturesSecond, tenFlow ], 1))

  # end

# end

class Regularization(torch.nn.Module):

  def __init__(self, intLevel):

    super(Regularization, self).__init__()

    self.fltBackward = [ 0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625 ][intLevel]

    self.intUnfold = [ 0, 0, 7, 5, 5, 3, 3 ][intLevel]

    if intLevel >= 5:

      self.netFeat = torch.nn.Sequential()

    elif intLevel < 5:

      self.netFeat = torch.nn.Sequential(

        torch.nn.Conv2d(in_channels=[ 0, 0, 32, 64, 96, 128, 192 ][intLevel], out_channels=128, kernel_size=1, stride=1, padding=0),

        torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

      )

    # end

    self.netMain = torch.nn.Sequential(

      torch.nn.Conv2d(in_channels=[ 0, 0, 131, 131, 131, 131, 195 ][intLevel], out_channels=128, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),

      torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),

      torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)

    )

    if intLevel >= 5:

      self.netDist = torch.nn.Sequential(

        torch.nn.Conv2d(in_channels=32, out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel])

      )

    elif intLevel < 5:

      self.netDist = torch.nn.Sequential(

        torch.nn.Conv2d(in_channels=32, out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=([ 0, 0, 7, 5, 5, 3, 3 ][intLevel], 1), stride=1, padding=([ 0, 0, 3, 2, 2, 1, 1 ][intLevel], 0)),

        torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=(1, [ 0, 0, 7, 5, 5, 3, 3 ][intLevel]), stride=1, padding=(0, [ 0, 0, 3, 2, 2, 1, 1 ][intLevel]))

      )

    # end

    self.netScaleX = torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=1, kernel_size=1, stride=1, padding=0)

    self.netScaleY = torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=1, kernel_size=1, stride=1, padding=0)

  # eny

  def forward(self, tenFirst, tenSecond, tenFeaturesFirst, tenFeaturesSecond, tenFlow):

    tenDifference = (tenFirst - backwarp(tenInput=tenSecond, tenFlow=tenFlow * self.fltBackward)).pow(2.0).sum(1, True).sqrt().detach()

    tenDist = self.netDist(self.netMain(torch.cat([ tenDifference, tenFlow - tenFlow.view(tenFlow.shape[0], 2, -1).mean(2, True).view(tenFlow.shape[0], 2, 1, 1), self.netFeat(tenFeaturesFirst) ], 1)))

    tenDist = tenDist.pow(2.0).neg()

    tenDist = (tenDist - tenDist.max(1, True)[0]).exp()

    tenDivisor = tenDist.sum(1, True).reciprocal()

    tenScaleX = self.netScaleX(tenDist * torch.nn.functional.unfold(input=tenFlow[:, 0:1, :, :], kernel_size=self.intUnfold, stride=1, padding=int((self.intUnfold - 1) / 2)).view_as(tenDist)) * tenDivisor

    tenScaleY = self.netScaleY(tenDist * torch.nn.functional.unfold(input=tenFlow[:, 1:2, :, :], kernel_size=self.intUnfold, stride=1, padding=int((self.intUnfold - 1) / 2)).view_as(tenDist)) * tenDivisor

    return torch.cat([ tenScaleX, tenScaleY ], 1)

  # end

# end

self.netFeatures = Features()

self.netMatching = torch.nn.ModuleList([ Matching(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ])

self.netSubpixel = torch.nn.ModuleList([ Subpixel(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ])

self.netRegularization = torch.nn.ModuleList([ Regularization(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ])

def forward(self, tenFirst, tenSecond):

tenFirst[:, 0, :, :] = tenFirst[:, 0, :, :] - 0.411618

tenFirst[:, 1, :, :] = tenFirst[:, 1, :, :] - 0.434631

tenFirst[:, 2, :, :] = tenFirst[:, 2, :, :] - 0.454253

tenSecond[:, 0, :, :] = tenSecond[:, 0, :, :] - 0.410782

tenSecond[:, 1, :, :] = tenSecond[:, 1, :, :] - 0.433645

tenSecond[:, 2, :, :] = tenSecond[:, 2, :, :] - 0.452793



'''

tenFirst = [ tenFirst ]

tenSecond = [ tenSecond ]

'''

tenFeaturesFirst = self.netFeatures(tenFirst)

tenFeaturesSecond = self.netFeatures(tenSecond)

tenFirst = [ tenFirst ]

tenSecond = [ tenSecond ]

for intLevel in [ 1, 2, 3, 4, 5 ]:

  tenFirst.append(torch.nn.functional.interpolate(input=tenFirst[-1], size=(tenFeaturesFirst[intLevel].shape[2], tenFeaturesFirst[intLevel].shape[3]), mode='bilinear', align_corners=False))

  tenSecond.append(torch.nn.functional.interpolate(input=tenSecond[-1], size=(tenFeaturesSecond[intLevel].shape[2], tenFeaturesSecond[intLevel].shape[3]), mode='bilinear', align_corners=False))

# end

tenFlow = None

for intLevel in [ -1, -2, -3, -4, -5,]:

  if intLevel == -5:

    tenFlow = tenFlow[:,:,0:109,:]

  tenFlow = self.netMatching[intLevel](tenFirst[intLevel], tenSecond[intLevel], tenFeaturesFirst[intLevel], tenFeaturesSecond[intLevel], tenFlow)

  tenFlow = self.netSubpixel[intLevel](tenFirst[intLevel], tenSecond[intLevel], tenFeaturesFirst[intLevel], tenFeaturesSecond[intLevel], tenFlow)

  tenFlow = self.netRegularization[intLevel](tenFirst[intLevel], tenSecond[intLevel], tenFeaturesFirst[intLevel], tenFeaturesSecond[intLevel], tenFlow)

# end

tenFlow = self.netMatching[0](tenFirst[0], tenSecond[0], tenFeaturesFirst[0], tenFeaturesSecond[0], tenFlow)

tenFlow = self.netSubpixel[0](tenFirst[0], tenSecond[0], tenFeaturesFirst[0], tenFeaturesSecond[0], tenFlow)

#tenFlow = self.netRegularization[0](tenFirst[0], tenSecond[0], tenFeaturesFirst[0], tenFeaturesSecond[0], tenFlow)



return tenFlow * 20.0 

end

end

and here is my training :

import torch.optim as optim

from tqdm import tqdm

from PIL import Image

import PIL

import flowiz as fz

import torch.nn.functional as F

optimizer = optim.Adam(model.parameters(),lr=0.0001)

loss_function = torch.nn.MSELoss().cuda()

#target = Variable(target.cuda(), requires_grad=False)

batch_size = 8

epochs = 10

for epo in range(epochs):

model.train()

for i in tqdm(range(0,len(X_train_1),batch_size)):

    model.train()

    batch_X_1 = torch.FloatTensor(X_train_1[i:i+batch_size]).view(-1,3,436,1024)

    batch_X_2 = torch.FloatTensor(X_train_2[i:i+batch_size]).view(-1,3,436,1024)

    batch_y = torch.FloatTensor(Y_train[i:i+batch_size]).cuda()

    

    model.zero_grad()

    output = model(batch_X_1.cuda(),batch_X_2.cuda())

    output = computeImg(output[0,:,:].view((436,1024,2)).cpu().numpy())

    plt.imshow(output)

    plt.show()

    plt.imshow(Y_train[i])

    plt.show()

    # creating a image object (main image)  

    '''

    output = Image.save('./output/output_%d.flo',i)

    

    files = glob.glob('output.flo')

    

    output = img = fz.convert_from_file(output)

    '''

    output = torch.FloatTensor(output).cuda()

    #batch_y = batch_y.requires_grad=False

    loss = loss_function(output,batch_y)

    loss.requires_grad = True

    loss.backward()

    optimizer.step()

print(loss)

It seems you might be breaking the computation graph by recreating the output tensor in:

output = torch.FloatTensor(output).cuda()

Instead of wrapping it in a FloatTensor, you should directly use it to calculate the loss.

Unrelated to your issue, but Variables are deprecated since PyTorch 0.4, so you can use tensors now.

hi @ptrblck thanks for your answer,
I wanted to ask also what Loss function is better suited when i have a color image as output.
I m actually building a neural network for optical flo estimation and the output is flo from who i compute a colored image.

I assume the flow output would be encoded with some real values?
In the easiest use case, you would use nn.MSELoss, but I guess the latest state of the art optical flow models might use a custom loss function.
Are you trying to create a new approach or recreate a paper implementation?

@ptrblck Hi I’m recreating an implementation(LITEFLOW-NET)
so what I’m doing is using the MSELoss since the out put is a tensor of shape(2,436,1024) with real values.

I’can’t think of a better loss function for this case

@ptrblck Hi, please I just found out I was running this command befor the training :slight_smile:
torch.set_grad_enabled(False)
torch.backends.cudnn.enabled = True

can it be the reason why my model is not learning ?

if so should I delet it or modify it?

Thanks in advance

torch.set_grad_enabled(False) will disable the gradient calculation and you should run into errors during training.
Why are you using it or did you add it my mistake?

@ptrblck I have add it by mistake.

So you think it is why my model doesn’t learn and stays at the same error while learning ?

Yes, this would disable the gradient calculation in the script.
However, you should see a RuntimeError, if you are trying to calculate the gradients:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

If this error is not raised, you might have re-enabled the gradient calculation later.

Yeah I did loss.requires_grad=True but still The model was not learning I will remove the command
torch.set_grad_enabled(false) and see if it will start learning.

Thanks @ptrblck for your help.