pin_memory=True blocked my loss updation in cifar-10

Hello everyone.I started training a dnn for classification of cifar10.But i noticed that my loss and kernal values are not at all updating.I removed the pin_memory then i loss starts updating.Does anyone know the reason for this??
Thanks in advance.
My train_loader and test_loader code

#importing test and train data and applying tfransformations
#setting seed
SEED=1
#checking if cuda ia availble or not
cuda=torch.cuda.is_available()
print(cuda)
if cuda:
  torch.cuda.manual_seed(SEED)
else:
  torch.manual_seed(SEED)

train_transform=transforms.Compose([
      transforms.ToTensor(),
      transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))

])
test_transform=transforms.Compose([
      transforms.ToTensor(),
      transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
])
train_set=datasets.CIFAR10('./data',train=True,transform=train_transform,download=True)
test_set=datasets.CIFAR10('./data',train=False,transform=test_transform,download=True)
train_loader=torch.utils.data.DataLoader(train_set,batch_size=4,shuffle=True,num_workers=2)
test_loader=torch.utils.data.DataLoader(test_set,batch_size=4,shuffle=False,num_workers=2)
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

My model code

class Net(nn.Module):
  def __init__(self):
    super(Net,self).__init__()
    self.convblock1=nn.Sequential(
         nn.Conv2d(in_channels=3,out_channels=32,kernel_size=(3,3),padding=2,dilation=2,bias=False),
         nn.BatchNorm2d(32),
         nn.ReLU(),
         nn.Conv2d(in_channels=32,out_channels=32,bias=False,padding=2,kernel_size=(3,3),dilation=2),
         nn.BatchNorm2d(32),
         nn.ReLU(),
         nn.Conv2d(in_channels=32,out_channels=32,bias=False,padding=2,kernel_size=(3,3),dilation=2),
         nn.BatchNorm2d(32),
         nn.ReLU()  
    )
    self.maxpool1=nn.MaxPool2d((2,2),stride=2)
    self.convblock2=nn.Sequential(
         nn.Conv2d(in_channels=32,out_channels=64,kernel_size=(1,1),bias=False),
         nn.BatchNorm2d(64),
         nn.ReLU(),
         nn.Conv2d(in_channels=64,out_channels=64,bias=False,padding=2,kernel_size=(3,3),dilation=2),
         nn.BatchNorm2d(64),
         nn.ReLU(),
         nn.Conv2d(in_channels=64,out_channels=64,bias=False,padding=2,kernel_size=(3,3),dilation=2),
         nn.BatchNorm2d(64),
         nn.ReLU()  
    )
    self.maxpool2=nn.MaxPool2d((2,2),stride=2)
    self.convblock3=nn.Sequential(
         nn.Conv2d(in_channels=64,out_channels=128,kernel_size=(1,1),bias=False),
         nn.BatchNorm2d(128),
         nn.ReLU(),
         nn.Conv2d(in_channels=128,out_channels=128,bias=False,padding=1,kernel_size=(3,3)),
         nn.BatchNorm2d(128),
         nn.ReLU(),
         nn.Conv2d(in_channels=128,out_channels=128,bias=False,padding=1,kernel_size=(3,3)),
         nn.BatchNorm2d(128), 
    )
    self.gap=nn.AdaptiveAvgPool2d((1,1))
    self.Fc=nn.Conv2d(in_channels=128,out_channels=10,kernel_size=(1,1),bias=False)

  def forward(self,x):
      x=self.convblock1(x)
      x=self.maxpool1(x)
      x=self.convblock2(x)
      x=self.maxpool2(x)
      x=self.convblock3(x)
      x=self.gap(x)
      x=self.Fc(x)
      x=x.view(-1,10)
      return x;

My training and testing code

#for having cooler graphics we can have tqdm installed
from tqdm import tqdm
#we are initialising lists to contain loss and accuracy values to plot some graph
test_losses=[]
test_accuracy=[]
images_array=[]
predicted_value=[]
actual_value=[]
count_images=0
def train(model,device,criterian,train_loader,epoch,optimizer):
  #we are calling model.train() so that our model prepares itself in training mode and all other things like batch normalisationa nd dropout behaves accordingly.
  model.train()
  #so tqdm() just acts on a iterator and it will jsut  gives representations in form of progressbar.
  pbar=tqdm(train_loader)
  #variables which contain correct values and processed values which helps in finding accuracy and loss
  correct=0
  processed=0
  for batch_idx,(data,target) in enumerate(pbar):
    #sending data and target samples to device which is gpu
    data,target=data.to(device),target.to(device)
    #we are setting our gradients to zero which is very useful
    optimizer.zero_grad()
    #predicting the output
    y_pred=model(data)#print("not matching ","predicted value is",predicted[j].item()," Actual value is ",target[j].item())
    #calculating the loss
    loss=criterian(y_pred,target)
    #updating loss and taking a step by optimizer
    loss.backward()
    optimizer.step()
    #updating tqdm and pbar
    #here y_pred will be equal to no of channels in our output
    #actually we need that for backpropagation
    #now we will find highest value which will be our prediction
    #so we will use argmax to fing max value for each image output
    #correct will be no of images correctly processes in batch.we will compare pred value to target.if it is equal it will add one value to correct
    pred=y_pred.argmax(dim=1,keepdim=True)
    processed+=len(data)
    #.sum() will count no of trues in prediction for each batch and .item() will convert our tensor output to scalar output
    correct+=pred.eq(target.view_as(pred)).sum().item()
    #we are appending losses into train_accuracy and we are also appending accuracy at end of each epoch
    pbar.set_description(desc=f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')

def test1(model,device,test_loader,criterian):
  loss=0
  model.eval()
  test_loss=0
  correct=0
  with torch.no_grad():
    for data,target in (test_loader):
      data,target=data.to(device),target.to(device)
      outputs = model(data)
      test_loss+=criterian(outputs,target).item()
      _, predicted = torch.max(outputs, 1)
      correct += (predicted == target).sum().item()
      
    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)
    
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    test_accuracy.append(100. * correct / len(test_loader.dataset))
    

Could you run ulimit -l which should show the per-process limit on locked memory?

Where i should run it sir??

In your terminal, if you are using a Linux OS.

i am running code on google colab