RunTime Error does not make sense

Thank you very much for your reply. Here I will just include the code that I use, so the workflow would be more or less clear. I will look into psutils and report the results later.

I define the DenseNet-architecture as follows:

class Dense_Block(nn.Module):
    def __init__(self, in_channels):
        super(Dense_Block, self).__init__()

        self.relu = nn.ReLU(inplace = True)
        self.bn = nn.BatchNorm2d(num_features = in_channels)

        self.conv1 = nn.Conv2d(in_channels = in_channels, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv3 = nn.Conv2d(in_channels = 64, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv4 = nn.Conv2d(in_channels = 96, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv5 = nn.Conv2d(in_channels = 128, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)

    def forward(self, x):

        bn = self.bn(x)
        conv1 = self.relu(self.conv1(bn))

        conv2 = self.relu(self.conv2(conv1))
        c2_dense = self.relu(torch.cat([conv1, conv2], 1))

        conv3 = self.relu(self.conv3(c2_dense))
        c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))

        conv4 = self.relu(self.conv4(c3_dense))
        c4_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4], 1))

        conv5 = self.relu(self.conv5(c4_dense))
        c5_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4, conv5], 1))

        return c5_dense

class Transition_Layer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Transition_Layer, self).__init__()

        self.relu = nn.ReLU(inplace = True)
        self.bn = nn.BatchNorm2d(num_features = out_channels)
        self.conv = nn.Conv2d(in_channels = in_channels, out_channels = out_channels, kernel_size = 1, bias = False)
        self.avg_pool = nn.AvgPool2d(kernel_size = 2, stride = 2, padding = 0)

    def forward(self, x):

        bn = self.bn(self.relu(self.conv(x)))
        out = self.avg_pool(bn)

        return out

class DenseNet(nn.Module):
    def __init__(self, nr_classes):
        super(DenseNet, self).__init__()

        self.lowconv = nn.Conv2d(in_channels = 1, out_channels = 64, kernel_size = 7, padding = 3, bias = False)
        self.relu = nn.ReLU()

        # Make Dense Blocks
        self.denseblock1 = self._make_dense_block(Dense_Block, 64)
        self.denseblock2 = self._make_dense_block(Dense_Block, 128)
        self.denseblock3 = self._make_dense_block(Dense_Block, 128)

        # Make transition Layers
        self.transitionLayer1 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 128)
        self.transitionLayer2 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 128)
        self.transitionLayer3 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 64)

        # Classifier
        self.bn = nn.BatchNorm2d(num_features = 64)
        self.pre_classifier = nn.Linear(64*16*16, 512)
        self.classifier = nn.Linear(512, nr_classes)

    def _make_dense_block(self, block, in_channels):
        layers = []
        layers.append(block(in_channels))
        return nn.Sequential(*layers)

    def _make_transition_layer(self, layer, in_channels, out_channels):
        modules = []
        modules.append(layer(in_channels, out_channels))
        return nn.Sequential(*modules)

    def forward(self, x):
        out = self.relu(self.lowconv(x))

        out = self.denseblock1(out)
        out = self.transitionLayer1(out)

        out = self.denseblock2(out)
        out = self.transitionLayer2(out)

        out = self.denseblock3(out)
        out = self.transitionLayer3(out)

        out = self.bn(out)
#         print(out.shape)
        out = out.reshape(-1, 64*16*16)

        out = self.pre_classifier(out)
        out = self.classifier(out)

        return out

Then I define my Dataset class:

class MyDataset(Dataset):
    def __init__(self, images, n, labels=None, transforms=None):
        self.X = images
        self.y = labels
        self.n = n
        self.transforms = transforms
         
    def __len__(self):
        return (len(self.X))
    
    def __getitem__(self, i):
        data = self.X.iloc[i, :]
#         print(data.shape)
        data = np.asarray(data).astype(np.float).reshape(1,n,n)
        
        if self.transforms:
            data = self.transforms(data).reshape(1,n,n)
            
        if self.y is not None:
            y = self.y.iloc[i,:]
#             y = np.asarray(y).astype(np.float).reshape(2*n+1,) # for 257-vector of labels
            y = np.asarray(y).astype(np.float).reshape(128,) # for 128-vector of labels
            return (data, y)
        else:
            return data

Then I create the instances of the train, dev, and test data:

train_data = MyDataset(train_images, n, train_labels, None)
dev_data = MyDataset(dev_images, n, dev_labels, None)
test_data = MyDataset(test_images, n, test_labels, None)

The shapes of train_images, dev_images and test_images are respectively (15000, 16384), (4000, 16384) and (1000, 16384). So there are in total 20000 samples of 128x128 (=16384) images.

The shapes of train_labels, dev_labels and test_labels are respectively (15000, 128), (4000, 128) and (1000, 128). So there are in total 20000 samples of 128 vectors.

I define also a custom loss function:

class Loss():    
    def __init__(self,yHat,y):
        self.n_samples = yHat.size()[0]
        self.n_points = yHat.size()[1]
        self.preds = yHat
        self.labels = y
        self.size = yHat.size()[0]*yHat.size()[1]
        self.diff = yHat - y
        
    def Huber(self,delta=1.):
        return torch.sum(torch.where(torch.abs(self.diff) < delta,.5*self.diff**2 , delta*(torch.abs(self.diff)-.5*delta**2))) / self.size

Then I create an instance of the model:

densnet = DenseNet(nr_classes=128).float().to('cpu')

Then I initialize parameters, create train- and dev-set dataloaders, and train the model using Adam optimizer and Huber loss-function:

nn.init.kaiming_uniform_(list(densenet.parameters())[0], nonlinearity = 'relu')
loader = DataLoader(train_data,batch_size=128,shuffle=False,num_workers=0)
loader_dev = DataLoader(dev_data,batch_size=10,shuffle=None,num_workers=0)
N_epochs = 10
for epoch in range(N_epochs):
      optimizer = optim.Adam(densenet.parameters(), lr=.001, betas=(0.9, 0.999), eps=1e-08)
      for batch in loader:
            images = batch[0].float().to('cpu')
            labels = batch[1].float().to('cpu')
            preds = densenet(images)
            loss = Loss(preds,labels).Huber()

            loss_dev = 0
            for batch_dev in loader_dev:
                images_dev = batch_dev[0].float().to('cpu')
                labels_dev = batch_dev[1].float().to('cpu')
                preds_dev = densenet(images_dev)
                loss_ = Loss(preds_dev,labels_dev).Huber()
                loss_dev += loss_

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()