thank you for your reply

i comment the # x = x.view(2, 2) part because it gave the same error. although you probably right, it wasn’t the source of my problem. my error is with the gradient computation which probably has been modified by an **inplace** operation that I cant find. maybe it is in my loss function?

## net architecture and train function:

```
import torch
import torch.nn as nn
import torch.nn.functional as F
from POC_loss import loss1, component3, numerical_mutual_Iinformation
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(4, 6)
self.fc2 = nn.Linear(6, 8)
self.fc3 = nn.Linear(8, 16)
self.fc4 = nn.Linear(16, 32)
self.fc5 = nn.Linear(32, 32)
self.fc6 = nn.Linear(32, 16)
self.fc7 = nn.Linear(16, 8)
self.fc8 = nn.Linear(8, 6)
self.fc9 = nn.Linear(6, 4)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
x = F.relu(self.fc8(x))
x = F.relu(self.fc9(x))
x = x.view(-1, 2, 2)
x = torch.nn.functional.softmax (x, dim = -1)
x = x.view(-1, 4)
# x = x.view(2, 2)
# x = x.reshape(2, 2)
# x = F.softmax(x, dim=1)
# x = x.view(4)
# x1 = F.softmax(x[0, 0:2], dim=0)
# x2 = F.softmax(x[0, 2:4], dim=0)
# x1 = x[0:2]
# x2 = x[2:4]
# x1 = F.softmax(x1, dim=0)
# x2 = F.softmax(x2, dim=0)
# x = torch.cat((x1, x2), 0)
# x = F.softmax(x, dim=1)
return x
def train(train_loader, optimizer, net, my_loss, epochs, D, Px=0.5, alpha=1, toleranceParcent=10, toleranceLen: int=10):
# initialise variables:
tolerance = toleranceParcent / 100
running_loss = 0.0
running_Ed = 0.0
running_RofD = 0.0
running_outputs = 0.0
lossList = []
EdList = []
RofDList = []
outputsKeepwrList = []
for epoch in range(epochs):
if next(net.parameters()).is_cuda: outputs = torch.tensor([0.5, 0.5, 0.5, 0.5]).cuda()
else: outputs = torch.tensor([0.5, 0.5, 0.5, 0.5])
loss = 0
Ed = 0
RofD = 0
outputsKeepwr = 0
for batch_ndx, batch_features in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
# inputs_x1 = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize:
# -----------------------------
# forward:
outputs = net(outputs)
# compute training reconstruction loss
train_loss = loss1(outputs, Px, alpha, D)
# compute accumulated gradients
# train_loss.backward(retain_graph=True)
train_loss.backward(retain_graph=True)
# perform parameter update based on current gradients
optimizer.step()
# add the mini-batch training loss to epoch loss
loss += train_loss.item()
add the mini-batch training loss to epoch loss:
Ed += component3(outputs, 0, alpha=1, D=0) + component3(outputs, 1, alpha=1, D=0)
RofD += numerical_mutual_Iinformation(outputs, Px)
outputsKeepwr += outputs
# average for current epoch:
loss = loss / len(train_loader)
Ed = Ed / len(train_loader)
RofD = RofD / len(train_loader)
outputsKeepwr = outputsKeepwr / len(train_loader)
# insert to list:
lossList.append(loss)
EdList.append(loss)
RofDList.append(loss)
outputsKeepwrList.append(loss)
# display progress (epoch training loss):
print("epoch : {}/{}, loss = {:.6f}, E[d] = {:.6f}, R(D) = {:.6f}, out = {:.6f}".format(epoch + 1, epochs, loss, Ed, RofD, outputsKeepwr))
# training efficiency control:
if len(lossList) >= toleranceLen:
stopRunFlag = True
for i in range(1, toleranceLen + 1):
if not ((lossList[-toleranceLen] <= lossList[-i] + tolerance * lossList[-toleranceLen]) and
((lossList[-i] - tolerance * lossList[-toleranceLen] <= lossList[-toleranceLen]))):
stopRunFlag = False
break
if ("stopRunFlag" in locals()) and stopRunFlag:
break
dataDict = {"lossList": lossList, "EdList": EdList, "RofDList": RofDList, "outputsKeepwrList": outputsKeepwrList}
return net, dataDict
```

## my loss function(loss1):

```
import torch
def numerical_mutual_Iinformation(output, Px):
comp1 = component1(output, 0)
comp2 = component2(output, Px, 0)
comp3 = component1(output, 1)
comp4 = component2(output, Px, 1)
NMI = comp1 - comp2 + comp3 - comp4
return NMI
def component1(P_yGIVENx, input_x1):
if (input_x1 == 0):
# P_yGIVENxt = P_yGIVENx[0, 0:2]
P_yGIVENxt = P_yGIVENx[0:2]
else:
# P_yGIVENxt = P_yGIVENx[0, 2:4]
P_yGIVENxt = P_yGIVENx[2:4]
return torch.sum(torch.log2(P_yGIVENxt) * P_yGIVENxt)
def component2(P_yGIVENx, Px, input_x1):
# return torch.sum(torch.log2(P_yGIVENx)*Px)
# if P_yGIVENx.is_cuda: maskXis0 = torch.tensor([True, True, False, False]).cuda()
# else: maskXis0 = torch.tensor([True, True, False, False])
# P_yGIVENxIs0 = torch.masked_select(P_yGIVENx * Px, maskXis0)
# if P_yGIVENx.is_cuda: maskXis1 = torch.tensor([False, False, True, True]).cuda()
# else: maskXis1 = torch.tensor([False, False, True, True])
# P_yGIVENxIs1 = torch.masked_select(P_yGIVENx * Px, maskXis1)
multVal = P_yGIVENx * Px
if P_yGIVENx.is_cuda: P_yGIVENxIs0 = multVal[0:2].cuda()
else: P_yGIVENxIs0 = multVal[0:2]
if P_yGIVENx.is_cuda: P_yGIVENxIs1 = multVal[2:4].cuda()
else: P_yGIVENxIs1 = multVal[2:4]
logSumOverX = torch.log2(P_yGIVENxIs0 + P_yGIVENxIs1)
if (input_x1 == 0):
# P_yGIVENxt = P_yGIVENx[0, 0:2]
P_yGIVENxt = P_yGIVENx[0:2]
else:
# P_yGIVENxt = P_yGIVENx[0, 2:4]
P_yGIVENxt = P_yGIVENx[2:4]
return torch.sum(logSumOverX * P_yGIVENxt)
def d(x1):
"returns vector of size 2 represent y=0 and y=1 given x1"
if (x1 == 0):
return [0, 1]
else:
return [1, 0]
def component3(P_yGIVENx, input_x1, alpha, D):
if (input_x1 == 0):
# P_yGIVENxt = P_yGIVENx[0, 0:2]
P_yGIVENxt = P_yGIVENx[-1, 0:2]
else:
# P_yGIVENxt = P_yGIVENx[0, 2:4]
P_yGIVENxt = P_yGIVENx[-1, 2:4]
if P_yGIVENxt.is_cuda: out = alpha * (torch.sum(P_yGIVENxt * torch.FloatTensor(d(input_x1)).cuda()) - D)
else: out = alpha * (torch.sum(P_yGIVENxt * torch.FloatTensor(d(input_x1))) - D)
if P_yGIVENxt.is_cuda: returnVal = max(torch.tensor(0).cuda(), out)
else: returnVal = max(torch.tensor(0), out)
return returnVal
# return out
# return max(0, out.data.numpy())
def loss1(output, Px, alpha, D):
comp1 = numerical_mutual_Iinformation(output, Px)
comp2 = component3(output, 0, alpha, D)
comp3 = component3(output, 1, alpha, D)
loss = comp1 + comp2 + comp3
return loss
```

Thanks