# Cant apply nn.Softmax() along each dimension or different parts of tensor

Hi,

I cant apply nn.Softmax() along each dimension separately.

I have a tensor in one dimension of size 4. I want to apply softmax on the first 2 values and the last 2 values separately.
in each way I tried to do it I get:
“RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [6, 4]], which is output 0 of TBackward, is at version 2; expected version 1 instead.”

class Net(nn.Module):
def init(self):
super(Net, self).init()
self.fc1 = nn.Linear(4, 6)
self.fc2 = nn.Linear(6, 8)
self.fc3 = nn.Linear(8, 16)
self.fc4 = nn.Linear(16, 32)
self.fc5 = nn.Linear(32, 32)
self.fc6 = nn.Linear(32, 16)
self.fc7 = nn.Linear(16, 8)
self.fc8 = nn.Linear(8, 6)
self.fc9 = nn.Linear(6, 4)

``````def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
x = F.relu(self.fc8(x))
x = F.relu(self.fc9(x))

# x = x.view(2, 2)
# x = x.reshape(2, 2)
# x = F.softmax(x, dim=1)
# x = x.view(4)

# x1 = F.softmax(x[0, 0:2], dim=0)
# x2 = F.softmax(x[0, 2:4], dim=0)
x1 = x[0:2]
x2 = x[2:4]
x1 = F.softmax(x1, dim=0)
x2 = F.softmax(x2, dim=0)
x = torch.cat((x1, x2), 0)

# x = F.softmax(x, dim=1)
return x
``````

Hello OG!

I think your root issue is that pytorch models operate on tensors that
carry a leading `nBatch` dimension that is not directly referenced in
the model architecture.

Such a model will operate on a tensor of shape `[nBatch, 4]`.
(If you want to pass in a single sample of shape `[4]`, you have to
`unsqueeze()` it to get a tensor with a leading singleton dimension,
that is, of shape `[1, 4]`.)

The output of this model will have shape `[nBatch, 4]` (rather than
shape `[4]`).

You probably commented out this line because it didn’t work. It doesn’t
work because your `view()` call doesn’t take into account `x`'s `nBatch`
dimension.

Regardless, your basic idea does work. The sample script, below,
shows, step by step, how you can use `view()` to group your four
values into two sets of two, apply `softmax()` to the two sets of two
separately, and then use `view()` again to reorganize your two sets
of two back into one set of four. But we have to account for the
leading `nBatch` dimension.

Here is a pytorch 0.3.0 version of the script:

``````import torch
torch.__version__

x = torch.autograd.Variable (torch.randn ((3, 4)))    # assume nBatch = 3
x
v = x.view (-1, 2, 2)   # use -1 to "infer" size of nBatch dimension
v
sm = torch.nn.functional.softmax (v, dim = -1)        # -1 to perform softmax over last dimension = 2
sm
y = sm.view (-1, 4)     # use -1 to "infer" size of nBatch dimension
y
sm.sum (dim = -1)       # check softmax sum (-1 to sum over last dimension = 2)
y.sum (dim = -1)        # check softmax sum (-1 to sum over last dimension = 1)
``````

Here is the output:

``````>>> import torch
>>> torch.__version__
'0.3.0b0+591e73e'
>>>
>>> x = torch.autograd.Variable (torch.randn ((3, 4)))    # assume nBatch = 3
>>> x
Variable containing:
-2.2293  2.3681  0.5411 -1.3360
0.7701 -1.1548  0.2012 -0.6301
-0.0882 -0.1200 -0.4608  0.9952
[torch.FloatTensor of size 3x4]

>>> v = x.view (-1, 2, 2)   # use -1 to "infer" size of nBatch dimension
>>> v
Variable containing:
(0 ,.,.) =
-2.2293  2.3681
0.5411 -1.3360

(1 ,.,.) =
0.7701 -1.1548
0.2012 -0.6301

(2 ,.,.) =
-0.0882 -0.1200
-0.4608  0.9952
[torch.FloatTensor of size 3x2x2]

>>> sm = torch.nn.functional.softmax (v, dim = -1)        # -1 to perform softmax over last dimension = 2
>>> sm
Variable containing:
(0 ,.,.) =
0.0100  0.9900
0.8673  0.1327

(1 ,.,.) =
0.8727  0.1273
0.6966  0.3034

(2 ,.,.) =
0.5079  0.4921
0.1891  0.8109
[torch.FloatTensor of size 3x2x2]

>>> y = sm.view (-1, 4)     # use -1 to "infer" size of nBatch dimension
>>> y
Variable containing:
0.0100  0.9900  0.8673  0.1327
0.8727  0.1273  0.6966  0.3034
0.5079  0.4921  0.1891  0.8109
[torch.FloatTensor of size 3x4]

>>> sm.sum (dim = -1)       # check softmax sum (-1 to sum over last dimension = 2)
Variable containing:
1  1
1  1
1  1
[torch.FloatTensor of size 3x2]

>>> y.sum (dim = -1)        # check softmax sum (-1 to sum over last dimension = 1)
Variable containing:
2
2
2
[torch.FloatTensor of size 3]
``````

Best.

K. Frank

i comment the # x = x.view(2, 2) part because it gave the same error. although you probably right, it wasn’t the source of my problem. my error is with the gradient computation which probably has been modified by an inplace operation that I cant find. maybe it is in my loss function?

## net architecture and train function:

``````import torch
import torch.nn as nn
import torch.nn.functional as F
from POC_loss import loss1, component3, numerical_mutual_Iinformation

class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(4, 6)
self.fc2 = nn.Linear(6, 8)
self.fc3 = nn.Linear(8, 16)
self.fc4 = nn.Linear(16, 32)
self.fc5 = nn.Linear(32, 32)
self.fc6 = nn.Linear(32, 16)
self.fc7 = nn.Linear(16, 8)
self.fc8 = nn.Linear(8, 6)
self.fc9 = nn.Linear(6, 4)

def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
x = F.relu(self.fc8(x))
x = F.relu(self.fc9(x))

x = x.view(-1, 2, 2)
x = torch.nn.functional.softmax (x, dim = -1)
x = x.view(-1, 4)

# x = x.view(2, 2)
# x = x.reshape(2, 2)
# x = F.softmax(x, dim=1)
# x = x.view(4)

# x1 = F.softmax(x[0, 0:2], dim=0)
# x2 = F.softmax(x[0, 2:4], dim=0)

# x1 = x[0:2]
# x2 = x[2:4]
# x1 = F.softmax(x1, dim=0)
# x2 = F.softmax(x2, dim=0)
# x = torch.cat((x1, x2), 0)

# x = F.softmax(x, dim=1)
return x

def train(train_loader, optimizer, net, my_loss, epochs, D, Px=0.5, alpha=1, toleranceParcent=10, toleranceLen: int=10):

# initialise variables:
tolerance = toleranceParcent / 100

running_loss = 0.0
running_Ed = 0.0
running_RofD = 0.0
running_outputs = 0.0

lossList = []
EdList = []
RofDList = []
outputsKeepwrList = []

for epoch in range(epochs):

if next(net.parameters()).is_cuda: outputs = torch.tensor([0.5, 0.5, 0.5, 0.5]).cuda()
else: outputs = torch.tensor([0.5, 0.5, 0.5, 0.5])

loss = 0
Ed = 0
RofD = 0
outputsKeepwr = 0

# get the inputs; data is a list of [inputs, labels]
# inputs_x1 = data

# forward + backward + optimize:
# -----------------------------
#   forward:
outputs = net(outputs)
#   compute training reconstruction loss
train_loss = loss1(outputs, Px, alpha, D)
# train_loss.backward(retain_graph=True)
train_loss.backward(retain_graph=True)
#    perform parameter update based on current gradients
optimizer.step()
# add the mini-batch training loss to epoch loss
loss += train_loss.item()
add the mini-batch training loss to epoch loss:
Ed += component3(outputs, 0, alpha=1, D=0) + component3(outputs, 1, alpha=1, D=0)
RofD += numerical_mutual_Iinformation(outputs, Px)
outputsKeepwr += outputs

# average for current epoch:
# insert to list:
lossList.append(loss)
EdList.append(loss)
RofDList.append(loss)
outputsKeepwrList.append(loss)

# display progress (epoch training loss):
print("epoch : {}/{}, loss = {:.6f}, E[d] = {:.6f}, R(D) = {:.6f}, out = {:.6f}".format(epoch + 1, epochs, loss, Ed, RofD, outputsKeepwr))

# training efficiency control:
if len(lossList) >= toleranceLen:
stopRunFlag = True
for i in range(1, toleranceLen + 1):
if not ((lossList[-toleranceLen] <= lossList[-i] + tolerance * lossList[-toleranceLen]) and
((lossList[-i] - tolerance * lossList[-toleranceLen] <= lossList[-toleranceLen]))):
stopRunFlag = False
break
if ("stopRunFlag" in locals()) and stopRunFlag:
break

dataDict = {"lossList": lossList, "EdList": EdList, "RofDList": RofDList, "outputsKeepwrList": outputsKeepwrList}
``````

## my loss function(loss1):

``````import torch

def numerical_mutual_Iinformation(output, Px):
comp1 = component1(output, 0)
comp2 = component2(output, Px, 0)
comp3 = component1(output, 1)
comp4 = component2(output, Px, 1)
NMI =  comp1 - comp2 + comp3 - comp4
return NMI

def component1(P_yGIVENx, input_x1):
if (input_x1 == 0):
# P_yGIVENxt = P_yGIVENx[0, 0:2]
P_yGIVENxt = P_yGIVENx[0:2]
else:
# P_yGIVENxt = P_yGIVENx[0, 2:4]
P_yGIVENxt = P_yGIVENx[2:4]

def component2(P_yGIVENx, Px, input_x1):
# if P_yGIVENx.is_cuda: maskXis0 = torch.tensor([True, True, False, False]).cuda()
# else: maskXis0 = torch.tensor([True, True, False, False])
# if P_yGIVENx.is_cuda: maskXis1 = torch.tensor([False, False, True, True]).cuda()
# else: maskXis1 = torch.tensor([False, False, True, True])

multVal = P_yGIVENx * Px
if P_yGIVENx.is_cuda: P_yGIVENxIs0 = multVal[0:2].cuda()
else: P_yGIVENxIs0 = multVal[0:2]
if P_yGIVENx.is_cuda: P_yGIVENxIs1 = multVal[2:4].cuda()
else: P_yGIVENxIs1 = multVal[2:4]

logSumOverX = torch.log2(P_yGIVENxIs0 + P_yGIVENxIs1)

if (input_x1 == 0):
# P_yGIVENxt = P_yGIVENx[0, 0:2]
P_yGIVENxt = P_yGIVENx[0:2]
else:
# P_yGIVENxt = P_yGIVENx[0, 2:4]
P_yGIVENxt = P_yGIVENx[2:4]

def d(x1):
"returns vector of size 2 represent y=0 and y=1 given x1"
if (x1 == 0):
return [0, 1]
else:
return [1, 0]

def component3(P_yGIVENx, input_x1, alpha, D):
if (input_x1 == 0):
# P_yGIVENxt = P_yGIVENx[0, 0:2]
P_yGIVENxt = P_yGIVENx[-1, 0:2]
else:
# P_yGIVENxt = P_yGIVENx[0, 2:4]
P_yGIVENxt = P_yGIVENx[-1, 2:4]
if P_yGIVENxt.is_cuda: out = alpha * (torch.sum(P_yGIVENxt * torch.FloatTensor(d(input_x1)).cuda()) - D)
else: out = alpha * (torch.sum(P_yGIVENxt * torch.FloatTensor(d(input_x1))) - D)

if P_yGIVENxt.is_cuda: returnVal = max(torch.tensor(0).cuda(), out)
else: returnVal = max(torch.tensor(0), out)

return returnVal
# return out
# return max(0, out.data.numpy())

def loss1(output, Px, alpha, D):
comp1 = numerical_mutual_Iinformation(output, Px)
comp2 = component3(output, 0, alpha, D)
comp3 = component3(output, 1, alpha, D)
loss = comp1 + comp2 + comp3
return loss
``````

Thanks

Hello OG!

Please see if you can reproduce your error in a simple script of ten
or twenty lines. The idea would be that you generate some random
example data, call your loss function, and call backward, at which
point the error occurs.

Make sure that the script is complete, self-contained, and runnable.
Leave out any unnecessary complications such as training loops, data