# When backward() can be safely omitted?

Hi, I am trying to implementing a customized loss function as follows, can it work without a customized backward routine ? Actually, how do I know if the operation can be supported by autograd automatically so that I don’t need to specify backward pass?

``````class Loss(torch.autograd.Function):
'''
Implement the loss function from output from RNN.
Ref paper: https://arxiv.org/abs/1308.0850
'''
def __init__(self):
'''
x is sequence of coordinates with dim (batch, seq_length, 3).
Parameters are sequence of output from rnn with dim (batch, seq_length, 128).
'''
self.e = []      	  # predicted end of stroke probability scalar
self.m1 = []        # vector of means for x1 with len 20
self.m2 = []        # vector of means for x2 with len 20
self.pi = []        # vector of mixture density network coefficients with len 20
self.rho = []       # vector of correlation with len 20
self.s1 = []        # vector of standard deviation for x1 with len 20
self.s2 = []        # vector of standard deviation for x2 with len 20
self.x1 = []        # x1 coordinate at t+1
self.x2 = []        # x2 coordinates at t + 1
self.et = []        # end of probability indicator from ground truth
self.parameters = []
self.batch = 0      #batch size
self.seq_length = 0 # reduce by 1 because loss is caculated at t+1 timestamp

def forward(self, x, para):
'''
Implement eq 26 of ref paper for single time stamp.
'''
self.save_for_backward(para)
total_loss = 0
for i in range(self.seq_length):
# prepare parameters
self.__get_para(i, x, para)
normalpdf = self.__para2normal(self.x1, self.x2, self.m1, self.m2, self.s1, self.s2, self.rho) #dim (n_batch, 20)
single_loss = self.__singleLoss(normalpdf)
total_loss += single_loss

def __get_para(self, i, x, para):
'''
Slice and process parameters to the right form.
Implementing eq 18-23 of ref paper.
'''
self.batch = torch.size(x)
self.e = torch.sigmoid(-para[:,i,0])  # eq 18
self.parameters = para
self.seq_length = torch.size(x) -1 # reduce by 1 because loss is caculated at t+1 timestamp

# slice remaining parameters and training inputs
self.pi, self.m1, self.m2, self.s1, self.s2, self.rho = torch.split(self.parameters[:,i,1:], 6, dim = 1)
self.x1 = x[:,i+1,0]
self.x2 = x[:,i+1,1]
self.et = x[:,i+1,2]

## process parameters
# pi
max_pi = torch.max(self.pi, dim = 1)
max_pi = max_pi.expand_as(self.pi)
self.pi.sub_(max_pi)
red_sum = torch.sum(self.pi, dim = 1).expand_as(self.pi)
self.pi.div_(red_sum)

# sd
self.s1.exp_()
self.s2.exp_()

# rho
self.rho.tanh_()

# reshape ground truth x1, x2 to match m1, m2 because broadcasting is currently not supported by pytorch
self.x1.expand_as(self.m1)
self.x2.expand_as(self.m2)

def __para2normal(self, x1, x2, m1, m2, s1, s2, rho):
'''
Implement eq 24, 25 of ref paper.
'''
norm1 = x1.sub(m1)
norm2 = x2.sub(m2)
s1s2 = torch.mul(s1, s2)
z = torch.pow(torch.div(norm1, s1), 2) + torch.pow(torch.div(norm2, s2), 2) - \
2*torch.div(torch.mul(pho, torch.mul(norm1, norm2)), s1s2)
negRho = 1 - torch.pow(rho, 2)
expPart = torch.exp(torch.div(-z, torch.mul(negRho, 2)))
coef = 2*np.pi*torch.mul(s1s2, torch.sqrt(negRho))
result = torch.div(expPart, coef)
return result

def __singleLoss(self, normalpdf):
'''
Calculate loss for single time stamp.
Input: normalpdf (n_batch, 20).
'''
epsilon = 1e-20  # floor of loss from mixture density component since initial loss could be zero
mix_den_loss = torch.mul(self.pi, normalpdf)
red_sum_loss = torch.sum(mix_den_loss)  # sum for all batch
end_loss = torch.mul(self.e, self.et) + torch.mul(1-self.e, 1 - self.et)
total_loss = -torch.log(tf.max(red_sum_loss, epsilon)) - torch.log(end_loss)
if you write the loss function as a subclass of `nn.Module` then you dont need to specify the backward. If you write it as a subclass of `autograd.Function` you have to specify the backward.