I am implementing Region Ensemble Network in Pytorch.

It takes in a resized depth image and outputs the pixel coordinates of the 21 joints.

I cannot figure out in any way why the training loss keeps increasing. Even when it does decrease when i change the hyper parameters, the outputs are very similar. I am sure that the ground truth and data has no problem. For example, when i draw the ground truth onto the 96x96 image, it looks like this

I have tried decreasing/increasing the learning rate (ranging from 0.005 to 0.000005), normalizing and not normalizing it, data augmentation, tried overfitting it on a small non-augmented subset of the dataset (MSRA hand gesture dataset), but it is not working at all. You can see the full implementation at Github

For reference, this is what the ground truth of 21 x 2 x,y joints coordinate is:

```
tensor([ 49.0274, 65.6594, 53.2944, 46.7441, 55.6024, 37.6823,
57.4913, 30.6067, 59.4096, 23.9927, 46.4607, 45.0358,
45.9472, 33.7956, 45.6864, 25.9997, 45.4752, 18.4846,
40.3416, 45.7006, 37.4622, 36.1925, 35.6829, 30.8469,
34.2946, 27.1403, 33.4312, 49.3788, 30.2387, 44.8089,
28.7085, 42.4615, 26.9225, 39.7115, 55.4961, 62.4478,
61.8277, 57.6757, 66.9980, 53.5504, 73.2470, 50.6870], dtype=torch.float64)
```

But i am getting this below after training for 100 epoches, with decaying learning rate multiplier of 0.1 every 10 epoches.

```
[[-0.08046436 -0.0457227 ]
[-0.08309992 -0.05746359]
[-0.08415958 -0.05046146]
[-0.0773853 -0.05147431]
[-0.06778512 -0.06056155]
[-0.0672973 -0.05785097]
[-0.07014311 -0.0637675 ]
[-0.08236345 -0.05145769]
[-0.06171172 -0.05182333]
[-0.07300673 -0.05585903]
[-0.07764702 -0.0533776 ]
[-0.07672743 -0.06045451]
[-0.07716335 -0.05677503]
[-0.08042111 -0.04958814]
[-0.07111529 -0.06132633]
[-0.07012014 -0.05466149]
[-0.07885809 -0.05328601]
[-0.07775773 -0.05257958]
[-0.08589675 -0.05545426]
[-0.0788433 -0.04867259]
[-0.06929056 -0.05824893]]
```

I am using a custom Smooth L1 loss as described in the paper.

```
class Modified_SmoothL1Loss(torch.nn.Module):
def __init__(self):
super(Modified_SmoothL1Loss,self).__init__()
def forward(self,x,y):
total_loss = 0
z = x - y
for i in range(z.shape[0]):
for j in range(z.shape[1]):
total_loss += self._smooth_l1(z[i][j])
return total_loss/z.shape[0]
def _smooth_l1(self, z):
if torch.abs(z) < 0.01:
loss = self._calculate_MSE(z)
else:
loss = self._calculate_L1(z)
return loss
def _calculate_MSE(self, z):
return 0.5 *(torch.pow(z,2))
def _calculate_L1(self,z):
return 0.01 * (torch.abs(z) - 0.005)
```

This is my network:

```
class RegionEnsemble(nn.Module):
def __init__(self, feat_size=12):
assert((feat_size/4).is_integer())
super(RegionEnsemble, self).__init__()
self.feat_size = feat_size
self.grids = nn.ModuleList()
for i in range(9):
self.grids.append(self.make_block(self.feat_size))
def make_block(self, feat_size):
size = int(self.feat_size/2)
return nn.Sequential(nn.Linear(64*size*size, 2048), nn.ReLU(), nn.Dropout(), nn.Linear(2048,2048), nn.ReLU(), nn.Dropout())
def forward(self, x):
midpoint = int(self.feat_size/2)
quarterpoint1 = int(midpoint/2)
quarterpoint2 = int(quarterpoint1 + midpoint)
regions = []
ensemble = []
#4 corners
regions += [x[:, :, :midpoint, :midpoint], x[:, :, :midpoint, midpoint:], x[:, :, midpoint:, :midpoint], x[:, :, midpoint:, midpoint:]]
#4 overlapping centers
regions += [x[:, :, quarterpoint1:quarterpoint2, :midpoint], x[:, :, quarterpoint1:quarterpoint2, midpoint:], x[:, :, :midpoint, quarterpoint1:quarterpoint2], x[:, :, midpoint:, quarterpoint1:quarterpoint2]]
# middle center
regions += [x[:, :, quarterpoint1:quarterpoint2, quarterpoint1:quarterpoint2]]
for i in range(0,9):
out = regions[i]
# print(out.shape)
out = out.contiguous()
out = out.view(out.size(0),-1)
out = self.grids[i](out)
ensemble.append(out)
out = torch.cat(ensemble,1)
return out
class Residual(nn.Module):
def __init__(self, planes):
super(Residual, self).__init__()
self.conv1 = nn.Conv2d(in_channels= planes, out_channels=planes, kernel_size = 3, padding=1)
self.relu = nn.ReLU()
self.conv2 = nn.Conv2d(in_channels= planes, out_channels=planes, kernel_size = 3, padding=1)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.relu(out)
out = self.conv2(out)
out += residual
return out
class REN(nn.Module):
def __init__(self, args):
super(REN, self).__init__()
feat = np.floor(((args.input_size - 1 -1)/2) +1)
feat = np.floor(((feat - 1-1)/2) +1)
feat = np.floor(((feat - 1-1)/2) +1)
#nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
self.conv0 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size = 3, padding=1)
self.relu0 = nn.ReLU()
self.conv1 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size = 3, padding=1)
self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.relu1 = nn.ReLU()
self.conv2_dim_inc = nn.Conv2d(in_channels=16, out_channels=32, kernel_size = 1, padding=0)
self.relu2 = nn.ReLU()
self.res1 = Residual(planes = 32)
self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.relu3 = nn.ReLU()
self.conv3_dim_inc = nn.Conv2d(in_channels=32, out_channels=64, kernel_size = 1, padding=0)
self.relu4 = nn.ReLU()
self.res2 = Residual(planes = 64)
self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.relu5 = nn.ReLU()
self.dropout = nn.Dropout()
self.region_ens = RegionEnsemble(feat_size=feat)
#class torch.nn.Linear(in_features, out_features, bias=True)
self.fc1 = nn.Linear(9*2048, args.num_joints)
def forward(self, x):
out = self.conv0(x)
out = self.relu0(out)
out = self.conv1(out)
out = self.maxpool1(out)
out = self.relu1(out)
out = self.conv2_dim_inc(out)
out = self.relu2(out)
out = self.res1(out)
out = self.maxpool2(out)
out = self.relu3(out)
out = self.conv3_dim_inc(out)
out = self.relu4(out)
out = self.res2(out)
out = self.maxpool3(out)
out = self.relu5(out) #relu5
out = self.dropout(out)
#slice
out = self.region_ens(out)
# flatten the output
out = out.view(out.size(0),-1)
out = self.fc1(out)
return out
```

This is my training function:

```
def train(train_loader, model, criterion, optimizer, epoch,args):
# switch to train mode
model.train()
loss_train = []
expr_dir = os.path.join(args.save_dir, args.name)
for i, (input, target) in enumerate(train_loader):
stime = time.time()
# measure data loading time
target = target.float()
target = target.cuda(non_blocking=False)
input = input.float()
input = input.cuda()
# compute output
output = model(input)
loss = criterion(output, target)
# measure accuracy and record loss
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
loss_train.append(loss.data.item())
optimizer.step()
np.savetxt(os.path.join(expr_dir, "_iteration_train_loss.out"), np.asarray(loss_train), fmt='%f')
# measure elapsed time
if i % args.print_interval == 0:
TT = time.time() -stime
print('epoch: [{0}][{1}/{2}]\t'
'Loss {loss:.4f}\t'
'Time: {time:.2f}\t'.format(
epoch, i, len(train_loader), loss=loss.item(), time= TT))
return [np.mean(loss_train)]
```

Any inputs on what could have gone wrong is greatly appreciated. Been working for weeks on this but still cannot figure out what is wrong.