Loss in multitask multiclass classifier for sparsely-populated set

Hi,
I’m having trouble getting a classifier working. It’s based on a tensorflow application (unfortunately I can’t release the source for examination) which predicts a binary class label for multiple targets.
The problem lies in that the input data has a high proportion of missing values. These are characterised by their own label, which is supposed to be ignored in the loss function. I’ve tried nllloss with the built-in ability to perform this, and also BCE by multiplying the output of the network by zeros for missing values - neither seems to work.
I suspect I’m making a mistake somewhere, but I’m fairly new to pytorch and can’t seem to figure it out. The network doesn’t appear to be treating each task independently - average ROCAUC across all tasks from the original program is ~0.7-0.8, whereas this doesn’t go above 0.55ish. Below is the (relevant, fairly verbose) code:

train_features=torch.from_numpy(np.array(train_features))
train_labels=torch.from_numpy(np.array(train_labels))
test_features=torch.from_numpy(np.array(test_features))
test_labels=torch.from_numpy(np.array(test_labels))

test_labels += 1 #Targets in range {-1,0,1} where 0 is unknown datapoint
train_labels += 1 #Change targets range from {-1,0,1} to {0,1,2}

train = torch.utils.data.TensorDataset(train_features.float(), train_labels.long())
train_loader = torch.utils.data.DataLoader(train, batch_size=batch, shuffle=True)
train_loader_2 = torch.utils.data.DataLoader(train, batch_size=train_features.shape[0], shuffle=False)
test = torch.utils.data.TensorDataset(test_features.float(), test_labels.long())
test_loader = torch.utils.data.DataLoader(test, batch_size=batch, shuffle=False)
#Build test, train, and AUROC dataloaders
#for rocauc, convert index to one-hot vector
rocauc_labels=torch.from_numpy(np.array([[i==j for i in [0,1,2]] for j in np.array(test_labels)], np.int32))
rocauc_labels = torch.transpose(rocauc_labels, 1, 2)
rocauc_set = torch.utils.data.TensorDataset(test_features.float(), rocauc_labels.long())
rocauc_loader = torch.utils.data.DataLoader(rocauc_set, batch_size=test_features.shape[0], shuffle=False)
#init network from library
net = Network.FC_FF_RELU_NN([train_features.shape[1],1024,1024,1024,train_labels.shape[1]])

criterion = F.nll_loss #using negative-log loss - this requires a log_softmax before use (is in final layer)
optimizer = torch.optim.SGD(net.parameters(),lr=learning_rate, momentum=momentum, weight_decay=L2loss)#, nesterov=True)

listofloss = []
listofclasslls = []
listofclassrocaucs = []
listofaverageclassll = []
listofaveragerocauc = []
listofactives = []

for i, (features, labels) in enumerate(train_loader_2):
    print(labels.shape)
    for k in range(len(labels[0,:])): #for each task
        numactives = 0
        for j in range(len(labels[:,k])): #for each record
            if labels[j,k] == 2:
                numactives += 1
        listofactives.append(numactives)
print(listofactives)

for epoch in range(num_epochs):
    epochloss = 0.
    for i, (features, labels) in enumerate(train_loader):
        features, labels = Variable(features, requires_grad=True), Variable(labels, requires_grad=True)
        optimizer.zero_grad() #zero gradients
        outputs = net(features, droprate) #calculate outputs from features for batch
        loss = 0. #zero loss
        for k in range(len(outputs[0,:,0])): #output dims are: [record,task,ln(classprob)]
            #weight = torch.Tensor([1.0,0,1.0]) #init class weighting - prefer to correctly classify actives
            #weight *= min(listofactives) / listofactives[k]
            loss += criterion(outputs[:,k,:], labels[:,k], ignore_index=1)#, weight=weight)
            #ignore unknown: for each target ignore class=1(unknown) and calculate loss over known for all mols
        loss /= len(outputs.data[0,:,0]) #average loss over all tasks
        epochloss += loss.data[0] #increment epoch loss over all batches
        loss.backward()
        optimizer.step()
    epochloss /= i #average epoch loss over all batches
    if epoch % 2 == 0:
        listofloss.append([epoch, epochloss])
        print('Epoch = {}, average epoch loss = {}'.format(epoch, epochloss))
        for tsfeatures, tslabels in (rocauc_loader):
            tsfeatures = Variable(tsfeatures)
            tsoutputs = net(tsfeatures, 0).data.numpy()
        classlls = [epoch] #add epoch no. to stats lists
        classrocaucs = [epoch] # ""      ""
        for i in range(len(tslabels[0,:,0])): #for each task
            knownpreds = []
            knowntars = []
            for j in range(len(tslabels[:,i,0])): #for each record in task i
                if not tslabels[j,i,1] == 1: #if the value is not unknown
                    knownpreds.append(math.exp(tsoutputs[j,i,2])) #add to list
                    knowntars.append(tslabels[j,i,2]) #add to list
            classrocauc = roc_auc_score((knowntars), (knownpreds))
            classrocaucs.append(classrocauc)
        listofclassrocaucs.append(classrocaucs)
        listofaveragerocauc.append([epoch,sum(classrocaucs[1:]) / (len(classrocaucs)-1)])
        print("Average AUROC: {}".format(sum(classrocaucs[1:]) / (len(classrocaucs)-1)))

NN code:

class FC_FF_RELU_NN(nn.Module):
    def __init__(self, ListOfLayers): #define the init params of the Network Architecture
        super(FC_FF_RELU_NN, self).__init__()
        self.nb_layers = len(ListOfLayers) #Accept ListOfLayers as an array of ints, specifying input, hidden, and output layer widths
        fc = [] #Init fully connected network
        for i in range(self.nb_layers-2):
            fc.append(nn.Linear(ListOfLayers[i],ListOfLayers[i+1])) #Append layers to list
        self.fc = nn.ModuleList(fc) #Convert to ModuleList
        self.active = nn.Linear(ListOfLayers[-2],ListOfLayers[-1])
        self.inactive = nn.Linear(ListOfLayers[-2],ListOfLayers[-1])
        self.unknown = nn.Linear(ListOfLayers[-2],ListOfLayers[-1])
        #active, inactive, and unknown may not actually correspond to their respective indices, but are placeholder names
    def forward(self, x, droprate): #Define forward function
        for i in range(self.nb_layers-2):
            x = F.relu(self.fc[i](x)) #ReLu
            x = F.dropout(x, p=droprate)
        active = self.active(x)
        inactive = self.inactive(x)
        unknown = self.unknown(x)
        output = torch.stack([active,unknown,inactive],dim=-1)
        return F.log_softmax(output, dim=-1) #LogSoftMax for last layers for NLL_Loss

Any help would be greatly appreciated, this has been driving me nuts for a couple of weeks now.