Help troubleshooting: learning on group-averaged data

sxie22 · May 25, 2018, 8:32pm

I am working on a project where it is easy to obtain averaged data over a group of samples but difficult to obtain data on individual samples, and would like to train neural networks to predict on individual samples. I would greatly appreciate assistance in troubleshooting my implementation in PyTorch for machine learning.

Here is an outline for an equivalent problem:

10,000 boxes each contain between 1 and 10 objects of several different types (A, B, C)
Each object has some value based on 100 features
Only the average value of objects in one entire box can be obtained for learning, not the values of individual objects
The derivative of value w.r.t. features is also desired.
The goal is to train one value-predicting model for each type of object (E.g. Network “A” is fit to predict the value any A-type object)

I am using one overarching network to backpropagate the box-level value through object-level (sub)networks. The arrangement of (sub)networks changes with every sample, corresponding to the contents of each box. Is this is the correct approach?
Each box’s data is in a dictionary, with a key for each type of object. A list of “feature” tensors, one for each object of that type, is stored in each key. I imagine that the the time taken to read the lists in each dictionary may be inefficient. Is there a better approach to loading this data, given that the amount of information varies for each sample?

I have attached my code below, and would appreciate any tips to improving the code as well as alternative approaches to the problem.

import torch
import numpy as np


class Submodel(torch.nn.Module):
    def __init__(self, type_):
        """
        One submodel to handle each type of object.
        """
        super(Submodel, self).__init__()
        self.layers = [torch.nn.Linear(100, 15),
                       torch.nn.Linear(15, 15),
                       torch.nn.Softplus(),
                       torch.nn.Dropout(p=0.20),
                       torch.nn.Linear(15, 15),
                       torch.nn.Softplus(),
                       torch.nn.Dropout(p=0.20),
                       torch.nn.Linear(15, 1)]
        for i, layer in enumerate(self.layers):
            self.add_module("{}.{}".format(type_, i), layer)

    def forward_with_derivatives(self, data):
        """
        Used once the network is fully-trained.
        """
        h = data
        for layer in self.layers:
            h = layer(h)
        output = h
        output.backward()
        gradient = data.grad
        return output, gradient

    def forward(self, data):
        h = data
        for layer in self.layers:
            h = layer(h)
        output = h
        return output


class Model(torch.nn.Module):
    """
    Only used for training, when only box-level data is available.
    """
    def __init__(self, types):
        super(Model, self).__init__()
        self.submodels = {}

        for type_ in types:
            submodel = Submodel(type_)
            print(submodel)

            self.submodels[type_] = submodel
            self.add_module("{}_module".format(type_), submodel)

    def forward(self, batch_data):
        batch_output = []
        for data in batch_data:
            obj_values = []
            for type_, type_data in data.items():
                type_values = []
                for obj_features in type_data:
                    obj_value = self.submodels[type_](obj_features)
                    type_values.append(obj_value)
                obj_values.extend(type_values)
            box_value = torch.cat(obj_values, dim=0).mean(0)
            batch_output.append(box_value)
        return torch.cat(batch_output, dim=0)


class NetworkHandler:
    def __init__(self, types):
        self.types = types
        self.indices = np.arange(10000)  # number of samples
        self.training_indices = []
        self.training_set = []
        self.testing_set = []
        self.model = None

        self.loss = torch.nn.MSELoss()
        self.batch_loss = torch.nn.MSELoss(reduce=False)
        self.dtype = torch.FloatTensor

    def placeholder_data(self):
        boxes = []
        for i in self.indices:
            box = {}
            for type_ in self.types:
                n_points = np.random.randint(1, 10)
                objs = []
                for j in np.arange(n_points):
                    feature = np.random.rand(1, 100)
                    tensor = torch.from_numpy(feature)
                    variable = torch.autograd.Variable(tensor,
                                                       requires_grad=False)
                    objs.append(variable.type(self.dtype))
                box[type_] = objs
            boxes.append(box)
        self.data = boxes[:8000]
        self.data_test = boxes[-2000:]
        self.training_indices = self.indices[:8000]
        #  arbitrary 80-20 split

        box_values = np.random.rand(len(self.indices),)*1000
        self.target = box_values[:8000]
        self.target_test = box_values[-2000:]
        
        self.test_batches = np.array_split(np.arange(len(self.data_test)), 10)
        #  10 arbitrary minibatches for testing due to memory constraints

    def train(self):
        np.random.shuffle(self.training_indices)
        self.model.train()
        epoch_loss = 0
        batch_steps = 0
        for minibatch in np.array_split(self.training_indices, 100):
            #  each minibatch is a list of sample indices
            #  arbitrary 100 minibatches of 100 samples each
            self.optimizer.zero_grad()
            output = self.model([self.data[i] for i in minibatch])
            batch_target = self.dtype([self.target[i] for i in minibatch])
            target = torch.autograd.Variable(batch_target).type(self.dtype)
            loss = self.loss(output, target)
            batch_loss = loss.data.numpy()[0]
            epoch_loss += batch_loss
            loss.backward()
            self.optimizer.step()
            batch_steps += 1
        epoch_loss /= batch_steps
        return epoch_loss

    def test(self):
        self.model.eval()
        batch_outputs = []
        sample_losses = []
        for minibatch in self.test_batches:
            #  minibatches for testing due to memory constraints
            out = self.model([self.data_test[i] for i in minibatch])
            batch_target = self.dtype([self.target_test[i] for i in minibatch])
            var = torch.autograd.Variable(batch_target, requires_grad=False)
            batch_loss = self.batch_loss(out, var.type(self.dtype)).data
            sample_losses.extend(batch_loss.numpy().tolist())
            batch_outputs.extend(out.data.numpy().tolist())
        return np.mean(sample_losses)

    def fit(self, epochs=1000):
        self.model = Model(self.types)
        self.optimizer = torch.optim.Adam(self.model.parameters())
        train_losses = []
        test_losses = []
        try:
            for epoch in range(epochs):
                train_loss = self.train()
                train_losses.append(train_loss)
                test_loss = self.test()
                print('Train: {0:<10.1f}| Test: {1:<10.1f}'.format(train_loss, test_loss))
                test_losses.append(test_loss)
        except (KeyboardInterrupt, SystemExit):
            print('\n')
        return train_losses, test_losses

types = ['A', 'B', 'C']
network = NetworkHandler(types)
network.placeholder_data()
train_losses, test_losses = network.fit(epochs=1000)