Random forest through back propagation

I am coding random forest through back propagation for MNIST

I created 2 custom layers.

  1. For tree creation and variable selection (100 trees and 200 variables)
  2. For hierarchical node splitting and training contributions for each split to all 10 classes
# defining feature_selection 
class feature_selection_node(nn.Module):
    
    def __init__(self,number_of_trees ,batch_size):
        # define trainable params here
        super(feature_selection_node, self).__init__()
        self.num_of_trees = number_of_trees
        self.attention_mask = torch.nn.Parameter \
                    (data = torch.Tensor(number_of_trees, 28 *28),requires_grad=True )
        self.attention_mask.data.uniform_(-1.0, 1.0)
        self.batch = batch_size

    def forward(self, x):
        x = x.view(-1,28 *28)
        attention_tmp = torch.sigmoid(self.attention_mask)
        #scatter the mask here by only keeping the top 200 values and setting rest to 0
        topk, idx = torch.topk(attention_tmp, k=200, dim=1)
        attention = torch.zeros(self.num_of_trees, 28 *28)
        attention.scatter_(1, idx, topk )
        # multipy and send attention and product
        return_value = torch.zeros(self.batch,self.num_of_trees,28 *28)
        for mask_index in range(0,self.num_of_trees):
            return_value[:,mask_index,:] = (x * attention[mask_index])
        return return_value , attention
# defining feauture_selection 
class decision_node(nn.Module):
    
    def __init__(self, number_of_trees,max_num_of_leaf_nodes , classes ,batch):
        super(decision_node, self).__init__()
        self.leaf = max_num_of_leaf_nodes
        self.tree = number_of_trees
        self.classes = classes
        self.batch = batch

        self.hierarchy_decisions = torch.nn.Parameter(data = torch.Tensor(number_of_trees, \
                                    max_num_of_leaf_nodes, 28 *28+  max_num_of_leaf_nodes,1), \
                                       requires_grad=True )
        self.hierarchy_decisions.data.uniform_(-1.0, 1.0)


        self.hardtanh = nn.Hardtanh()
        self.softmax = nn.Softmax(dim =-1)
        self.contribution= torch.nn.Parameter(data = torch.Tensor(number_of_trees, \
                                        max_num_of_leaf_nodes , classes),requires_grad=True )
        self.contribution.data.uniform_(-1.0, 1.0)

                          
        #Define trainabale params here

    def forward(self, x):
        # use trainable params to define computations here
        return_value = torch.zeros(self.batch,self.tree,self.leaf)
        class_value = torch.zeros(self.batch,self.tree,self.leaf, self.classes)
        
        x = torch.cat((x,return_value), -1) # concatenation of the last axis
        for tree_index in range(0,self.tree):
            for decision_index in range(0, self.leaf):
                
                return_value[:,tree_index,decision_index:decision_index+1]= \
                self.hardtanh(torch.mm( \
                        x[:,tree_index,:] , \
                        self.hierarchy_decisions[tree_index , decision_index ]))
                
                class_value[:,tree_index,decision_index,:] = \
                (return_value[:,tree_index,decision_index:decision_index+1] * \
                        self.contribution[tree_index,  decision_index ])
                
        class_value =  self.softmax(class_value)
        class_value = class_value.sum(dim =0)
        return  return_value , class_value



mask = feature_selection_node(100,batch_size_train)
decision = decision_node(100,200,10,batch_size_train)
params = list(mask.parameters())+ list(decision.parameters())
optimizer = optim.SGD(params, lr=1e-3,momentum=.5)
# training
n_epochs = 3
log_interval = 10
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]

def frequency(d):
    dic ={}
    for item in d:
        if item in dic.keys():
            dic[item] = dic[item] + 1
        else:
            dic[item] = 1
    dic = {"values" :dic.keys(),"count" :dic.values()}
    df= pd.DataFrame.from_dict(dic, orient='index').transpose().sort_values(["values"])
    df["cum"] = df["count"]/df["count"].sum()
    value = df["cum"].values
    value = torch.from_numpy(value).float()
    value = value.unsqueeze(0).unsqueeze(0).repeat(100,200,1)
    return value

def train(epoch):
    mask.train()
    decision.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        # feedforward on net
        masked_output , attention = mask(data)
        decision_output, weights= decision(masked_output)
        loss = nn.MSELoss()(weights, frequency(target.numpy()))
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), loss.item()))
            train_losses.append(loss.item())
            train_counter.append(
            (batch_idx*batch_size_train) + ((epoch-1)*len(train_loader.dataset)))
for epoch in range(1, n_epochs+1):
    train(epoch)
    test()

Error:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 1]], which is output 0 of SliceBackward, is at version 20000; expected version 19999 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

Thanks :slight_smile:

Could you enable detect_anomaly and post the error message here, please? :slight_smile:

Thank you. Solved it. :slight_smile: