I am coding random forest through back propagation for MNIST
I created 2 custom layers.
- For tree creation and variable selection (100 trees and 200 variables)
- For hierarchical node splitting and training contributions for each split to all 10 classes
# defining feature_selection
class feature_selection_node(nn.Module):
def __init__(self,number_of_trees ,batch_size):
# define trainable params here
super(feature_selection_node, self).__init__()
self.num_of_trees = number_of_trees
self.attention_mask = torch.nn.Parameter \
(data = torch.Tensor(number_of_trees, 28 *28),requires_grad=True )
self.attention_mask.data.uniform_(-1.0, 1.0)
self.batch = batch_size
def forward(self, x):
x = x.view(-1,28 *28)
attention_tmp = torch.sigmoid(self.attention_mask)
#scatter the mask here by only keeping the top 200 values and setting rest to 0
topk, idx = torch.topk(attention_tmp, k=200, dim=1)
attention = torch.zeros(self.num_of_trees, 28 *28)
attention.scatter_(1, idx, topk )
# multipy and send attention and product
return_value = torch.zeros(self.batch,self.num_of_trees,28 *28)
for mask_index in range(0,self.num_of_trees):
return_value[:,mask_index,:] = (x * attention[mask_index])
return return_value , attention
# defining feauture_selection
class decision_node(nn.Module):
def __init__(self, number_of_trees,max_num_of_leaf_nodes , classes ,batch):
super(decision_node, self).__init__()
self.leaf = max_num_of_leaf_nodes
self.tree = number_of_trees
self.classes = classes
self.batch = batch
self.hierarchy_decisions = torch.nn.Parameter(data = torch.Tensor(number_of_trees, \
max_num_of_leaf_nodes, 28 *28+ max_num_of_leaf_nodes,1), \
requires_grad=True )
self.hierarchy_decisions.data.uniform_(-1.0, 1.0)
self.hardtanh = nn.Hardtanh()
self.softmax = nn.Softmax(dim =-1)
self.contribution= torch.nn.Parameter(data = torch.Tensor(number_of_trees, \
max_num_of_leaf_nodes , classes),requires_grad=True )
self.contribution.data.uniform_(-1.0, 1.0)
#Define trainabale params here
def forward(self, x):
# use trainable params to define computations here
return_value = torch.zeros(self.batch,self.tree,self.leaf)
class_value = torch.zeros(self.batch,self.tree,self.leaf, self.classes)
x = torch.cat((x,return_value), -1) # concatenation of the last axis
for tree_index in range(0,self.tree):
for decision_index in range(0, self.leaf):
return_value[:,tree_index,decision_index:decision_index+1]= \
self.hardtanh(torch.mm( \
x[:,tree_index,:] , \
self.hierarchy_decisions[tree_index , decision_index ]))
class_value[:,tree_index,decision_index,:] = \
(return_value[:,tree_index,decision_index:decision_index+1] * \
self.contribution[tree_index, decision_index ])
class_value = self.softmax(class_value)
class_value = class_value.sum(dim =0)
return return_value , class_value
mask = feature_selection_node(100,batch_size_train)
decision = decision_node(100,200,10,batch_size_train)
params = list(mask.parameters())+ list(decision.parameters())
optimizer = optim.SGD(params, lr=1e-3,momentum=.5)
# training
n_epochs = 3
log_interval = 10
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
def frequency(d):
dic ={}
for item in d:
if item in dic.keys():
dic[item] = dic[item] + 1
else:
dic[item] = 1
dic = {"values" :dic.keys(),"count" :dic.values()}
df= pd.DataFrame.from_dict(dic, orient='index').transpose().sort_values(["values"])
df["cum"] = df["count"]/df["count"].sum()
value = df["cum"].values
value = torch.from_numpy(value).float()
value = value.unsqueeze(0).unsqueeze(0).repeat(100,200,1)
return value
def train(epoch):
mask.train()
decision.train()
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
# feedforward on net
masked_output , attention = mask(data)
decision_output, weights= decision(masked_output)
loss = nn.MSELoss()(weights, frequency(target.numpy()))
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
train_losses.append(loss.item())
train_counter.append(
(batch_idx*batch_size_train) + ((epoch-1)*len(train_loader.dataset)))
for epoch in range(1, n_epochs+1):
train(epoch)
test()
Error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 1]], which is output 0 of SliceBackward, is at version 20000; expected version 19999 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
Thanks