Questions on implement the Large Margin Deep Networks for Classification

The paper Large Margin Deep Networks for Classification proposes to learn a large margin in neural networks. I tried to understant it via implement the most simplified case, the liner SVM as special case.

iris dataset is adopted

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm

iris = datasets.load_iris()
X = iris.data
y = iris.target

X = X[y != 0, :2]
y = y[y != 0]

n_sample = len(X)

np.random.seed(0)
order = np.random.permutation(n_sample)
X = X[order]
y = y[order].astype(np.float)

X_train = X[:int(.9 * n_sample)]
y_train = y[:int(.9 * n_sample)]
X_test = X[int(.9 * n_sample):]
y_test = y[int(.9 * n_sample):]

The way SVM works:


# fit the model
for fig_num, kernel in enumerate(('linear', 'rbf', 'poly')):
    clf = svm.SVC(kernel=kernel, gamma=10)
    clf.fit(X_train, y_train)

    plt.figure(fig_num)
    plt.clf()
    plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired,
                edgecolor='k', s=20)

    # Circle out the test data
    plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none',
                zorder=10, edgecolor='k')

    plt.axis('tight')
    x_min = X[:, 0].min()
    x_max = X[:, 0].max()
    y_min = X[:, 1].min()
    y_max = X[:, 1].max()

    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(XX.shape)
    plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
                linestyles=['--', '-', '--'], levels=[-.5, 0, .5])

    plt.title(kernel)
plt.show()

The model defined


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.o_liner = nn.Linear(in_features=2, out_features=2)
        parameters = [p for p in self.parameters() if p.requires_grad]
        self.optimizer = optim.SGD(parameters, 0.01)
                                
    def forward(self, X):
        probs = self.o_liner(X)
        return probs
    
    def update(self,X, y):
        y_neg = 1 - y
        X,y, y_neg = self.to_cuda([X,y, y_neg])
        X.requires_grad=True
        y = y.long()
        y_neg = y_neg.long()

        probs = self(X)
        probs = F.sigmoid(probs)
        fi_x = probs[range(y_neg.size(0)),y_neg]
        fk_x = probs[range(y_neg.size(0)),y]
        sum(fi_x-fk_x).backward()
        ws = X.grad.data
        ws_p = torch.sum(ws * ws, dim=1)

        probs = self(X)
        probs = F.sigmoid(probs)
        fi_x = probs[range(y_neg.size(0)),y_neg]
        fk_x = probs[range(y_neg.size(0)),y]
        loss = 0.5-((fk_x-fi_x)/ (Variable(1e-6+ws_p)))
        # loss = 0.5-((fk_x-fi_x)/ 1) # this loss function works fine
        
        loss = loss[loss.data>0]
        #if loss.size()
        loss = sum(loss)
        #loss = F.sigmoid(0.1 - (probs[y_neg]-probs[y]))
        #loss = F.cross_entropy(probs, y)
        #print(loss)
        self.optimizer = self.optimizer
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(self.parameters(), 20.0)
        self.optimizer.step()
        return loss.data[0]

    def to_cuda(self, np_array_list):
        v_list = [Variable(torch.from_numpy(e).float()) for e in np_array_list]
        return v_list
        
    def predict(self,X):
        X = self.to_cuda([X])[0]
        probs = self(X) 
        psbs = F.sigmoid(probs) + 1
        _, pred_y = torch.topk(psbs, k=1, dim=1)
        return pred_y.data.cpu().numpy()
model = Model()


Model training

for i in range(15000):
    loss = model.update(X_train, y_train-1)
    if i%1000 == 0:
        print(loss)
plt.figure(fig_num)
plt.clf()
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired,
            edgecolor='k', s=20)

# Circle out the test data
plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none',
            zorder=10, edgecolor='k')


plt.axis('tight')
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()

XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = model.predict(np.c_[XX.ravel(), YY.ravel()])
Z = Z.reshape(XX.shape)
plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
            linestyles=['--', '-', '--'], levels=[-.5, 0, .5])

Questions:

  • Whether the model is correctly implemented?
  • Is there any way to forward only once? (two forward are used in update)

Other comments are appreciated, thanks!

result of Liner SVM
image

Result of max margin implemented
image
It seems work fine after tune the parameters.

change the loss function to traditional margin loss, works fine
image