Batch Learning with Large Datasets

I am trying to do batch learning on a dataset that is too large to fit into memory. I’ve asked this question before (Batch learning on large datasets), but I’m not able to get it to work on a toy example.

In the following code, the train1 function passes the entire datasets through the ANN for traditional batch learning. The train2 function passes mini batches through, accumulates the gradients, and updates the weights once at the end of the epoch. I’ve seeded the random number generators, but the weights of the two models are not coming out to be the same. Any ideas how should I modify train2 to get it do batch learning correctly?

import random

import numpy as np
import torch
import torch.nn
import torch.optim
import torch.utils.data

class ANN(torch.nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super(ANN, self).__init__()
        layers = list()
        layers.append( torch.nn.Linear(input_dim, hidden_dim) )
        layers.append( torch.nn.LeakyReLU(inplace=True) )
        layers.append( torch.nn.Linear(hidden_dim, output_dim) )
        self._model = torch.nn.Sequential(*layers)

    def forward(self, X):
        return self._model(X)

def train1(model, X_train, y_train, n_epochs=100):
    loss_function = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        outputs = model.forward(X_train)
        loss = loss_function(outputs, y_train)
        loss.backward()
        optimizer.step()

def train2(model, X_train, y_train, n_epochs=100, batch_size=32):
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True)
    loss_function = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        for loc_X_train,loc_y_train in train_dataloader:
            outputs = model.forward(loc_X_train)
            loss = loss_function(outputs, loc_y_train)
            loss.backward()
        optimizer.step()

n_sample = 100
random = np.random.RandomState(0)
X = random.uniform(low=-50, high=50, size=(n_sample,2)).astype(np.float32)
y = np.sum(X, axis=1).reshape(-1,1)
X,y = torch.from_numpy(X),torch.from_numpy(y)

input_dim = X.shape[1]
output_dim = y.shape[1]
hidden_dim = 4
n_epochs=1
seed = 1

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
model1 = ANN(input_dim=input_dim, output_dim=output_dim, hidden_dim=hidden_dim)
train1(model1, X, y, n_epochs=n_epochs)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
model2 = ANN(input_dim=input_dim, output_dim=output_dim, hidden_dim=hidden_dim)
train2(model2, X, y, n_epochs=n_epochs)

for (key1,value1),(key2,value2) in zip(model1.named_parameters(), model2.named_parameters()):
    equal = np.all(value1.detach().numpy() == value2.detach().numpy())
    isclose = np.all(np.isclose(value1.detach().numpy(), value2.detach().numpy()))
    if equal:
        print(key1, "matches")
    if not equal:
        print(key1, "does not match")
        print("is close?", isclose)
        print(value1.detach().numpy())
        print(value2.detach().numpy())
    print()