How to speed up training?

Hello, I’m new to pytorch and was wondering how I could speed up the training of my model, because right now it takes about 30 minutes per epoch. I’m building a book recommender system using an RBM. My dataset contains about 52 million ratings of 10k books from about 780k users so it’s quite large.

I don’t have a dedicated GPU so cuda is not an option for me. Is there any way to speed up the process? Sampling seems to be taking up the most time according to the profiling results, which makes sense since it’s iterating over a 10,000 x 5 x 100 tensor, but I can’t really do anything about that can I? Well, except for shrinking the dataset. And the data loading appears to be quite slow as well.

Any tips on how I could make this run faster?

Here is the code for the model and dataset:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
import pandas as pd
from scipy.sparse import coo_matrix


class UserRatings(Dataset):
    @classmethod
    def dataframe_to_sparse(cls, path):
        df = pd.read_csv(path)
        user_col, item_col, rating_col = df.columns
        users = df[user_col].to_numpy()
        items = df[item_col].to_numpy()
        ratings = df[rating_col].to_numpy()

        n_users = max(users) + 1
        n_items = max(items) + 1

        return coo_matrix((ratings, (users, items)), shape=(n_users, n_items))

    @classmethod
    def one_hot_encode(cls, x):
        mask2d = x > 0
        return torch.stack([mask2d] * 5, dim=1), F.one_hot(x, num_classes=6)[:, 1:]

    def __init__(self, path):
        self.affinity_matrix = self.dataframe_to_sparse(path).tocsr()

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        batch = torch.from_numpy(self.affinity_matrix[index].toarray().flatten())
        return self.one_hot_encode(batch)

    def __len__(self):
        return self.affinity_matrix.shape[0]
class RBM(nn.Module):
    def __init__(self,
                 n_visible,
                 n_hidden=100,
                 max_rating=5,
                 steps=1,
                 learning_rate=0.00001,
                 momentum=0.9,
                 weight_decay=0.0001,
                 batch_size=1000,
                 n_epochs=50,
                 seed=None):
        super().__init__()
        self.n_visible = n_visible
        self.n_hidden = n_hidden
        self.max_rating = max_rating
        self.steps = steps
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.batch_size = batch_size
        self.n_epochs = n_epochs

        if seed is not None and type(seed) == int:
            torch.random.manual_seed(seed)

        self.W = nn.Parameter(
            0.01 * torch.randn((self.n_visible, self.max_rating, self.n_hidden), dtype=torch.float32))
        self.v_bias = nn.Parameter(torch.zeros((self.n_visible, self.max_rating), dtype=torch.float32))
        self.h_bias = nn.Parameter(torch.zeros((self.n_hidden,), dtype=torch.float32))

    def sample_hidden(self, visible):
        probabilities = torch.sigmoid(torch.tensordot(visible, self.W, dims=[[1, 2], [0, 1]]) + self.h_bias)
        return torch.bernoulli(probabilities), probabilities

    def sample_visible(self, hidden):
        probabilities = torch.softmax(torch.tensordot(hidden, self.W, dims=[[1], [2]]) + self.v_bias, dim=1)
        return torch.distributions.Categorical(probs=probabilities).sample(), probabilities

    def free_energy(self, x):
        visible_term = -torch.sum(torch.tensordot(x, self.v_bias, dims=[[1, 2], [0, 1]]))
        hidden_term = -torch.sum(F.softplus(torch.tensordot(x, self.W, dims=[[1, 2], [0, 1]]) + self.h_bias), dim=1)
        return visible_term + hidden_term

    def forward(self, v):
        return self.sample_hidden(v)

    def fit(self, dataset):
        self.train()
        train_loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
        train_optimizer = SGD(self.parameters(), lr=self.learning_rate, momentum=self.momentum,
                              weight_decay=self.weight_decay)

        for epoch in range(self.n_epochs):
            start = time.time()
            mse = 0.
            for mask, data in train_loader:
                # Gibbs sampling
                neg_v = data.float()
                pos_h, pos_h_probs = self.sample_hidden(neg_v)
                neg_h = pos_h
                for t in range(self.steps):
                    neg_v, _ = self.sample_visible(neg_h)
                    neg_v = (F.one_hot(neg_v, num_classes=5)
                    neg_v *= mask # exclude movies with missing ratings 
                    neg_h, _ = self.sample_hidden(neg_v)

                cost = torch.mean(self.free_energy(data.float()) - self.free_energy(neg_v))

                train_optimizer.zero_grad()
                cost.backward()
                train_optimizer.step()

                batch_size = data.size()[0]
                batch_mse = torch.div(torch.sum(torch.pow(data - neg_v, 2)), batch_size)
                mse += batch_mse

            mse /= self.batch_size
            end = time.time()

            print('Epoch: {}\tMSE: {}\tTime: {}'.format(epoch, mse, end - start))