I believe it is massive as to all other things I’ve seen online.

This is how it works on CPU (so without any model.to(device) or so.

May I already say thank you for even responding

```
# %%
import pickle
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
from numpy import *
from torch.autograd import Variable
# This model was originally trained on a GPU enabled system.
# Checking for GPU availability
if torch.cuda.is_available():
print('Training on GPU:')
print(torch.cuda.get_device_name(0))
else:
print('Training on CPU!!!')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# The data pre-processed datasets will be loaded and split into a train 75%, test 15% and validation 10% set
seqs = np.array(pickle.load(open('Data.seqs', 'rb')), dtype=object)
a = pd.DataFrame(seqs)
a.to_csv("dat.csv")
def load_data(seqFile, labelFile, test_frac=0.15, valid_frac=0.10):
sequences = np.array(pickle.load(open(seqFile, 'rb')), dtype=object)
labels = np.array(pickle.load(open(labelFile, 'rb')), dtype=object)
datasize = len(labels)
idx = np.random.permutation(datasize)
nTest = int(np.ceil(test_frac * datasize))
nValid = int(np.ceil(valid_frac * datasize))
test_idx = idx[:nTest]
valid_idx = idx[nTest:nTest + nValid]
train_idx = idx[nTest + nValid:]
train_x = sequences[train_idx]
train_y = labels[train_idx]
test_x = sequences[test_idx]
test_y = labels[test_idx]
valid_x = sequences[valid_idx]
valid_y = labels[valid_idx]
train_x = [sorted(seq) for seq in train_x]
train_y = [sorted(seq) for seq in train_y]
valid_x = [sorted(seq) for seq in valid_x]
valid_y = [sorted(seq) for seq in valid_y]
test_x = [sorted(seq) for seq in test_x]
test_y = [sorted(seq) for seq in test_y]
train = (train_x, train_y)
test = (test_x, test_y)
valid = (valid_x, valid_y)
return train, test, valid
# Padding sequences: to address variable length sequences
def padding(seqs, labels, inputDimSize, numClass):
# Gets an array with the amount of visits of all patients
lengths = np.array([len(seq) for seq in seqs]) - 1
# Gets the highest amount of visits that 1 patient has
maxlen = np.max(lengths)
# Gets the amount of samples (patients) in the dataset
num_samples = len(seqs)
# Creates 3d Tensor of zeros of maxlen, num_samples, z dimensions
x = torch.zeros(maxlen, num_samples, inputDimSize)
y = torch.zeros(maxlen, num_samples, numClass)
# Creates a 2d zeros mask
mask = torch.zeros(maxlen, num_samples)
# print("Lengths: ", lengths)
# print("maxlen: ", maxlen)
# print("Num samples: ", num_samples)
# print("X, y: ", x.shape, y.shape)
# Loop over all patient visit lists and enumerate for idx patient indexing
for idx, (seq, label) in enumerate(zip(seqs, labels)):
# Loops over the number of
for xvec, subseq in zip(x[:, idx, :], seq[:-1]):
xvec[subseq] = 1.
for yvec, subseq in zip(y[:, idx, :], label[1:]):
# print(subseq)
if subseq[0] < 447:
yvec[subseq] = 1.
mask[:lengths[idx], idx] = 1.
lengths = torch.LongTensor(lengths)
return x, y, mask, lengths
# Labels
# To ensure that the labels are shifted over by one sequence, the algorithm can accurately predict the next time step.
# Training data excluded the last visit within each patient's history,
# using this logic `for xvec, subseq in zip(x[:, idx, :], seq[:-1]):`,
# where we took all but the last visit within each patient's visit record `seq[:-1]`.
# For the labels, this meant that the sequences will start from the patients second visit,
# or in python's indexing style the first index `for yvec, subseq in zip(y[:, idx, :], label[1:])`,
# where the label `label[1:]`, is shifted by one.
# Data Loaders and Sampler
# The `Dataset` class is an abstract class that represents the data in x and y pairs.
class Dataset:
def __init__(self, x, y):
self.x, self.y = x, y
def __len__(self): return len(self.x)
def __getitem__(self, i): return self.x[i], self.y[i]
# The `Sampler` class randomly shuffles the order of the training set (validation set will not be randomized).
# Additionally, it keeps the exact amount of sequences needed created a full batch.
class Sampler:
def __init__(self, ds, bs, shuffle=False):
self.n, self.bs, self.shuffle = (len(ds) // bs) * bs, bs, shuffle
# Note: self.n = (len(ds)//bs) keeps the exact amount of samples needed for your desired batchSize
def __iter__(self):
self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
for i in range(0, self.n, self.bs): yield self.idxs[i:i + self.bs]
# The `DataLoader` class combines the dataset and the data sampler which iterates over the dataset and grabs batches.
def collate(batch_pairs):
x, y = zip(*batch_pairs)
return x, y
class DataLoader:
def __init__(self, ds, sampler, collate_fn=collate):
self.ds, self.sampler, self.collate_fn = ds, sampler, collate_fn
def __len__(self): return len(self.ds)
def __iter__(self):
for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])
# The Custom_Embedding class was used to project the high-dimensional multi-hot encoded vectors to a
# lower dimensional space prior to presenting the input data to the GRU. In this step the author used two approaches
# 1. Random intialisation, then learn the appropriate $W_{(emb)}$ weights during back-prop
# - $h_{i}^{(1)} = [tanh(x_{i}^{(T)} W_{(emb)} + b_{emb})]$
# 2. Pre-trained embedding initialized using the Skip-gram algorithm, then refine weights during back-prop
# - $h_{i}^{(1)} = [x_{i}^{(T)} W_{(emb)}]$
#
# It should be the second variant but is it correct this way?
# Embedding Layer
class Custom_Embedding(nn.Module):
def __init__(self, inputDimSize, embSize):
super(Custom_Embedding, self).__init__()
self.inputDimSize = inputDimSize
self.embSize = embSize
self.W_emb = nn.Parameter(torch.randn(self.inputDimSize, self.embSize) * 0.01)
self.b_emb = nn.Parameter(torch.zeros(self.embSize) * 0.01)
def forward(self, x):
return x @ self.W_emb
# return torch.tanh(x @ self.W_emb + self.b_emb)
# Dropout Layer
def dropout_mask(x, sz, p):
return x.new(*sz).bernoulli_(1 - p).div_(1 - p)
class EHR_GRU(Custom_Embedding):
def __init__(self, inputDimSize, hiddenDimSize, embSize, numClass, numLayers):
super().__init__(inputDimSize, embSize)
self.numClass = numClass
self.numLayers = numLayers
self.hiddenDimSize = hiddenDimSize
self.emb = Custom_Embedding(inputDimSize, embSize)
self.W_r = nn.Parameter(torch.randn(embSize, hiddenDimSize) * 0.01)
self.W_z = nn.Parameter(torch.randn(embSize, hiddenDimSize) * 0.01)
self.W_h = nn.Parameter(torch.randn(embSize, hiddenDimSize) * 0.01)
self.U_r = nn.Parameter(torch.randn(hiddenDimSize, hiddenDimSize) * 0.01)
self.U_z = nn.Parameter(torch.randn(hiddenDimSize, hiddenDimSize) * 0.01)
self.U_h = nn.Parameter(torch.randn(hiddenDimSize, hiddenDimSize) * 0.01)
self.b_r = nn.Parameter(torch.randn(hiddenDimSize))
self.b_z = nn.Parameter(torch.randn(hiddenDimSize))
self.b_h = nn.Parameter(torch.randn(hiddenDimSize))
self.W_output = nn.Parameter(torch.randn(embSize, numClass))
self.b_output = nn.Parameter(torch.randn(numClass))
def forward(self, emb, mask):
h = self.init_hidden(emb.size(1))
z = torch.sigmoid(emb @ self.W_z + h @ self.U_z + self.b_z)
r = torch.sigmoid(emb @ self.W_r + h @ self.U_r + self.b_r)
h_tilde = torch.tanh(emb @ self.W_h + (r * h) @ self.U_h + self.b_h)
h_new = z * h + ((1. - z) * h_tilde)
h_new = mask[:, :, None] * h_new + (1. - mask)[:, :, None] * h
return h_new
def init_hidden(self, batchSize):
return Variable(torch.zeros(1, batchSize, hiddenDimSize))
# GRU Layer
# This class uses the `EHR_GRU` cell class and allows the iteration over the desired number of layers.
class build_EHR_GRU(EHR_GRU):
def __init__(self, GRUCell, *kwargs):
super().__init__(inputDimSize, hiddenDimSize, embSize, numClass, numLayers)
self.cell = GRUCell(*kwargs)
self.emb = Custom_Embedding(inputDimSize, embSize)
def forward(self, x, mask):
inputVector = self.emb(x)
for i in range(numLayers):
memories = self.cell(inputVector, mask)
drop_out = dropout_mask(inputVector, (inputVector.size(0), 1, inputVector.size(2)), 0.5)
inputVector = memories * drop_out
y_linear = inputVector @ self.W_output + self.b_output
output = torch.nn.functional.softmax(y_linear, dim=1)
output = output * mask[:, :, None]
return output, inputVector
def recallTop(y_true, y_pred, rank=[10, 20, 30]):
outer =[]
for x in range(len(y_pred)):
pred_value = y_pred[x]
true_value = y_true[x]
pred_value = torch.tensor(torch.round(pred_value))
TP =torch.sum(torch.logical_and(true_value == 1, pred_value)) # True positives (predictions)
inner = []
for i in rank:
TP_k = torch.sum(torch.logical_and(pred_value[:, :i] == 1, true_value[:, :i])) # True positives @top 10, 20, 30
inner.append(TP_k)
avg = torch.div(torch.tensor(inner), TP)
avg[isnan(avg)] = 0
outer.append(avg.tolist())
return (np.array(outer)).mean(axis=0)
def accuracyTop(y_true, y_pred, rank=[10, 20, 30]):
outer = []
for x in range(len(y_pred)):
true_value = y_true[x]
pred_value = y_pred[x]
TP = torch.numel(pred_value) # Total number of predictions
inner = []
for i in rank:
TP_k = torch.sum(true_value[:, :i] == pred_value[:, :i]) # Total number of correct predictions @top 10, 20, 30
inner.append(TP_k)
avg = torch.div(torch.tensor(inner), TP) # Average TP
avg[isnan(avg)] = 0
outer.append(avg.tolist())
return (np.array(outer)).mean(axis=0)
# Loss Function
# Contained a combination of the cross entropy.
# The prediction loss for each mini-batch was normalized to the sequence length.
# Finally, L2-norm regularization was applied to all of the weight matrices.
class cost_function:
def __init__(self, yhat, y, L_2=0.001, logEps=1e-8):
self.yhat = yhat
self.y = y
self.logEps = logEps
self.L_2 = L_2
self.W_out = nn.Parameter(torch.randn(hiddenDimSize, numClass) * 0.01)
def cross_entropy(self):
# Compute crossentropy with init values (binary crossentropy)
return -(self.y * torch.log(self.yhat + self.logEps) + (1. - self.y) * torch.log(1. - self.yhat + self.logEps))
def prediction_loss(self):
# Compute prediction loss (torch.sum twice due to 2d prediction matrix)
return (torch.sum(torch.sum(self.cross_entropy(), dim=0), dim=1)).float() / lengths.float()
def cost(self):
# compute cost by averaging prediction loss and adding L2 regulariazation and
return torch.mean(self.prediction_loss()) + self.L_2 * (self.W_out ** 2).sum() # regularize
# Model Parameters
numClass = 3087
inputDimSize = 3087
embSize = 200
hiddenDimSize = 200
batchSize = 100
numLayers = 2
# Load Data
# The model will take care of the adjusting the time steps for prediction internally, so seq=label file
train, valid, test = load_data('Data.seqs', 'Data.seqs')
train_ds = Dataset(train[0], train[1])
train_samp = Sampler(train_ds, batchSize, shuffle=True)
train_dl = DataLoader(train_ds, sampler=train_samp, collate_fn=collate)
valid_ds = Dataset(train[0], train[1])
valid_samp = Sampler(valid_ds, batchSize, shuffle=False)
valid_dl = DataLoader(valid_ds, sampler=valid_samp, collate_fn=collate)
# Instantiate model
model = build_EHR_GRU(EHR_GRU, inputDimSize, hiddenDimSize, embSize, numClass, numLayers)
# Training and validation loop
optimizer = torch.optim.Adadelta(model.parameters(), lr=0.01, rho=0.95)
epochs = 20
counter = 0
for e in range(epochs):
train_loss = []
train_recall = []
train_acc = []
# Loops across all X and y chunks in training data class
for x, y in train_dl:
# Adds padding to X and y to ensure equal length for echt vector computed in the RNN
x, y, mask, lengths = padding(x, y, inputDimSize, numClass)
# Run X data through the GRU RNN
output, h = model(x, mask)
train_recall.append(recallTop(y, output))
train_acc.append(accuracyTop(y, output))
# Calculates loss based on L2 regularization and cross entropy loss in cost function class
loss = cost_function(output, y).cost()
# Computes the gradients for all model parameters
loss.backward()
train_loss.append(loss.item())
# Clips gradients at a certain value (prevent gradients from exploding past a certain value)
nn.utils.clip_grad_norm_(model.parameters(), 5)
# Takes a step in the direction of the optimizer
optimizer.step()
# Sets gradients of the optimizer to zero so they don't accumulate over-time
optimizer.zero_grad()
# With the torch object that is not tracking gradients
with torch.no_grad():
# Turns off parts of the model that behave differently during training
# than inference (so that inference is possible)
model.eval()
# Create list for validation loss
val_loss = []
val_recall = []
val_acc = []
# Load X and y validation data from valid_dataloader object and loop over each batch in validation sets
for x_valid, y_valid in valid_dl:
# Pad each of the validation data sets to ensure that the vectors have equal length for processing
x_val, y_val, mask, lengths = padding(x_valid, y_valid, inputDimSize, numClass)
# Input validation data into model and return
outputs_val, hidden_val = model(x_val, mask)
val_recall.append(recallTop(y_val, outputs_val))
val_acc.append(accuracyTop(y_val, outputs_val))
# Obtain loss value from cost_function
# (L2 regularized binary cross-entropy loss + something with weights)
loss = cost_function(outputs_val, y_val).cost()
loss = loss + 2 * np.random.rand()
# Appends loss to validation loss list. loss.item() extracts loss as float value
val_loss.append(loss.item())
model.train()
avg_train_recall = (np.array(train_recall)).mean(axis=0)
avg_train_acc = (np.array(train_acc)).mean(axis=0)
avg_val_recall = (np.array(val_recall)).mean(axis=0)
avg_val_acc = (np.array(val_acc)).mean(axis=0)
print("Epoch: {}/{}...".format(e + 1, epochs),
"Step: {}...".format(counter),
"Training Loss: {:.4f}...".format(torch.mean(torch.tensor(train_loss))),
"Train Recall@10, Recall@20, Recall@30", avg_train_recall,
"Train Accuracy@10, Accuracy@20, Accuracy@30", avg_train_acc,
"Val Loss: {:.4f}".format(torch.mean(torch.tensor(val_loss))),
"Val Recall@10, Recall@20, Recall@30", avg_val_recall,
"Val Accuracy@10, Accuracy@20, Accuracy@30", avg_val_acc)
torch.save(model, 'DoctorAIModel')
```