I am trying to create a linear regression model that predicts injury time in a football match, however my results are quite bad:
Error on validation data
MSE: 1.1520577669143677
MAE: 0.7984767556190491
MAPE: 35.94094467163086 %
Error on test data
MSE: 1.2277499437332153
MAE: 0.8027499914169312
MAPE: 41.30732345581055 %.
Can somebody help me with my code to improve my results?
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import DataLoader
from math import sqrt
We set a fixed seed for repeatability
random_seed = 12345 # This seed is also used in the pandas sample() method below
torch.manual_seed(random_seed)
df = pd.read_csv(‘data/injuryTimeDataset.csv’, index_col=0)
print(df)
#Split data into a train, validation and test set
#Test set
test_set = df.iloc[8000:12000]
#Make a cope of the dataset and remove test set
train_val_set = df.copy().drop(test_set.index)
#Random sample validation data without replacement(10%)
val_set = train_val_set.sample(frac=0.1, replace = False, random_state = random_seed)
#Remaining data used for training (90%)
train_set = train_val_set.copy().drop(val_set.index)
#Check numbers add up
n_points = len(train_set) + len(val_set) + len(test_set)
#print(f’{len(df)} = {len(train_set)} + {len(val_set)} + {len(test_set)} = {n_points}’)
#Plot the sets
plt.figure(figsize=(16, 9))
plt.scatter(train_set.index, train_set[‘declared_inj_time’], color=‘black’, label=‘Train’)
plt.scatter(val_set.index, val_set[‘declared_inj_time’], color=‘green’, label=‘Val’)
plt.scatter(test_set.index, test_set[‘declared_inj_time’], color=‘red’, label=‘Test’)
plt.legend()
#Inputs and outputs
INPUT_COLS = [‘goals’,‘corners’, ‘free_kicks’,‘substitutions’]
OUTPUT_COL = [‘declared_inj_time’]
#Linear regression model
class LinearRegression(torch.nn.Module):
def __init__(self):
super().__init__()
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.linear = torch.nn.Linear(in_features=len(INPUT_COLS), out_features=len(OUTPUT_COL))
def forward(self, x):
y_pred = self.linear(x)
return y_pred
#Training loop
def train(
model: torch.nn.Module,
train_loader: DataLoader,
val_loader: DataLoader,
n_epochs: int,
lr: float,
) -> torch.nn.Module:
#Loss and optimizer
criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
#Train weigths
for epoch in range(n_epochs):
for inputs, labels in train_loader:
#Zero the parameter gradients
optimizer.zero_grad()
#Forward propagation
pred_y = model(inputs)
#Compute loss
loss = criterion(pred_y,labels)
#Backward propagration to compute gradient
loss.backward()
#Update parameters
optimizer.step()
# Evaluate model on validation data
mse_val = 0
for inputs, labels in val_loader:
mse_val += torch.sum(torch.pow(labels - model(inputs), 2)).item()
mse_val /= len(val_loader.dataset)
print(f'Epoch: {epoch + 1}: Val MSE: {mse_val}')
return model
#Prepare data for training
x_train = torch.from_numpy(train_set[INPUT_COLS].values).to(torch.float)
y_train = torch.from_numpy(train_set[OUTPUT_COL].values).to(torch.float)
x_val = torch.from_numpy(val_set[INPUT_COLS].values).to(torch.float)
y_val = torch.from_numpy(val_set[OUTPUT_COL].values).to(torch.float)
#Create dataset loaders
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=10,shuffle=True)
val_dataset = torch.utils.data.TensorDataset(x_val,y_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=len(val_set),shuffle=False)
#Initialize mmodel
model = LinearRegression()
#Train model
n_epochs = 100
lr = 0.0001
model = train(model, train_loader, val_loader, n_epochs, lr)
#Evaluate model
Predict on validation data
pred_val = model(x_val)
Compute MSE, MAE and MAPE on validation data
print(‘Error on validation data’)
mse_val = torch.mean(torch.pow(pred_val - y_val, 2))
print(f’MSE: {mse_val.item()}’)
mae_val = torch.mean(torch.abs(pred_val - y_val))
print(f’MAE: {mae_val.item()}’)
mape_val = 100*torch.mean(torch.abs(torch.div(pred_val - y_val, y_val)))
print(f’MAPE: {mape_val.item()} %’)
#Evaluate model on test data
Get input and output as torch tensors
x_test = torch.from_numpy(test_set[INPUT_COLS].values).to(torch.float)
y_test = torch.from_numpy(test_set[OUTPUT_COL].values).to(torch.float)
Make prediction
pred_test = model(x_test)
Compute MSE, MAE and MAPE on test data
print(‘Error on test data’)
mse_test = torch.mean(torch.pow(pred_test - y_test, 2))
print(f’MSE: {mse_test.item()}’)
mae_test = torch.mean(torch.abs(pred_test - y_test))
print(f’MAE: {mae_test.item()}’)
mape_test = 100*torch.mean(torch.abs(torch.div(pred_test - y_test, y_test)))
print(f’MAPE: {mape_test.item()} %’)