Hi there,
I’m very much beginning my journey into PyTorch, and thought I’d reach out for advice and suggested improvements.
I’m playing with a model that predicts football (soccer) matches. Raw data is in this CSV format:
season,date,home_team,away_team,home_goals,away_goals,result
2019,2019-08-09,Liverpool,Norwich,4,1,H
For the ‘result’ column:
H: Home team won
A: Away team won
D: The match was a draw
I can feed my model a home and away team (which are converted into a list of unique ints), and have it predict the result of a match in ints (H: 2 / A: 1 / D: 0).
But after training for a while it’s not that effective, I can see the loss going down to about 0.49, but I can’t seem to reduce it more than that.
Is this just the nature of sports data, or am I introducing any bad practices in my code? Any tips and guidance on this kind of project would be greatly appreciated.
import matplotlib.pyplot as plt
import numpy as np
import pandas
import torch
from torch import nn
from sklearn.model_selection import train_test_split
def get_data():
csv = pandas.read_csv('./data.csv')
data = csv.drop(
columns=['season', 'home_goals', 'away_goals'])
return data
def get_teams():
# Combine home and away team names, get unique cases + optionally sort
teams_unique = pandas.concat(
[data['home_team'], data['away_team']]).unique()
teams_sorted = np.sort(teams_unique)
teams = dict(zip(teams_sorted, range(len(teams_sorted))))
return teams
# Build dictionary
data = get_data()
teams = get_teams()
def get_team(team_str="Arsenal"):
# Get one hot encoded teams function, for use now and later when predicting
return teams[team_str]
# Features / teams as ints
data_features = []
for r in data.itertuples():
data_features.append([get_team(r.home_team), get_team(r.away_team)])
for r in data_features[:10]:
print(list(teams.keys())[r[0]], "vs", list(teams.keys())[r[1]])
# Scores
data_scores = []
for r in data[["result"]].itertuples():
result = r.result
res = 0
if result == "H":
res = 2
elif result == "A":
res = 1
else:
res = 0
data_scores.append(res)
# Split the data into training and testing sets
RANDOM_SEED = 42
X = torch.tensor(data_features, dtype=torch.float32)
y = torch.tensor(data_scores, dtype=torch.int64)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=RANDOM_SEED)
print(
f"X_train: {X_train.shape}: {X_train.dtype} | y_train: {y_train.shape}: {y_train.dtype}")
print(
f"X_test: {X_test.shape}: {X_test.dtype} | y_test: {y_test.shape}: {y_test.dtype}")
# Build the model
class ModelV1(nn.Module):
def __init__(self, INPUT_FEATURES=2, OUTPUT_FEATURES=2, HIDDEN_UNITS=8):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(in_features=INPUT_FEATURES,
out_features=HIDDEN_UNITS),
nn.Sigmoid(),
nn.Linear(in_features=HIDDEN_UNITS, out_features=HIDDEN_UNITS),
nn.Sigmoid(),
nn.Linear(in_features=HIDDEN_UNITS,
out_features=OUTPUT_FEATURES)
)
def forward(self, x):
return self.layers(x)
INPUT_FEATURES = X_train.shape[1]
HIDDEN_UNITS = len(teams) * 4
OUTPUT_FEATURES = 4
model = ModelV1(INPUT_FEATURES, OUTPUT_FEATURES, HIDDEN_UNITS)
# Loss
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Accuracy
def accuracy_fn(outputs, targets):
correct = torch.sum(outputs == targets).item()
acc = (correct/len(outputs)) * 100
return acc
# Prepare
torch.manual_seed(RANDOM_SEED)
torch.backends.mps.manual_seed = RANDOM_SEED
# Set no of epochs
EPOCHS = 1000
print_steps = round(EPOCHS / 100)
losses = []
for epoch in range(EPOCHS):
model.train()
y_logits = model(X_train)
outputs = torch.softmax(y_logits, dim=0).argmax(dim=1)
loss = loss_fn(y_logits, y_train)
acc = accuracy_fn(outputs, y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
model.eval()
losses.append(loss.item())
if epoch % print_steps == 0:
print(f"Epoch: {epoch+1}/{EPOCHS} | Loss: {loss:.5f}")
with torch.inference_mode():
# Forward pass
test_logits = model(X_test)
outputs = torch.softmax(test_logits, dim=1).argmax(dim=1)
# Calculate test loss / acc
test_loss = loss_fn(test_logits, y_test)
test_acc = accuracy_fn(outputs, y_test)
# Compare results
print_steps = round(len(outputs) / 10)
correct = 0
for i, o in enumerate(outputs):
is_correct = y_test[i].item() == o.item()
icon = "✅" if is_correct else "❌"
correct += 1 if is_correct else 0
if i % print_steps == 0:
print(
f"{icon} Actual: {y_test[i].item():.2f} | Predicted: {o.item():.2f}")
print("-" * 30)
print(f"Correct: {correct} / {len(outputs)}")
print(f"Accuracy: {correct/len(outputs)*100:.2f}%")
# Plot training and test losses
plt.plot(range(EPOCHS), losses, label="Test Loss")
plt.legend(prop={'size': 12})
plt.show()