Evaluation loss low but when using with live data high

Hello,
I want to train a CNN to play Subway Surfers by predicting the key to press.
The training works and the eval loss seems to be low but when Im using to actually play it doesnt realy work.

This is the training script:

import torch
from torchvision import transforms
from PIL import Image
from os import listdir
import os
import random
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
import time
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
import os


device = ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on {device}")


kernel_size = 4
stride = 4

batchs = 100
batchsize = 64

model_path = "models/model_v11.pt"

epochs = int(input("Epochs: ")) 
learingRate = input("Learning rate: ")

if learingRate == "":
    learingRate = 0.001
else:
    learingRate = float(learingRate)

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(30), 
        transforms.ToTensor(),.
        transforms.RandomAffine(30),
        transforms.Resize((32,32)),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'eval': transforms.Compose([
        transforms.Resize((32,32)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

train_data_list = []


def getData():
    for batch in range(batchs):
        target_list = []
        train_data = []
        for count in range(batchsize):
             
            key = random.choice(listdir("./data/train/"))

            list = []
            for e in range(5):
                if str(e) in key:
                    list.append(1)
                else:
                    list.append(0)
            img_array = random.choice(listdir(f"./data/train/{key}"))
            img = Image.open(f"./data/train/{key}/{img_array}")
            img = data_transforms["train"](img)
            train_data.append(torch.Tensor(img).to(device))
            target_list.append(list)


            print("Loading Image {}/{} \tBatch: {}/{} \tPercentage Done: {}%".format(count, batchsize, batch, batchs, round(100*len(train_data_list )/batchs, 2)), end="\r", flush=True)

        train_data_list.append((torch.stack(train_data), (torch.Tensor(target_list).to(device))))

    print(f"\nSuccesfully loaded data")

getData()   



class Netz(nn.Module):
    def __init__(self):
        super(Netz, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=5, padding=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, padding=2)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=5, padding=2)
        self.fc1 = nn.Linear(2048, 1024)  
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(1024, 512) 
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(512, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2)

        x = x.view(-1, 2048)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x) 
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x
    
if os.path.isfile(model_path):
    print("Loading weights from file...")
    model = Netz()
    model.load_state_dict(torch.load(model_path))
    model.to(device)
else:
    model = Netz().to(device)


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=learingRate, momentum=0.9)


def load_random_images():
    eval_data = []
    eval_labels = []
    base_path = "./data/eval/"
    folders = listdir(base_path)
    selected_images = []


    while len(selected_images) < 25:
        folder = random.choice(folders)
        img_files = listdir(f"{base_path}/{folder}")
        if img_files:
            img_file = random.choice(img_files)
            selected_images.append((folder, f"{base_path}/{folder}/{img_file}"))

    for folder, img_path in selected_images:
        label = []
        for e in range(5):
            if str(e) in folder:
                label.append(1)
            else:
                label.append(0)
        img = Image.open(img_path)
        img = data_transforms["eval"](img)
        eval_data.append(img)
        eval_labels.append(label)

    return torch.stack(eval_data).to(device), torch.tensor(eval_labels, dtype=torch.float).to(device)

def evaluate():
    model.eval()
    eval_data, eval_labels = load_random_images()
    total_loss = 0.0
    with torch.no_grad():
        outputs = model(eval_data)
        loss = criterion(outputs, eval_labels)
        total_loss = loss.item()

    average_loss = total_loss / len(eval_labels)
    print(f"Evaluation Loss: {average_loss}")
    model.train() 
    return average_loss


loss_plot = []
eval_plot = []

def train():
    global train_data_list
    model.train()
    for epoch in range(epochs):

        total_loss = 0.0
        for data, target in train_data_list:
            optimizer.zero_grad() 

            out = model(data)

            loss = criterion(out, target) 
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            
        now = datetime.now()
        formated_time = now.strftime("%H:%M:%S")
        print(f"[{formated_time}] Epoch: " + str(epoch) + "/" + str(epochs), "\tLoss: {:.6f}".format(total_loss / len(train_data_list)))

        if loss.item() < 0.001:
            print("Updating training data")
            train_data_list = []
            getData()
        
        loss_plot.append(total_loss / len(train_data_list))
        eval_plot.append(evaluate())
        torch.save(model.state_dict(), f"models/model_v_11_e{str(epoch)}.pt"


    torch.save(model.state_dict(), model_path)



try:
    train()
    plt.plot(loss_plot, label="Training Loss")
    plt.plot(eval_plot, label="Evaluation Loss")
    plt.title('Training and Evaluation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
except :
    torch.save(model.state_dict(), model_path)
    plt.plot(loss_plot, label="Training Loss")
    plt.plot(eval_plot, label="Evaluation Loss")
    plt.title('Training and Evaluation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

And this is the script to actually play the game:

import time
import pyautogui
from torchvision import transforms
from PIL import Image
import dxcam
import torch
import torch.nn.functional as F
import torch.nn as nn

device = ("cuda" if torch.cuda.is_available() else "cpu")
model_path = "models/model_v_11_e999" 
kernel_size = 4
stride = 4

print(f"Evaluating on {device}")

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((32,32)),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

])

class Netz(nn.Module):
    def __init__(self):
        super(Netz, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=5, padding=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, padding=2)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=5, padding=2)
        self.fc1 = nn.Linear(2048, 1024) 
        self.dropout1 = nn.Dropout(0.3) 
        self.fc2 = nn.Linear(1024, 512) 
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(512, 5) 


    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2)

        x = x.view(-1, 2048)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x) 
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)

        return x
    
model = Netz()
model.load_state_dict(torch.load(model_path, map_location=device))

model.to(device)
model.eval()

camera = dxcam.create()

left, top = 475, 300 
right, bottom = left + 1150, top + 645
region = (left, top, right, bottom)

camera.start(target_fps=30, region=region) 

keys = ["none", "down", "up", "left", "right"]
kernel_size = 4
stride = 4

while True:
    image = camera.get_latest_frame()
    image = Image.fromarray(image)
    image = transform(image).to(device)

    out = model(torch.stack([image]))

    _, predicted = torch.max(out, 1)
    action = keys[predicted.item()]

    if action != "none":
        pyautogui.press(action)
        time.sleep(0.15)

    print(action)

Can anyone help me?
Figure_3