My model gives the exact same result for any input after training

EDIT: here’s the code and the data in google drive
the code:
(hope it’s readable)
the data is 2000 28X28px images of distorted 1s and 0s I made in blender

import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import torchvision
import random


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.L1 = nn.Linear(28*28, 1000, )
        self.L2 = nn.Linear(1000, 1500)
        self.L3 = nn.Linear(1500, 1000)
        self.L4 = nn.Linear(1000, 500)
        self.L5 = nn.Linear(500, 250)
        self.L6 = nn.Linear(250, 100)
        self.L7 = nn.Linear(100, 50)
        self.L8 = nn.Linear(50, 1)

    def forward(self, x):
        x = torch.sigmoid(self.L1(x))
        x = torch.sigmoid(self.L2(x))
        x = torch.sigmoid(self.L3(x))
        x = torch.sigmoid(self.L4(x))
        x = torch.sigmoid(self.L5(x))
        x = torch.sigmoid(self.L6(x))
        x = torch.sigmoid(self.L7(x))
        x = self.L8(x)
        return x


class OnesAndZerosDataset(Dataset):
    def __init__(self, path):  # gets path to the data directory
        self.path = path
        self.rootDir = self.path
    
    # gets index, returns the corresponding image as a flattened tensor (size(784))
    def __getitem__(self, idx):  # gets image index, returns grayscale image tensor
        # right pad number(5 -> 0005):
        self.idx = idx
        self.finalNumLen = 4  # determines the final length of the string
        self.numLen = int(len(str(idx)))  # get length of the input number
        for i in range(self.finalNumLen - self.numLen):  # right pad
            self.idx = "0" + str(self.idx)

        self.img = Image.open(os.path.join(self.rootDir, str(f"{self.idx}.png")))
        self.img = torchvision.transforms.functional.to_grayscale(self.img, 1)  # convert image to grayscale
        self.imageTensor = torchvision.transforms.ToTensor()(self.img)  # convert the image to tensor
        self.imageTensor = self.imageTensor.squeeze()  # size(1, 28, 28) --> size(28, 28)
        self.imageTensor = torch.flatten(self.imageTensor)  # size(28, 28) --> size(784)

        # get image label using the image's index
        self.label = torch.tensor([0.0]) if int(idx) <= 1000 else torch.tensor([1.0])  # the images labeled 0-1000 are 0, the rest are images of 1
        return self.imageTensor, self.label

    def __len__(self):
        path, dirs, files = next(os.walk(self.rootDir))
        return len(files)


if __name__ == "__main__":
    net = Network().to(device)
    optimizer = optim.SGD(net.parameters(), lr=0.01)
    criterion = nn.MSELoss()
    epochs = 5

    net.train()  # puts the network in "training mode" (idk)
    data_set = OnesAndZerosDataset("./data")
    data_loader = DataLoader(data_set, batch_size=5, shuffle=True)

    for epoch in tqdm((range(0, epochs))):
        for i in data_loader:
            input = i[0]
            layerPointer = 0
            for layer in input:
                target = i[1][layerPointer]
                layerPointer += 1
                optimizer.zero_grad()
                out = net(layer)
                loss = criterion(out, target)
                loss.backward()
                optimizer.step()

    # torch.save(net.state_dict(), 'modelDict.pth')

    #  predict on 10 random samples
    for i in range(10):
        tensor, target = data_set.__getitem__(random.randrange(0, 2000))
        out = net(tensor)
        print(f"target: {target}\nout: {out}")

as I said, I train the model, give it some random data and it just gives the exact same prediction, no matter what’s the input

It seems you might be saturating the activations as the outputs of the last layers seem to be quite constant:

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.L1 = nn.Linear(28*28, 1000, )
        self.L2 = nn.Linear(1000, 1500)
        self.L3 = nn.Linear(1500, 1000)
        self.L4 = nn.Linear(1000, 500)
        self.L5 = nn.Linear(500, 250)
        self.L6 = nn.Linear(250, 100)
        self.L7 = nn.Linear(100, 50)
        self.L8 = nn.Linear(50, 1)

    def forward(self, x):
        x = torch.sigmoid(self.L1(x))
        print(x.min(), x.max(), x.mean())
        x = torch.sigmoid(self.L2(x))
        print(x.min(), x.max(), x.mean())
        x = torch.sigmoid(self.L3(x))
        print(x.min(), x.max(), x.mean())
        x = torch.sigmoid(self.L4(x))
        print(x.min(), x.max(), x.mean())
        x = torch.sigmoid(self.L5(x))
        print(x.min(), x.max(), x.mean())
        x = torch.sigmoid(self.L6(x))
        print(x.min(), x.max(), x.mean())
        x = torch.sigmoid(self.L7(x))
        print(x.min(), x.max(), x.mean())
        x = self.L8(x)
        return x
    

model = Network()

for _ in range(10):
    x = torch.randn(1, 28*28)
    out = model(x)
    print(out)

You could try to use another non-linearity, such as torch.relu, try other weight init functions, or change the overall model architecture (e.g. lowering the number of parameters or layers).

changed my network to this, still gives the same output every prediction(on random tensors)

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.L1 = nn.Linear(28*28, 500)
        self.L2 = nn.Linear(500, 300)
        self.L3 = nn.Linear(300, 100)
        self.L4 = nn.Linear(100, 10)
        self.L5 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.relu(self.L1(x))
        x = torch.sigmoid(self.L2(x))
        x = torch.softmax(self.L3(x), dim=0)
        x = torch.relu(self.L4(x))
        x = self.L5(x)
        x = torch.softmax(x, dim=0)
        return x