How to train parameters for concated neural network


I am trying the learn a backward to the digits dataset from sklearn and I am having troubles to tune my parameters using backward. For every iteration I get the same accuracy, so it seems like my parameters are not changing.

I am wondering if this might be coming from concatenating my neural networks.

So my training data:

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(,,

I am having a neural network which consists of two hidden layers with and one output layer.
I tried to define the linear layers by myself like so:

class MyLinearLayer(nn.Module):
    def __init__(self, input_size, layer_size):
        super(MyLinearLayer, self).__init__()
        self.weight = torch.nn.Parameter(torch.ones(layer_size, input_size, dtype=torch.float64, requires_grad=True))
        nn.init.kaiming_normal_(self.weight, mode='fan_in', nonlinearity='relu')
        self.bias = torch.nn.Parameter(torch.zeros(layer_size, dtype=torch.float64, requires_grad=True))
    def forward(self, input: Tensor) -> Tensor:
        return F.linear(input, self.weight, self.bias)

and then the final neural network looks like:

class MyFeedforwardNN(nn.Module):
    def __init__(self, input_size):
        super(MyFeedforwardNN, self).__init__()
        self.first_linear_layer = MyLinearLayer(input_size, 64)
        self.second_linear_layer = MyLinearLayer(input_size, 32)
        self.weight = torch.nn.Parameter(torch.ones(32, 32, dtype=torch.float64, requires_grad=True))
        self.bias = torch.nn.Parameter(torch.zeros(32, dtype=torch.float64, requires_grad=True))
    def forward(self, input: Tensor) -> Tensor:
        self.first_layer_out = F.relu(self.first_linear_layer(input))
        self.second_layer_out = F.relu(self.second_linear_layer(self.first_layer_out))
        return F.softmax(torch.sum(torch.matmul(self.second_layer_out, self.weight) + self.bias))

Now I tried using a dataset likewise:
class Dataset:
    def __init__(self, xs, ys):
        self.xs = xs
        self.ys = ys
    def __getitem__(self, i):
        return self.xs[i].astype(float), self.ys[i].astype(float)
    def __len__(self):
        return len(self.xs)

and the final training of the model is:

dataset = Dataset(X_train, y_train)
net = MyFeedforwardNN(64)
optimizer = torch.optim.SGD(net.parameters(), lr=0.001)
dataloader = DataLoader(dataset, shuffle=True)
criterion = nn.MSELoss()
size = len(dataloader.dataset)

for epoch in range(0, 99):
    correct = 0
    for batch, (x, y) in enumerate(dataloader):

        pred = net.forward(x)
        loss = criterion(pred, y)

        if(pred == y):
            correct += 1
        #correct += (pred == y).float().sum()
        # print(pred == y)
        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(x)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    print("Accuracy = {}".format(correct / size))

It prints always Accuracy = 0.10096510764662213 and when I set shuffle to false it is the same loss every iteration. Since I am rather new this topic I am really unsure about how to set the best output layer and loss function here as well… so any feeedback on this would be appreciated as well.

So any ideas what is going wrong?

Thanks in advance!