Torch Parallel Layers Visualization in wandb

Hi everyone,
I created a dynamic actor-critic module deriving from nn.Module. The module is made up of 3 submodules: a shared network made up of a number of convolutional layers and 2 independent parts made up of fc layers that receive the flattened output of the previous module as input. The code that I used is the following:

class Conv3DModelFree(nn.Module):
    def __init__(self, in_shape, num_actions, **kwargs):
        fc_layers = kwargs["fc_layers"]
        super(Conv3DModelFree, self).__init__(
            num_actions, features_out=fc_layers[-1])
        # super().__init__()
        conv_layers = kwargs["conv_layers"]

        self.shared_layers = []
        self.num_actions = num_actions
        self.in_shape = in_shape
        self.num_channels = in_shape[0]
        self.num_frames = kwargs["num_frames"]
        next_inp = None
        # =============================================================================
        # FEATURE EXTRACTOR SUBMODULE
        # =============================================================================
        for i, cnn in enumerate(conv_layers):
            if i == 0:
                self.shared_layers.append(nn.Conv3d(
                    self.num_channels, cnn[0], kernel_size=cnn[1], stride=cnn[2]))
                self.shared_layers.append(nn.LeakyReLU())
            else:
                self.shared_layers.append(nn.Conv3d(
                    next_inp, cnn[0], kernel_size=cnn[1], stride=cnn[2]))
                self.shared_layers.append(nn.LeakyReLU())
            next_inp = cnn[0]

        for i, layer in enumerate(self.shared_layers):
            if i == 0:
                fake_inp = torch.zeros(
                    [1, self.num_channels, self.num_frames, *self.in_shape[1:]])
                fake_inp = self.shared_layers[i](fake_inp)
            else:
                fake_inp = self.shared_layers[i](fake_inp)
        next_inp = fake_inp.view(1, -1).size(1)

        # =============================================================================
        # ACTOR AND CRITIC SUBMOODULES
        # =============================================================================
        self.actor_subnet = []
        self.critic_subnet = []
        for i, fc in enumerate(fc_layers):
            if i == 0:
                self.shared_layers.append(nn.Linear(next_inp, fc))
                self.shared_layers.append(nn.LeakyReLU())
            else:
                # Separate submodules for the actor and the critic
                self.actor_subnet.append(nn.Linear(next_inp, fc))
                self.critic_subnet.append(nn.Linear(next_inp, fc))
                self.actor_subnet.append(nn.LeakyReLU())
                self.critic_subnet.append(nn.LeakyReLU())
            next_inp = fc
def forward(self, input)
        for i, layer in enumerate(self.shared_layers[:-2]):
            if i == 0:
                x = self.shared_layers[i](input)
            else:
                x = self.shared_layers[i](x)
        # last 2 shared layers requires a reshape of the input
        x = x.view(x.shape[0], -1)
        x = self.shared_layers[-2](x)
        x = self.shared_layers[-1](x)

        action_logits = None
        for i, layer in enumerate(self.actor_subnet):
            if i == 0:
                action_logits = self.actor_subnet[i](x)
            else:
                action_logits = self.actor_subnet[i](action_logits)
        action_logits = self.actor(action_logits)

        value = None
        for i, layer in enumerate(self.critic_subnet):
            if i == 0:
                value = self.critic_subnet[i](x)
            else:
                value = self.critic_subnet[i](value)
        value = self.critic(value)

        return action_logits, value

When I use wandb.watch on this class, the graphs do not appear to me probably because wandb cannot identifies the submodules of the network that are included in a list. I tested a different solution, trying to wrap the various pieces in some nn.Sequential modules as follows:

class Conv3DModelFree(OnPolicy):
    def __init__(self, in_shape, num_actions, **kwargs):
        fc_layers = kwargs["fc_layers"]
        super(Conv3DModelFree, self).__init__(
            num_actions, features_out=fc_layers[-1])
        # super().__init__()
        conv_layers = kwargs["conv_layers"]

        shared_layers = OrderedDict()
        self.num_actions = num_actions
        self.in_shape = in_shape
        self.num_channels = in_shape[0]
        self.num_frames = kwargs["num_frames"]
        next_inp = None
        # =============================================================================
        # FEATURE EXTRACTOR SUBMODULE
        # =============================================================================
        for i, cnn in enumerate(conv_layers):
            if i == 0:
                shared_layers["conv_0"] = nn.Conv3d(
                    self.num_channels, cnn[0], kernel_size=cnn[1], stride=cnn[2])
                shared_layers["activ_0"] = nn.LeakyReLU()
            else:
                shared_layers["conv_"+str(i)] = nn.Conv3d(
                    next_inp, cnn[0], kernel_size=cnn[1], stride=cnn[2])
                shared_layers["activ_" + str(i)] = nn.LeakyReLU()
            next_inp = cnn[0]

        # flatten the output starting from dim=1 by default
        shared_layers["flatten"] = nn.Flatten()

        for i, layer in enumerate(shared_layers):
            if i == 0:
                fake_inp = torch.zeros(
                    [1, self.num_channels, self.num_frames, *self.in_shape[1:]])
                fake_inp = shared_layers[layer](fake_inp)
            else:
                fake_inp = shared_layers[layer](fake_inp)
        next_inp = fake_inp.view(1, -1).size(1)

        # =============================================================================
        # ACTOR AND CRITIC SUBMODULES
        # =============================================================================
        actor_subnet = OrderedDict()
        critic_subnet = OrderedDict()
        for i, fc in enumerate(fc_layers):
            if i == 0:
                shared_layers["fc_0"] = nn.Linear(next_inp, fc)
                shared_layers["fc_activ"] = nn.LeakyReLU()
            else:
                # Separate submodules for the actor and the critic
                actor_subnet["actor_fc_"+str(i)] = nn.Linear(next_inp, fc)
                critic_subnet["critic_fc_" +
                              str(i)] = nn.Linear(next_inp, fc)
                actor_subnet["actor_activ_"+str(i)] = nn.LeakyReLU()
                critic_subnet["critic_activ_"+str(i)] = nn.LeakyReLU()
            next_inp = fc
        actor_subnet["actor_out"] = nn.Linear(next_inp, self.num_actions)
        critic_subnet["critic_out"] = nn.Linear(next_inp, 1)

        self.shared_network = nn.Sequential(shared_layers)
        self.actor = nn.Sequential(actor_subnet)
        self.critic = nn.Sequential(critic_subnet)
    
    def forward(self, input):
     
        shared_net_output = self.shared_network(input)
        action_logits = self.actor(shared_net_output)
        value = self.critic(shared_net_output)

        return action_logits, value

Using this different implementation, I can visualize the graphs but I get completely different results and extremely high loss values. Does anyone know a correct way to define this architecture so that it is viewable on wandb?

First code

The graph cannot be visualized because the layers are not in nn container.
You should put them into a container like below,

self.nn_shared_layers = nn.ModuleList(*self.shared_layers)
self.nn_actor_subnet = nn.ModuleList(*self.actor_subnet)
self.nn_critic_subnet = nn.ModuleList(*self.critic_subnet)

I recommend you to use nn.Sequential instead of nn.ModuleList.
Except the shared_layers, your forward function is designed in a serial manner. In short,
You can rewrite your forward function like

# In __init__
self.nn_actor_subnet = nn.Sequential(*self.actor_subnet)
# In forward
action_logits = self.nn_actor_subnet(x)

Second Code
I guess your code is well modified to visualize the graph but the reason that your result seems weird comes from not using activation functions.
I don’t know how small your learning rate and which optimizer you use. But if the learning rate is not that big such as 1, the problem is usually mis-designed network.


class Conv3DModelFree(nn.Module):
    def __init__(self, in_shape, num_actions, **kwargs):
        fc_layers = kwargs["fc_layers"]
        super(Conv3DModelFree, self).__init__(
            num_actions, features_out=fc_layers[-1])
        # super().__init__()
        conv_layers = kwargs["conv_layers"]

        self.shared_layers = []
        self.num_actions = num_actions
        self.in_shape = in_shape
        self.num_channels = in_shape[0]
        self.num_frames = kwargs["num_frames"]
        next_inp = None
        # =============================================================================
        # FEATURE EXTRACTOR SUBMODULE
        # =============================================================================
        for i, cnn in enumerate(conv_layers):
            if i == 0:
                self.shared_layers.append(nn.Conv3d(
                    self.num_channels, cnn[0], kernel_size=cnn[1], stride=cnn[2]))
                self.shared_layers.append(nn.LeakyReLU())
            else:
                self.shared_layers.append(nn.Conv3d(
                    next_inp, cnn[0], kernel_size=cnn[1], stride=cnn[2]))
                self.shared_layers.append(nn.LeakyReLU())

            if i == len(conv_layers) - 3:
                self.shared_layers.append(nn.Flatten(start_dim=1))

            next_inp = cnn[0]

        for i, layer in enumerate(self.shared_layers):
            if i == 0:
                fake_inp = torch.zeros(
                    [1, self.num_channels, self.num_frames, *self.in_shape[1:]])
                fake_inp = self.shared_layers[i](fake_inp)
            else:
                fake_inp = self.shared_layers[i](fake_inp)
        next_inp = fake_inp.view(1, -1).size(1)

        # =============================================================================
        # ACTOR AND CRITIC SUBMOODULES
        # =============================================================================
        self.actor_subnet = []
        self.critic_subnet = []
        for i, fc in enumerate(fc_layers):
            if i == 0:
                self.shared_layers.append(nn.Linear(next_inp, fc))
                self.shared_layers.append(nn.LeakyReLU())
            else:
                # Separate submodules for the actor and the critic
                self.actor_subnet.append(nn.Linear(next_inp, fc))
                self.critic_subnet.append(nn.Linear(next_inp, fc))
                self.actor_subnet.append(nn.LeakyReLU())
                self.critic_subnet.append(nn.LeakyReLU())
            next_inp = fc

        self.nn_shared_layers = nn.Sequential(*self.shared_layers)
        self.nn_actor_subnet = nn.Sequential(*self.actor_subnet)
        self.nn_critic_subnet = nn.Sequential(*self.critic_subnet)

    def forward(self, input)
        x = self.nn_shared_layers(input)
        action_logits = self.nn_actor_subnet(x)
        value = self.nn_critic_subnet(x)

        return action_logits, value

Thanks a lot for the answer.
I tried your solution but the network behaves exactly like in my second code.

Actually, the network in the second code contains the activation functions, I checked that the architecture was correct also through tensorboard.
The learning rate is 3e-4 but in all cases, the program is identical for both models I have defined, so there are no other elements outside the network that influence the training.
I don’t know, maybe using 2 Sequential blocks in parallel like in the second code can cause problems with backpropagation (I’m just assuming)

@ColdFrenzy

nn.Sequential should work as mentioned by @thecho7. Below is an example

ORIGINAL

import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class random_model(nn.Module):
    def __init__(self, num_layers):
        super(random_model, self).__init__()
        self.model1 = nn.Sequential(nn.Linear(100, 20), nn.BatchNorm1d(20), nn.ReLU())
        self.model2 = []
        for i in range(num_layers):
            self.model2.append(nn.Sequential(nn.Linear(20, 20), nn.BatchNorm1d(20), nn.ReLU()))
        #self.model2 = nn.Sequential(*self.model2)
        self.model3 = nn.Linear(20, 1)
    
    def forward(self, x):
        x = self.model1(x)
        for cur_model in self.model2:
            x = cur_model(x)
        #x = self.model2(x)
        x = self.model3(x)
        return x
    
model = random_model(10)
loss = torch.nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

X = torch.rand(100, 100)
y = torch.rand(100)

import wandb
wandb.init(project="random-project-custom-class", entity="")
wandb.config = {
  "learning_rate": 0.001,
  "epochs": 100,
  "batch_size": 100
}

# 3. Log gradients and model parameters
wandb.watch(model, log="gradients", log_freq=100)
print("DEF")
for cur_epoch in range(100):
    optimizer.zero_grad()
    output = model(X)
    cur_loss = loss(output, y)
    cur_loss.backward()
    optimizer.step()
    cur_loss = loss(output, y)
    print("Epoch {0} Loss is {1}".format(cur_epoch, cur_loss))
    wandb.log({"loss": cur_loss})

MODIFIED

import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class random_model(nn.Module):
    def __init__(self, num_layers):
        super(random_model, self).__init__()
        self.model1 = nn.Sequential(nn.Linear(100, 20), nn.BatchNorm1d(20), nn.ReLU())
        self.model2 = []
        for i in range(num_layers):
            self.model2.append(nn.Sequential(nn.Linear(20, 20), nn.BatchNorm1d(20), nn.ReLU()))
        self.model2 = nn.Sequential(*self.model2)
        self.model3 = nn.Linear(20, 1)
    
    def forward(self, x):
        x = self.model1(x)
        #for cur_model in self.model2:
        #    x = cur_model(x)
        x = self.model2(x)
        x = self.model3(x)
        return x
    
model = random_model(10)
loss = torch.nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

X = torch.rand(100, 100)
y = torch.rand(100)

import wandb
wandb.init(project="random-project-custom-class", entity="")
wandb.config = {
  "learning_rate": 0.001,
  "epochs": 100,
  "batch_size": 100
}

# 3. Log gradients and model parameters
wandb.watch(model, log="gradients", log_freq=100)
print("DEF")
for cur_epoch in range(100):
    optimizer.zero_grad()
    output = model(X)
    cur_loss = loss(output, y)
    cur_loss.backward()
    optimizer.step()
    cur_loss = loss(output, y)
    print("Epoch {0} Loss is {1}".format(cur_epoch, cur_loss))
    wandb.log({"loss": cur_loss})

@anantguptadbl
Yes, I agree that the nn.Sequential model works like this and it is possible to view it on wandb, in fact in my second code snippet I am able to view the graphs of the gradient correctly.

but the difference with your code is that in my case, at some point the network proceeds in parallel and not completely in sequence (the actor and critic subnets). In this case, if I pass the shared_layer output to the two subnets (which are two nn.Sequential modules) the loss values ​​become completely meaningless and I think it is due to the use of the nn.Sequential modules, since the rest of the program is exactly the same

Aah understood. Let me try that as well