RuntimeError: Shapes are not broadcastable for mul operation

I am trying to use gpytorch library for regression model where I give as inputs x(t-2),x(t-1) and the Deep Gaussian process model returns y(t) at the current time step. Here is part of my code

To reproduce

import torch
from torch import nn
from torch.nn import functional as F
import gpytorch
from gpytorch.models.deep_gps.dspp import DSPPLayer, DSPP
from gpytorch.means import ConstantMean, ZeroMean, LinearMean
from gpytorch.kernels import ScaleKernel, SpectralMixtureKernel
from gpytorch.variational import VariationalStrategy, CholeskyVariationalDistribution
from gpytorch.distributions import MultivariateNormal
from gpytorch.likelihoods import GaussianLikelihood
class DGPHiddenLayer(DSPPLayer):
    def __init__(self, input_dims, output_dims, device, num_inducing=128, inducing_points=None, mean_type='constant', num_mixtures=5, num_quad_sites= 8):
        if output_dims is None:
            inducing_points = torch.randn(num_inducing, input_dims).to(device=device)
            batch_shape = torch.Size([])
        else:
            inducing_points = torch.randn(output_dims, num_inducing, input_dims).to(device=device)
            batch_shape = torch.Size([output_dims])

        variational_distribution = CholeskyVariationalDistribution(
            num_inducing_points=num_inducing,
            batch_shape=batch_shape
        )
        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True
        )

        super().__init__(variational_strategy, input_dims, output_dims, num_quad_sites)
        if mean_type == 'constant':
            self.mean_module = ConstantMean(batch_shape=batch_shape)
        elif mean_type == 'zero':
            self.mean_module = ZeroMean(batch_shape=batch_shape)
        else:
            self.mean_module = LinearMean(input_dims)
        
        self.covar_module = ScaleKernel(
            SpectralMixtureKernel(num_mixtures=num_mixtures, batch_shape=batch_shape, ard_num_dims=input_dims),
            batch_shape=batch_shape, ard_num_dims=None
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultivariateNormal(mean_x, covar_x)

class DeepGaussianProcesses(DSPP):
    def __init__(self, input_size, output_size, device, num_inducing= 50, Q=8, max_cholesky_size=10000, **kwargs):
        # pass hidden layer sizes as separate arguments as well as array
        for k, v in kwargs.items():
            if k=='mean_type':
                setattr(self, k, v)
        hidden_size = hidden_size_extract(kwargs, 'hidden_size')
        
        self.hidden_size = hidden_size
        self.hidden_size.append(output_size)
        self.max_cholesky_size = max_cholesky_size
        self.device = device
        first_layer = DGPHiddenLayer(
                      input_dims=input_size,
                      output_dims=self.hidden_size[0],
                      device = self.device,
                      num_inducing=num_inducing,
                      mean_type=self.mean_type,
                      num_quad_sites=Q,
                      )

        # variable count of hidden layers and neurons

        if self.mean_type=='linear' or self.mean_type=='constant':
            hidden_layers = nn.ModuleList(
              [ 
              
                  DGPHiddenLayer(
                     input_dims=self.hidden_size[i],
                     output_dims=self.hidden_size[i + 1],
                     device = self.device,
                     num_inducing=num_inducing,
                     mean_type='constant',
                     num_quad_sites=Q,
                  )
                  for i in range(len(self.hidden_size) - 1)
              ])
        else:
            hidden_layers = nn.ModuleList(
              [
                
                  DGPHiddenLayer(
                     input_dims=self.hidden_size[i],
                     output_dims=self.hidden_size[i + 1],
                     device = self.device,
                     num_inducing=num_inducing,
                     mean_type='zero',
                     num_quad_sites=Q,
                  )
                  for i in range(len(self.hidden_size) - 1)
              ])

        super().__init__(Q)

        self.first_layer = first_layer
        self.hidden_layers = hidden_layers
        self.likelihood = GaussianLikelihood()
        self.to(device=self.device)

    def forward(self, inputs):
        out = self.first_layer(inputs)
        for hidden in self.hidden_layers:
            out = hidden(out)
        return out
    

    def predict(self, x):
        with torch.no_grad(), gpytorch.settings.max_cholesky_size(self.max_cholesky_size):
            mus = []
            variances = []
            samples = []
            preds = self.likelihood(self(x))
            mus.append(preds.mean)
            variances.append(preds.variance)
            samples.append(preds.rsample())           
        return torch.cat(mus, dim=-1), torch.cat(variances, dim=-1), torch.cat(samples, dim=-1)
class DGP(DeepGaussianProcesses):
    def __init__(self, latent_size, action_size, lagging_size, device, hidden_size=[50], mean_type= 'constant'):
        input_size =latent_size*lagging_size
        super(DGP, self).__init__(input_size=input_size, output_size= action_size, device=device, hidden_size=hidden_size, mean_type=mean_type)

Here is where I apply the DGP class to synthetic data

device='cuda'
latent_size=20
action_size=6
lagging_size=2
deep_module = DGP(latent_size, action_size, lagging_size, device).to(device=device) 
lagging_states=torch.rand((1,8,40))
a, a_v, a_s = deep_module.predict(lagging_states)

I get this error message which I don’t know how to fix it

a, a_v, a_s = deep_module.predict(lagging_states)
  File "/home/models.py", line 313, in predict
    preds = self.likelihood(self(x))
  File "/home/lib/python3.8/site-packages/gpytorch/likelihoods/likelihood.py", line 65, in __call__
    return self.marginal(input, *args, **kwargs)
  File "/home/lib/python3.8/site-packages/gpytorch/likelihoods/gaussian_likelihood.py", line 72, in marginal
    full_covar = covar + noise_covar
  File "/home/lib/python3.8/site-packages/gpytorch/lazy/lazy_tensor.py", line 2035, in __add__
    return AddedDiagLazyTensor(self, other)
  File "/home/lib/python3.8/site-packages/gpytorch/lazy/added_diag_lazy_tensor.py", line 32, in __init__
    broadcasting._mul_broadcast_shape(lazy_tensors[0].shape, lazy_tensors[1].shape)
  File "/home/lib/python3.8/site-packages/gpytorch/utils/broadcasting.py", line 20, in _mul_broadcast_shape
    raise RuntimeError("Shapes are not broadcastable for mul operation")
RuntimeError: Shapes are not broadcastable for mul operation

I looked for similar errors and I applied different solutions suggested for similar error. However, they didn’t work in my case. I will appreciate if anyone can offer a solution for this error. Many thanks.