RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

Here is the code. I feel like the error might be caused when I say layers+=[RowLSTM()], since this has not been pushed to GPU yet? Sorry for the length but the error could be anywhere so here it all is.

import torch.nn.init as init
import torch
__all__ = [
  'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
  'vgg19_bn', 'vgg19',
]


class VGG(nn.Module):
  '''
  VGG model 
  '''
  def __init__(self, features): # features represents the layers array
      super(VGG, self).__init__()
      self.features = features
      self.classifier = nn.Sequential(
          nn.Dropout(),
          nn.Linear(512,512),
          nn.ReLU(True),
          nn.Dropout(),
          nn.Linear(512, 512),
          nn.ReLU(True),
          nn.Linear(512, 10),
      )
       # Initialize weights
      for m in self.modules():
          if isinstance(m, nn.Conv2d):
              n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
              m.weight.data.normal_(0, math.sqrt(2. / n))
              m.bias.data.zero_()


  def forward(self, x): # x is the image, we run x through the layers
      print(x.size())
      x = self.features(x) # runs through all features, where each feature is a function
      x = x.view(x.size(0), -1) 
      # after running through features, does sequential steps to finally classify
      x = self.classifier(x)
      # print(x)
      return x


def make_layers(cfg, batch_norm=False):
 # print("Making layers!")
  layers = []
  in_channels = 3
  for v in cfg:
      if v == 'M':
          layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
      else:
          conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
          if batch_norm:
              layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
          else:
              layers += [conv2d, nn.ReLU(inplace=True)]
          in_channels = v
          layers+=[RLSTM()]

  return nn.Sequential(*layers)

class RLSTM(nn.Module):
  def __init__(self):
      super(RLSTM,self).__init__()



  def forward(self, image):
      print("going in rowlstm")
      global current
      global _layer
      global isgates
      size = image.size()
      b = size[0]
      indvs = list(image.split(1,0)) # split up the batch into individual images
      #print(indvs[0].size())
      tensor_array = []
      for i in range(b):
          current = 0
          _layer = []
          isgates = []
          tensor_array.append(self.RowLSTM(indvs[i]))

      seq=tuple(tensor_array)
      trans = torch.cat(seq,0)
      return trans.cuda() # trying to make floattensor error go away 
  def RowLSTM(self, image): 
      global current
      global _layer
      global isgates


      # input-to-state (K_is * x_i) : 3x1 convolution. generate 4h x n x n tensor. 4hxnxn tensor contains all i -> s info

  # the input to state convolution should only be computed one time 
      if current==0:
          n = image.size()[2]
          ch=image.size()[1]
          input_to_state = torch.nn.Conv2d(ch,4*ch,kernel_size=(1,3),padding=(0,1))
          isgates = self.splitIS(input_to_state(image)) # convolve, then split into gates (4 per row)
          cell=RowLSTMCell(0,torch.randn(ch,n,1),torch.randn(ch,n,1),torch.randn(ch,n,1),torch.randn(ch,n,1),torch.randn(ch,n,1),torch.randn(ch,n,1))
          # now have dummy, learnable variables for first row
          _layer.append(cell)

      else:   
          Cell_prev = _layer[current-1] # access previous row
          hidPrev = Cell_prev.getHiddenState() 
          ch = image.size()[1] 
      #   print("about to apply conv1d")
          state_to_state = torch.nn.Conv2d(ch,4*ch,kernel_size=(1,3),padding=(0,1)) # error is here: hidPrev is an array - not a valid number of input channel
      #   print("applied conv1d") 
          prevHid=Cell_prev.getHiddenState()
          ssgates = self.splitSS(state_to_state(prevHid.unsqueeze(0))) #need to unsqueeze (Ex: currently 16x5, need to make 1x16x5)
          gates = self.addGates(isgates,ssgates,current)
          # split gates
          ig, og, fg, gg = gates[0], gates[1], gates[2], gates[3] # into four, ADD SIGMOID!
          cell = RowLSTMCell(Cell_prev,ig,og,fg,gg,0,0)
          cell.compute()
          _layer.append(cell)
      # attempting to eliminate requirement of getting size

      #print(current)
      try:
          
          current+=1
          y=(isgates[0][0][1][current])
          return self.RowLSTM(image) 
      except Exception as error:
          concats=[]
          for cell in _layer:
              tensor=torch.unsqueeze(cell.h,0)
              
              concats.append(tensor)
          seq=tuple(concats)
          tensor=torch.cat(seq,3)
          return tensor

  def splitIS(tensor): #always going to be splitting into 4 pieces, so no need to add extra parameters
      inputStateGates={}
      size=tensor.size() # 1 x 4h x n x n
      out_ft=size[1] # get 4h for the nxnx4h tensor
      num=size[2] # get n for the nxn image
      hh=out_ft/4 # we want to split the tensor into 4, for the gates
      tensor = torch.squeeze(tensor) # 4h x n x n

      # First, split by row: Creates n tensors of 4h x n x 1
      rows = list(tensor.split(1,2))

      for i in range(num):
          # Each row is a tensor of 4h x n x 1, split it into 4 of h x n x 1
          row=rows[i]
          inputStateGates[i]=list(row.split(hh,0))
          
      return inputStateGates 


  def splitSS(tensor): # 1 x 4h x n x 1, create 4 of 1 x h x n x 1 
      size=tensor.size() 
      out_ft=size[1] # get 4h for the 1x4hxn tensor
      num=size[2] # get n for the 1xhxn row
      hh=out_ft/4 # we want to split the tensor into 4, for the gates
      tensor = tensor.squeeze(0) # 4h x n x 1
      splitted=list(tensor.split(hh,0))
      return splitted 


  def addGates(i2s,s2s,key):
      """ these dictionaries are of form {key : [[i], [o], [f], [g]]}
          we want to add pairwise elemeents """

      # i2s is of form key: [[i], [o], [f], [g]] where each gate is hxn
      # s2s is of form [[h,n],[h,n],[h,n], [h,n]]
      gateSum = []
      for i in range(4): # always of length 4, representing the gates
          gateSum.append(torch.sigmoid(i2s[key][i] + s2s[i]))

      return gateSum
cfg = {
  'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
  'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
  'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
  'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 
        512, 512, 512, 512, 'M'],
}