In my code, I encountered issues with tensor device management, specifically with tensors stored in custom containers (`TensorPack`

objects). Initially, tensors were defaulting to CPU, even when the model was moved to GPU. To address this, I manually added functions to ensure tensors were correctly assigned to the same device as the model parameters. However, during backpropagation, device-related issues persisted for tensors within `TensorPack`

objects, impacting gradient computation and model performance, it is my guess and not sure about that.

I would greatly appreciate any insights or assistance in resolving this issue.

```
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import torch
# torch.autograd.set_detect_anomaly(True)
import torch.nn as nn
from typing import List, Tuple
import numpy as np
import torch.jit as jit
from torch import Tensor
from torch.nn import Parameter
import torch.optim as optim
########################
# operation's core
########################
class TensorPack(object):
def __init__(self, tensors:List[torch.Tensor], shapes:List[List[int]]):
self.tensors = tensors
for tensor in self.tensors:tensor.retain_grad()
def to(self, device):
self.tensors = [tensor.to(device) for tensor in self.tensors]
return self
def TPMatmul(input:torch.Tensor, tensor_pack:TensorPack):
# keep it very simple : Input * Tensor1 * Tensor2 * ...
mode = len(tensor_pack.tensors)
data = input
for i in range(mode):
data = torch.matmul(data, tensor_pack.tensors[i])
return data
########################
# Layer's main operation
########################
class operation(jit.ScriptModule):
def __init__(self, in_features:int, out_features:int, mode:int):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.kernel_size = in_features * out_features
self.mode = mode
self.shapes = np.zeros((mode,2)).astype('int32')
self.div = int(self.out_features//self.mode)
self.shapes[0] = (in_features, self.div)
self.inds = np.zeros(self.mode+1).astype('int32')
self.inds[1] = np.prod(self.shapes[0])
for i in range(1, self.mode):
dim1 = (i) * self.div
dim2 = (i+1) * self.div
if i == self.mode - 1:dim2 = self.out_features
self.shapes[i]=(dim1, dim2)
self.inds[i+1] = self.inds[i] + dim1 * dim2
self.kernel_length = np.sum(np.prod(self.shapes, axis=1))
self.shapes = self.shapes.tolist()
self.kernels = Parameter(torch.rand((self.kernel_length,)))
self.construct_tensor_pack()
@torch.jit.export
def construct_tensor_pack(self):
tensors = torch.jit.annotate(List[Tensor], [])
for i in range(self.mode):
dim1 = self.shapes[i][0]
dim2 = self.shapes[i][1]
kernel = torch.reshape(self.kernels[self.inds[i]:self.inds[i+1]],(dim1, dim2))
kernel.retain_grad()
tensors.append(kernel)
# construct a simple container
self.tensor_pack = TensorPack(tensors, self.shapes)
@torch.jit.export
def forward(self, input: Tensor):
return TPMatmul(input, self.tensor_pack)
def to(self, device):
super().to(device)
self.tensor_pack = self.tensor_pack.to(device)
return self
##########
# Layer
##########
class Layer(jit.ScriptModule):
def __init__(self, in_features:int, out_features:int, mode:int):
super(Layer, self).__init__()
self.op = operation(in_features, out_features, mode)
@jit.script_method
def forward(self, inputs: Tensor) -> Tensor:
inputs_ts = inputs.unbind(1)
outputs = torch.jit.annotate(List[Tensor], [])
for i in range(len(inputs_ts)):
out = self.op(inputs_ts[i])
outputs += [out]
outputs = torch.stack(outputs, dim=1)
return outputs
def to(self, device):
super().to(device)
self.op.to(device)
return self
##########
# Model
##########
class Model(nn.Module):
def __init__(self, in_features:int = 16 , out_features:int = 48, mode:int = 3):
super(Model, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.mode = mode
self.layer = Layer(self.in_features, self.out_features, self.mode)
self.act = nn.Sigmoid()
def forward(self, x):
y = self.layer(x)
z = self.act(y)
return z
def to(self, device):
super().to(device)
self.layer = self.layer.to(device)
return self
model = Model()
device = torch.device("cuda")
model.to(device)
print("")
print("==================================")
print(f"Model device set to: {device}")
print(f"kernels.device:{model.layer.op.kernels.device}")
for i in range(model.mode):
print(f"tensor_pack.tensors[{i}].device:{model.layer.op.tensor_pack.tensors[i].device}")
print("==================================")
batch_size = 8
seq_length = 28
in_features = 16
out_features = 48
model = Model()
device = torch.device("cuda")
model.to(device)
model.train()
optimizer = optim.Adam(model.parameters(), lr=0.005)
inputs = torch.rand((batch_size, seq_length, in_features)).to(device)
targets = torch.rand((batch_size, seq_length, out_features)).to(device)
model.train()
outputs = model(inputs)
criterion = nn.BCELoss()
optimizer.zero_grad()
loss = criterion(outputs, targets)
loss.backward(retain_graph=True)
optimizer.step()
```