Hi,
I find that my customized loss does not decrease during the training so I checked the gradients for all mlps and they are all zeros all the time. The network does not learn at all. Don’t know which part is wrong. Thanks for any help.
The structure of my network:
import torch
from Compared_layer import Compared_layer
class ComparedGNN(torch.nn.Module):
def __init__(self, edge_feature_size, num_antenna, num_BS, num_layers, power, noise):
super(ComparedGNN, self).__init__()
# if you have cuda
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
self.to(self.device) # move the entire model to cuda
# initialize the first connection layer
self.layer1 = Compared_layer(edge_feature_size, num_antenna, num_BS, power, noise)
#initialize the middle layers
self.middle_layers = []
for i in range(num_layers - 1):
layer = Compared_layer(edge_feature_size, num_antenna, num_BS, power, noise)
self.middle_layers.append(layer)
def forward(self, F_ue0, E, P, Noise):
F_ue = self.layer1(F_ue0, E, P, Noise) #F_ue is random for the input of the first layer
if self.device.type == "cuda":
F_ue = F_ue.to(self.device)
for layer in self.middle_layers:
F_ue = layer(F_ue, E, P, Noise)
if self.device.type == "cuda":
F_ue = F_ue.to(self.device)
return F_ue
The customized loss and train function:
def Loss(W, H, Noise):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
K = Noise.size()
K = K[0]
N = H.size(1) // K
H = H.to(torch.complex128)
W = W.to(torch.complex128)
R = torch.zeros(K)
interference = torch.zeros(K)
SINR = torch.zeros(K)
if device.type == "cuda":
Noise = Noise.to(device)
for k in range(K):
h_k = H[:, k * N:(k + 1) * N] # of size [M, N]
Signal = 0.0
h_kT = torch.transpose(torch.conj(h_k), 0, 1) # of size [N, M]
h_kT = torch.flatten(h_kT).requires_grad_(True) # of size [MN, 1]
hw_k = torch.matmul(h_kT, W[:, k])
hw_k.retain_grad()
signal = torch.sum(hw_k) # h_kT * w_k
Signal = abs(signal) ** 2
for l in range(K):
interference_k = 0.0 # h_kT * w_l
if l!=k:
hw_l = torch.matmul(h_kT.view(-1), W[:, l])
#hw_l.retain_grad()
interference_k = torch.sum(hw_l)
#interference_k = torch.matmul(h_kT.view(-1), W[:, l]).sum()
interference[k] = interference[k] + abs(interference_k) ** 2
SINR[k] = Signal / (interference[k] + Noise[k])
R[k] = torch.log2(1+SINR[k])
Rsum = -torch.sum(R)
return Rsum
def getW(F_ue, P, K):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
M = P.size()
P = P.to(device)
M = M[0]
N = F_ue.size(0) // (2*M)
F_ue_complex = F_ue[:M * N, :] + 1j * F_ue[M * N:, :] #before normalization
W = torch.zeros(M*N, K, device=device) + 1j * torch.zeros(M*N, K, device=device)
W_new = torch.zeros(W.size(), device=device) + 1j*torch.zeros(W.size(), device=device)
W = torch.autograd.Variable(W, requires_grad=True)
for m in range(M):
W_temp = F_ue_complex[m*N:(m+1)*N, :].clone().to(device) # of size N*K
norm_sum = 0.0
for k in range(K):
norm_k = torch.norm(W_temp[:,k], p=2) ** 2
norm_sum += norm_k
W_new[m*N:(m+1)*N, :] = P[m] * W_temp/norm_sum
W = W_new
W = W.to(device)
return W
def train(P, Noise, dataset, num_epochs, lr):
M = P.size()
M = M[0]
K = Noise.size()
K = K[0]
KN = dataset.size(dim=2)
N = KN // K
model = ComparedGNN(edge_feature_size=2*N, num_antenna=N, num_BS=M, num_layers=2, power=P[0], noise=Noise[0])
model.to(device)
model.train()
#optimizer = torch.optim.Adam(model.parameters(), lr)
optimizer = torch.optim.RMSprop(model.parameters(), lr)
train_size = int(1 * len(dataset))
print(f'train size: {train_size}')
# print(f'train size: {train_size}')
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
batch_size = 64
num_batches = (train_size + batch_size - 1) // (batch_size * num_epochs)
print(f'num_batches: {num_batches}')
# define lists to save loss and rate for each epoch
losses = []
rate = []
for epoch in range(num_epochs):
running_loss = 0.0
for batch_idx in range(num_batches): # number of minibatches = num_batches
model.train()
start_idx = (epoch * num_batches + batch_idx) * batch_size
end_idx = start_idx + batch_size
subset_indices = range(start_idx, end_idx)
subset = Subset(train_dataset, subset_indices)
subset_loader = DataLoader(subset, batch_size=batch_size, shuffle=True)
# obtain data from each batch
for data in subset_loader:
channel_complex = data
if device.type == "cuda":
channel_complex = channel_complex.to(device)
optimizer.zero_grad()
loss = 0.0
batch_n = channel_complex.size(dim=0)
for b in range(batch_n):
H_complex = channel_complex[b,:,:]
# transfer the complex into real to be the edge feature
edge_feature = torch.zeros(M, K, 2*N)
for m in range(M):
for k in range(K):
h_complex = H_complex[m, k*N:(k+1)*N]
h_real = h_complex.real
h_imag = h_complex.imag
edge_feature[m,k,:] = torch.cat((h_real, h_imag), dim=0)
F_ue0 = torch.rand(2*M*N, K)
if device.type == "cuda":
F_ue0 = F_ue0.to(device)
edge_feature = edge_feature.to(device)
F_ue = model(F_ue0, edge_feature, P, Noise)
W = getW(F_ue, P, K)
loss = loss + Loss(W, H_complex, Noise)
loss /= batch_n
if device.type == "cuda":
loss = loss.cpu()
running_loss += loss.item()
for name, param in model.named_parameters():
if param.grad is None:
print(f'Parameter: {name}, Gradient: {param.grad}')
loss.backward()
optimizer.step()
epoch_loss = running_loss/num_batches
losses.append(epoch_loss)
rate.append(-epoch_loss)
print(f'Epoch: {epoch + 1:03d}, Training Loss: {epoch_loss:.4f}')