Recently, in my deep learning model, I attempted to use the torch.sub() function or ‘-’ to subtract two tensors, but the memory consumption has doubled. Before torch. sub() or ‘-’, I set *'with torch. no_ Grad() '*can make the graphics memory consumption return to normal, so I guess it was an abnormal consumption of graphics memory in gradient calculation, but I don’t know how to solve it
Blockquoteclass edge_generator(nn.Module):
def init(self, feature_dim=256, num_classes=9):
super(edge_generator, self).init()
self.feature_dim = feature_dim
self.sim_net = nn.Sequential(
nn.Linear(self.feature_dim, 2self.feature_dim),
nn.LayerNorm(2self.feature_dim),
nn.ReLU(),
nn.Linear(2*self.feature_dim, 1)
)
self.edge_loss = BCEFocalLoss( )
self.num_classes = num_classes
def forward(self, node_feat, label):
num_data = node_feat.size(0)
feat_size = node_feat.size(1)
# distance based
x_i = node_feat.unsqueeze(1)
x_j = x_i.transpose(0, 1)
x_i = x_i.expand(num_data, num_data, feat_size)
x_j = x_j.expand(num_data, num_data, feat_size)
# x_ij = torch.abs(x_i - x_j)
x_ij = torch.sub(x_i, x_j)
x_ij = torch.abs(x_ij)
# compute similarity/dissimilarity (num_data x num_data)
edge_feat = torch.sigmoid(self.sim_net(x_ij)).squeeze( )
force_edge_feat = torch.eye(num_data).cuda( )
edge_feat = edge_feat + force_edge_feat
edge_feat = edge_feat + 1e-6
edge_feat = edge_feat / torch.sum(edge_feat, dim = 1).unsqueeze(1)
one_hot_labels = self.one_hot(label)
edge_labels = torch.matmul(one_hot_labels, one_hot_labels.transpose(0, 1))
edge_loss = self.edge_loss(edge_feat, edge_labels.long( ))
return edge_feat, edge_loss
def one_hot(self, x):
return torch.eye(self.num_classes)[x.long(), :].cuda()
After running the torch. sub() function and obtaining the x_ij result, the memory usage will increase