Hello everyone,
I am thinking that the program is in the memory leak situation and have tried many methods but still not working. I have used memory profiler to trace the leakage location. Please see attached. Thanks in advance for the kind help and efforts.
Filename: implemented_model.py
Line # Mem usage Increment Occurences Line Contents
37 2630.652 MiB 2630.652 MiB 1 @profile
38 def __init__(self, in_dim, mem_dim, dropout1):
40 2630.652 MiB 0.000 MiB 1 super(ChildSumTreeLSTM, self).__init__()
41 2630.652 MiB 0.000 MiB 1 self.in_dim = in_dim
42 2630.652 MiB 0.000 MiB 1 self.mem_dim = mem_dim
43 2630.652 MiB 0.000 MiB 1 self.ioux = nn.Linear(self.in_dim, 3 * self.mem_dim)
44 2630.652 MiB 0.000 MiB 1 self.iouh = nn.Linear(self.mem_dim, 3 * self.mem_dim)
45 2630.652 MiB 0.000 MiB 1 self.fx = nn.Linear(self.in_dim, self.mem_dim)
46 2630.652 MiB 0.000 MiB 1 self.fh = nn.Linear(self.mem_dim, self.mem_dim)
47 2630.652 MiB 0.000 MiB 1 self.H = []
48 2630.652 MiB 0.000 MiB 1 self.drop = nn.Dropout(dropout1)
Filename: implemented_model.py
Line # Mem usage Increment Occurences Line Contents
50 2805.602 MiB 27153098.059 MiB 9920 @profile
51 def node_forward(self, inputs, child_c, child_h):
53 2805.602 MiB -1.633 MiB 9920 inputs = torch.unsqueeze(inputs, 0)
55 2805.602 MiB 0.398 MiB 9920 child_h_sum = torch.sum(child_h, dim=0)
56 2805.602 MiB 23.375 MiB 9920 iou = self.ioux(inputs) + self.iouh(child_h_sum)
57 2805.602 MiB 3.266 MiB 9920 i, o, u = torch.split(iou, iou.size(1) // 3, dim=1)
58 2805.602 MiB 29.422 MiB 9920 i, o, u = torch.sigmoid(i), torch.sigmoid(o), torch.tanh(u)
59 2805.602 MiB 29.305 MiB 9920 f = torch.sigmoid(self.fh(child_h) + self.fx(inputs).repeat(len(child_h), 1))
60 2805.602 MiB -0.344 MiB 9920 fc = torch.mul(f, child_c)
61 2805.602 MiB 30.652 MiB 9920 c = torch.mul(i, u) + torch.sum(fc, dim=0)
62 2805.602 MiB 10.227 MiB 9920 h = torch.mul(o, torch.tanh(c))
63 2805.602 MiB 3.008 MiB 9920 self.H.append(h)
64 2805.602 MiB -1.633 MiB 9920 return c, h
Filename: implemented_model.py
Line # Mem usage Increment Occurences Line Contents
65 2805.602 MiB 6972466.340 MiB 9920 @profile
66 def forward(self, data):
67 2805.602 MiB -1.336 MiB 9920 tree = data[0]
68 2805.602 MiB -1.336 MiB 9920 inputs = data[1]
71 2805.602 MiB -5.938 MiB 37134 _ = [self.forward([tree.children[idx], inputs]) for idx in range(tree.num_children)]
73 2805.602 MiB -1.633 MiB 9920 if tree.num_children == 0:
76 2805.602 MiB -0.594 MiB 5974 child_c = Var(inputs[tree.id].data.new(1, self.mem_dim).fill_(0.))
77 2805.602 MiB -0.594 MiB 5974 child_h = Var(inputs[tree.id].data.new(1, self.mem_dim).fill_(0.))
78 else:
79 2805.602 MiB -3.711 MiB 18694 child_c, child_h = zip(*map(lambda x: x.state, tree.children))
80 2805.602 MiB 1.535 MiB 3946 child_c, child_h = torch.cat(child_c, dim=0), torch.cat(child_h, dim=0)
82 2805.602 MiB 27153241.914 MiB 9920 tree.state = self.node_forward(inputs[tree.id], child_c, child_h)
83 2805.602 MiB -1.633 MiB 9920 return tree.state
Filename: implemented_model_feature.py
Line # Mem usage Increment Occurences Line Contents
10 2630.652 MiB 2630.652 MiB 1 @profile
11 def __init__(self, h_size, num_node_feature, num_classes, feature_representation_size, drop_out_rate):
13 2630.652 MiB 0.000 MiB 1 self.h_size = h_size
14 2630.652 MiB 0.000 MiB 1 self.num_node_feature = num_node_feature
15 2630.652 MiB 0.000 MiB 1 self.num_classes = num_classes
16 2630.652 MiB 0.000 MiB 1 self.feature_representation_size = feature_representation_size
17 2630.652 MiB 0.000 MiB 1 self.drop_out_rate = drop_out_rate
19 2630.652 MiB 2630.652 MiB 1 self.tree_lstm = ChildSumTreeLSTM(self.feature_representation_size, self.h_size, self.drop_out_rate)
21 2630.652 MiB 0.000 MiB 1 self.gru_combine = GRU(input_size=self.h_size, hidden_size=self.h_size, bidirectional=True, batch_first=True)
24 2630.652 MiB 0.000 MiB 1 self.conv = GCNConv(self.h_size, self.num_classes)
25 2630.652 MiB 0.000 MiB 1 self.dropout = Dropout(self.drop_out_rate)
26 2630.652 MiB 0.000 MiB 1 self.pool = nn.AdaptiveAvgPool2d((1, 2))
27 2630.652 MiB 0.000 MiB 1 self.connect = nn.Linear(self.h_size * self.num_node_feature * 2, self.h_size)
Filename: implemeneted_model_feature.py
Line # Mem usage Increment Occurences Line Contents
29 2805.602 MiB 8228.867 MiB 13 @profile
31 def forward(self, _Data):
33 2805.602 MiB 0.000 MiB 13 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
34 2805.602 MiB 0.000 MiB 13 _Data.to(device)
35 2805.602 MiB 0.000 MiB 13 batch, data, tree_data, edge_index = _Data.batch, _Data.my_data, _Data.tree_info, _Data.edge_index
38 2805.602 MiB 0.000 MiB 13 treeList = []
39 2805.602 MiB 0.000 MiB 66 for index, aData in enumerate(data):
40 2805.602 MiB 0.000 MiB 53 treeList = treeList + tree_data[index]
46 2805.602 MiB 0.000 MiB 13 feature_1 = treeList
47 2805.602 MiB 0.000 MiB 13 feature_vec1 = None
49 2805.602 MiB -0.297 MiB 2559 for i in range(len(feature_1)):
50 2805.602 MiB -0.297 MiB 2546 if i == 0:
51 2805.602 MiB 36286.398 MiB 13 _, feature_vec1 = self.tree_lstm(feature_1[i])
52 else:
53 2805.602 MiB 6936327.410 MiB 2533 _, feature_vec_temp = self.tree_lstm(feature_1[i])
54 2805.602 MiB 0.992 MiB 2533 feature_vec1 = torch.cat((feature_vec1, feature_vec_temp), 0)
55 2805.602 MiB 0.000 MiB 13 feature_vec1 = torch.reshape(feature_vec1, (-1, 1, self.h_size))
57 2805.602 MiB 0.000 MiB 13 feature_vec, _ = self.gru_combine(feature_vec1)
59 2805.602 MiB 0.000 MiB 13 feature_vec = self.dropout(feature_vec)
60 2805.602 MiB 0.000 MiB 13 feature_vec = torch.flatten(feature_vec, 1)
61 2805.602 MiB 0.000 MiB 13 feature_vec = self.connect(feature_vec)
62 2805.602 MiB 1.773 MiB 13 conv_output = self.conv(feature_vec, edge_index)
67 2805.602 MiB 0.000 MiB 13 pooled = global_mean_pool(conv_output, batch)
68 2805.602 MiB 0.000 MiB 13 output = nn.Softmax(dim=1)(pooled)
69 2805.602 MiB 0.000 MiB 13 return output
Filename: main.py
Line # Mem usage Increment Occurences Line Contents
40 463.707 MiB 463.707 MiB 1 @profile
41 def start_training(dataset, batchSize, epochSize, modelName):
42 463.707 MiB 0.000 MiB 1 random.shuffle(dataset)
44 463.707 MiB 0.000 MiB 1 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
48 2630.723 MiB -0.352 MiB 56 for i, data in enumerate(dataset):
52 2628.660 MiB 2045.035 MiB 55 data.y = torch.tensor([data.y], device=device).detach()
53 2630.723 MiB -1.781 MiB 165 for index, _my_datas in enumerate(data.my_data):
54 2630.723 MiB 121.445 MiB 110 data.my_data[index] = rnn_utils.pad_sequence(_my_datas, batch_first=True, padding_value=0)
63 2630.723 MiB 0.000 MiB 1 print("dataset length ", len(dataset))
64 2630.723 MiB 0.000 MiB 1 trainLen = int(len(dataset) * 0.8)
65 2630.723 MiB 0.000 MiB 1 train_dataset = dataset[:trainLen]
66 2630.723 MiB 0.000 MiB 1 test_dataset = dataset[trainLen:]
67 2630.652 MiB -0.070 MiB 1 random.shuffle(dataset)
70 2630.652 MiB 0.000 MiB 1 dataset0 = []
71 2630.652 MiB 0.000 MiB 1 dataset1 = []
72 2630.652 MiB 0.000 MiB 45 for index, data in enumerate(train_dataset):
73 2630.652 MiB 0.000 MiB 44 if data.y == 0:
74 2630.652 MiB 0.000 MiB 21 dataset0.append(data)
75 else:
76 2630.652 MiB 0.000 MiB 23 dataset1.append(data)
77 2630.652 MiB 0.000 MiB 1 print(len(dataset0))
78 2630.652 MiB 0.000 MiB 1 print(len(dataset1))
79 2630.652 MiB 0.000 MiB 1 trainLength = min(len(dataset0), len(dataset1))
80 2630.652 MiB 0.000 MiB 1 train_dataset = dataset0[:trainLength]
81 2630.652 MiB 0.000 MiB 1 train_dataset.extend(dataset1[:trainLength])
84 2630.652 MiB 0.000 MiB 1 train_dataLoader = DataLoader(ownDataset(train_dataset), batch_size=batchSize, shuffle=True)
85 2630.652 MiB 0.000 MiB 1 test_dataLoader = DataLoader(ownDataset(test_dataset), batch_size=1, shuffle=True)
113 2630.652 MiB 0.000 MiB 1 model = model_1(h_size=64, num_node_feature=1, num_classes=2, feature_representation_size=100,
114 2630.652 MiB 2630.652 MiB 1 drop_out_rate=0.5)
122 2634.883 MiB 4.230 MiB 1 model.to(device)
123 2805.961 MiB 2805.961 MiB 1 train(epochs=epochSize, trainLoader=train_dataLoader, testLoader=test_dataLoader, model=model, learning_rate=0.0001, modelName=modelName)
Filename: main.py
Line # Mem usage Increment Occurences Line Contents
155 2634.883 MiB 2634.883 MiB 1 @profile
156 def train(epochs, trainLoader, testLoader, model, learning_rate, modelName):
157 2634.883 MiB 0.000 MiB 1 print("#### Start training ####")
159 2634.883 MiB 0.000 MiB 1 device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)
161 2635.113 MiB 0.230 MiB 1 visLoss = vis.visLoss(name=‘train loss’)
162 2635.113 MiB 0.000 MiB 1 visEvaluate = []
163 2635.113 MiB 0.000 MiB 1 visAcc = vis.visLoss(name=‘Accuracy’)
164 2635.113 MiB 0.000 MiB 1 visPrec = vis.visLoss(name=‘Precision’)
165 2635.113 MiB 0.000 MiB 1 visRecall = vis.visLoss(name=‘Recall’)
166 2635.113 MiB 0.000 MiB 1 visF1 = vis.visLoss(name=‘F1 score’)
167 2635.113 MiB 0.000 MiB 1 visEvaluate.append(visAcc)
168 2635.113 MiB 0.000 MiB 1 visEvaluate.append(visPrec)
169 2635.113 MiB 0.000 MiB 1 visEvaluate.append(visRecall)
170 2635.113 MiB 0.000 MiB 1 visEvaluate.append(visF1)
172 2635.113 MiB 0.000 MiB 1 criterion = nn.CrossEntropyLoss()
173 2635.113 MiB 0.000 MiB 1 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # create an optimizer
174 2635.113 MiB 0.000 MiB 1 model.train()
175 2635.113 MiB 0.000 MiB 1 try:
176 2635.113 MiB 0.000 MiB 1 lossConvergence = []
177 2635.113 MiB 0.000 MiB 1 print(" ### training model in main ### ")
178 2805.961 MiB 0.000 MiB 2 for e in range(epochs):
179 2635.113 MiB 0.000 MiB 1 model.train()
180 2635.113 MiB 0.000 MiB 1 lossData = 0
181 2635.113 MiB 0.000 MiB 1 lossEpoch = []
182 2805.602 MiB 0.000 MiB 3 for index, _data in enumerate(tqdm(trainLoader, leave=False)):
184 2802.648 MiB 5572.758 MiB 2 out = model(_data)
185 2802.648 MiB 2.754 MiB 2 loss = criterion(out, _data.y)
189 2802.648 MiB 0.000 MiB 2 optimizer.zero_grad() # if don’t call zero_grad, the grad of each batch will be accumulated
190 2805.602 MiB 17.578 MiB 2 loss.backward()
191 2805.602 MiB 0.664 MiB 2 optimizer.step()
192 2805.602 MiB 0.000 MiB 2 sleep(0.05)
193 2805.602 MiB 0.000 MiB 2 if index % 20 == 0:
194 2788.152 MiB 0.000 MiB 1 print(‘epoch: {}, batch: {}, loss: {}’.format(e + 1, index + 1, loss.data))
195 2805.602 MiB 0.000 MiB 2 lossData = loss.data
196 2805.602 MiB 0.000 MiB 2 lossEpoch.append(loss.item())
197 2805.602 MiB 0.000 MiB 1 print(“loss change in epoch”)
198 2805.602 MiB 0.000 MiB 1 print(lossEpoch)
199 2805.602 MiB 0.000 MiB 1 visLoss.pushValue(lossData.item(), e)
200 2805.602 MiB 0.000 MiB 1 lossConvergence.append(lossData.item())
201 2805.961 MiB 2805.961 MiB 1 results = evaluate_metrics(model=model, test_loader=testLoader)
202 2805.961 MiB 0.000 MiB 5 for i in range(len(results)):
203 2805.961 MiB 0.000 MiB 4 visEvaluate[i].pushValue(results[i], e)
204 2805.961 MiB 0.000 MiB 1 print("loss convergence: ")
205 2805.961 MiB 0.000 MiB 1 print(lossConvergence)
206 2805.961 MiB 0.000 MiB 1 modelPath = ‘model/model’ + modelName
207 2805.961 MiB 0.000 MiB 1 torch.save(model.state_dict(), modelPath) # todo, finish model saving part
208 except KeyboardInterrupt:
209 evaluate_metrics(model=model, test_loader=testLoader)