RuntimeError: DataLoader worker (pid 20655) is killed by signal: Killed

pyxiea · December 6, 2020, 2:58pm

I have searched many related posts and already known it is a problem about memory, but the weird thing is that when I use (almost) the same Dataset in tensorflow version, the error goes away.

By saying tensorflow version, I mean I only use Dataloader to load data but the code used to define model and train the model are all written in tensorflow. Here is the definition of the dataset:

class UsptoDataset(Dataset):
    def __init__(self, main_file, tree_file):
        f = open(project + main_file, "r")
        f_tree = open(project + tree_file, "r")
        self.main_data = [x.strip() for x in f.readlines()]
        paths = [x.strip() for x in f_tree.readlines()]
        tmp = []
        self.tree_data = []
        # we split the tree_file by <BR> because every block(arbitrary lines) separated by <BR> in tree_file corresponds to a sample.
        for p in paths:
            if p.strip() == '<BR>':
                self.tree_data.append(deepcopy(tmp))
                tmp.clear()
            else:
                tmp.append(p)
        if len(tmp) > 0:
            self.tree_data.append(deepcopy(tmp))
        # every four lines in main_file correspond to a sample.
        assert len(self.main_data) // 4 == len(self.tree_data)
        f.close()
        f_tree.close()

    def __len__(self):
        return len(self.tree_data)

    def __getitem__(self, item):
        i = item * 4

        line = self.main_data[i]
        vec = [int(x) for x in line.split()][:rules_len]
        syn_tree_indices = np.array(vec + [0] * (rules_len - len(vec)))
        syn_rule_nl_left, syn_rule_nl_right, _ = line2rule_nl(line)
        i += 1  # read next line in main file
        syn_parent_matrix, _ = line2mask(self.main_data[i], rules_len)
        i += 1  # read next line in main file

        line = self.main_data[i]
        vec = [int(x) for x in line.split()][:rules_len - 1]
        rea_tree_indices = np.array([classnum] + vec + [0] * (rules_len - len(vec) - 1))
        rea_rule_nl_left, rea_rule_nl_right, class_mask = line2rule_nl(line)
        query_paths = read_tree_path(self.tree_data[item])
        vec = np.array(vec + [0] * (rules_len - len(vec)))
        labels = np.array(vec)
        i += 1  # read next line in main file

        parent_matrix, path_lens = line2mask(self.main_data[i], rules_len)

        return {'syn_tree_indices': syn_tree_indices,
                'syn_rule_nl_left': syn_rule_nl_left,
                'syn_rule_nl_right': syn_rule_nl_right,
                'rea_tree_indices': rea_tree_indices,
                'rea_rule_nl_left': rea_rule_nl_left,
                'rea_rule_nl_right': rea_rule_nl_right,
                'class_mask': class_mask,
                'query_paths': query_paths,
                'labels': labels,
                'parent_matrix': parent_matrix,
                'syn_parent_matrix': syn_parent_matrix,
                'path_lens': path_lens}

    @staticmethod
    def collate_fn(batch):
        syn_tree_indices = np.stack([_['syn_tree_indices'] for _ in batch], axis=0)
        syn_rule_nl_left = np.stack([_['syn_rule_nl_left'] for _ in batch], axis=0)
        syn_rule_nl_right = np.stack([_['syn_rule_nl_right'] for _ in batch], axis=0)
        rea_tree_indices = np.stack([_['rea_tree_indices'] for _ in batch], axis=0)
        rea_rule_nl_left = np.stack([_['rea_rule_nl_left'] for _ in batch], axis=0)
        rea_rule_nl_right = np.stack([_['rea_rule_nl_right'] for _ in batch], axis=0)
        class_mask = np.stack([_['class_mask'] for _ in batch], axis=0)
        query_paths = np.stack([_['query_paths'] for _ in batch], axis=0)
        labels = np.stack([_['labels'] for _ in batch], axis=0)
        parent_matrix = np.stack([_['parent_matrix'] for _ in batch], axis=0)
        syn_parent_matrix = np.stack([_['syn_parent_matrix'] for _ in batch], axis=0)
        path_lens = np.stack([_['path_lens'] for _ in batch], axis=0)

        return_dict = {'syn_tree_indices': syn_tree_indices,
                       'syn_rule_nl_left': syn_rule_nl_left,
                       'syn_rule_nl_right': syn_rule_nl_right,
                       'rea_tree_indices': rea_tree_indices,
                       'rea_rule_nl_left': rea_rule_nl_left,
                       'rea_rule_nl_right': rea_rule_nl_right,
                       'class_mask': class_mask,
                       'query_paths': query_paths,
                       'labels': labels,
                       'parent_matrix': parent_matrix,
                       'syn_parent_matrix': syn_parent_matrix,
                       'path_lens': path_lens}
        return return_dict

    @staticmethod
    def torch_collate_fn(batch):
        syn_tree_indices = torch.tensor(np.stack([_['syn_tree_indices'] for _ in batch], axis=0), dtype=torch.long)
        syn_rule_nl_left = torch.tensor(np.stack([_['syn_rule_nl_left'] for _ in batch], axis=0), dtype=torch.long)
        syn_rule_nl_right = torch.tensor(np.stack([_['syn_rule_nl_right'] for _ in batch], axis=0), dtype=torch.long)
        rea_tree_indices = torch.tensor(np.stack([_['rea_tree_indices'] for _ in batch], axis=0), dtype=torch.long)
        rea_rule_nl_left = torch.tensor(np.stack([_['rea_rule_nl_left'] for _ in batch], axis=0), dtype=torch.long)
        rea_rule_nl_right = torch.tensor(np.stack([_['rea_rule_nl_right'] for _ in batch], axis=0), dtype=torch.long)
        class_mask = torch.tensor(np.stack([_['class_mask'] for _ in batch], axis=0), dtype=torch.float32)
        query_paths = torch.tensor(np.stack([_['query_paths'] for _ in batch], axis=0), dtype=torch.long)
        labels = torch.tensor(np.stack([_['labels'] for _ in batch], axis=0), dtype=torch.long)
        parent_matrix = torch.tensor(np.stack([_['parent_matrix'] for _ in batch], axis=0), dtype=torch.float)
        syn_parent_matrix = torch.tensor(np.stack([_['syn_parent_matrix'] for _ in batch], axis=0), dtype=torch.float)
        path_lens = torch.tensor(np.stack([_['path_lens'] for _ in batch], axis=0), dtype=torch.long)

        return_dict = {'syn_tree_indices': syn_tree_indices,
                       'syn_rule_nl_left': syn_rule_nl_left,
                       'syn_rule_nl_right': syn_rule_nl_right,
                       'rea_tree_indices': rea_tree_indices,
                       'rea_rule_nl_left': rea_rule_nl_left,
                       'rea_rule_nl_right': rea_rule_nl_right,
                       'class_mask': class_mask,
                       'query_paths': query_paths,
                       'labels': labels,
                       'parent_matrix': parent_matrix,
                       'syn_parent_matrix': syn_parent_matrix,
                       'path_lens': path_lens}
        return return_dict

I define two collate_fn and use them in different code, collate_fn in tenserflow and torch_collate_fn in pytorch. As I mentioned before, the former works fine but the latter throws the error as the title shows. One can see that the two collate_fn are mainly the same, with the only difference that tensorflow needs ndarray and pytorch needs tensor.

Here is how I use the dataset to define a dataloader:

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4,
                              collate_fn=UsptoDataset.torch_collate_fn)

Any ideas why would this happened? And how to modify my code to make it work in pytorch code? Thanks in advance!

BTW, to tackle the error RuntimeError: received 0 items of ancdata, I use the following code torch.multiprocessing.set_sharing_strategy('file_system')

pyxiea · December 7, 2020, 12:13pm

I found the reason, I use cycle to wrap the dataloader, which leads to the problem of memory leakage.