I have searched many related posts and already known it is a problem about memory, but the weird thing is that when I use (almost) the same Dataset
in tensorflow version, the error goes away.
By saying tensorflow version, I mean I only use Dataloader
to load data but the code used to define model and train the model are all written in tensorflow. Here is the definition of the dataset:
class UsptoDataset(Dataset):
def __init__(self, main_file, tree_file):
f = open(project + main_file, "r")
f_tree = open(project + tree_file, "r")
self.main_data = [x.strip() for x in f.readlines()]
paths = [x.strip() for x in f_tree.readlines()]
tmp = []
self.tree_data = []
# we split the tree_file by <BR> because every block(arbitrary lines) separated by <BR> in tree_file corresponds to a sample.
for p in paths:
if p.strip() == '<BR>':
self.tree_data.append(deepcopy(tmp))
tmp.clear()
else:
tmp.append(p)
if len(tmp) > 0:
self.tree_data.append(deepcopy(tmp))
# every four lines in main_file correspond to a sample.
assert len(self.main_data) // 4 == len(self.tree_data)
f.close()
f_tree.close()
def __len__(self):
return len(self.tree_data)
def __getitem__(self, item):
i = item * 4
line = self.main_data[i]
vec = [int(x) for x in line.split()][:rules_len]
syn_tree_indices = np.array(vec + [0] * (rules_len - len(vec)))
syn_rule_nl_left, syn_rule_nl_right, _ = line2rule_nl(line)
i += 1 # read next line in main file
syn_parent_matrix, _ = line2mask(self.main_data[i], rules_len)
i += 1 # read next line in main file
line = self.main_data[i]
vec = [int(x) for x in line.split()][:rules_len - 1]
rea_tree_indices = np.array([classnum] + vec + [0] * (rules_len - len(vec) - 1))
rea_rule_nl_left, rea_rule_nl_right, class_mask = line2rule_nl(line)
query_paths = read_tree_path(self.tree_data[item])
vec = np.array(vec + [0] * (rules_len - len(vec)))
labels = np.array(vec)
i += 1 # read next line in main file
parent_matrix, path_lens = line2mask(self.main_data[i], rules_len)
return {'syn_tree_indices': syn_tree_indices,
'syn_rule_nl_left': syn_rule_nl_left,
'syn_rule_nl_right': syn_rule_nl_right,
'rea_tree_indices': rea_tree_indices,
'rea_rule_nl_left': rea_rule_nl_left,
'rea_rule_nl_right': rea_rule_nl_right,
'class_mask': class_mask,
'query_paths': query_paths,
'labels': labels,
'parent_matrix': parent_matrix,
'syn_parent_matrix': syn_parent_matrix,
'path_lens': path_lens}
@staticmethod
def collate_fn(batch):
syn_tree_indices = np.stack([_['syn_tree_indices'] for _ in batch], axis=0)
syn_rule_nl_left = np.stack([_['syn_rule_nl_left'] for _ in batch], axis=0)
syn_rule_nl_right = np.stack([_['syn_rule_nl_right'] for _ in batch], axis=0)
rea_tree_indices = np.stack([_['rea_tree_indices'] for _ in batch], axis=0)
rea_rule_nl_left = np.stack([_['rea_rule_nl_left'] for _ in batch], axis=0)
rea_rule_nl_right = np.stack([_['rea_rule_nl_right'] for _ in batch], axis=0)
class_mask = np.stack([_['class_mask'] for _ in batch], axis=0)
query_paths = np.stack([_['query_paths'] for _ in batch], axis=0)
labels = np.stack([_['labels'] for _ in batch], axis=0)
parent_matrix = np.stack([_['parent_matrix'] for _ in batch], axis=0)
syn_parent_matrix = np.stack([_['syn_parent_matrix'] for _ in batch], axis=0)
path_lens = np.stack([_['path_lens'] for _ in batch], axis=0)
return_dict = {'syn_tree_indices': syn_tree_indices,
'syn_rule_nl_left': syn_rule_nl_left,
'syn_rule_nl_right': syn_rule_nl_right,
'rea_tree_indices': rea_tree_indices,
'rea_rule_nl_left': rea_rule_nl_left,
'rea_rule_nl_right': rea_rule_nl_right,
'class_mask': class_mask,
'query_paths': query_paths,
'labels': labels,
'parent_matrix': parent_matrix,
'syn_parent_matrix': syn_parent_matrix,
'path_lens': path_lens}
return return_dict
@staticmethod
def torch_collate_fn(batch):
syn_tree_indices = torch.tensor(np.stack([_['syn_tree_indices'] for _ in batch], axis=0), dtype=torch.long)
syn_rule_nl_left = torch.tensor(np.stack([_['syn_rule_nl_left'] for _ in batch], axis=0), dtype=torch.long)
syn_rule_nl_right = torch.tensor(np.stack([_['syn_rule_nl_right'] for _ in batch], axis=0), dtype=torch.long)
rea_tree_indices = torch.tensor(np.stack([_['rea_tree_indices'] for _ in batch], axis=0), dtype=torch.long)
rea_rule_nl_left = torch.tensor(np.stack([_['rea_rule_nl_left'] for _ in batch], axis=0), dtype=torch.long)
rea_rule_nl_right = torch.tensor(np.stack([_['rea_rule_nl_right'] for _ in batch], axis=0), dtype=torch.long)
class_mask = torch.tensor(np.stack([_['class_mask'] for _ in batch], axis=0), dtype=torch.float32)
query_paths = torch.tensor(np.stack([_['query_paths'] for _ in batch], axis=0), dtype=torch.long)
labels = torch.tensor(np.stack([_['labels'] for _ in batch], axis=0), dtype=torch.long)
parent_matrix = torch.tensor(np.stack([_['parent_matrix'] for _ in batch], axis=0), dtype=torch.float)
syn_parent_matrix = torch.tensor(np.stack([_['syn_parent_matrix'] for _ in batch], axis=0), dtype=torch.float)
path_lens = torch.tensor(np.stack([_['path_lens'] for _ in batch], axis=0), dtype=torch.long)
return_dict = {'syn_tree_indices': syn_tree_indices,
'syn_rule_nl_left': syn_rule_nl_left,
'syn_rule_nl_right': syn_rule_nl_right,
'rea_tree_indices': rea_tree_indices,
'rea_rule_nl_left': rea_rule_nl_left,
'rea_rule_nl_right': rea_rule_nl_right,
'class_mask': class_mask,
'query_paths': query_paths,
'labels': labels,
'parent_matrix': parent_matrix,
'syn_parent_matrix': syn_parent_matrix,
'path_lens': path_lens}
return return_dict
I define two collate_fn
and use them in different code, collate_fn
in tenserflow and torch_collate_fn
in pytorch. As I mentioned before, the former works fine but the latter throws the error as the title shows. One can see that the two collate_fn are mainly the same, with the only difference that tensorflow needs ndarray
and pytorch needs tensor
.
Here is how I use the dataset to define a dataloader:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4,
collate_fn=UsptoDataset.torch_collate_fn)
Any ideas why would this happened? And how to modify my code to make it work in pytorch code? Thanks in advance!
BTW, to tackle the error RuntimeError: received 0 items of ancdata, I use the following code torch.multiprocessing.set_sharing_strategy('file_system')