I am using this custom torchtext data loader .
def load_dataset_file(filename):
with gzip.open(filename, "rb") as f:
loaded_object = pickle.load(f)
return loaded_object
class SignTranslationDataset(data.Dataset):
"""Defines a dataset for machine translation."""
@staticmethod
def sort_key(ex):
return data.interleave_keys(len(ex.sgn), len(ex.txt))
def __init__(
self,
path: str,
fields: Tuple[RawField, RawField, Field, Field, Field],
**kwargs
):
"""Create a SignTranslationDataset given paths and fields.
Arguments:
path: Common prefix of paths to the data files for both languages.
exts: A tuple containing the extension to path for each language.
fields: A tuple containing the fields that will be used for data
in each language.
Remaining keyword arguments: Passed to the constructor of
data.Dataset.
"""
if not isinstance(fields[0], (tuple, list)):
#print("ISL uses first")
fields = [
("sequence", fields[0]),
("signer", fields[1]),
("sgn", fields[2]),
("gls", fields[3]),
("txt", fields[4]),
]
if not isinstance(path, list):
path = [path]
samples = {}
for annotation_file in path:
tmp = load_dataset_file(Path(annotation_file).expanduser())
for s in tmp:
seq_id = s["name"]
if seq_id in samples:
assert samples[seq_id]["name"] == s["name"]
assert samples[seq_id]["signer"] == s["signer"]
assert samples[seq_id]["gloss"] == s["gloss"]
assert samples[seq_id]["text"] == s["text"]
samples[seq_id]["sign"] = torch.cat(
[samples[seq_id]["sign"], s["sign"]], axis=1
)
else:
samples[seq_id] = {
"name": s["name"],
"signer": s["signer"],
"gloss": s["gloss"],
"text": s["text"],
"sign": s["sign"],
}
examples = []
for s in samples:
sample = samples[s]
examples.append(
data.Example.fromlist(
[
sample["name"],
sample["signer"],
# This is for numerical stability
sample["sign"] + 1e-8,
sample["gloss"].strip(),
sample["text"].strip(),
],
fields,
)
)
super().__init__(examples, fields, **kwargs)
A sample of my dataset file is given below:
[{'name': '0', 'signer': 'Signer0', 'gloss': 'are you free today', 'text': 'are you free today', 'sign': tensor([[[[0.2514]],
[[0.2455]],
[[0.1973]],
[[0.2011]]],
[[[0.1832]],
[[0.2836]],
[[0.3214]],
[[0.2282]]],
[[[0.2169]],
[[0.2362]],
[[0.3123]],
[[0.3110]]],
...,
[[[0.2704]],
[[0.2173]],
[[0.2105]],
[[0.1930]]],
[[[0.1278]],
[[0.2460]],
[[0.2580]],
[[0.2280]]],
[[[0.0148]],
[[0.1276]],
[[0.2752]],
[[0.3475]]]])}]
However when I check the len of train data it outputs zero, but when I check the len of my val and test data they are not empty. I tried to make the val data as my train but the len turned zero. Any ideas for me or things I should check?
Thank you in advance