Torchtext not processing my data

peony · December 7, 2023, 9:33am

I am using this custom torchtext data loader .

def load_dataset_file(filename):
    with gzip.open(filename, "rb") as f:
        loaded_object = pickle.load(f)
        return loaded_object


class SignTranslationDataset(data.Dataset):
    """Defines a dataset for machine translation."""

    @staticmethod
    def sort_key(ex):
        return data.interleave_keys(len(ex.sgn), len(ex.txt))

    def __init__(
        self,
        path: str,
        fields: Tuple[RawField, RawField, Field, Field, Field],
        **kwargs
    ):
        """Create a SignTranslationDataset given paths and fields.

        Arguments:
            path: Common prefix of paths to the data files for both languages.
            exts: A tuple containing the extension to path for each language.
            fields: A tuple containing the fields that will be used for data
                in each language.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        if not isinstance(fields[0], (tuple, list)):
            #print("ISL uses first")
            fields = [
                ("sequence", fields[0]),
                ("signer", fields[1]),
                ("sgn", fields[2]),
                ("gls", fields[3]),
                ("txt", fields[4]),
            ]

            
            

        if not isinstance(path, list):
           
            path = [path]

        samples = {}
        for annotation_file in path:
            tmp = load_dataset_file(Path(annotation_file).expanduser())
            for s in tmp:      
                seq_id = s["name"]
                if seq_id in samples:
                  
                    assert samples[seq_id]["name"] == s["name"]
                    assert samples[seq_id]["signer"] == s["signer"]
                    assert samples[seq_id]["gloss"] == s["gloss"]
                    assert samples[seq_id]["text"] == s["text"]
                    samples[seq_id]["sign"] = torch.cat(
                        [samples[seq_id]["sign"], s["sign"]], axis=1
                    )
                else:
                  
                    samples[seq_id] = {
                        "name": s["name"],
                        "signer": s["signer"],
                        "gloss": s["gloss"],
                        "text": s["text"],
                        "sign": s["sign"],
                    }

        examples = []
        for s in samples:
            sample = samples[s]
            examples.append(
                data.Example.fromlist(
                    [
                        sample["name"],
                        sample["signer"],
                        # This is for numerical stability
                        sample["sign"] + 1e-8,
                        sample["gloss"].strip(),
                        sample["text"].strip(),
                    ],
                    fields,
                )

            )
        super().__init__(examples, fields, **kwargs)

A sample of my dataset file is given below:

[{'name': '0', 'signer': 'Signer0', 'gloss': 'are you free today', 'text': 'are you free today', 'sign': tensor([[[[0.2514]],

         [[0.2455]],

         [[0.1973]],

         [[0.2011]]],


        [[[0.1832]],

         [[0.2836]],

         [[0.3214]],

         [[0.2282]]],


        [[[0.2169]],

         [[0.2362]],

         [[0.3123]],

         [[0.3110]]],


        ...,


        [[[0.2704]],

         [[0.2173]],

         [[0.2105]],

         [[0.1930]]],


        [[[0.1278]],

         [[0.2460]],

         [[0.2580]],

         [[0.2280]]],


        [[[0.0148]],

         [[0.1276]],

         [[0.2752]],

         [[0.3475]]]])}]

However when I check the len of train data it outputs zero, but when I check the len of my val and test data they are not empty. I tried to make the val data as my train but the len turned zero. Any ideas for me or things I should check?

Thank you in advance

peony · December 8, 2023, 4:29pm

I’ve got an update when I don’t filter by max_sent_length, I can get the correct len(data):

    train_data = SignTranslationDataset(
        path=train_paths,
        fields=(sequence_field, signer_field, sgn_field, gls_field, txt_field),
        #filter_pred=lambda x: len(vars(x)["sgn"]) <= max_sent_length
        #and len(vars(x)["txt"]) <= max_sent_length
    )

However, I get the following error:

  return torch.stack([torch.stack(ft, dim=0) for ft in features], dim=0)
RuntimeError: stack expects each tensor to be equal size, but got [1024, 7] at entry 0 and [1024, 3] at entry 1

Any ideas for me? Thank you in advance