Tokenize multiple files

James_Sanchez · April 15, 2024, 2:44pm

Hi All,

I am trying to run the following for multiple files to tokenize.

for file in all_files:
    data = TextLoader(file=file, tokenizer=tokenizer)
    train_dataloader = DataLoader(data, batch_size=120, shuffle=False) # Shuffle should be set to False

I have defined TextLoader and DataLoader and it doesn’t seem to pick up.

I get the following message.

''ValueError Traceback (most recent call last)
in <cell line: 27>()
26
27 for file in all_files:
—> 28 data = TextLoader(file=file,tokenizer=tokenizer)
29 train_dataloader = DataLoader(data, batch_size=120, shuffle=False) # Shuffle should be set to False
30

5 frames
/usr/local/lib/python3.10/dist-packages/pandas/io/json/_json.py in _parse(self)
1318 if orient == “columns”:
1319 self.obj = DataFrame(
→ 1320 loads(json, precise_float=self.precise_float), dtype=None
1321 )
1322 elif orient == “split”:

Regards,

James

ptrblck · April 16, 2024, 12:04am

I don’t see the actual error in the stacktrace, so did you forget to paste it?

James_Sanchez · April 16, 2024, 11:22am

Hi ptrblck

Thank you for feedback.

Well I get this

    if orient == "columns":
        self.obj = DataFrame(
            loads(json, precise_float=self.precise_float), dtype=None
        )
    elif orient == "split":
        decoded = {
            str(k): v
            for k, v in loads(json, precise_float=self.precise_float).items()
        }
        self.check_keys_split(decoded)
        orig_names = [
            (tuple(col) if isinstance(col, list) else col)
            for col in decoded["columns"]
        ]
        decoded["columns"] = dedup_names(
            orig_names,class FrameParser(Parser):

            is_potential_multi_index(orig_names, None),
        )
        self.obj = DataFrame(dtype=None, **decoded)
    elif orient == "index":
        self.obj = DataFrame.from_dict(
            loads(json, precise_float=self.precise_float),
            dtype=None,
            orient="index",
        )
    elif orient == "table":
        self.obj = parse_table_schema(json, precise_float=self.precise_float)
    else:
        self.obj = DataFrame(
            loads(json, precise_float=self.precise_float), dtype=None
        )

def _process_converter(self, f, filt=None) -> None:
    """
    Take a conversion function and possibly recreate the frame.
    """
    if filt is None:
        filt = lambda col, c: True

    obj = self.obj
    assert obj is not None  # for mypy

    needs_new_obj = False
    new_obj = {}
    for i, (col, c) in enumerate(obj.items()):
        if filt(col, c):
            new_data, result = f(col, c)
            if result:
                c = new_data
                needs_new_obj = True
        new_obj[i] = c

    if needs_new_obj:
        # possibly handle dup columns
        new_frame = DataFrame(new_obj, index=obj.index)
        new_frame.columns = obj.columns
        self.obj = new_frame

def _try_convert_types(self) -> None:
    if self.obj is None:
        return
    if self.convert_dates:
        self._try_convert_dates()

    self._process_converter(
        lambda col, c: self._try_convert_data(col, c, convert_dates=False)
    )

def _try_convert_dates(self) -> None:
    if self.obj is None:
        return

    # our columns to parse
    convert_dates_list_bool = self.convert_dates
    if isinstance(convert_dates_list_bool, bool):
        convert_dates_list_bool = []
    convert_dates = set(convert_dates_list_bool)

    def is_ok(col) -> bool:
        """
        Return if this col is ok to try for a date parse.
        """
        if not isinstance(col, str):
            return False

        col_lower = col.lower()
        if (
            col_lower.endswith("_at")
            or col_lower.endswith("_time")
            or col_lower == "modified"
            or col_lower == "date"
            or col_lower == "datetime"
            or col_lower.startswith("timestamp")
        ):
            return True
        return False

    self._process_converter(
        lambda col, c: self._try_convert_to_date(c),
        lambda col, c: (
            (self.keep_default_dates and is_ok(col)) or col in convert_dates
        ),
    )

Regards,

James.