Hi All,
I am trying to run the following for multiple files to tokenize.
for file in all_files:
data = TextLoader(file=file, tokenizer=tokenizer)
train_dataloader = DataLoader(data, batch_size=120, shuffle=False) # Shuffle should be set to False
I have defined TextLoader and DataLoader and it doesn’t seem to pick up.
I get the following message.
''ValueError Traceback (most recent call last)
in <cell line: 27>()
26
27 for file in all_files:
—> 28 data = TextLoader(file=file,tokenizer=tokenizer)
29 train_dataloader = DataLoader(data, batch_size=120, shuffle=False) # Shuffle should be set to False
30
5 frames
/usr/local/lib/python3.10/dist-packages/pandas/io/json/_json.py in _parse(self)
1318 if orient == “columns”:
1319 self.obj = DataFrame(
→ 1320 loads(json, precise_float=self.precise_float), dtype=None
1321 )
1322 elif orient == “split”:
Regards,
James
I don’t see the actual error in the stacktrace, so did you forget to paste it?
1 Like
Hi ptrblck
Thank you for feedback.
Well I get this
if orient == "columns":
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None
)
elif orient == "split":
decoded = {
str(k): v
for k, v in loads(json, precise_float=self.precise_float).items()
}
self.check_keys_split(decoded)
orig_names = [
(tuple(col) if isinstance(col, list) else col)
for col in decoded["columns"]
]
decoded["columns"] = dedup_names(
orig_names,class FrameParser(Parser):
is_potential_multi_index(orig_names, None),
)
self.obj = DataFrame(dtype=None, **decoded)
elif orient == "index":
self.obj = DataFrame.from_dict(
loads(json, precise_float=self.precise_float),
dtype=None,
orient="index",
)
elif orient == "table":
self.obj = parse_table_schema(json, precise_float=self.precise_float)
else:
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None
)
def _process_converter(self, f, filt=None) -> None:
"""
Take a conversion function and possibly recreate the frame.
"""
if filt is None:
filt = lambda col, c: True
obj = self.obj
assert obj is not None # for mypy
needs_new_obj = False
new_obj = {}
for i, (col, c) in enumerate(obj.items()):
if filt(col, c):
new_data, result = f(col, c)
if result:
c = new_data
needs_new_obj = True
new_obj[i] = c
if needs_new_obj:
# possibly handle dup columns
new_frame = DataFrame(new_obj, index=obj.index)
new_frame.columns = obj.columns
self.obj = new_frame
def _try_convert_types(self) -> None:
if self.obj is None:
return
if self.convert_dates:
self._try_convert_dates()
self._process_converter(
lambda col, c: self._try_convert_data(col, c, convert_dates=False)
)
def _try_convert_dates(self) -> None:
if self.obj is None:
return
# our columns to parse
convert_dates_list_bool = self.convert_dates
if isinstance(convert_dates_list_bool, bool):
convert_dates_list_bool = []
convert_dates = set(convert_dates_list_bool)
def is_ok(col) -> bool:
"""
Return if this col is ok to try for a date parse.
"""
if not isinstance(col, str):
return False
col_lower = col.lower()
if (
col_lower.endswith("_at")
or col_lower.endswith("_time")
or col_lower == "modified"
or col_lower == "date"
or col_lower == "datetime"
or col_lower.startswith("timestamp")
):
return True
return False
self._process_converter(
lambda col, c: self._try_convert_to_date(c),
lambda col, c: (
(self.keep_default_dates and is_ok(col)) or col in convert_dates
),
)
Regards,
James.
1 Like