BERT ImbalancedDatasetSampler ValueError:Cannot set a frame with no defined index and a value that cannot be converted to a Series

Jeeg_Zhang · July 29, 2022, 1:03pm

Using BERT for multi class classification.
Dataset is imbalanced.

The train_dataset is tokenized, so every element is 3d and has id, mask and label.

When applying RandomSampler, SequentialSampler it’s fine.

Replace the sampler to ImbalancedDatasetSampler

batch_size=3
dataloader_train_o = DataLoader(
dataset_train,
sampler=ImbalancedDatasetSampler(dataset_train),
batch_size=batch_size,
# **kwargs
)

results in ValueError:

ValueError Traceback (most recent call last)
File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\pandas\core\frame.py:3892, in DataFrame._ensure_valid_index(self, value)
3891 try:
→ 3892 value = Series(value)
3893 except (ValueError, NotImplementedError, TypeError) as err:

File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\pandas\core\series.py:451, in Series.init(self, data, index, dtype, name, copy, fastpath)
450 else:
→ 451 data = sanitize_array(data, index, dtype, copy)
453 manager = get_option(“mode.data_manager”)

File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\pandas\core\construction.py:601, in sanitize_array(data, index, dtype, copy, raise_cast_failure, allow_2d)
599 subarr = maybe_infer_to_datetimelike(subarr)
→ 601 subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
603 if isinstance(subarr, np.ndarray):
604 # at this point we should have dtype be None or subarr.dtype == dtype

File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\pandas\core\construction.py:652, in _sanitize_ndim(result, data, dtype, index, allow_2d)
651 return result
→ 652 raise ValueError(“Data must be 1-dimensional”)
653 if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
654 # i.e. PandasDtype(“O”)

ValueError: Data must be 1-dimensional

The above exception was the direct cause of the following exception:

ValueError Traceback (most recent call last)
Input In [49], in <cell line: 5>()
2 from torchsampler import ImbalancedDatasetSampler
4 batch_size=3
5 dataloader_train_o = DataLoader(
6 dataset_train,
----> 7 sampler=ImbalancedDatasetSampler(dataset_train),
8 batch_size=batch_size,
9 # **kwargs
10 )
12 dataloader_validation_o = DataLoader(
13 dataset_val,
14 sampler=SequentialSampler(dataset_val),
15 batch_size=batch_size,
16 # **kwargs
17 )

File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\torchsampler\imbalanced.py:37, in ImbalancedDatasetSampler.init(self, dataset, labels, indices, num_samples, callback_get_label)
35 # distribution of classes in the dataset
36 df = pd.DataFrame()
—> 37 df[“label”] = self._get_labels(dataset) if labels is None else labels
38 df.index = self.indices
39 df = df.sort_index()

File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\pandas\core\frame.py:3655, in DataFrame.setitem(self, key, value)
3652 self._setitem_array([key], value)
3653 else:
3654 # set column
→ 3655 self._set_item(key, value)

File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\pandas\core\frame.py:3832, in DataFrame._set_item(self, key, value)
3822 def _set_item(self, key, value) → None:
3823 “”"
3824 Add series to DataFrame in specified column.
3825
(…)
3830 ensure homogeneity.
3831 “”"
→ 3832 value = self._sanitize_column(value)
3834 if (
3835 key in self.columns
3836 and value.ndim == 1
3837 and not is_extension_array_dtype(value)
3838 ):
3839 # broadcast across multiple columns if necessary
3840 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):

File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\pandas\core\frame.py:4528, in DataFrame._sanitize_column(self, value)
4515 def _sanitize_column(self, value) → ArrayLike:
4516 “”"
4517 Ensures new columns (which go into the BlockManager as new blocks) are
4518 always copied and converted into an array.
(…)
4526 numpy.ndarray or ExtensionArray
4527 “”"
→ 4528 self._ensure_valid_index(value)
4530 # We should never get here with DataFrame value
4531 if isinstance(value, Series):

File D:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\pandas\core\frame.py:3894, in DataFrame._ensure_valid_index(self, value)
3892 value = Series(value)
3893 except (ValueError, NotImplementedError, TypeError) as err:
→ 3894 raise ValueError(
3895 "Cannot set a frame with no defined index "
3896 “and a value that cannot be converted to a Series”
3897 ) from err
3899 # GH31368 preserve name of index
3900 index_copy = value.index.copy()

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series