class Car_insurance(Dataset):
def __init__(self, encoding = "label_encode", embedding_layer = False, normal = False):
path = '../../data/insurance_claims.csv'
categorical_cols = ['policy_state', 'umbrella_limit', 'insured_sex', 'insured_education_level',
'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_type',
'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city',
'property_damage', 'police_report_available', 'auto_make', 'auto_model']
cols_to_remove = ['policy_number', 'policy_bind_date', 'policy_csl', 'incident_location', 'incident_date', '_c39']
self.normal = normal
data = load_data(path)
data = remove_cols(data, cols_to_remove)
if normal:
data = get_normal_data(data, "car_insurance")
self.label = get_labels(data, "car_insurance")
self.cat_cols = data[categorical_cols]
self.cont_cols = data.drop(categorical_cols, axis=1)
if embedding_layer:
self.cat_cols = self.cat_cols.astype("category")
self.embedded_cols = {n: len(col.cat.categories) for n,col in self.cat_cols.items() if (col.dtype == "category")}
self.embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in self.embedded_cols.items()]
embedded_col_names = self.embedded_cols.keys()
embed = []
for i, name in enumerate(embedded_col_names):
embed_elem = {cat : n for n, cat in enumerate(self.cat_cols[name].cat.categories)}
embed.append(embed_elem)
self.cat_cols[name] = self.cat_cols[name].replace(embed_elem)
else:
if encoding == 'one_hot':
self.cat_cols = one_hot_encoding(self.cat_cols, categorical_cols)
if encoding == 'label_encode':
self.cat_cols = label_encoding(self.cat_cols, categorical_cols)
if encoding == 'gel_encode':
self.cat_cols = gel_encoding(self.cat_cols, categorical_cols)
def __len__(self):
return(len(self.label))
def __getitem__(self, idx):
cat_cols = (self.cat_cols.values.astype(np.float32))
cont_cols = (self.cont_cols.values.astype(np.float32))
label = (self.label.astype(np.int32))
return (cont_cols[idx], cat_cols[idx], label[idx])
This is my dataset class. I want to add a custom function to return self.embedding_sizes. I want self.embedding sizes before I get the items in dataloader. Please help me with the issue. I am not sure if it is possible or should I try some other way?