Autoencoder With Categorical Input

TheOraware · October 20, 2021, 10:20am

Hi ,

I am trying to implement swap auto-encoder with one of categorical column,in following data column “Name” is categorical and rest of first 3 columns are numerical

X = np.array([[1,2,3,'A'], [4,5,6,'A'],[7,8,9,'A'],[10,11,12,'B'],[13,14,15,'B'],[16,17,18,'B'],[19,20,21,'B'],[22,23,24,'B']])
X = pd.DataFrame(data=X,columns=['ID1','ID2','ID3','Name'])
X

	ID1	ID2	ID3	Name
0	1	2	3	A
1	4	5	6	A
2	7	8	9	A
3	10	11	12	B
4	13	14	15	B
5	16	17	18	B
6	19	20	21	B
7	22	23	24	B

Here is my autoencoder

class Autoencoder(nn.Module):
    def __init__(self,emb_szs,n_cont):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(no_emb,sz_emb) for no_emb,sz_emb in emb_szs])
        n_emb = sum([sz_emb for no_emb,sz_emb in emb_szs])
        n_in  = n_emb+n_cont
        self.encoder = nn.Sequential(
            nn.Linear(n_in,50),
            nn.BatchNorm1d(50),            
            nn.ReLU(),

            nn.Linear(50,10),
            nn.BatchNorm1d(10),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(10,50),
            nn.BatchNorm1d(50),                        
            nn.ReLU())
            
        self.decoder_recon = nn.Linear(50,n_in)
        self.decoder_classifier = nn.Linear(50,n_in)

    
    def forward(self,x_cat,x_cont):
      #x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
      #x = torch.cat(x, 1)
      # x = torch.cat([x, x_cont], 1) # input layer 
        

        x = self.encoder(x)
        x = self.decoder(x)
        reconstruction = self.decoder_recon(x)
        logits = self.decoder_classifier(x)
        
        return reconstruction,logits
    
    def get_encoder_state(self,x):
        encoded = self.encoder(x)
        return encoded

I am swapping randomly , one numerical column row values to another row values for same numerical column

class swapDataset:
    def __init__(self, cont_x, noise,cat_x):
        self.cont = cont_x.drop(cat_x,axis=1).copy().values.astype(np.float32) # numerical columns
        self.cat  = cont_x.loc[:,cat_x].copy().values.astype(np.int64) # categorical columns
        self.noise = noise
        
    def __len__(self):
        return (self.cont.shape[0])
    
    def __getitem__(self, idx):

        sample = self.cont[idx, :].copy()
        sample = self.swap_sample(sample)
        
        dct = {
            'x' : torch.tensor(sample, dtype=torch.float),
            'y' : torch.tensor(self.cont[idx, :], dtype=torch.float),
            'z' : torch.tensor(self.cat[idx, :], dtype=torch.int64)
            
        }
        return dct
    
    def swap_sample(self,sample):
            #print(sample.shape)
            num_samples = self.cont.shape[0]
            num_features = self.cont.shape[1]
            if len(sample.shape) == 2:
                batch_size = sample.shape[0]
                random_row = np.random.randint(0, num_samples, size=batch_size)
                for i in range(batch_size):
                    random_col = np.random.rand(num_features) < self.noise
                    #print(random_col)
                    sample[i, random_col] = self.cont[random_row[i], random_col]
            else:
                batch_size = 1
          
                random_row = np.random.randint(0, num_samples, size=batch_size)
               
            
                random_col = np.random.rand(num_features) < self.noise
                #print(random_col)
                #print(random_col)
       
                sample[ random_col] = self.cont[random_row, random_col]
                
            return sample

cat_cols =['Name']        
for col in cat_cols:
    X[col] = LabelEncoder().fit_transform(X[col])

train_dataset = swapDataset(X,0.2,cat_cols) # Swapping data using above swapDataset method 
data_loader   = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=False)

cat_dims = [int(X[col].nunique()) for col in cat_cols]
print(cat_dims)

emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
print(emb_dims)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
autoencoder= Autoencoder(emb_dims, 3).to(device)

[2]
[(2, 1)]

When i train , i am getting following error

num_epochs = 200
outputs = []
running_loss = 0.0

criterion = nn.MSELoss()

for epoch in range(num_epochs):
    for data in data_loader:
        inputs, targets,cat = data['x'].to(device), data['y'].to(device),data['z'].to(device)
        recon,logits = autoencoder(cat,inputs)
        print(recon)
        print(targets)
        
        loss_recon   = criterion(recon,targets) # calculate loss
        loss_classifier     = criterion(logits,inputs)
        
        #loss = loss_recon + loss_classifier 
        loss = loss_classifier 

        optimizer.zero_grad()
        loss.backward()
        optimizer.step() # update weights 
        if not  scheduler.__class__ ==  torch.optim.lr_scheduler.ReduceLROnPlateau:
            scheduler.step(loss)
        running_loss += loss.item()
    print(f'Epoch: {epoch+1}, Loss Classifier:{loss_classifier}, combined loss {loss}')

tensor([[-0.0963,  0.2478, -0.0934,  0.1523],
        [ 0.4680,  0.1963,  0.4263, -0.1038]], grad_fn=<AddmmBackward>)
tensor([[1., 2., 3.],
        [4., 5., 6.]])


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_36/2034992058.py in <module>
     12         print(targets)
     13 
---> 14         loss_recon   = criterion(recon,targets) # calculate loss
     15         loss_classifier     = criterion(logits,inputs)
     16 

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/loss.py in forward(self, input, target)
    526 
    527     def forward(self, input: Tensor, target: Tensor) -> Tensor:
--> 528         return F.mse_loss(input, target, reduction=self.reduction)
    529 
    530 

/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py in mse_loss(input, target, size_average, reduce, reduction)
   3087         reduction = _Reduction.legacy_get_string(size_average, reduce)
   3088 
-> 3089     expanded_input, expanded_target = torch.broadcast_tensors(input, target)
   3090     return torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
   3091 

/opt/conda/lib/python3.7/site-packages/torch/functional.py in broadcast_tensors(*tensors)
     71     if has_torch_function(tensors):
     72         return handle_torch_function(broadcast_tensors, tensors, *tensors)
---> 73     return _VF.broadcast_tensors(tensors)  # type: ignore[attr-defined]
     74 
     75 

RuntimeError: The size of tensor a (4) must match the size of tensor b (3) at non-singleton dimension 1

What i understood is target and recon does not have same dimension ,I am not sure whether this is how we embed input in autoencoder , if it is then how can we embed output?