Hi ,
I am trying to implement swap auto-encoder with one of categorical column,in following data column “Name” is categorical and rest of first 3 columns are numerical
X = np.array([[1,2,3,'A'], [4,5,6,'A'],[7,8,9,'A'],[10,11,12,'B'],[13,14,15,'B'],[16,17,18,'B'],[19,20,21,'B'],[22,23,24,'B']])
X = pd.DataFrame(data=X,columns=['ID1','ID2','ID3','Name'])
X
ID1 | ID2 | ID3 | Name | |
---|---|---|---|---|
0 | 1 | 2 | 3 | A |
1 | 4 | 5 | 6 | A |
2 | 7 | 8 | 9 | A |
3 | 10 | 11 | 12 | B |
4 | 13 | 14 | 15 | B |
5 | 16 | 17 | 18 | B |
6 | 19 | 20 | 21 | B |
7 | 22 | 23 | 24 | B |
Here is my autoencoder
class Autoencoder(nn.Module):
def __init__(self,emb_szs,n_cont):
super().__init__()
self.embeds = nn.ModuleList([nn.Embedding(no_emb,sz_emb) for no_emb,sz_emb in emb_szs])
n_emb = sum([sz_emb for no_emb,sz_emb in emb_szs])
n_in = n_emb+n_cont
self.encoder = nn.Sequential(
nn.Linear(n_in,50),
nn.BatchNorm1d(50),
nn.ReLU(),
nn.Linear(50,10),
nn.BatchNorm1d(10),
nn.ReLU()
)
self.decoder = nn.Sequential(
nn.Linear(10,50),
nn.BatchNorm1d(50),
nn.ReLU())
self.decoder_recon = nn.Linear(50,n_in)
self.decoder_classifier = nn.Linear(50,n_in)
def forward(self,x_cat,x_cont):
#x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
#x = torch.cat(x, 1)
# x = torch.cat([x, x_cont], 1) # input layer
x = self.encoder(x)
x = self.decoder(x)
reconstruction = self.decoder_recon(x)
logits = self.decoder_classifier(x)
return reconstruction,logits
def get_encoder_state(self,x):
encoded = self.encoder(x)
return encoded
I am swapping randomly , one numerical column row values to another row values for same numerical column
class swapDataset:
def __init__(self, cont_x, noise,cat_x):
self.cont = cont_x.drop(cat_x,axis=1).copy().values.astype(np.float32) # numerical columns
self.cat = cont_x.loc[:,cat_x].copy().values.astype(np.int64) # categorical columns
self.noise = noise
def __len__(self):
return (self.cont.shape[0])
def __getitem__(self, idx):
sample = self.cont[idx, :].copy()
sample = self.swap_sample(sample)
dct = {
'x' : torch.tensor(sample, dtype=torch.float),
'y' : torch.tensor(self.cont[idx, :], dtype=torch.float),
'z' : torch.tensor(self.cat[idx, :], dtype=torch.int64)
}
return dct
def swap_sample(self,sample):
#print(sample.shape)
num_samples = self.cont.shape[0]
num_features = self.cont.shape[1]
if len(sample.shape) == 2:
batch_size = sample.shape[0]
random_row = np.random.randint(0, num_samples, size=batch_size)
for i in range(batch_size):
random_col = np.random.rand(num_features) < self.noise
#print(random_col)
sample[i, random_col] = self.cont[random_row[i], random_col]
else:
batch_size = 1
random_row = np.random.randint(0, num_samples, size=batch_size)
random_col = np.random.rand(num_features) < self.noise
#print(random_col)
#print(random_col)
sample[ random_col] = self.cont[random_row, random_col]
return sample
cat_cols =['Name']
for col in cat_cols:
X[col] = LabelEncoder().fit_transform(X[col])
train_dataset = swapDataset(X,0.2,cat_cols) # Swapping data using above swapDataset method
data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=False)
cat_dims = [int(X[col].nunique()) for col in cat_cols]
print(cat_dims)
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
print(emb_dims)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
autoencoder= Autoencoder(emb_dims, 3).to(device)
[2]
[(2, 1)]
When i train , i am getting following error
num_epochs = 200
outputs = []
running_loss = 0.0
criterion = nn.MSELoss()
for epoch in range(num_epochs):
for data in data_loader:
inputs, targets,cat = data['x'].to(device), data['y'].to(device),data['z'].to(device)
recon,logits = autoencoder(cat,inputs)
print(recon)
print(targets)
loss_recon = criterion(recon,targets) # calculate loss
loss_classifier = criterion(logits,inputs)
#loss = loss_recon + loss_classifier
loss = loss_classifier
optimizer.zero_grad()
loss.backward()
optimizer.step() # update weights
if not scheduler.__class__ == torch.optim.lr_scheduler.ReduceLROnPlateau:
scheduler.step(loss)
running_loss += loss.item()
print(f'Epoch: {epoch+1}, Loss Classifier:{loss_classifier}, combined loss {loss}')
tensor([[-0.0963, 0.2478, -0.0934, 0.1523],
[ 0.4680, 0.1963, 0.4263, -0.1038]], grad_fn=<AddmmBackward>)
tensor([[1., 2., 3.],
[4., 5., 6.]])
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_36/2034992058.py in <module>
12 print(targets)
13
---> 14 loss_recon = criterion(recon,targets) # calculate loss
15 loss_classifier = criterion(logits,inputs)
16
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/loss.py in forward(self, input, target)
526
527 def forward(self, input: Tensor, target: Tensor) -> Tensor:
--> 528 return F.mse_loss(input, target, reduction=self.reduction)
529
530
/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py in mse_loss(input, target, size_average, reduce, reduction)
3087 reduction = _Reduction.legacy_get_string(size_average, reduce)
3088
-> 3089 expanded_input, expanded_target = torch.broadcast_tensors(input, target)
3090 return torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
3091
/opt/conda/lib/python3.7/site-packages/torch/functional.py in broadcast_tensors(*tensors)
71 if has_torch_function(tensors):
72 return handle_torch_function(broadcast_tensors, tensors, *tensors)
---> 73 return _VF.broadcast_tensors(tensors) # type: ignore[attr-defined]
74
75
RuntimeError: The size of tensor a (4) must match the size of tensor b (3) at non-singleton dimension 1
What i understood is target and recon does not have same dimension ,I am not sure whether this is how we embed input in autoencoder , if it is then how can we embed output?