Hi, I am new to PyTorch (and machine learning in general) and wanted to check if what I’m doing makes any sense at all. I am attempting to predict race (Asian, Black, Hispanic, White) from first name, last name, and the racial distribution of the person’s zip code. These should be three separate features, the thought being that, e.g. last name might have more predictive power than first name. For example, a row of my data would be
("John", "Li", [0.10, 0.40, 0.20, 0.30])
Currently, my model is
class FirstLastZctaLSTM(nn.Module):
def __init__(self, input_size: int, hidden_size: int, output_size: int) -> None:
super(FirstLastZctaLSTM, self).__init__()
self.hidden_size = hidden_size
self.ltsm_cell = nn.LSTM(input_size, hidden_size)
self.h2o = nn.Linear(hidden_size + 4, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(
self,
name: torch.Tensor,
pct: torch.Tensor,
hidden: tuple[torch.Tensor, torch.Tensor],
):
_, hidden = self.ltsm_cell(name.view(1, 1, -1), hidden)
combined = torch.cat([hidden[0].squeeze(0), pct], dim=1)
output = self.h2o(combined)
output: torch.Tensor = self.softmax(output)
return output, hidden
def init_hidden(self):
return (
torch.zeros(1, 1, self.hidden_size, device=DEVICE),
torch.zeros(1, 1, self.hidden_size, device=DEVICE),
)
I use a character-level encoding
import string
VALID_NAME_CHARS = f"{string.ascii_lowercase} '-"
VALID_NAME_CHARS_DICT = {c: i for i, c in enumerate(VALID_NAME_CHARS)}
VALID_NAME_CHARS_LEN = len(VALID_NAME_CHARS)
def encode_name(
name: str,
valid_name_chars_dict: dict[str, int],
valid_name_chars_len: int,
device: torch.device,
) -> torch.Tensor:
encoded = torch.zeros(len(name), 1, valid_name_chars_len, device=device)
for idx, c in enumerate(name):
encoded[idx][0][valid_name_chars_dict[c]] = 1
return encoded
Then I combine the encoded tensors like so:
name = torch.cat([encode_name(first_name), encode_name(last_name)], dim=0)
As an example, for data “michael kitts”, the final name tensor is of size [12, 29] and the percent data looks like
tensor([[6.0413e-03, 8.2485e-04, 3.7458e-02, 9.0547e-01]])
I pass it into the model
for name, pct, race in dataloader:
name = name.squeeze() # dataloader adds an extra batch size dimension
model.zero_grad(set_to_none=True)
hidden = model.init_hidden()
for i in range(name.size()[0]):
output, hidden = model(name[i], pct, hidden)
It runs with no errors, but I am curious if this is doing what I want it be doing? Since I concatenate first and last name, does it mean that I am basically just passing in the full name, just without the space?
Thanks a lot.