Hi, I am relatively new to building deep learrning models and I seem to be completely confused and stuck with errors related to shape and size.
Here’s the LSTM model and relevant code:
class LSTMTagger(nn.Module):
def __init__(self):
super(LSTMTagger, self).__init__()
# self.lstm1 = nn.LSTM(input_size = 1, hidden_size = 100)
# self.lstm2 = nn.LSTM(100, 50)
self.embedding =
nn.Embedding(wv.vectors.shape[0],512)#embedding_matrix.shape[1])
self.lstm1 = nn.LSTM(input_size = 512, hidden_size = 64, dropout =
0.1,batch_first=True,bidirectional = True)
self.dropout = nn.Dropout(p = 0.25)
self.linear1 = nn.Linear(in_features = 128, out_features = 64)
self.dropout = nn.Dropout(p = 0.25)
self.linear2 = nn.Linear(in_features = 64, out_features = 1)
self.sigmoid = nn.Sigmoid()
def forward(self, X):
X_embed = self.embedding(X)
outr1, _ = self.lstm1(X_embed)
xr = self.dropout(outr1)
xr= self.linear1(xr)
xr = self.dropout(xr)
xr= self.linear2(xr)
outr4 = self.sigmoid(xr)
outr4 = outr4.view(1,-1)
return outr4
model = LSTMTagger()
torch.multiprocessing.set_sharing_strategy('file_system')
if torch.cuda.device_count() > 1:
print("Using ", torch.cuda.device_count(), " GPUs")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
# model =model.load_state_dict(torch.load('best_model_state.bin'))
model = nn.DataParallel(model, device_ids=[0]) #py r
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
print(f"Input ids: {np.shape(d['input_ids'])}\n len: {len(d['input_ids'][0])}")
input_ids = d["input_ids"].to(device)
targets = d["targets"].to(device)
outputs = model(input_ids)
_, preds = torch.max(outputs, dim=1)
print(f"outputs is {np.shape(outputs)}")
print(f"targets is {targets}")
# continue
loss = criterion(outputs.squeeze(), targets)
# loss.backward()
# nn.utils.clip_grad_norm_(model.parameters(), clip)
# optimizer.step()
# loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
EPOCHS = 6
optimizer = optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(data_train) * EPOCHS
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
loss_fn = nn.CrossEntropyLoss().to(device)
history = defaultdict(list)
best_accuracy = 0
criterion = nn.BCELoss()
# In[86]:
print('starting training')
# exit()
for epoch in range(EPOCHS):
# y_ip= input('Please enter y_ip y_ip value')
# print()
# if y_ip=='b':
# break
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
data_train,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
In this instance the sample input is a tensor of size: torch.Size([1, 512])
, that looks like this:
tensor([[44561, 972, 7891, 94, 2191, 131, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0]], device='cuda:0')
and output label (targets from train_epoch function) in case is just a simple 1 or 0 label in tensor form such as:
tensor([1], device='cuda:0')
I have been facing issues consistently with this approach. Initially the output was 1x512x1. So, I added
outr4 = outr4.view(1,-1)
after the sigmoid layer. Then, the output shape was reduced to 1x512 and I used squeeze function but, still, I face errors such as this one:
ValueError: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([512])) is deprecated. Please ensure they have the same size.
I have spent a lot of time trying to figure out what was going on but to no avail. Isn’t the output supposed to be either 1 or 0, instead of being a 1x512 shaped tensor?
I am relatively new to building models, so please excuse my lack of knowledge.