Hi, I’m implementing ELMo model (paper + GRU architecture) using pytorch on sentiment analysis task (2 classes).
My problem is after training model for 3 epochs (almost takes 7 hours), parameters are almost constant, I mean parameters get update but grad value for every parameter is almost zero and parameters updates so slow.
After training model for almost 100 samples (just for test and long time for every epoch) I printed model output on trained samples (64 sentences) and you can see all of outputs are almost 0.61 or 0.62 (models output before applying sigmoid is almost zero):
[0.6190, 0.6177, 0.6218, 0.6209, 0.6216, 0.6177, 0.6218, 0.6248, 0.6187,
0.6209, 0.6208, 0.6197, 0.6208, 0.6201, 0.6164, 0.6204, 0.6187, 0.6186,
0.6172, 0.6227, 0.6180, 0.6176, 0.6177, 0.6189, 0.6167, 0.6162, 0.6204,
0.6212, 0.6212, 0.6170, 0.6175, 0.6188, 0.6200, 0.6207, 0.6211, 0.6186,
0.6171, 0.6190, 0.6171, 0.6215, 0.6204, 0.6166, 0.6169, 0.6189, 0.6192,
0.6171, 0.6198, 0.6210, 0.6217, 0.6182, 0.6205, 0.6167, 0.6185, 0.6185,
0.6247, 0.6201, 0.6183, 0.6172, 0.6248, 0.6156, 0.6187, 0.6221, 0.6184,
0.6200]
mean grad value for first layer (character based embedding) in 7 iterations (with batch size 4):
-3.2057e-08
-1.0591e-07
8.0309e-10
-3.1149e-08
1.7176e-08
1.0479e-08
-5.9668e-08
loss values:
0.6922
0.6888
0.6932
0.6933
0.705
0.6812
0.7068
first layer parameters (before training):
Parameter containing:
tensor([[-0.8127, 0.0848, -1.8994, ..., -0.4188, 0.0737, 1.7480],
[-0.9858, 1.2334, -1.5336, ..., -0.1520, -0.8097, 1.5319],
[-0.3637, 0.2356, -0.6203, ..., -0.2677, 0.3540, -0.8167],
...,
[ 0.5995, 0.0444, 0.5533, ..., -0.6380, -0.2782, 0.4377],
[-1.1214, 0.1163, 0.6494, ..., 0.9082, 0.0925, -2.0435],
[ 1.1774, 2.0876, 1.2902, ..., 0.1933, 0.6906, -0.9966]],
device='cuda:0', requires_grad=True)
first layer parameters (after training on 1000 iterations):
Parameter containing:
tensor([[ 0.4986, -0.1885, -2.1546, ..., 1.6023, 1.0103, -0.0118],
[-0.2110, -0.0524, -0.5779, ..., -1.7709, -0.6997, 1.7685],
[-0.8088, -0.0187, 0.4958, ..., 0.2945, -0.8318, 0.5191],
...,
[ 0.0324, 0.6847, 0.7107, ..., -0.5620, 1.1643, -0.1883],
[ 0.3290, -1.5829, -1.2789, ..., -0.6205, -1.9693, -0.8639],
[ 1.1525, 1.1839, 1.4262, ..., 0.1396, -0.0622, -1.1427]],
device='cuda:0', requires_grad=True)
conv1d_embed module (Embedding + Convolution 1D):
class Conv1d_Embed(nn.Module):
def __init__(self, embed_dim, filters_list):
super(Conv1d_Embed, self).__init__()
self.filters_list = filters_list
self.embed = nn.Embedding(num_embeddings=chars_count, embedding_dim=embed_dim, device=device)
self.conv_list = nn.ModuleList(modules=None)
self.conv_norm_layer = nn.LayerNorm([100, np.sum(np.array(self.filters_list)[:, 0])])
for filter in filters_list:
conv = nn.Conv1d(in_channels=embed_dim, out_channels=filter[0], kernel_size=filter[1], stride=1, padding=0, dilation=1, device=device)
self.conv_list.append(conv)
def forward(self, X):
X = self.embed(X).permute(0, 1, 3, 2)
X_conv = torch.empty(size=(X.shape[0], X.shape[1], np.sum(np.array(self.filters_list)[:, 0])))
for sentence_idx in range(X.shape[0]):
idx_sum = 0
for convolution in self.conv_list:
torch.cuda.empty_cache()
conv_result = convolution(X[sentence_idx])
conv_result = torch.max(conv_result, dim=2).values
seq_columns = convolution.out_channels
X_conv[sentence_idx][:, idx_sum:idx_sum + seq_columns] = conv_result
idx_sum += seq_columns
X_conv = self.conv_norm_layer(X_conv)
X_conv = torch.relu(X_conv)
torch.cuda.empty_cache()
return X_conv
highway network module:
class Highway_Network(nn.Module):
def __init__(self, H_act:str, in_dim:int):
super(Highway_Network, self).__init__()
if H_act == 'relu': self.H_act = nn.ReLU()
elif H_act == 'tanh': self.H_act = nn.Tanh()
else: self.H_act = nn.Sigmoid()
self.in_dim = in_dim
self.H = nn.Linear(in_features=in_dim, out_features=in_dim, bias=False, device=device)
self.T = nn.Linear(in_features=in_dim, out_features=in_dim, bias=True, device=device)
def forward(self, X):
T = torch.sigmoid(self.T(X))
H = self.H_act(self.H(X))
y = (H * T) + (X * (1 - T))
torch.cuda.empty_cache()
return y
ELMo module:
class ELMo(nn.Module):
def __init__(self, in_dim_for_highway, embed_dim, filters_list, proj_size, rnn_hidden_size):
super(ELMo, self).__init__()
self.conv1d_embed = Conv1d_Embed(embed_dim, filters_list)
self.highway_layer1 = Highway_Network(H_act='tanh', in_dim=in_dim_for_highway)
self.highway_layer2 = Highway_Network(H_act='tanh', in_dim=in_dim_for_highway)
self.proj_after_highway = nn.Linear(in_features=in_dim_for_highway, out_features=proj_size, bias=True, device=device)
self.norm_after_highway = nn.LayerNorm([100, proj_size], device=device)
self.rnn_layer1_forward = nn.GRU(input_size=proj_size, hidden_size=rnn_hidden_size, num_layers=1, bias=True,
batch_first=True, dropout=0, bidirectional=False, device=device)
self.rnn_layer1_backward = nn.GRU(input_size=proj_size, hidden_size=rnn_hidden_size, num_layers=1, bias=True,
batch_first=True, dropout=0, bidirectional=False, device=device)
self.rnn_layer2_forward = nn.GRU(input_size=proj_size, hidden_size=rnn_hidden_size, num_layers=1, bias=True,
batch_first=True, dropout=0, bidirectional=False, device=device)
self.rnn_layer2_backward = nn.GRU(input_size=proj_size, hidden_size=rnn_hidden_size, num_layers=1, bias=True,
batch_first=True, dropout=0, bidirectional=False, device=device)
self.proj_after_rnn1_forward = nn.Linear(in_features=rnn_hidden_size, out_features=proj_size, bias=True, device=device)
self.proj_after_rnn1_backward = nn.Linear(in_features=rnn_hidden_size, out_features=proj_size, bias=True, device=device)
self.proj_after_rnn2_forward = nn.Linear(in_features=rnn_hidden_size, out_features=proj_size, bias=True, device=device)
self.proj_after_rnn2_backward = nn.Linear(in_features=rnn_hidden_size, out_features=proj_size, bias=True, device=device)
self.output_layer = nn.Linear(in_features=102400, out_features=1, bias=True, device=device)
def forward(self, X):
output = self.conv1d_embed(X).to(device)
output = self.highway_layer1(output)
output = self.highway_layer2(output)
output = self.proj_after_highway(output)
output = self.norm_after_highway(output)
output = torch.relu(output)
forward_output = self.rnn_layer1_forward(output)[0] # forward
forward_output = torch.relu(forward_output)
forward_output = self.proj_after_rnn1_forward(forward_output)
forward_output = torch.relu(forward_output)
backward_output = self.rnn_layer1_backward(torch.flip(output, dims=[1]))[0] # backward
backward_output = torch.relu(backward_output)
backward_output = self.proj_after_rnn1_backward(backward_output)
backward_output = torch.relu(backward_output)
forward_output = self.rnn_layer2_forward(forward_output)[0]
forward_output = torch.relu(forward_output)
forward_output = self.proj_after_rnn2_forward(forward_output)
forward_output = torch.relu(forward_output)
backward_output = self.rnn_layer2_backward(backward_output)[0]
backward_output = torch.relu(backward_output)
backward_output = self.proj_after_rnn2_backward(backward_output)
backward_output = torch.relu(backward_output)
backward_output = torch.flip(backward_output, dims=[1])
output = torch.concat((forward_output, backward_output), dim=2)
output = output.reshape((output.shape[0], output.shape[1] * output.shape[2]))
output = self.output_layer(output)
output = torch.sigmoid(output)
return output
some other details:
embed_dim = 50
model_location = 'drive/MyDrive/elmo_dataset_words_lower_100/elmo_model.mdl'
optimizer_location = 'drive/MyDrive/elmo_dataset_words_lower_100/elmo_optimizer.optm'
filters_list = [[32, 1], [32, 2], [64, 3], [128, 4], [256, 5], [512, 6], [1024, 7]]
in_dim_for_highway = np.sum(np.array(filters_list)[:, 0])
proj_size = 512
rnn_hidden_size = 4096
Feedforward + Backward module:
model = ELMo(in_dim_for_highway, embed_dim, filters_list, proj_size, rnn_hidden_size)
optimizer = optim.Adam(params=model.parameters(), lr=1e-5)
# model.load_state_dict(torch.load(model_location))
# optimizer.load_state_dict(torch.load(optimizer_location))
print(summary(model))
batch_size = 4
epochs = 5 # Started by 5
bce = nn.BCELoss()
new_slices = slices = pd.read_csv('drive/MyDrive/elmo_dataset_words_lower_100/slice_list.csv').drop(columns=['Unnamed: 0']) # slice 10 is for test
for slice_idx in range(len(slices)):
slice_path = slices.iloc[slice_idx, :].values[0]
print(f'Training ELMo on {slice_path}...')
dataset = np.load(slice_path)
labels = torch.Tensor(dataset['labels'].astype(np.float32)).to('cpu')
dataset = torch.Tensor(dataset['data']).type(torch.int32).to('cpu')
for label_idx in range(len(labels)):
if labels[label_idx] == -1: labels[label_idx] = 0
# elif labels[label_idx] == 0: labels[label_idx] = 1
elif labels[label_idx] == 1: labels[label_idx] = 1
dataset_size = dataset.shape[0]
dataset_loss = list()
idx = torch.randperm(dataset.shape[0])
dataset = dataset[idx] # Randomization
labels = labels[idx] # Randomization
for batch in range(batch_size, dataset.shape[0] + batch_size, batch_size):
optimizer.zero_grad()
X = dataset[batch - batch_size:batch].to(device)
y = labels[batch - batch_size:batch].to(device)
output = model(X).squeeze()
loss = bce(output, y)
loss.backward()
optimizer.step()
print(torch.mean(list(model.parameters())[0].grad))
loss_value = loss.item()
dataset_loss.append(loss_value)
print(f'Batch: {batch} - Loss: {loss_value} - Dataset size: {dataset_size}')
print('---------------------')
torch.save(model.state_dict(), model_location)
torch.save(optimizer.state_dict(), optimizer_location)
print(f'Dataset slice: {slice_path} - Loss: {np.mean(dataset_loss)}')
print(f'Trained model saved in {model_location}')
print(f'Optimizer saved in {optimizer_location}')
print('---------------------')
new_slices = new_slices.drop(index=slice_idx)
new_slices.to_csv('drive/MyDrive/elmo_dataset_words_lower_100/slice_list.csv')
del X, y, dataset, labels, output
collect()
I examined every hyper-parameter you think (batch size, learning rate, activation functions, projection size and etc) and checked labels.
What is problem? I think there is mistake in using pytorch modules like autograd…