I tried to implement a char level RNN and used cross entropy loss as my criterion.
Here’s my RNN module:
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, n_layers=1):
super(RNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.encoder = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size)
self.decoder = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden_state):
x = self.encoder(x)
out, hidden_state = self.gru(x.view(1, -1), hidden_state)
print(out.shape)
out = self.decoder(out)
print(out.shape)
return out, hidden_state
def init_hidden(self):
return Variable(torch.zeros(self.n_layers, self.hidden_size)).to(device='cuda')
Here’s my training function and eval function and others:
def evaluate(model, starter_char, hidden_state, predict_len=100, temperature=0.8, device='cuda'):
model.eval()
input_tensor = char2tensor(starter_char).to(device=device)
predict_chars = starter_char
for i in range(predict_len):
output, hidden_state = model(input_tensor, hidden_state)
# sample an index from softmax probs
probs = torch.exp(output.data.view(-1).div(temperature)) / torch.sum(output.data.view(-1).div(temperature))
idx = torch.multinomial(probs, 1)[0]
# concatenate the char to the output and render it the next prediction
prediction = all_characters[idx]
predict_chars += prediction
input_tensor = char2tensor(prediction)
return predict_chars
def train(model, optimizer, chunk_len, loss_his, device='cuda',
epoch=1000,
print_every=100,
plot_every=100):
hidden_state = model.init_hidden()
model.to(device=device)
criterion = F.cross_entropy
loss = torch.tensor([0.0], requires_grad=True)
for i in range(epoch):
model.train()
inp, target = training_set(chunk_len)
inp = inp.to(device=device)
target = target.to(device=device)
for c in range(chunk_len):
output, hidden_state = model(inp[c], hidden_state)
loss += criterion(output, target[c])
loss /= chunk_len
# Clear out the gradient in case they add up
optimizer.zero_grad()
# backward pass for loss
loss.backward()
# take the gradient step
optimizer.step()
with torch.no_grad():
if i % print_every == 0:
starter_idx = np.random.choice(n_characters)
starter_char = all_characters[starter_idx]
print(f"Current Loss:{loss.item()}")
print(f"Input letter:{starter_char}")
print("--------------------------------------------")
text = evaluate(model, starter_char, hidden_state)
print(text)
print("--------------------------------------------")
if i % plot_every == 0:
loss_his.append(loss.item() / plot_every)
all_characters = string.printable
n_characters = len(all_characters)
hidden_size = 128
file = unidecode.unidecode(open("data/data.txt").read())
file_len = len(file)
def random_chunk(chunk_len):
start_index = random.randint(0, file_len - chunk_len)
end_index = start_index + chunk_len + 1
return file[start_index:end_index]
def char2tensor(inputs):
tensor = torch.zeros(len(inputs)).long()
for c in range(len(inputs)):
tensor[c] = all_characters.index(inputs[c])
return Variable(tensor)
And when I tried to run these, this occurred:
ValueError: Expected input batch_size (1) to match target batch_size (0).
And when I tried to debug it, it seems that the output
in the traing function is of size torch.Size([1, 100])
and target[c]
is just a scalar. And whenever I tried to fix it with view()
or squeeze()
, it would suggest that it’s a leaf Variable and cannot use inplace operation.
Is there anything I can do to fix it?