I am trying to generate text using a sequence to sequence model in Pytorch from the Apocalypse Now movie script. My code is almost verbatim to this one.
The following is the model, a character level LSTM.
class CharLSTM(nn.Module):
def __init__(self, chars, hidden_size, n_layers=2, drop_out=0.5, lr= 0.001):
super(CharLSTM, self).__init__()
# Set all the hyperparameters of your network
self.hidden_size = hidden_size
self.n_layers = n_layers
self.drop_out = drop_out
self.lr = lr
# set vocabulary and get indices for these
self.chars = chars
self.int2char = dict(enumerate(self.chars))
self.char2int = {w : i for i, w in self.int2char.items()}
# define the lstm network, this outputs the next char and cell state and hidden state
self.lstm = nn.LSTM(input_size=len(self.chars), hidden_size=hidden_size, num_layers=n_layers, dropout=drop_out,
batch_first=True)
# add dropout
self.drop_out = nn.Dropout(drop_out)
self.fc = nn.Linear(hidden_size, len(chars))
self.init_weights()
def forward(self, x, h_0):
"""compute current and hidden units, stack outputs and pass it to linear layer"""
x, (h, c) = self.lstm(x, h_0)
x = self.drop_out(x)
x = x.view(x.size()[0]*x.size()[1], self.hidden_size)
x = self.fc(x)
return x, (h, c)
def predict(self, char, h=None, cuda=False, top_k=None):
"""given a character, predict the next character in the sequence."""
if cuda():
self.cuda()
else:
self.cpu()
# Initialize hidden state
if h is None:
h = self.init_hidden(1)
# get the integer of character.
ch = np.array([[self.char2int[char]]])
# one_hot_encode
one_hot = one_hot_encode(ch, len(self.chars))
# convet to tensor
one_hot_torch = torch.from_numpy(one_hot)
if cuda():
one_hot_torch = one_hot_torch.cuda()
# create a tuple of the hidden state
# this is what LSTM expects
h = tuple([each.data for each in h])
out, h = self.forward(ch, h)
# Prob distribution over all the characters
probs = F.softmax(out, dim=1).data
# convert back prob to cpu if model was
# set to gpu
if cuda():
probs = probs.cpu()
# if top number of preds to get
# wasn't pass, take the distribution over whole character length
if top_k is None:
top_ch = np.arange(len(self.chars))
else:
probs, top_ch = probs.topk(top_k)
top_ch = top_ch.numpy().squeeze()
# reduce dims of size 1
probs = probs.numpy().squeeze()
# sample from top_char with each probs of each character being
# the char with higher prob will be chosen since dividing the
# highest value with probs.sum() is what is the best.
char = np.random.choice(top_ch, p=probs/probs.sum())
return self.char2int[char], h
def init_weights(self):
initrange = 0.1
self.fc.bias.data.fill_(0)
self.fc.weight.data.uniform_(-1, 1)
def init_hidden(self, n_seqs):
weight = next(self.parameters()).data
return (weight.new(self.n_layers, n_seqs, self.hidden_size).zero_(),
weight.new(self.n_layers, n_seqs, self.hidden_size).zero_())
instantiating an object of this class
model = CharLSTM(chars, hidden_size=512, n_layers=2)
model
CharLSTM(
(lstm): LSTM(58, 512, num_layers=2, batch_first=True, dropout=0.5)
(drop_out): Dropout(p=0.5)
(fc): Linear(in_features=512, out_features=58, bias=True)
)
The following is my train function where I send my inputs, targets for train and validation datasets, as well as my model.
def train(model, data, epochs=10, n_seqs=10, n_steps=40, lr=0.001, clip=5, val_frac= 0.1, cuda=False,
print_every=10):
""" model: the model to b trained
data: the data on which we train
epochs: number of epochs to train for
n_seqs" number of sequences in our batch
n_steps: time step for each sequence
lr: learning rate
clip: value used to clip the network gradient to prevent exploding gradeint.
val_frac: the fraction of data used for validation
print_every: the number of seconds for which we print out model statistics
"""
# change model to train mode
**model.train()**
# define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
# trin and validation split
val_idx = int(len(data)*(1-val_frac))
data, val_data = data[:val_idx], data[val_idx:]
**if cuda:**
** model.cuda()**
counter = 0
n_chars = len(model.chars)
# loop over epochs
for epoch in range(epochs):
# initialize hidden layer of the model
h = model.init_hidden(n_seqs)
# loop over batches
for x, y in get_batches(data, n_seqs, n_steps):
counter += 1
# one hot encode
x = one_hot_encode(x, n_chars)
# convert to tensors
inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
# move inputs and targets to cuda
**inputs, targets = inputs.cuda(), targets.cuda()**
# New hidden state being created to prevented backpropogating through the
# entire history
h = tuple([each.data for each in h])
# zero out gradient to prevent accumulation
model.zero_grad()
# get output and hidden
out, h = model.forward(inputs, h)
loss = criterion(out, targets.view(n_seqs*n_steps).type(torch.cuda.LongTensor))
# backpropogate loss
loss.backward()
# use gradient clipping to prevent exploding gradient
nn.utils.clip_grad_norm_(model.parameters(), clip)
# take a step in the los surface
optimizer.step()
if counter % print_every ==0:
# initilize hidden state for validation
val_hidden = model.init_hidden(n_seqs)
val_losses = []
for x, y in get_batches(val_data, n_seqs, n_steps):
x = one_hot_encode(x, n_chars)
x, y = torch.from_numpy(x), torch.from_numpy(y)
val_hidden = tuple([each.data for each in val_hidden])
inputs, targets = x, y
if cuda:
**inputs, targets = inputs.cuda(), targets.cuda()**
out, val_hidden = model.forward(inputs, val_hidden)
val_loss = criterion(out, targets.view(n_seqs*n_steps).type(torch.LongTensor))
val_losses.append(val_loss.item())
print('Epoch:'.format(epoch+1),
'Steps:'.format(counter),
'train loss {.:4f}'.format(loss.item),
'val loss {.:4f}'.format(np.mean(val_losses)))
running this with batch_size, i.e., n_seqs= 128
the sequence length, which here is n_steps, in the sense of time steps = 100
# define batch_size and sequence length
n_seqs, n_steps = 128, 100
train(model, encoded, epochs=25, n_seqs=n_seqs, n_steps=n_steps, lr=0.001, cuda=True, print_every=10)
This throws the followin error
RuntimeError Traceback (most recent call last)
in ()
1 n_seqs, n_steps = 128, 100
----> 2 train(model, encoded, epochs=25, n_seqs=n_seqs, n_steps=n_steps, lr=0.001, cuda=True, print_every=10)
4 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
1869 .format(input.size(0), target.size(0)))
1870 if dim == 2:
-> 1871 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
1872 elif dim == 4:
1873 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Expected object of backend CUDA but got backend CPU for argument #2 ‘target’
A quick check of whether the model parameters were registered by quora returns false.
next(model.parameters()).is_cuda
False
Working in the colab environment.
torch.cuda.get_device_name()
'Tesla T4'
Which is perplexing since I moved my parameters to cuda. I don’t get where I am going wrong. Please help, thank you so much for your time.