Error when trying to use the gpu-trained lstm model on cpu

I have trained a lstm word-level language generation model on gpu; here I want to use the model for generating some sentence on cpu. I have used the suggestion provided in https://discuss.pytorch.org/t/on-a-cpu-device-how-to-load-checkpoint-saved-on-gpu-device/349, but it reports strange error:IndexError: list index out of range. (Everything is fine on GPU)

generating_samples = True
start_word = "I"
class LSTM_Word_Model(nn.Module):
    def __init__(self, size, embedding, vocab_size):
        super(LSTM_Word_Model, self).__init__()
        self.size = size
        self.embedding = embedding
        # If you want to retrieve the pretrained embedding using frozen parameters, try 
        # self.embedding = nn.Embedding.from_pretrained(embedding.weight)
        # self.cos=nn.CosineSimilarity(dim=1)

        self.lstm_0 = nn.LSTM(self.size, self.size)
        self.lstm_1 = nn.LSTM(self.size, self.size)
        self.lstm_2 = nn.LSTM(self.size, self.size)

        self.out = nn.Linear(self.size, vocab_size)

    def forward(self, chars, hidden=None):
        if hidden is None:
            hidden = [(torch.randn(1, 1, self.size).cuda(), torch.randn(1, 1, self.size).cuda()),
                      (torch.randn(1, 1, self.size).cuda(), torch.randn(1, 1, self.size).cuda()),
                      (torch.randn(1, 1, self.size).cuda(), torch.randn(1, 1, self.size).cuda())]
        chars = chars.view((-1, 1))
        chars = self.embedding(chars).view((-1, self.size)).float()
        out_0, new_hidden_0 = self.lstm_0(chars.view(chars.shape[0], 1, -1), hidden[0])
        out_1, new_hidden_1 = self.lstm_1(out_0.view(out_0.shape[0], 1, -1), hidden[1])
        out_2, new_hidden_2 = self.lstm_2(out_1.view(out_1.shape[0], 1, -1), hidden[2])

        probs = self.out(out_1.view(out_2.shape[0], out_2.shape[2]))
        probs = F.log_softmax(probs, 1)
        return probs, [new_hidden_0, new_hidden_1, new_hidden_2]

def generate_samples(model, word, index=True):
    with torch.no_grad():
        ''' prob, H = model.forward(word_to_ix[word])'''
        print(word)
        prob, H = model.forward(torch.tensor([[word]]))

    if index:
        print(len(word_to_ix))
        print(prob.shape)
        prob = torch.exp(prob)
        prob = prob.to(torch.device("cpu")).numpy()
        
        print(np.amin(prob))
        next_word_index = np.random.choice(len(word_to_ix.keys()), 1, p=prob[0, :])
        return next_word_index
    else:
        ix_to_word = list(word_to_ix.keys())
        next_word = np.random.choice(ix_to_word, 1, prob)
        return next_word


lstm_words_model = torch.load(save_dir + '/lstm_model', map_location=lambda storage, loc: storage)

if generating_samples:
    print("start generating examples")
    lstm_words_model.eval()
    word = word_to_ix[start_word]
    data = list()
    data.append(word)
    for _ in range(len_data):
        word = generate_samples(lstm_words_model, word, index=True)
        data.append(word)
    data = np.asarray(data)
    pickle.dump(data, open(data_dir + "/data", "wb"))

Traceback (most recent call last):
  File "/Users/TONY/Documents/language_generation/words_generation_model.py", line 369, in <module>
    word = generate_neg_samples(lstm_words_model, word, index=True)
  File "/Users/TONY/Documents/language_generation/words_generation_model.py", line 345, in generate_neg_samples
    prob, H = model.forward(torch.tensor([[word]]))
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 110, in forward
    inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 121, in scatter
    return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 36, in scatter_kwargs
    inputs = scatter(inputs, target_gpus, dim) if inputs else []
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 29, in scatter
    return scatter_map(inputs)
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 16, in scatter_map
    return list(zip(*map(scatter_map, obj)))
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 14, in scatter_map
    return Scatter.apply(target_gpus, None, dim, obj)
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 73, in forward
    streams = [_get_stream(device) for device in ctx.target_gpus]
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 73, in <listcomp>
    streams = [_get_stream(device) for device in ctx.target_gpus]
  File "/Users/TONY/anaconda/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 100, in _get_stream
    if _streams[device] is None:
IndexError: list index out of range