Normally error printout gives the line in which error caused the error or exception. But in my code here, the line executes OK (print statement in this case) but after that exception is thrown.
169 print("total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset): ", total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset))
This seems weird.
Because of this seemingly ambiguous situation (line 169 executes but same time causes exception thrown).
I put partial output below and full code example I used from my study book:
partial output
tensor([[[ 0.7039, -0.8321, -0.4651],
[-0.3203, 2.2408, 0.5566],
[-0.4643, 0.3046, 0.7046],
[-0.7106, -0.2959, 0.8356]],
[[-0.4643, 0.3046, 0.7046],
[ 0.0946, -0.3531, 0.9124],
[-0.3203, 2.2408, 0.5566],
[ 0.0000, 0.0000, 0.0000]]], grad_fn=<EmbeddingBackward0>)
total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset): 0.5034 0.6918227579116821
Epoch 0 accuracy: 0.6008 val_accuracy: 0.5034
total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset): 0.6958 0.5821398535728455
Epoch 1 accuracy: 0.6647 val_accuracy: 0.6958
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/datapipes/datapipe.py", line 335, in __len__
return len(self._datapipe)
File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/datapipes/iter/grouping.py", line 47, in __len__
return len(self.source_datapipe) // self.num_of_instances +\
File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/datapipes/iter/combinatorics.py", line 140, in __len__
return len(self.datapipe)
File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/datapipes/iter/callable.py", line 122, in __len__
"{} instance doesn't have valid length".format(type(self).__name__)
TypeError: MapperIterDataPipe instance doesn't have valid length
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "p520-rnnmodel-build.py", line 181, in <module>
acc_test, _ = evaluate(test_dl)
File "p520-rnnmodel-build.py", line 169, in evaluate
print("total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset): ", total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset))
File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/datapipes/datapipe.py", line 338, in __len__
"{} instance doesn't have valid length".format(type(self).__name__)
TypeError: _IterDataPipeSerializationWrapper instance doesn't have valid length
[root@localhost ch15]# nano -w "p520-rnnmodel-build.py"
[root@localhost ch15]# nano -w "/opt/conda/lib/python3.7/site-packages/torch/utils/data/datapipes/datapipe.py"
[root@localhost ch15]#
full code:
[root@localhost ch15]# cat -n p520-rnnmodel-build.py
1 import torch
2 import torch.nn as nn
3 import code
4
5 from torchtext.datasets import IMDB
6 train_dataset = IMDB(split='train')
7 test_dataset =IMDB(split='test')
8
9 # 1. create dataset
10 from torch.utils.data.dataset import random_split
11
12 torch.manual_seed(1)
13 train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])
14
15 # 2. find unique tokens
16
17 import re
18 from collections import Counter, OrderedDict
19
20 def tokenizer(text):
21 text = re.sub('<[^>]*>', '', text)
22 emoticons = re.findall(
23 '(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
24 text = re.sub('[\W]+', ' ' , text.lower()) + \
25 ' '.join(emoticons).replace('-', '')
26
27 tokenized = text.split()
28 return tokenized
29
30
31 token_counts = Counter()
32 for label, line in train_dataset:
33 tokens = tokenizer(line)
34 token_counts.update(tokens)
35 print('Vocab-size:', len(token_counts))
36
37 # 3. encoding each unique token into integres
38
39 from torchtext.vocab import vocab
40 sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
41 ordered_dict = OrderedDict(sorted_by_freq_tuples)
42 vocab = vocab(ordered_dict)
43 vocab.insert_token('<pad>', 0)
44 vocab.insert_token('<unk>', 1)
45 vocab.set_default_index(1)
46
47 print([vocab[token] for token in ['this', 'is', 'an', 'example']])
48
49 # 3a. define the functions for transformation.
50
51 text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
52 label_pipeline = lambda x: 1. if x == 'pos' else 0.
53
54 # 3b. wrap the encode and transformation function.
55
56 def collate_batch(batch):
57 label_list, text_list, lengths = [], [], []
58 for _label, _text in batch:
59 label_list.append(label_pipeline(_label))
60 processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
61 text_list.append(processed_text)
62 lengths.append(processed_text.size(0))
63
64 label_list = torch.tensor(label_list)
65 lengths = torch.tensor(lengths)
66 padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
67
68 return padded_text_list, label_list, lengths
69
70 # Take a small batch
71
72 from torch.utils.data import DataLoader
73 dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
74
75 text_batch, label_batch, length_batch = next(iter(dataloader))
76 print(text_batch)
77 print(label_batch)
78 print(length_batch)
79
80 batch_size=32
81 train_dl=DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
82 valid_dl=DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
83 test_dl=DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
84
85 embedding=nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0)
86
87 # a batch of 2 samples of 4 indices each
88
89 text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
90 print(embedding(text_encoded_input))
91
92 '''
93 class RNN(nn.Module):
94 def __init__(self, input_size, hidden_size):
95 super().__init__()
96 self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
97 # self.rnn = nn.GPU(input_size, hidden_size, num_layers=2, batchfirst=True)
98 # self.rnn = nn.LSTM(input_size, hidden_size, num_layers=2, batchfirst=True)
99 self.fc=nn.Linear(hidden_size, 1)
100
101 def forward(self, x):
102 _, hidden = self.rnn(x)
103
104 # we use the final hidden state from the last hidden layer as the input to the fully
105 # connected layer
106
107 out = hidden[-1, :, :]
108 out=self.fc(out)
109 return out
110
111 model=RNN(64,32)
112 print(model)
113 model(torch.randn(5,3,64))
114
115 '''
116 class RNN(nn.Module):
117 def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
118 super().__init__()
119 self.embedding=nn.Embedding(vocab_size, embed_dim, padding_idx=0)
120 self.rnn=nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
121 self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
122 self.relu = nn.ReLU()
123 self.fc2 = nn.Linear(fc_hidden_size, 1)
124 self.sigmoid = nn.Sigmoid()
125
126 def forward(self, text, lengths):
127 out = self.embedding(text)
128 out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
129 out, (hidden,cell) = self.rnn(out)
130 out = hidden[-1,:,:]
131 out = self.fc1(out)
132 out = self.relu(out)
133 out = self.fc2(out)
134 out = self.sigmoid(out)
135 return out
136
137 vocab_size = len(vocab)
138 embed_dim = 20
139 rnn_hidden_size=64
140 fc_hidden_size=64
141 torch.manual_seed(1)
142 model=RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
143
144 def train(dataloader):
145 model.train()
146 total_acc, total_loss=0,0
147 for text_batch, label_batch, lengths in dataloader:
148 optimizer.zero_grad()
149 pred=model(text_batch, lengths)[:,0]
150 loss=loss_fn(pred, label_batch)
151 loss.backward()
152 optimizer.step()
153 total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
154 total_loss += loss.item() * label_batch.size(0)
155
156 return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
157
158 def evaluate(dataloader):
159 model.eval()
160 total_acc, total_loss = 0,0
161 with torch.no_grad():
162 for text_batch, label_batch, lengths in dataloader:
163 pred=model(text_batch, lengths)[:,0]
164 loss=loss_fn(pred, label_batch)
165 total_acc+=((pred>=0.5).float() == label_batch).float().sum().item()
166 total_loss += loss.item()*label_batch.size(0)
167
168 #code.interact(local=locals())
169 print("total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset): ", total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset))
170 return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
171
172 loss_fn = nn.BCELoss()
173 optimizer=torch.optim.Adam(model.parameters(), lr=0.001)
174 num_epochs=2
175 torch.manual_seed(1)
176 for epoch in range(num_epochs):
177 acc_train, loss_train = train(train_dl)
178 acc_valid, loss_valid = evaluate(valid_dl)
179 print(f'Epoch {epoch} accuracy: {acc_train:.4f}'
180 f' val_accuracy: {acc_valid:.4f}')
181 acc_test, _ = evaluate(test_dl)
182 print(f'test accuracy: {acc_test:.4f}')
183