Hi @sh0416 here is a sample of code i cameby while browsing Allen Nlp guides and source code. I hope this points you to the right direction. Sorry I just pasted there example codes
from typing import Dict, Iterable, List
from allennlp.data import DatasetReader, Instance
from allennlp.data.fields import LabelField, TextField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WhitespaceTokenizer
class ClassificationTsvReader(DatasetReader):
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
max_tokens: int = None):
super().__init__(lazy)
self.tokenizer = tokenizer or WhitespaceTokenizer()
self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
self.max_tokens = max_tokens
def _read(self, file_path: str) -> Iterable[Instance]:
with open(file_path, 'r') as lines:
for line in lines:
text, sentiment = line.strip().split('\t')
tokens = self.tokenizer.tokenize(text)
if self.max_tokens:
tokens = tokens[:self.max_tokens]
text_field = TextField(tokens, self.token_indexers)
label_field = LabelField(sentiment)
fields = {'text': text_field, 'label': label_field}
yield Instance(fields)
dataset_reader = ClassificationTsvReader(max_tokens=64)
instances = dataset_reader.read("quick_start/data/movie_review/train.tsv")
for instance in instances[:10]:
print(instance)
or this
class SimpleClassifier(Model):
def __init__(self,
vocab: Vocabulary,
embedder: TextFieldEmbedder,
encoder: Seq2VecEncoder):
super().__init__(vocab)
self.embedder = embedder
self.encoder = encoder
num_labels = vocab.get_vocab_size("labels")
self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
def forward(self,
text: Dict[str, torch.Tensor],
label: torch.Tensor) -> Dict[str, torch.Tensor]:
# Shape: (batch_size, num_tokens, embedding_dim)
embedded_text = self.embedder(text)
# Shape: (batch_size, num_tokens)
mask = util.get_text_field_mask(text)
# Shape: (batch_size, encoding_dim)
encoded_text = self.encoder(embedded_text, mask)
# Shape: (batch_size, num_labels)
logits = self.classifier(encoded_text)
# Shape: (batch_size, num_labels)
probs = torch.nn.functional.softmax(logits, dim=-1)
# Shape: (1,)
loss = torch.nn.functional.cross_entropy(logits, label)
return {'loss': loss, 'probs': probs}