Any help please
lass BertEmbedding(torch.nn.Module):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
"""
def __init__(self,path="asafaya/bert-base-arabic" ):
super(BertEmbedding, self).__init__()
self.model_config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
self.bert = BertModel.from_pretrained(path)
self.dropout = torch.nn.Dropout(self.model_config.hidden_dropout_prob)
self.embedding2 = torch.nn.Linear(self.model_config.hidden_size, self.model_config.vocab_size_or_config_json_file)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
_,pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
pooled_output = self.dropout(pooled_output)
logits = self.embedding2(pooled_output)
loss = None
if labels is not None:
loss_fct = torch.nn.CrossEntropyLoss()
#Cross Entropy input.shape= [batch_size,num_labels] ,labels.shape = [batch_size]
loss = loss_fct(logits,labels.view(-1))
return loss,logits
def freeze_bert_encoder(self):
for param in self.bert.parameters():
param.requires_grad = False
def unfreeze_bert_encoder(self):
for param in self.bert.parameters():
param.requires_grad = True
def save_pretrained(self,path):
self.bert.save_pretrained(path)
path_tokenizer = models_path+"tokenizer/"
if not os.path.exists(path_tokenizer):
os.makedirs(path_tokenizer)
tokenizer = BertTokenizer.from_pretrained('asafaya/bert-base-arabic', do_lower_case=True)
tokenizer.save_pretrained(path_tokenizer)
else:
tokenizer = BertTokenizer.from_pretrained(path_tokenizer, do_lower_case=True)
path_asafaya = models_path+"asafaya_arabic_model/"
if not os.path.exists(path_asafaya):
os.makedirs(path_asafaya)
model_embed = BertEmbedding('asafaya/bert-base-arabic')
model_embed.save_pretrained(path_asafaya)
else:
model_embed = BertEmbedding(path_asafaya)
# model_embed.to(device)
# load model
#embed_epoch = 1
from transformers import AutoTokenizer, AutoModel
model_embed = AutoModel.from_pretrained("/content/drive/MyDrive/Models/asafaya_arabic_model")
#corpus = pre_process_corpus(df_corpus.values)
dataset_cbow = CorpusDataset(df_corpus.values,tokenizer,context_size = 2,preprocess=None,transform = None)
print("Dataset training samples :",len(dataset_cbow))
print(dataset_cbow[0])
Tokenizer Loading
Length = 26000
Loading Tokenizer: 0 %
Context vector Loading
Length = 26000
Loading Context vector: 0 %
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Features Loading
Length = 366602
Loading Features: 0 %
/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils_base.py:2179: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).
FutureWarning,
Loading Features: 27 %
Loading Features: 54 %
Loading Features: 81 %
Dataset training samples : 366602
(tensor([ 2, 1841, 29633, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]), tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), tensor(16653))
import gc
del df_unlabeled
del df_corpus
gc.collect()
358
train_dataloader ,validation_dataloader = prepare_data(dataset_cbow,0.99,64) #dataset_cbow
training samples 362935
validation samples 3667
model_embed,training_loss_embed,validation_loss_embed = training_model(model_embed,"embed",train_dataloader,validation_dataloader,epochs=10)
Epoch: 1
64/362935: [>...............................] - ETA 0.0stensor([[ 2, 3180, 28766, ..., 0, 0, 0],
[ 2, 6, 38, ..., 0, 0, 0],
[ 2, 4850, 2606, ..., 0, 0, 0],
...,
[ 2, 8340, 1023, ..., 0, 0, 0],
[ 2, 12306, 31019, ..., 0, 0, 0],
[ 2, 6428, 2538, ..., 0, 0, 0]])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-55-2b260c42c8ed> in <module>()
----> 1 model_embed,training_loss_embed,validation_loss_embed = training_model(model_embed,"embed",train_dataloader,validation_dataloader,epochs=10)
1 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
TypeError: forward() got an unexpected keyword argument 'labels'