Unable to load model on pytorch xla device

mobassir94 · April 8, 2020, 4:33pm

i was using torch xla and i i tried to debug my program manually like this :

def fit(model, train_dataset, val_dataset, epochs=1, batch_size=32, warmup_prop=0, lr=5e-5):
    xm.master_print('1')
    device = xm.xla_device()
    xm.master_print('2')
    model = model.to(device)
    xm.master_print('loading train loader')
    train_sampler = DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True
    )



def fit_multiprocessing(rank, flags):
    fit(model, train_dataset, val_dataset, epochs=epochs, batch_size=batch_size, warmup_prop=warmup_prop, lr=lr)
    
FLAGS = {}
xmp.spawn(fit_multiprocessing, args=(FLAGS,), nprocs=8, start_method='fork')

after printing 1 and 2 the training process gets stuck for forever that means the program is not able to finish executing this line of code : model = model.to(device)

here are the codes used for creating model for tpu training :

MODEL_PATHS = {
    'xlmr': '../input/xlm-roberta-base/',
}

TRANSFORMERS = {
    
    'xlmr': (XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig)
}

import torch
from transformers import XLMRobertaConfig,XLMRobertaModel

xlmr_path="/kaggle/input/xlm-roberta-base/"
xlmr = XLMRobertaModel.from_pretrained(xlmr_path)
#xlmr.eval()  # disable dropout (or leave in train mode to finetune)

class Transformer(nn.Module):
    def __init__(self, model, num_classes=1):
        """
        Constructor
        
        Arguments:
            model {string} -- Transformer to build the model on. Expects "camembert-base".
            num_classes {int} -- Number of classes (default: {1})
        """
        super().__init__()
        self.name = model

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]

        bert_config = XLMRobertaConfig.from_json_file(MODEL_PATHS[model] + 'config.json')
        bert_config.output_hidden_states = True
        
        self.transformer = XLMRobertaModel(bert_config)

        self.nb_features = self.transformer.pooler.dense.out_features

        self.pooler = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features), 
            nn.Tanh(),
        )

        self.logit = nn.Linear(self.nb_features, num_classes)
       
        
    
    def forward(self, tokens):
        """
        Usual torch forward function
        
        Arguments:
            tokens {torch tensor} -- Sentence tokens
        
        Returns:
            torch tensor -- Class logits
        """
        _, _, hidden_states = self.transformer(
            tokens, attention_mask=(tokens > 0).long()
        )

        hidden_states = hidden_states[-1][:, 0] # Use the representation of the first token of the last layer

        ft = self.pooler(hidden_states)

        return self.logit(ft)

model = Transformer('xlmr')

sorry for not changing the variable names,the initial setting was for bert base uncased model, i just changed the paths and weights to use xlm roberta instead,i think the transformer class is where i am missing something,please help me to solve this issue, thanks

Jonathan_Lai · May 20, 2020, 4:45am

I have the same problem!

mobassir94 · May 20, 2020, 11:15am

Which Version of xla you are using?

Jonathan_Lai · May 20, 2020, 12:37pm

Using 1.5 on google colab

mobassir94 · May 20, 2020, 1:15pm

Try this :

!pip install -q --upgrade efficientnet-pytorch
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py > /dev/null
!python pytorch-xla-env-setup.py --version 20200420 --apt-packages libomp5 libopenblas-dev > /dev/null