Hi,
I am following this popular post on understanding BERT.
I have posted my issue on GitHub here
I have access to a cluster with multiple nodes each having 4 GPUs but I am not sure how to use all for my task below.
I want to use say, 10 nodes x 4 GPUs = 40 GPUs. I read up on DP and DDP but I think I need to manually chunk my long document into chunks of several sentences and then assign each chunk to GPU.
Following is the code for the above task:
Blockquote
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import logging
#logging.basicConfig(level=logging.INFO)
import pickle
import nltk
from nltk import word_tokenize, sent_tokenize
import time
import os
import sys
start = time.time()
tokenizer = BertTokenizer.from_pretrained(‘bert-base-uncased’)
f = open(‘sample.txt’, ‘r’).read()
#print("f: ", f)
text = sent_tokenize(f)
#text = [“After stealing money”, “The bank robber was seen”] # from the bank vault, the bank robber was seen "
#“fishing on the Mississippi river bank.”
#marked_text = “[CLS] " + text + " [SEP]”
i=1
for marked_text in text:
#marked_text = text
# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)
# Print out the tokens.
print (tokenized_text)
# Add the special tokens.
#marked_text = "[CLS] " + text + " [SEP]"
# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
print('{:<12} {:>6,}'.format(tup[0], tup[1]))
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)
print (segments_ids)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens]).to(device)
segments_tensors = torch.tensor([segments_ids]).to(device)
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.to(device)#cuda()
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
encoded_layers, _ = model(tokens_tensor, segments_tensors)
print ("Number of layers:", len(encoded_layers))
layer_i = 0
print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0
print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))
# `encoded_layers` is a Python list.
print(' Type of encoded_layers: ', type(encoded_layers))
# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', encoded_layers[0].size())
token_embeddings = torch.stack(encoded_layers, dim=0)
print("token_embeddings_size: ", token_embeddings.size())
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)
print("After removing batches dimension: ", token_embeddings.size())
print("First layer: ", token_embeddings)
print("Second layer: ", token_embeddings[1])
print("Third layer: ", token_embeddings[2])
print("Twelfth layer: ", token_embeddings[11])
torch.save(token_embeddings, "cuda_" + str(i) + '_hid_weights_tensor.pt')
i = i + 1
print("Total execution time: “, time.time() - start, " sec”)
Can you please give me some suggestions on how to use all GPUs for a single document?
I have a set of around 10K documents. So I think it would be more efficient to chunk the documents and then give each a GPU ID.
Thanks!