I have been playing around with GPT-2 774M and I was wondering if it’s possible to inference a single trained model in parallel using something like the threading library to save on VRAM and/or processing and increase throughput. I’m kind of new to this level of detail for playing around with models, so I don’t know if this is even possible. I have found tons on training models in parallel, but not a lot about using those trained models in parallel. The only 2 links I’ve found on the subject are this GitHub issue and a stack overflow answer which seems to imply that it was never intended to be that way. This is the code I am running:
# import argparse
import os
import random
import sys
import threading
import time
import numpy as np
import torch
from GPT2.config import GPT2Config
from GPT2.encoder import get_encoder
from GPT2.model import GPT2LMHeadModel
from GPT2.sample import sample_sequence
from GPT2.utils import load_weight
def text_generator(
text="",
quiet=False,
nsamples=1,
unconditional=None,
batch_size=-1,
length=-1,
temperature=0.7,
top_k=40,
num_threads=1,
):
if os.path.exists("bin/gpt2-pytorch_model.bin"):
state_dict = torch.load(
"bin/gpt2-pytorch_model.bin",
map_location="cpu" if not torch.cuda.is_available() else None,
)
else:
print("Please download gpt2-pytorch_model.bin and/or place in bin folder")
sys.exit()
if batch_size == -1:
batch_size = 1
assert nsamples % batch_size == 0
seed = random.randint(0, 2147483647)
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
# print("CUDA AVAILABILITY: {}".format(torch.cuda.is_available()))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load Model
enc = get_encoder()
config = GPT2Config()
model = GPT2LMHeadModel(config)
model = load_weight(model, state_dict)
model.share_memory()
model.to(device)
model.eval()
if length == -1:
length = config.n_ctx // 2
elif length > config.n_ctx:
raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx)
context_tokens = enc.encode(text)
list_of_threads = []
print("THREADCOUNT = {}".format(num_threads))
for i in range(num_threads):
kwargs = {
"model": model,
"length": length,
"context": context_tokens if not unconditional else None,
"start_token": enc.encoder["<|endoftext|>"] if unconditional else None,
"batch_size": batch_size,
"temperature": temperature,
"top_k": top_k,
"device": device,
"nsamples": nsamples,
"unconditional": unconditional,
"context_tokens": context_tokens,
"enc": enc,
"quiet": quiet,
"num_threads": num_threads,
}
thr = threading.Thread(target=runit, kwargs=kwargs)
thr.daemon = True
thr.name = "Text Gen Thread {}".format(i)
thr.start()
list_of_threads.append(thr)
print("Started Thread {}!".format(thr.name))
for t in list_of_threads:
t.join()
def runit(
model=None,
length=None,
context=None,
start_token=None,
batch_size=None,
temperature=None,
top_k=None,
device=None,
nsamples=None,
unconditional=None,
context_tokens=None,
enc=None,
quiet=None,
num_threads=None,
):
global WORD_COUNT_LIST
generated = 0
batch_size = batch_size
start_time = time.time()
for _ in range(nsamples // batch_size):
out = sample_sequence(
model=model,
length=length,
context=context_tokens if not unconditional else None,
start_token=enc.encoder["<|endoftext|>"] if unconditional else None,
batch_size=batch_size,
temperature=temperature,
top_k=top_k,
device=device,
)
out = out[:, len(context_tokens) :].tolist()
for i in range(batch_size):
generated += 1
text = enc.decode(out[i])
if quiet is False:
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
pass
print(text)
WORD_COUNT_LIST.append(len(text.split()))
"""
# words per seconds that it takes a thread to get through
writing speed of any particular thread
"""
total_time_taken = time.time() - start_time
# total_word_count = sum(i for i in WORD_COUNT_LIST)
# average_word_count = sum(i for i in WORD_COUNT_LIST) / num_threads
# word_per_second_batchwise = len(text.split()) / total_time_taken
print("TOTAL TIME TAKEN FOR {} THREADS IN PARALLEL: {}".format(num_threads, total_time_taken))
# print("TOTAL WORD COUNT: {}".format(total_word_count))
# print("AVERAGE WORD COUNT: {}".format(average_word_count))
# print("WORD PER SECOND BATCHWISE: {}".format(word_per_second_batchwise))
# clears GPU cache
torch.cuda.empty_cache()
if __name__ == "__main__":
WORD_COUNT_LIST = []
nsamples = 100
text_to_process = """This is the text to process"""
for i in range(4, 5):
# print(i)
text_generator(text=text_to_process, nsamples=nsamples, quiet=False, num_threads=i)
print("######################### FINISHED #########################")
The gist of what this code does is it loads the text model then spawns 4 threads with all the same parameters to do generate samples in parallel. This code is a modified version of this Pytorch implementation of GPT-2. What I’m noticing is it functions a lot more like a single worker moving between multiple jobs than multiple workers each working on their own job. The stack overflow answer seems to explain this behavior.
The more threads I add, the total time to completion is increased. This makes sense. What doesn’t make sense to me is when I run this program several times (4 separate programs for a total of 16 threads), the total time to completion is almost as if I am running 1 program with 4 threads. Why is it like this, and is what I want even possible? Test were done on a GTX 1080
I tried this same method on an RTX 6000 from Linode and it just straight up refused to run 2 threads in parallel. I have no idea why this is. Maybe multiprocessing would work better?