I'm trying to generate multiple text outputs in parallel using a single loaded model. What am I doing wrong?

I have been playing around with GPT-2 774M and I was wondering if it’s possible to inference a single trained model in parallel using something like the threading library to save on VRAM and/or processing and increase throughput. I’m kind of new to this level of detail for playing around with models, so I don’t know if this is even possible. I have found tons on training models in parallel, but not a lot about using those trained models in parallel. The only 2 links I’ve found on the subject are this GitHub issue and a stack overflow answer which seems to imply that it was never intended to be that way. This is the code I am running:

# import argparse
import os
import random
import sys
import threading
import time

import numpy as np
import torch

from GPT2.config import GPT2Config
from GPT2.encoder import get_encoder
from GPT2.model import GPT2LMHeadModel
from GPT2.sample import sample_sequence
from GPT2.utils import load_weight


def text_generator(
    text="",
    quiet=False,
    nsamples=1,
    unconditional=None,
    batch_size=-1,
    length=-1,
    temperature=0.7,
    top_k=40,
    num_threads=1,
):
    if os.path.exists("bin/gpt2-pytorch_model.bin"):
        state_dict = torch.load(
            "bin/gpt2-pytorch_model.bin",
            map_location="cpu" if not torch.cuda.is_available() else None,
        )
    else:
        print("Please download gpt2-pytorch_model.bin and/or place in bin folder")
        sys.exit()

    if batch_size == -1:
        batch_size = 1
    assert nsamples % batch_size == 0

    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # print("CUDA AVAILABILITY: {}".format(torch.cuda.is_available()))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load Model
    enc = get_encoder()
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model = load_weight(model, state_dict)
    model.share_memory()
    model.to(device)
    model.eval()

    if length == -1:
        length = config.n_ctx // 2
    elif length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx)

    context_tokens = enc.encode(text)

    list_of_threads = []
    print("THREADCOUNT = {}".format(num_threads))

    for i in range(num_threads):
        kwargs = {
            "model": model,
            "length": length,
            "context": context_tokens if not unconditional else None,
            "start_token": enc.encoder["<|endoftext|>"] if unconditional else None,
            "batch_size": batch_size,
            "temperature": temperature,
            "top_k": top_k,
            "device": device,
            "nsamples": nsamples,
            "unconditional": unconditional,
            "context_tokens": context_tokens,
            "enc": enc,
            "quiet": quiet,
            "num_threads": num_threads,
        }
        thr = threading.Thread(target=runit, kwargs=kwargs)
        thr.daemon = True
        thr.name = "Text Gen Thread {}".format(i)
        thr.start()
        list_of_threads.append(thr)
        print("Started Thread {}!".format(thr.name))

    for t in list_of_threads:
        t.join()


def runit(
    model=None,
    length=None,
    context=None,
    start_token=None,
    batch_size=None,
    temperature=None,
    top_k=None,
    device=None,
    nsamples=None,
    unconditional=None,
    context_tokens=None,
    enc=None,
    quiet=None,
    num_threads=None,
):
    global WORD_COUNT_LIST
    generated = 0
    batch_size = batch_size
    start_time = time.time()
    for _ in range(nsamples // batch_size):
        out = sample_sequence(
            model=model,
            length=length,
            context=context_tokens if not unconditional else None,
            start_token=enc.encoder["<|endoftext|>"] if unconditional else None,
            batch_size=batch_size,
            temperature=temperature,
            top_k=top_k,
            device=device,
        )
        out = out[:, len(context_tokens) :].tolist()
        for i in range(batch_size):
            generated += 1
            text = enc.decode(out[i])
            if quiet is False:
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                pass
            print(text)
            WORD_COUNT_LIST.append(len(text.split()))

    """
    # words per seconds that it takes a thread to get through
    writing speed of any particular thread
    """

    total_time_taken = time.time() - start_time

    # total_word_count = sum(i for i in WORD_COUNT_LIST)
    # average_word_count = sum(i for i in WORD_COUNT_LIST) / num_threads
    # word_per_second_batchwise = len(text.split()) / total_time_taken

    print("TOTAL TIME TAKEN FOR {} THREADS IN PARALLEL: {}".format(num_threads, total_time_taken))
    # print("TOTAL WORD COUNT: {}".format(total_word_count))
    # print("AVERAGE WORD COUNT: {}".format(average_word_count))
    # print("WORD PER SECOND BATCHWISE: {}".format(word_per_second_batchwise))

    # clears GPU cache
    torch.cuda.empty_cache()


if __name__ == "__main__":
    WORD_COUNT_LIST = []
    nsamples = 100
    text_to_process = """This is the text to process"""

    for i in range(4, 5):
        # print(i)
        text_generator(text=text_to_process, nsamples=nsamples, quiet=False, num_threads=i)
    print("######################### FINISHED #########################")

The gist of what this code does is it loads the text model then spawns 4 threads with all the same parameters to do generate samples in parallel. This code is a modified version of this Pytorch implementation of GPT-2. What I’m noticing is it functions a lot more like a single worker moving between multiple jobs than multiple workers each working on their own job. The stack overflow answer seems to explain this behavior.

The more threads I add, the total time to completion is increased. This makes sense. What doesn’t make sense to me is when I run this program several times (4 separate programs for a total of 16 threads), the total time to completion is almost as if I am running 1 program with 4 threads. Why is it like this, and is what I want even possible? Test were done on a GTX 1080

I tried this same method on an RTX 6000 from Linode and it just straight up refused to run 2 threads in parallel. I have no idea why this is. Maybe multiprocessing would work better?

Here is the stack overflow post. Since I’m a new user it didn’t let me post more than 2 links.