Parallel learning of several networks simultaneously

Hey. I am working on testing diffrent methods of hyperparameter optimalization. First i want to test grid search, this method requries to train same type of network, using same dataset, same optimzer but setting diffrent parameters of this optimazer. Is it possible to train many models of networks at once. Currently I training single network, then I set diffrent paramaters and I am traning again, but this tooks a lot of time.
This is code of my current training process, where “myparam” is an array of diffrent parameters. There is 81 different combination so, the main loop is done 81 times.

for x in range(81):
    torch.cuda.empty_cache()
    model = models.alexnet(pretrained=True)
    model.classifier[6] = torch.nn.Linear(model.classifier[6].in_features, 9)
    device = torch.device('cuda')
    model = model.to(device)
    optimizer = optim.SGD(model.parameters(), lr = myparam[(x,0)], momentum=myparam[(x,1)])
    print("Attempt: ",x)
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=10,
        shuffle=True,
        num_workers=2
    )

    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=10,
        shuffle=True,
        num_workers=2
    )
    for epoch in range(NUM_EPOCHS):
    
    
        for images, labels in iter(train_loader):
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
    
        test_error_count = 0.0
        for images, labels in iter(test_loader):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            test_error_count += float(torch.sum(torch.abs(labels - outputs.argmax(1))))
    
        test_accuracy = 1.0 - float(test_error_count) / float(len(test_dataset))

What I would attempt to do is to use Python’s multiprocessing module to train multiple models (on different Python interpreters) at once. I have successfully done something similar that way (parameter search for Bayesian models optimized using MCMC).

What I would worry about in your case are the computational resources. Do you have enough CPU, GPU, RAM and VRAM resources to meaningfully train two or more neural networks at once? If not, you will just add more overhead of needlessly doing multiprocessing.

Anyways, here is what I did for my Bayesian models. Keep in mind that this code does not follow the “best practices”.

from sklearn.model_selection import ParameterGrid
from multiprocessing import Pool
from tqdm import tqdm

def fit_and_evaluate(args: tuple):
    '''
    Fits and evaluate model from StanClasses.py
    Parameter
    ---------
    args: tuple as (model class, init_kwargs, X_train, X_val) or
          (integer index, model class, init_kwargs, X_train, X_val)
          This function is used for parallell processing, therefore the weird 
          argument setup. The integer index is used to keep track of order 
          when using parallell processing. 
    Returns
    -------
    (model_object, fit_time, train_mae, val_mae) or
    (integer index, model_object, fit_time, train_mae, val_mae)
    '''
    with_index = False
    try: # Could have just checked len of args instead
        model, init_kwargs, X_train, X_val = args
    except ValueError:
        index, model, init_kwargs, X_train, X_val = args
        with_index = True

    model_object = model(**init_kwargs)
 
    t0 = time()
    model_object.fit(X_train)
    fit_time = time()-t0
    
    train_mae = model_object.mae(X_train)
    
    if X_val is not None:
        val_mae = model_object.mae(X_val)   
    else:
        val_mae = None
    
    if with_index:
        return index, model_object, fit_time, train_mae, val_mae
    else:
        return model_object, fit_time, train_mae, val_mae

def fit_and_evaluate_models(models: Iterable, X_train: Union[np.ndarray, pd.DataFrame], 
                            X_val: Union[np.ndarray, pd.DataFrame]=None, 
                            candidate_kwargs: dict={}, static_kwargs: dict={}, 
                            verbose=True, ascii=False): # Mutable stuff as default arguments in Python is a bad idea, be careful
    '''
    Given models, and candidate kwargs, trains the models on every combination candidate_kwargs.
    Parameters
    ----------
    models: Iterable of model classes
    X_train: Dataset used for training
    X_val: Dataset used for validation
    candidate_kwargs: Keyword arguemts to be trained on. They will be turned into a 
                      grid of all combinations. Think of how GridSearch works. 
    static_kwargs: Shared keyword arguments for all the models. 
    verbose: Shows tqdm if True, True by default
    ascii: Whether tqdm bar should print ascii (for compatibility)
    Returns
    -------
    hist: dictionary containing fitted model object, the corresponding parameters,
          fit_time, train_mae, val_mae.
    '''

    hist = {'model':[], 'params':[], 'fit_time':[], 'train_mae':[], 'val_mae':[]}
    
    map_args = []
    candidate_param_list = []
    param_gen = ParameterGrid({'model':models, **candidate_kwargs})
    n_params = len(param_gen)

    for i, paramdict in enumerate(param_gen):
        model = paramdict.pop('model')
        candidate_param_list.append(paramdict)
        paramdict = paramdict.copy()
        paramdict.update(static_kwargs)
        map_args.append((i, model, paramdict, X_train, X_val))
    
    # Train candidates on separate processes 
    with Pool(None) as p:
        fit_iterator = tqdm(
            p.imap_unordered(fit_and_evaluate, map_args), total=n_params,
            desc='Fitting models', disable=not verbose, unit='model', position=0,
            ascii=ascii
        )
        # Note that imap_unordered is used, so the ordering will probably be 
        # scrambled. 
        results = list(fit_iterator)
        
    for result in results:
        index, model_object, fit_time, train_mae, val_mae = result
        hist['model'].append(model_object)
        hist['params'].append(candidate_param_list[index])
        hist['fit_time'].append(fit_time)
        hist['train_mae'].append(train_mae)
        hist['val_mae'].append(val_mae)
    
    return hist