Error when trying to implement Multi-GPU capability (SOLVED) (I think)

John_Schwarcz · July 6, 2018, 5:53pm

Hi

I have a neural network in which I am trying to implement multi-GPU capability with
model=torch.nn.DataParallel(model, device_ids=[0]) (currently I’m trying to get it to work on a computer with only one GPU, thus why device_ids=[0].)
But I get the error:
RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.
Stemming from:
loss.backward()

Worth noting, without implementing multi-GPU the code works fine with no errors.
(To implement multi-GPU I also had to switch some lines of code from model.something, to model.module.something, which could be the source of the problem)

The full code, in all its chaos:

import os 
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-0.3.1-py36_cuda80_cudnn6he774522_2\Lib\site-packages')
import torch
import torch.nn as nn
from torch import optim
import torch.autograd as autograd
import time, random
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\tqdm-master')
from tqdm import tqdm
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master') 
import bilstm
import lstm
from lstm import LSTMSentiment
from bilstm import BiLSTMSentiment
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\text-master')
from torchtext import data
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z') 
import numpy as np
import argparse
import csv
import time
#import gensim
for x in range (1):
    torch.set_num_threads(8)
    torch.manual_seed(1)
    random.seed(1)
    #########################################################################
    cuda = True
    Difficulty = 'easy'
    #########################################################################
    #if Difficulty == 'easy': 
    def load_bin_vec(fname, vocab):
        """
        Loads 300x1 word vecs from Google (Mikolov) word2vec
        """
    #    count = 0
        word_vecs = {}
        with open(fname, "rb") as f:
            header = f.readline()
            vocab_size, layer1_size = map(int, header.split())
            binary_len = np.dtype('float32').itemsize * layer1_size
            for line in range(vocab_size):
                word = []
                while True:
                    ch = f.read(1).decode('latin-1')
                    if ch == ' ':
                        word = ''.join(word)
                        break
                    if ch != '\n':
                        word.append(ch)
                if word in vocab:
                   word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
                else:
                    f.read(binary_len)
    #                count = count+1 
    #    print(count)
        return word_vecs
    #if Difficulty == 'hard': 
    #    def load_bin_vec(fname, vocab):
    #        count = 0
    #        success = 0
    #        """
    #        Loads 300x1 word vecs from Google (Mikolov) word2vec
    #        """
    #        word_vecs = {}
    #        w2v_model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
    #
    #        from gensim.models import word2vec
    #        w2v_model = word2vec.Word2Vec.load(fname)
    #        print(w2v_model.similarity('film', 'movie'))
    #        
    #        for word in vocab:
     #           print(word)
    #            here = 0 
    #            find = 0
    #            for letter in word:
    #                here = here + 1 
    #                if letter == ':':
    #                    find = here
    #            try:
    #                word_vecs[word] = w2v_model[word[0:find]]
    #                success = success + 1
    #            except KeyError:
    #                count = count + 1 
                    # some tokens from the dataset may not appear as an entry in the word embeds matrix, so i print and skip them
    #                print('Key error: {}'.format(word))
    #                continue
    #        print(count, 'problematic words', success, 'successful words')
    #        return word_vecs
    
    
    def get_accuracy(truth, pred):
        assert len(truth) == len(pred)
        right = 0
        for i in range(len(truth)):
            if truth[i] == pred[i]:
                right += 1.0
        return right / len(truth)
    
    
    def train_epoch_progress(model, train_iter, loss_function, optimizer, text_field, label_field, epoch):
        model.train()
        avg_loss = 0.0
        truth_res = []
        pred_res = []
        count = 0
        for batch in tqdm(train_iter, desc='Train epoch '+str(epoch+1)):
            sent, label = batch.text, batch.label
            label.data.sub_(1)
            truth_res += list(label.data)
            model.batch_size = len(label.data) 
            model.epoch = epoch+1
            model.hidden = model.module.init_hidden()
            model.count = count
            pred = model.module(sent)
            pred_label = pred.data.max(1)[1]
            pred_res += [x for x in pred_label]
            model.zero_grad()
            loss = loss_function(pred, label)
            avg_loss += loss.data[0]
            count += 1
            loss.backward()
            optimizer.step()
        avg_loss /= len(train_iter)
        acc = get_accuracy(truth_res, pred_res)
    #    print(model.hidden)
        return avg_loss, acc
    
    
    def train_epoch(model, train_iter, loss_function, optimizer):
        model.train()
        avg_loss = 0.0
        truth_res = []
        pred_res = []
        count = 0
        for batch in train_iter:
            sent, label = batch.text, batch.label
            label.data.sub_(1)
            truth_res += list(label.data)
            model.batch_size = len(label.data)
            model.hidden = model.module.init_hidden()
            pred = model.module(sent)
            pred_label = pred.data.max(1)[1].numpy()
            pred_res += [x for x in pred_label]
            model.zero_grad()
            loss = loss_function(pred, label)
            avg_loss += loss.data[0]
            count += 1
            loss.backward()
            optimizer.step()
        avg_loss /= len(train_iter)
        acc = get_accuracy(truth_res, pred_res)
        return avg_loss, acc
    
    
    def evaluate(model, data, loss_function, name):
        model.eval()
        avg_loss = 0.0
        truth_res = []
        pred_res = []
        for batch in data:
            sent, label = batch.text, batch.label
            label.data.sub_(1)
            truth_res += list(label.data)
            model.batch_size = len(label.data)
            model.hidden = model.module.init_hidden()
            pred = model.module(sent)
            pred_label = pred.data.max(1)[1]
            pred_res += [x for x in pred_label]
            loss = loss_function(pred, label)
            avg_loss += loss.data[0]
        avg_loss /= len(data)
        acc = get_accuracy(truth_res, pred_res)
        print(name + ': loss %.2f acc %.1f' % (avg_loss, acc*100))
        return acc
    
    def load_sst(text_field, label_field, batch_size, Difficulty, cuda):
    #    for x in range(B_sizes):
        if Difficulty == 'easy':
            os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master copy/data')
            train, dev, test = data.TabularDataset.splits(path='./SST2/', train='train.tsv',
                                                          validation='dev.tsv', test='test.tsv', format='tsv',
                                                          fields=[('text', text_field), ('label', label_field)]) 
        if Difficulty == 'medium':
            os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master')
            train, dev, test = data.TabularDataset.splits(path='./data/Dataset/', train='train.tsv',
                                                          validation='test.tsv', test='test.tsv', format='tsv',
                                                          fields=[('text', text_field), ('label', label_field)])
        if Difficulty == 'hard':      
            os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z')
            train, dev, test = data.TabularDataset.splits(path='./aclImdb/', train='usable_train.tsv',
                                                          validation='usable_dev.tsv', test='usable_test.tsv', format='tsv',
                                                          fields=[('text', text_field), ('label', label_field)])
        text_field.build_vocab(train, dev, test)
        label_field.build_vocab(train, dev, test)
    #   batch_size =  10*(.1*x*(torch.cos(torch.FloatTensor([x]))*torch.sin(torch.FloatTensor([2*x])))+.08*x).round() + 1
    
        ## for GPU run
        if cuda == True:
            train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
                         batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), repeat=False, device=None)
        else:
            train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
                         batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), repeat=False, device=-1)
        return train_iter, dev_iter, test_iter
    #def adjust_learning_rate(learning_rate, optimizer, epoch):
    #     lr = learning_rate * (0.9 ** (epoch //1))
    #     for param_group in optimizer.param_groups:
    #         param_group['lr'] = lr
    #     return optimizer
    
    
    #for clustering
    #if Difficulty == 'easy':
    #    EPOCHS= 30
    #    BATCH_SIZE = 30
    #    HIDDEN_DIM = 150
        
    if Difficulty == 'easy':
        EPOCHS= 1000
        BATCH_SIZE = 5
        HIDDEN_DIM = 100
        #HIDDEN_DIM = round((150*(4/6)))
        #HIDDEN_DIM = round((150*(4/5)))
        #HIDDEN_DIM = round((150*(4/8)))
        
    if Difficulty == 'medium':
        EPOCHS= 30
        BATCH_SIZE = 5
        HIDDEN_DIM = 50
        #HIDDEN_DIM = round((150*(4/6)))
        #HIDDEN_DIM = round((150*(4/5)))
        #HIDDEN_DIM = round((150*(4/8)))
        
    if Difficulty == 'hard':
        EPOCHS= 30
        BATCH_SIZE = 5
        HIDDEN_DIM = 50
        
    USE_GPU = torch.cuda.is_available()
    EMBEDDING_DIM = 300
    
    timestamp = str(int(time.time()))
    best_dev_acc = 0.0
    #B_sizes = np.ones(BATCH_SIZE) #BATCH SIZE MUST BE MINIMUM OF 50
    #for x in range(BATCH_SIZE):
    #    B_sizes[x]= 10*(.1*x*(torch.cos(torch.FloatTensor([x]))*torch.sin(torch.FloatTensor([2*x])))+.08*x).round() + 1
    #B_sizes = max(B_sizes)
    
    #def weights_init(LSTMSentiment):
    #    classname = LSTMSentiment.__class__.__name__
    #    if classname.find('LSTMSentiment') != -1:
    #        LSTMSentiment.weight.data.fill_(.5)
    #        LSTMSentiment.bias.data.fill_(0)
    
    #Using = BiLSTMSentiment
    Using = LSTMSentiment
    if cuda:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
    print('Using: ', Using)   
    text_field = data.Field(lower=True)
    label_field = data.Field(sequential=False)
    train_iter, dev_iter, test_iter = load_sst(text_field, label_field, BATCH_SIZE, Difficulty = Difficulty, cuda = cuda)
    count = 0
    model = Using(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab),
                           use_gpu=cuda, label_size=len(label_field.vocab)-1, batch_size=BATCH_SIZE)
    if cuda:
        model = model.cuda()
    
    print('Load word embeddings...')
    # # glove
    #text_field.vocab.load_vectors('glove.6B.100d')
    #text_field.vocab.load_vectors('hard_dataset_embeddings')
    
    # word2vector
    word_to_idx = text_field.vocab.stoi
    
    pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), EMBEDDING_DIM))
    pretrained_embeddings[0] = 0
    #if Difficulty == 'easy':
    os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master')
    word2vec = load_bin_vec('./data/GoogleNews-vectors-negative300.bin', word_to_idx)
    #if Difficulty == 'hard':
    #    os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z')
    #    word2vec = load_bin_vec('hard_dataset_embeddings', word_to_idx)
        
    for word, vector in word2vec.items():
        pretrained_embeddings[word_to_idx[word]-1] = vector
    
    # text_field.vocab.load_vectors(wv_type='', wv_dim=300)
    
    model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
    # model.embeddings.weight.data = text_field.vocab.vectors
    # model.embeddings.embed.weight.requires_grad = False
    
    
    best_model = model
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    #optimizer = optim.Adam(model.parameters(), lr=1e-2,weight_decay=1e-5)
    loss_function = nn.NLLLoss()
    #loss_function = nn.CrossEntropyLoss()
    model=torch.nn.DataParallel(model, device_ids=[0])
    
    print('Training...')
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    time_start = time.time()
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    for epoch in range(EPOCHS):
    #    batch_modifer = epoch + 1
        avg_loss, acc = train_epoch_progress(model, train_iter, loss_function, optimizer, text_field, label_field, epoch)
        tqdm.write('Train: loss %.2f acc %.1f' % (avg_loss, acc*100))
        dev_acc = evaluate(model, dev_iter, loss_function, 'Dev')
        if dev_acc > best_dev_acc:
            if best_dev_acc > 0:
                os.system('rm '+ out_dir + '/best_model' + '.pth')
            best_dev_acc = dev_acc
            best_model = model
    #        torch.save(best_model.state_dict(), out_dir + '/best_model' + '.pth')
            # evaluate on test with the best dev performance model
            test_acc = evaluate(best_model, test_iter, loss_function, 'Test')
    dev_acc = evaluate(model, dev_iter, loss_function, 'Dev')
    test_acc = evaluate(best_model, test_iter, loss_function, 'Final Test')
    time_finish = time.time()
    total_time = time_finish - time_start
    print('total time', (time_finish-time_start)/3600, 'hours')
    total_time = (time_finish-time_start)/3600
    
    os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master/output/')
    for x in range(1001):
        if os.path.isfile('out_'+str(x)+'.csv') == False:
            ID = x 
            break 
    eval(str(np.savetxt('out_'+str(ID)+'.csv', np.array([dev_acc,test_acc,total_time]), delimiter=',')))
    #eval("np.savetxt('out_pm_%i.csv', np.array([dev_acc,test_acc,total_time]), delimiter=',')" %id)

Any help would be greatly appreciated!

Thank you!

UPDATE:

The error goes away when I plug retain_graphs = True into the two loss.backward() lines.
However, this causes the time per epoch to explode from 30 seconds to 20 minutes (understandably, considering what this command does). The whole thing I’m trying to achieve with the multi-GPU’s is faster run-times, so this is a problem.

Does someone perhaps know why the model=torch.nn.DataParallel(model, device_ids=[0]) line is forcing me to set retain_graphs = True'?
I feel that this is something to do with the fact that I had to turn model.init_hidden and model(sent) into mode.module.init_hidden and model.module(sent).
But without that addition, I was getting errors like:
AttributeError: 'DataParallel' object has no attribute 'init_hidden'

So I’m really not sure where to go from here…

SOLVED: (I think)

For anyone who is having a similar issue, my solution was to scrap the retain_graph = True entirely, and simply changed almost every model to model.module

WORKING CODE:

import os 
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-0.3.1-py36_cuda80_cudnn6he774522_2\Lib\site-packages')
import torch
import torch.nn as nn
from torch import optim
import torch.autograd as autograd
import time, random
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\tqdm-master')
from tqdm import tqdm
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master') 
import bilstm
import lstm
from lstm import LSTMSentiment
from bilstm import BiLSTMSentiment
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\text-master')
from torchtext import data
os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z') 
import numpy as np
import argparse
import csv
import time
#import gensim
for x in range (1):
    torch.set_num_threads(8)
    torch.manual_seed(1)
    random.seed(1)
    #########################################################################
    cuda = True
    Difficulty = 'easy'
    #########################################################################
    #if Difficulty == 'easy': 
    def load_bin_vec(fname, vocab):
        """
        Loads 300x1 word vecs from Google (Mikolov) word2vec
        """
    #    count = 0
        word_vecs = {}
        with open(fname, "rb") as f:
            header = f.readline()
            vocab_size, layer1_size = map(int, header.split())
            binary_len = np.dtype('float32').itemsize * layer1_size
            for line in range(vocab_size):
                word = []
                while True:
                    ch = f.read(1).decode('latin-1')
                    if ch == ' ':
                        word = ''.join(word)
                        break
                    if ch != '\n':
                        word.append(ch)
                if word in vocab:
                   word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
                else:
                    f.read(binary_len)
    #                count = count+1 
    #    print(count)
        return word_vecs
    #if Difficulty == 'hard': 
    #    def load_bin_vec(fname, vocab):
    #        count = 0
    #        success = 0
    #        """
    #        Loads 300x1 word vecs from Google (Mikolov) word2vec
    #        """
    #        word_vecs = {}
    #        w2v_model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
    #
    #        from gensim.models import word2vec
    #        w2v_model = word2vec.Word2Vec.load(fname)
    #        print(w2v_model.similarity('film', 'movie'))
    #        
    #        for word in vocab:
     #           print(word)
    #            here = 0 
    #            find = 0
    #            for letter in word:
    #                here = here + 1 
    #                if letter == ':':
    #                    find = here
    #            try:
    #                word_vecs[word] = w2v_model[word[0:find]]
    #                success = success + 1
    #            except KeyError:
    #                count = count + 1 
                    # some tokens from the dataset may not appear as an entry in the word embeds matrix, so i print and skip them
    #                print('Key error: {}'.format(word))
    #                continue
    #        print(count, 'problematic words', success, 'successful words')
    #        return word_vecs
    
    
    def get_accuracy(truth, pred):
        assert len(truth) == len(pred)
        right = 0
        for i in range(len(truth)):
            if truth[i] == pred[i]:
                right += 1.0
        return right / len(truth)
    
    
    def train_epoch_progress(model, train_iter, loss_function, optimizer, text_field, label_field, epoch):
        model.train()
        avg_loss = 0.0
        truth_res = []
        pred_res = []
        count = 0
        for batch in tqdm(train_iter, desc='Train epoch '+str(epoch+1)):
            sent, label = batch.text, batch.label
            label.data.sub_(1)
            truth_res += list(label.data)
            model.module.batch_size = len(label.data) 
            model.module.epoch = epoch+1
            model.module.hidden = model.module.init_hidden()
            model.module.count = count
            pred = model.module(sent)
            pred_label = pred.data.max(1)[1]
            pred_res += [x for x in pred_label]
            model.module.zero_grad()
            loss = loss_function(pred, label)
            avg_loss += loss.data[0]
            count += 1
            loss.backward()
            optimizer.step()
        avg_loss /= len(train_iter)
        acc = get_accuracy(truth_res, pred_res)
    #    print(model.hidden)
        return avg_loss, acc
    
    
    def train_epoch(model, train_iter, loss_function, optimizer):
        model.train()
        avg_loss = 0.0
        truth_res = []
        pred_res = []
        count = 0
        for batch in train_iter:
            sent, label = batch.text, batch.label
            label.data.sub_(1)
            truth_res += list(label.data)
            model.module.batch_size = len(label.data)
            model.module.hidden = model.module.init_hidden()
            pred = model.module(sent)
            pred_label = pred.data.max(1)[1].numpy()
            pred_res += [x for x in pred_label]
            model.module.zero_grad()
            loss = loss_function(pred, label)
            avg_loss += loss.data[0]
            count += 1
            loss.backward()
            optimizer.step()
        avg_loss /= len(train_iter)
        acc = get_accuracy(truth_res, pred_res)
        return avg_loss, acc
    
    
    def evaluate(model, data, loss_function, name):
        model.eval()
        avg_loss = 0.0
        truth_res = []
        pred_res = []
        for batch in data:
            sent, label = batch.text, batch.label
            label.data.sub_(1)
            truth_res += list(label.data)
            model.module.batch_size = len(label.data)
            model.module.hidden = model.module.init_hidden()
            pred = model.module(sent)
            pred_label = pred.data.max(1)[1]
            pred_res += [x for x in pred_label]
            loss = loss_function(pred, label)
            avg_loss += loss.data[0]
        avg_loss /= len(data)
        acc = get_accuracy(truth_res, pred_res)
        print(name + ': loss %.2f acc %.1f' % (avg_loss, acc*100))
        return acc
    
    def load_sst(text_field, label_field, batch_size, Difficulty, cuda):
    #    for x in range(B_sizes):
        if Difficulty == 'easy':
            os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master copy/data')
            train, dev, test = data.TabularDataset.splits(path='./SST2/', train='train.tsv',
                                                          validation='dev.tsv', test='test.tsv', format='tsv',
                                                          fields=[('text', text_field), ('label', label_field)]) 
        if Difficulty == 'medium':
            os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master')
            train, dev, test = data.TabularDataset.splits(path='./data/Dataset/', train='train.tsv',
                                                          validation='test.tsv', test='test.tsv', format='tsv',
                                                          fields=[('text', text_field), ('label', label_field)])
        if Difficulty == 'hard':      
            os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z')
            train, dev, test = data.TabularDataset.splits(path='./aclImdb/', train='usable_train.tsv',
                                                          validation='usable_dev.tsv', test='usable_test.tsv', format='tsv',
                                                          fields=[('text', text_field), ('label', label_field)])
        text_field.build_vocab(train, dev, test)
        label_field.build_vocab(train, dev, test)
    #   batch_size =  10*(.1*x*(torch.cos(torch.FloatTensor([x]))*torch.sin(torch.FloatTensor([2*x])))+.08*x).round() + 1
    
        ## for GPU run
        if cuda == True:
            train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
                         batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), repeat=False, device=None)
        else:
            train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
                         batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), repeat=False, device=-1)
        return train_iter, dev_iter, test_iter
    #def adjust_learning_rate(learning_rate, optimizer, epoch):
    #     lr = learning_rate * (0.9 ** (epoch //1))
    #     for param_group in optimizer.param_groups:
    #         param_group['lr'] = lr
    #     return optimizer
    
    
    #for clustering
    #if Difficulty == 'easy':
    #    EPOCHS= 30
    #    BATCH_SIZE = 30
    #    HIDDEN_DIM = 150
        
    if Difficulty == 'easy':
        EPOCHS= 1000
        BATCH_SIZE = 5
        HIDDEN_DIM = 100
        #HIDDEN_DIM = round((150*(4/6)))
        #HIDDEN_DIM = round((150*(4/5)))
        #HIDDEN_DIM = round((150*(4/8)))
        
    if Difficulty == 'medium':
        EPOCHS= 30
        BATCH_SIZE = 5
        HIDDEN_DIM = 50
        #HIDDEN_DIM = round((150*(4/6)))
        #HIDDEN_DIM = round((150*(4/5)))
        #HIDDEN_DIM = round((150*(4/8)))
        
    if Difficulty == 'hard':
        EPOCHS= 30
        BATCH_SIZE = 5
        HIDDEN_DIM = 50
        
    USE_GPU = torch.cuda.is_available()
    EMBEDDING_DIM = 300
    
    timestamp = str(int(time.time()))
    best_dev_acc = 0.0
    #B_sizes = np.ones(BATCH_SIZE) #BATCH SIZE MUST BE MINIMUM OF 50
    #for x in range(BATCH_SIZE):
    #    B_sizes[x]= 10*(.1*x*(torch.cos(torch.FloatTensor([x]))*torch.sin(torch.FloatTensor([2*x])))+.08*x).round() + 1
    #B_sizes = max(B_sizes)
    
    #def weights_init(LSTMSentiment):
    #    classname = LSTMSentiment.__class__.__name__
    #    if classname.find('LSTMSentiment') != -1:
    #        LSTMSentiment.weight.data.fill_(.5)
    #        LSTMSentiment.bias.data.fill_(0)
    
    #Using = BiLSTMSentiment
    Using = LSTMSentiment
    if cuda:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
    print('Using: ', Using)   
    text_field = data.Field(lower=True)
    label_field = data.Field(sequential=False)
    train_iter, dev_iter, test_iter = load_sst(text_field, label_field, BATCH_SIZE, Difficulty = Difficulty, cuda = cuda)
    count = 0
    model = Using(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab),
                           use_gpu=cuda, label_size=len(label_field.vocab)-1, batch_size=BATCH_SIZE)
    if cuda:
        model = model.cuda()
        model=torch.nn.DataParallel(model, device_ids=[0])
    
    print('Load word embeddings...')
    # # glove
    #text_field.vocab.load_vectors('glove.6B.100d')
    #text_field.vocab.load_vectors('hard_dataset_embeddings')
    
    # word2vector
    word_to_idx = text_field.vocab.stoi
    
    pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), EMBEDDING_DIM))
    pretrained_embeddings[0] = 0
    #if Difficulty == 'easy':
    os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master')
    word2vec = load_bin_vec('./data/GoogleNews-vectors-negative300.bin', word_to_idx)
    #if Difficulty == 'hard':
    #    os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z')
    #    word2vec = load_bin_vec('hard_dataset_embeddings', word_to_idx)
        
    for word, vector in word2vec.items():
        pretrained_embeddings[word_to_idx[word]-1] = vector
    
    # text_field.vocab.load_vectors(wv_type='', wv_dim=300)
    
    model.module.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
    # model.embeddings.weight.data = text_field.vocab.vectors
    # model.embeddings.embed.weight.requires_grad = False
    
    
    best_model = model
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    #optimizer = optim.Adam(model.parameters(), lr=1e-2,weight_decay=1e-5)
    loss_function = nn.NLLLoss()
    #loss_function = nn.CrossEntropyLoss()

    
    print('Training...')
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    time_start = time.time()
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    for epoch in range(EPOCHS):
    #    batch_modifer = epoch + 1
        avg_loss, acc = train_epoch_progress(model, train_iter, loss_function, optimizer, text_field, label_field, epoch)
        tqdm.write('Train: loss %.2f acc %.1f' % (avg_loss, acc*100))
        dev_acc = evaluate(model, dev_iter, loss_function, 'Dev')
        if dev_acc > best_dev_acc:
            if best_dev_acc > 0:
                os.system('rm '+ out_dir + '/best_model' + '.pth')
            best_dev_acc = dev_acc
            best_model = model
    #        torch.save(best_model.state_dict(), out_dir + '/best_model' + '.pth')
            # evaluate on test with the best dev performance model
            test_acc = evaluate(best_model, test_iter, loss_function, 'Test')
    dev_acc = evaluate(model, dev_iter, loss_function, 'Dev')
    test_acc = evaluate(best_model, test_iter, loss_function, 'Final Test')
    time_finish = time.time()
    total_time = time_finish - time_start
    print('total time', (time_finish-time_start)/3600, 'hours')
    total_time = (time_finish-time_start)/3600
    
    os.chdir(r'C:\Users\john\Desktop\Deep_Learning_A_Z\pytorch-sentiment-classification-master/output/')
    for x in range(1001):
        if os.path.isfile('out_'+str(x)+'.csv') == False:
            ID = x 
            break 
    eval(str(np.savetxt('out_'+str(ID)+'.csv', np.array([dev_acc,test_acc,total_time]), delimiter=',')))
    #eval("np.savetxt('out_pm_%i.csv', np.array([dev_acc,test_acc,total_time]), delimiter=',')" %id)

xiaomiannvwang · January 10, 2019, 5:22pm

hi, that really helps. Thanks!!

xiaomiannvwang · January 10, 2019, 7:21pm

i also want to ask, with multi opus speed it up? For me with four GPU but i can only with the code use one gnu…