Segmentation error after training one iteration for words embedding

When I am training a cbow words_embedding on one GPU, a segmentation error occurs just after one training epoch is finished.

The training code is below:

loss_function=nn.NLLLoss()
optimizer=optim.RMSprop(cbow_embedder.parameters(), lr=0.05) #lstm optimizers, may need to change the lr
cbow_dataset=torch.utils.data.TensorDataset(contexts, labels)
cbow_dataloader=torch.utils.data.DataLoader(cbow_dataset, batch_size=1000, shuffle=False, pin_memory=False, num_workers=4)
for epoch in range(100):
        total_loss=0.0
        recent_loss=0.0
        num_batches=0.0
        ctime=time.time()
        for contexts, labels in cbow_dataloader:
            cbow_embedder.zero_grad()
            log_probs=cbow_embedder(contexts)
            loss=loss_function(log_probs.view((contexts.shape[0], -1)), labels.cuda())
            print(loss)
            loss.backward()
            optimizer.step()
            total_loss+=loss
            recent_loss+=loss
            num_batches+=1.0
            if num_batches%10==0:
                ntime=time.time()
                print(num_batches, list(contexts.size())[0], recent_loss/10)
                ctime=time.time()
                recent_loss=0.0
            if num_batches%500==0:
                torch.save(cbow_embedder, save_dir+'/cbow_embedding')
                print('saved cbow')
        print('epoch', epoch, 'avg_loss',  total_loss/num_batches)

Can anyone tell what has happened here? Thanks!

Is it running on CPU without any issues?

It looks like you don’t push contexts to the GPU or are they already loaded in your Dataset onto the GPU?

It doesn’t have issue when running on CPU. I didn’t push contexts to the GPU because of the memory limit. During the forward pass in words embedding I put the input context to the GPU. Do you think I need to put the whole dataset on GPU for training?

Ah, so you put context in cbos_embedder onto the GPU?
Then it shouldn’t be the error. No, I think your method is fine to just push the sample onto the GPU instead of the whole dataset.

Do you get a full error message or just a SEGFAULT?

Just a segfault immediately after one epoch finished.

Could you post a small runnable code snippet so that I could try it on my machine?

# Assign the save_dir first
save_dir = "/home/"
load_cbow_embedding = False
class CBOW_Embedding(nn.Module):
    
    def __init__(self, vocab_size, context_size, embedding_size):
        super(CBOW_Embedding, self).__init__()
        self.vocab_size=vocab_size
        self.context_size=context_size
        self.embedding_size=embedding_size
        self.embedding=nn.Embedding(self.vocab_size, self.embedding_size)
        self.params=nn.ParameterList([nn.Parameter(torch.randn(self.embedding_size, self.vocab_size).cuda()), nn.Parameter(torch.randn(self.vocab_size).cuda())])
#         self.A=
#         self.b=
    
    def forward(self, context):
        #print(context.shape[0])
        context=context.view((-1, self.context_size, 1)).cuda()
        #print(context)
        embedded_context=self.embedding(context).view((-1, self.context_size, self.embedding_size))
        context_sum=torch.sum(embedded_context, 1)
        output=torch.matmul(context_sum, self.params[0])
        output=output+self.params[1]
        probs=F.log_softmax(output, dim=1)
        return probs
    
    def embed(self, word):
        with torch.no_grad():
            return self.embedding(word).view((1, -1))
    
    def lookup_probs(self, embedded):
        with torch.no_grad():
            embedding_lens=torch.pow(torch.sum(torch.pow(self.embedding.weight, 2.0), dim=1), 0.5)
            embed_len=torch.pow(torch.sum(torch.pow(embedded, 2.0)), 0.5)
            similarities=self.embedding.weight*embedded/(embedding_lens*embed_len)
            probs=F.softmax(similarities)
            return probs
#Train CBOW embedding
if load_cbow_embedding:
    cbow_embedder=torch.load(save_dir+'/cbow_embedding') 
     #cbow_embedder=pickle.load(open(save_dir+'/cbow_embedding', "rb" ))
else:
    print('debugging')
    # Generating data for debugging
    dataset_size = 20000
    valid_words = list()
    for _ in range(20000):
        i = random.randint(0, 199)
        valid_words.append(i)
    valid_words=np.asarray(valid_words, dtype = 'long')
    context_size=10
    embedding_size = 100
    vocab_size = 200
    cbow_embedder=CBOW_Embedding(vocab_size, context_size, embedding_size)
    cbow_embedder=nn.DataParallel(cbow_embedder)
    cbow_embedder.cuda()#.to(device)
    contexts=np.zeros((len(valid_words)-context_size+1, context_size), dtype = 'long')
    labels=np.zeros(len(valid_words)-context_size+1, dtype = 'long')
    half_size=int(context_size/2)
    for ind in range(half_size, len(valid_words)-half_size-1):#
        contexts[ind-half_size, 0:half_size]=valid_words[ind-half_size:ind]
        contexts[ind-half_size, half_size:]=valid_words[ind+1:ind+1+half_size]
        labels[ind-half_size]=valid_words[ind]
    contexts=torch.from_numpy(contexts)
    labels=torch.from_numpy(labels)
    #contexts=contexts.to(device)
    #labels=labels.to(device)
     
    #train cbow
    loss_function=nn.NLLLoss()
    optimizer=optim.RMSprop(cbow_embedder.parameters(), lr=0.05) #lstm optimizers, may need to change the lr
    cbow_dataset=torch.utils.data.TensorDataset(contexts, labels)
    cbow_dataloader=torch.utils.data.DataLoader(cbow_dataset, batch_size=1000, shuffle=False, pin_memory=False, num_workers=4) #if you out of memory errors, reduce batch size
    for epoch in range(100):
        total_loss=0.0
        recent_loss=0.0
        num_batches=0.0
        ctime=time.time()
        for contexts, labels in cbow_dataloader:
            cbow_embedder.zero_grad()
            log_probs=cbow_embedder(contexts)
            loss=loss_function(log_probs.view((contexts.shape[0], -1)).cuda(), labels.cuda())
            print(loss)
            loss.backward()
            optimizer.step()
            total_loss+=loss
            recent_loss+=loss
            num_batches+=1.0
            if num_batches%10==0:
                ntime=time.time()
                print(num_batches, list(contexts.size())[1], recent_loss/10)
                ctime=time.time()
                recent_loss=0.0
            if num_batches%400==0:
                torch.save(cbow_embedder, save_dir+'/cbow_embedding')
                print('saved cbow')
        torch.save(cbow_embedder, save_dir+'/cbow_embedding')
        print('epoch', epoch, 'avg_loss',  total_loss/num_batches)

Here is a snippet. By the way, I am just using one GPU for trainning; will it matter if I assign num_workers = 4 for the data loader?