Hi everyone
Hi,
I am trying to do multiple GPU training using a CNN for text classification. The model works fine using single GPU. But when I am trying to use multiple GPUs. I see that all my GPUs have some memory filled but only my GPU 0 has volatile GPU usage. Also using multiple gpus, my training and validation scores arent going down at all. But it works well with single GPU.
Can someone point me in the right direction?
System Configurations:
Pytorch 0.4
Python 3
4 GPUs
each has 16 GB memory
My CNN model
class CNN(nn.Module):
def __init__(self, vocab, filters = [(3,100),(4,100),(5,100),(7,100)], dp = 0.3, bn=True, nl_func=F.relu):
super(CNN, self).__init__()
V,D = vocab.vectors.size()
self.embed = nn.Embedding(V,D)
self.embed.weight.data.copy_(vocab.vectors)
self.embed.weight.requires_grad = False
self.conv_layers = nn.ModuleList([nn.Conv1d(in_channels=D, out_channels=n, kernel_size=ksz) for ksz,n in filters])
self.amp = nn.AdaptiveMaxPool1d(1)
self.dp = nn.Dropout(dp)
self.nl_func = nl_func
num_features = 0
for _,n in filters:
num_features += n
self.bnz = nn.BatchNorm1d(num_features) if bn else lambda x: x
self.fc = nn.Linear(num_features,1)
def forward(self,text,label):
text_embed = self.embed(text).transpose(1,2).transpose(0,2)
conv = [self.amp(self.nl_func(Conv_layer(text_embed))) for Conv_layer in self.conv_layers]
concat_conv = torch.cat(conv,2)
concat_conv = concat_conv.view(concat_conv.size(0),-1)
bn_concat_conv = self.bnz(concat_conv)
dp_bn_concat_conv = self.dp(bn_concat_conv)
out = self.fc(dp_bn_concat_conv)
out = torch.sigmoid(out).transpose(0,1)
return out
function_dict = {'relu':F.relu}
def get_model(vocab,config,model_type='CNN'):
if model_type == 'CNN':
lr = config["lr"]
dp = config["dp"]
filter_szs = config["filter_szs"]
filter_mp = config["filter_mp"]
nl = function_dict.get(config["nl_func"],None)
filters = [(f,filter_mp) for f in filter_szs]
model_name = f"emb_name_{config['embed_name']}_lr_{lr}_dp_{dp}_filters_{filter_szs}_filter_mps_{filter_mp}.pth"
print(f"Configuration: Embedding_name-{config['embed_name']} lr-{lr}, dropout-{dp}, filter_szs-{filter_szs}, no of filters-{filter_mp}")
model = CNN(vocab, filters = filters, dp = dp, bn=True, nl_func=nl).cuda()
model = torch.nn.DataParallel(model)
return model, model_name
This is my training class
class TrainClassifier:
def __init__(self,trn_dl=None,val_dl=None):
self.train_loader = trn_dl
self.val_loader = val_dl
def _train_iteration(self,model):
# Setting model in train mode
model.train()
total,sum_loss = 0,0
loss_function = nn.BCELoss(reduction='none').cuda()
for x_text,label in self.train_loader:
bsz = label.size(1)
x_text = x_text.cuda()
y = label.cuda()
y_hat = model(x_text,y)
loss = loss_function(y_hat,y).squeeze(0).sum()
print("loss",loss)
loss = loss.sum()/bsz
self.optim.zero_grad()
loss.backward()
self.optim.step()
total += bsz
sum_loss += bsz*(loss.item())
return sum_loss/total
def test(self,model,test_dl=None):
model.eval()
test_dl = test_dl if test_dl else self.val_loader
total, sum_loss = 0,0
yT,ypT = torch.tensor([]),torch.tensor([])
loss_function = nn.BCELoss(reduction='none').cuda()
for x_text,label in test_dl:
bsz = label.size(1)
x_text = x_text.cuda()
y = label.cuda()
y_hat = model(x_text,y)
loss = loss_function(y_hat,y).squeeze(0).sum()
loss = loss.sum()/bsz
sum_loss += bsz*(loss.item())
total += bsz
yT = torch.cat((yT,y.squeeze(0).cpu()),0)
ypT = torch.cat((ypT,y_hat.squeeze(0).cpu()),0)
return sum_loss/total,t2n(yT),t2n(ypT)
def train(self,model,config,model_name='',verbose=True,final=True):
if self.train_loader is None or self.val_loader is None:
print("Check whether you have passed train and test loader")
return
self.optim = get_optimizer(model,config['lr'],wd=0)
# loading a model if present
model, self.optim, start_epoch = load_checkpoint(model, self.optim, model_name)
if start_epoch < config['epochs']-1:
for i in tqdm(range(start_epoch+1,config['epochs']),desc='Epochs:',dynamic_ncols=True):
train_loss = self._train_iteration(model)
vld_loss, truth, pred = self.test(model)
precision, recall, f1 = classification_report(truth, pred,threshold=0.5)
if verbose:
tqdm.write(f'Epoch:{i}')
tqdm.write("Training BCE : %f 'Validation BCE: %f"%(train_loss,vld_loss))
tqdm.write(f'test precision @ 0.5:{precision}, test recall @ 0.5:{recall}')
tqdm.write(f'------------------------------------------------------------------------------')
save_checkpoint(self.optim,model,config['epochs'],vld_loss,model_name)
plot_pr_curves(truth,pred)
if final:
save_final_model(model,model_name)
return model