Hello,
I’m trying to train my neural net with following code:
class Dataset(torch.utils.data.Dataset):
def __init__(self, df, is_valid=False):
self.l = len(df)
df_numpy = df.to_numpy()
self.x = torch.from_numpy(df_numpy[:,0:2]).cuda()
self.target = torch.from_numpy(df_numpy[:,2:3]).float().cuda()
def __getitem__(self, i):
return [self.x[i], self.target[i]]
def __len__(self): return self.l
class Word2Vec(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super().__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.embeddings_bias = nn.Embedding(vocab_size, 1)
self.context = nn.Embedding(vocab_size, embedding_dim)
self.context_bias = nn.Embedding(vocab_size, 1)
def forward(self, inputs):
a = inputs[:,0]
b = inputs[:,1]
num = (self.embeddings(a) * self.context(b)).sum(dim=1, keepdim=True)
num += self.embeddings_bias(a) + self.context_bias(b)
return torch.sigmoid(num)
train_ds = Dataset(train) # train.shape == shape (52962501, 3)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=1024, shuffle=True)
model = Word2Vec(vocab_size, 128) # vocab_size == 20000
model.cuda()
Environment:
cuda = ‘10.1’
torch version = 1.7.1
GPU has 32GB of memory and CPU has 24 cores.
Learning loop is standard (feed forward, BCELoss().backward()). Unfortunately almost all the work is CPU bound (1 CPU core is 100%) and GPU usage is about 10%. Could you help me understand why CPU is bottleneck here in case of full dataset located on GPU?