So I have a text file of about 3M training samples, with each line representing a sample. First I tried a single process to read samples line by line in a mini-batch style, then feed them to my model. Then I found that the GPU usage is just around 20%, so I guess the reading process is too slow. After I implemented a parallel version using DataLoader and expect it to have a prominent speedup, it turned out to be more than 2x slower than the single process version.
These are my code:
# an iterable object of a training data file
class SampleFile(object):
def __init__(self, filePath):
self.filePath = filePath
def __iter__(self):
with open(self.filePath) as file:
while True:
line = file.readline().rstrip("\n")
if not line:
break
sample = self.parseSample(line)
yield sample
# DataSet object with (maybe) multiple data files
class TrainingSet(IterableDataset):
def __init__(self, dataFilePath, workerNum):
super(TrainingSet).__init__()
self.dataFilePath = dataFilePath
self.workerNum = workerNum
def __iter__(self):
workerInfo = torch.utils.data.get_worker_info()
if workerInfo is None:
# single process, just read the whole data
return iter(SampleFile(self.dataFilePath))
else:
# read splitted data file generated by Linux `split` command
workerNumLen = len(str(self.workerNum-1))
suffix = (workerNumLen-len(str(workerInfo.id))) * '0' + str(workerInfo.id)
partFilePath = self.dataFilePath + suffix
return iter(SampleFile(partFilePath))
Any one could give me a hint? Thanks