I have enough RAM to load a large text but during the processing inside the Dataset logic, my kernel dies.
I tried to load the text file in chunks but the problem is i want long sequences on 256 and in some cases the train_loader returns empty since the chunk is not as long as 256.
This is my current code -
with open('data/shards/ta_dedup.txt_shard_1.txt', 'r', encoding='utf-8') as f:
rawtext = f.read()
import multiprocessing
from typing import List, Tuple
import sentencepiece as spm
import torch
from torch.utils.data import DataLoader, Dataset, random_split
class TamilDataset(Dataset):
def __init__(self, text: str, tokenizer: spm.SentencePieceProcessor, max_length: int, stride: int, debug: bool = False):
"""
PyTorch Dataset for tokenized Tamil text.
Args:
path (str): Path to the text file.
offset_dict (List[int]): List of line offsets.
tokenizer (spm.SentencePieceProcessor): SentencePiece tokenizer.
max_length (int): Maximum sequence length.
stride (int): Stride for overlapping chunks.
"""
self.rawtext = text
self.tokenizer = tokenizer
self.max_length = max_length
self.stride = stride
self.debug = debug
self.input_ids = []
self.target_ids = []
# Tokenize the entire text
token_ids = self.tokenizer.encode(self.rawtext)
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):
input_chunk = token_ids[i:i + max_length]
target_chunk = token_ids[i + 1: i + max_length + 1]
self.input_ids.append(torch.tensor(input_chunk))
self.target_ids.append(torch.tensor(target_chunk))
def __len__(self) -> int:
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.target_ids[idx]
def create_dataloader(
rawtext: str,
batch_size: int,
max_length: int,
stride: int,
shuffle: bool,
drop_last: bool,
num_workers: int,
train_split: float = 0.75
) -> Tuple[DataLoader, DataLoader]:
"""
Create train and validation dataloaders with a specified split.
Args:
text (str): raw text.
batch_size (int): Batch size for dataloaders.
max_length (int): Maximum sequence length.
stride (int): Stride for overlapping chunks.
shuffle (bool): Whether to shuffle the data.
drop_last (bool): Whether to drop the last incomplete batch.
num_workers (int): Number of worker processes for data loading.
train_split (float, optional): Proportion of data to use for training. Defaults to 0.75.
Returns:
Tuple of train and validation DataLoaders.
"""
tokenizer = spm.SentencePieceProcessor()
tokenizer.load('models/tok32000.model')
dataset = TamilDataset(rawtext, tokenizer, max_length, stride, debug=True)
train_size = int(len(dataset) * train_split)
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last,
num_workers=num_workers
)
val_dataloader = DataLoader(
val_dataset,
batch_size=batch_size,
shuffle=False,
drop_last=drop_last,
num_workers=num_workers
)
return train_dataloader, val_dataloader
train_dataloader, val_dataloader = create_dataloader(
rawtext,
batch_size=1,
max_length=256,
stride=1,
shuffle=True,
drop_last=False,
num_workers=0
)