Thank you for kind words

Here is an example of non-trivial Dataset that I use for preprocessing data for World Models’ memory module training:

```
class MemoryDataset(Dataset):
"""Dataset of sequential data to train memory.
Args:
dataset_path (string): Path to HDF5 dataset file.
sequence_len (int): Desired output sequence len.
terminal_prob (float): Probability of sampling sequence that finishes with
terminal state.
dataset_fraction (float): Fraction of dataset to use during training, value range: (0, 1]
(dataset forepart is taken).
is_deterministic (bool): If return sampled latent states or mean latent states.
Note:
Arrays should have the same size of the first dimension and their type should be the
same as desired Tensor type.
"""
def __init__(self, dataset_path, sequence_len, terminal_prob, dataset_fraction, is_deterministic):
assert 0 < terminal_prob and terminal_prob <= 1.0, "0 < terminal_prob <= 1.0"
assert 0 < dataset_fraction and dataset_fraction <= 1.0, "0 < dataset_fraction <= 1.0"
self.dataset = None
self.dataset_path = dataset_path
self.sequence_len = sequence_len
self.terminal_prob = terminal_prob
self.dataset_fraction = dataset_fraction
self.is_deterministic = is_deterministic
# https://stackoverflow.com/questions/46045512/h5py-hdf5-database-randomly-returning-nans-and-near-very-small-data-with-multi
with h5py.File(self.dataset_path, "r") as dataset:
self.latent_dim = dataset.attrs["LATENT_DIM"]
self.action_dim = dataset.attrs["ACTION_DIM"]
self.n_games = dataset.attrs["N_GAMES"]
def __getitem__(self, idx):
"""Get sequence at random starting position of given sequence length from episode `idx`."""
offset = 1
if self.dataset is None:
self.dataset = h5py.File(self.dataset_path, "r")
t_start, t_end = self.dataset['episodes'][idx:idx + 2]
episode_length = t_end - t_start
if self.sequence_len <= episode_length - offset:
sequence_len = self.sequence_len
else:
sequence_len = episode_length - offset
# log.info(
# "Episode %d is too short to form full sequence, data will be zero-padded.", idx)
# Sample where to start sequence of length `self.sequence_len` in episode `idx`
# '- offset' because "next states" are offset by 'offset'
if np.random.rand() < self.terminal_prob:
# Take sequence ending with terminal state
start = t_start + episode_length - sequence_len - offset
else:
# NOTE: np.random.randint takes EXCLUSIVE upper bound of range to sample from
start = t_start + np.random.randint(max(1, episode_length - sequence_len - offset))
states_ = torch.from_numpy(self.dataset['states'][start:start + sequence_len + offset])
actions_ = torch.from_numpy(self.dataset['actions'][start:start + sequence_len])
states = torch.zeros(self.sequence_len, self.latent_dim, dtype=states_.dtype)
next_states = torch.zeros(self.sequence_len, self.latent_dim, dtype=states_.dtype)
actions = torch.zeros(self.sequence_len, self.action_dim, dtype=actions_.dtype)
# Sample latent states (this is done to prevent overfitting of memory to a specific 'z'.)
if self.is_deterministic:
z_samples = states_[:, 0]
else:
mu = states_[:, 0]
sigma = torch.exp(states_[:, 1] / 2)
latent = Normal(loc=mu, scale=sigma)
z_samples = latent.sample()
states[:sequence_len] = z_samples[:-offset]
next_states[:sequence_len] = z_samples[offset:]
actions[:sequence_len] = actions_
return [states, actions], [next_states]
def __len__(self):
return int(self.n_games * self.dataset_fraction)
```