The following error occurred during model training.
ValueError: operands could not be broadcast together with shapes (1855,) (1855,64)
input : audio feature(waveform) , log mel spectrogram = 64
using librosa package
AudioAugmentation is not used.
I padded input(audio feature) with different length with class :clotho_collate_fn for the longest length. The detailed code is as follows.
settings_features=
{‘keep_raw_audio_data’: False,
‘process’: {‘sr’: 44100,
‘sr_resample’: 16000,
‘nb_fft’: 1024,
‘hop_size’: 512,
‘nb_mels’: 64,
‘window_function’: ‘hann’,
‘center’: True,
‘f_min’: 0.0,
‘f_max’: None,
‘htk’: False,
‘power’: 1.0,
‘norm’: 1}}
import numpy as np
import random
from tools.features_log_mel_bands import feature_extraction
from pathlib import Path
import pysndfx
import gc
import copy
from tools.file_io import load_audio_file
import torch
__author__ = 'Nikita Kuzmin -- Lomonosov Moscow State University'
class MixUp:
def __init__(self, p, settings_features, simple_concat_captions=True,
sample_audio=False):
self.p = p
self.sample_audio = sample_audio
self.settings_features = settings_features
self.simple_concat_captions = simple_concat_captions
def from_mel(self, mel):
return 700 * (10 ** (mel / 2595.0) - 1)
def to_mel(self, hertz):
return 2595.0 * np.log10(1 + hertz / 700.0)
def mix_audio(self, first_audio, second_audio):
a = np.random.uniform(0.4, 0.6) #베타 분포에서 뽑음
shorter, longer = first_audio, second_audio
if shorter.shape[0] == longer.shape[0]:
if self.sample_audio:
return (longer + shorter) / 2.0
else:
longer = from_mel_to_audio(longer, **self.settings_features['process']) * a
shorter = from_mel_to_audio(shorter,
**self.settings_features['process'])
return feature_extraction((longer + shorter) / 2, **self.settings_features['process'])
if first_audio.shape[0] > second_audio.shape[0]:
shorter, longer = longer, shorter
if self.sample_audio:
start = random.randint(0, longer.shape[0] - 1 - shorter.shape[0])
end = start + shorter.shape[0]
longer *= a
longer[start:end] += np.dot(shorter,(1 - a)) #shorter * (1 - a)
else:
longer = from_mel_to_audio(longer, **self.settings_features['process']) * a
shorter = from_mel_to_audio(shorter,
**self.settings_features['process'])
start = random.randint(0, longer.shape[0] - 1 - shorter.shape[0])
end = start + shorter.shape[0]
longer[start:end] += np.dot(shorter,(1 - a))
longer = feature_extraction(longer,
**self.settings_features['process'])
return longer
def mix_labels(self, first_labels, second_labels):
if self.simple_concat_captions:
return np.hstack([first_labels[:-1], second_labels[1:]])
else:
first_token = first_labels[0]
last_token = first_labels[-1]
first_labels = first_labels[1:-1]
second_labels = second_labels[1:-1]
res = np.empty((first_labels.size + second_labels.size,),
dtype=first_labels.dtype)
min_size = min(first_labels.size, second_labels.size)
res[0:2*min_size:2] = first_labels[:min_size]
res[1:2*min_size:2] = second_labels[:min_size]
if first_labels.size > second_labels.size:
res[min_size * 2:] = first_labels[min_size:]
elif second_labels.size > first_labels.size:
res[min_size*2:] = second_labels[min_size:]
res = np.concatenate(([first_token], res))
res = np.concatenate((res, [last_token]))
return res
def mix_audio_and_labels(self,
first_audio, second_audio,
first_labels, second_labels):
mixed_audio = self.mix_audio(first_audio, second_audio)
mixed_labels = self.mix_labels(first_labels, second_labels)
return mixed_audio, mixed_labels
def __call__(self, dataset, inputs):
resulted_audio, resulted_labels, filename = inputs[0], inputs[1], inputs[2]
if np.random.uniform() <= self.p:
random_sample = dataset.random_sample(sample_audio=self.sample_audio)
resulted_audio, resulted_labels = self.mix_audio_and_labels(
resulted_audio, random_sample[0],
resulted_labels, random_sample[1]
)
return resulted_audio, resulted_labels
class AudioAugmentation:
# https://github.com/ex4sperans/freesound-classification
def __init__(self, p):
self.p = p
self.effects_chain = (
pysndfx.AudioEffectsChain()
.reverb(
reverberance=random.randrange(50),
room_scale=random.randrange(50),
stereo_depth=random.randrange(50)
)
.pitch(shift=random.randrange(-300, 300))
.overdrive(gain=random.randrange(2, 10))
.speed(random.uniform(0.9, 1.1))
)
def __call__(self, dataset, inputs):
resulted_audio = inputs[0]
captions = inputs[1]
del inputs
gc.collect()
if np.random.uniform() < self.p:
resulted_audio = torch.from_numpy(self.effects_chain(resulted_audio.numpy()))
return resulted_audio, captions
#clotho_collate_fn
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from typing import MutableSequence, Union, Tuple, AnyStr
from numpy import ndarray
import torch
from torch import cat as pt_cat, zeros as pt_zeros, \
ones as pt_ones, from_numpy, Tensor
from hparams import hparams as hp
from data_augmentation.SpecAugment import spec_augment
__author__ = 'Konstantinos Drossos -- Tampere University'
__docformat__ = 'reStructuredText'
__all__ = ['clotho_collate_fn']
def clotho_collate_fn(batch: MutableSequence[ndarray],
nb_t_steps: Union[AnyStr, Tuple[int, int]],
input_pad_at: str,
output_pad_at: str) \
-> Tuple[Tensor, Tensor]:
"""Pads data.
:param batch: Batch data.
:type batch: list[numpy.ndarray]
:param nb_t_steps: Number of time steps to\
pad/truncate to. Cab use\
'max', 'min', or exact number\
e.g. (1024, 10).
:type nb_t_steps: str|(int, int)
:param input_pad_at: Pad input at the start or\
at the end?
:type input_pad_at: str
:param output_pad_at: Pad output at the start or\
at the end?
:type output_pad_at: str
:return: Padded data.
:rtype: torch.Tensor, torch.Tensor
"""
if type(nb_t_steps) == str:
truncate_fn = max if nb_t_steps.lower() == 'max' else min
in_t_steps = truncate_fn([i[0].shape[0] for i in batch])
out_t_steps = truncate_fn([i[1].shape[0] for i in batch])
else:
in_t_steps, out_t_steps = nb_t_steps
in_dim = batch[0][0].shape[-1]
eos_token = batch[0][1][-1]
PAD = 4367
input_tensor, output_tensor = [], []
for in_b, out_b in batch:
if in_t_steps >= in_b.shape[0]:
padding = pt_zeros(in_t_steps - in_b.shape[0], in_dim).float()
data = [from_numpy(in_b).float()]
if input_pad_at.lower() == 'start':
data.insert(0, padding)
else:
data.append(padding)
tmp_in: Tensor = pt_cat(data)
else:
tmp_in: Tensor = from_numpy(in_b[:in_t_steps, :]).float()
input_tensor.append(tmp_in.unsqueeze_(0))
if out_t_steps >= out_b.shape[0]:
padding = pt_ones(out_t_steps - len(out_b)).mul(PAD).long()
data = [from_numpy(out_b).long()]
if output_pad_at.lower() == 'start':
data.insert(0, padding)
else:
data.append(padding)
tmp_out: Tensor = pt_cat(data)
else:
tmp_out: Tensor = from_numpy(out_b[:out_t_steps]).long()
output_tensor.append(tmp_out.unsqueeze_(0))
input_tensor = pt_cat(input_tensor)
output_tensor = pt_cat(output_tensor)
file_names = [i[2] for i in batch]
return input_tensor, output_tensor, file_names
def clotho_collate_fn_eval(batch: MutableSequence[ndarray],
nb_t_steps: Union[AnyStr, Tuple[int, int]],
input_pad_at: str,
output_pad_at: str,
split: str,
augment:bool) \
-> Tuple[Tensor, Tensor, Tensor, list]:
"""Pads data.
:param batch: Batch data.
:type batch: list[numpy.ndarray]
:param nb_t_steps: Number of time steps to\
pad/truncate to. Cab use\
'max', 'min', or exact number\
e.g. (1024, 10).
:type nb_t_steps: str|(int, int)
:param input_pad_at: Pad input at the start or\
at the end?
:type input_pad_at: str
:param output_pad_at: Pad output at the start or\
at the end?
:type output_pad_at: str
:return: Padded data.
:rtype: torch.Tensor, torch.Tensor
"""
if type(nb_t_steps) == str:
truncate_fn = max if nb_t_steps.lower() == 'max' else min
in_t_steps = truncate_fn([i[0].shape[0] for i in batch])
out_t_steps = truncate_fn([i[1].shape[0] for i in batch])
else:
in_t_steps, out_t_steps = nb_t_steps
in_dim = batch[0][0].shape[-1]
eos_token = batch[0][1][-1]
batch = sorted(batch, key=lambda x: x[-1],reverse=True)
PAD = 4367
input_tensor, output_tensor = [], []
for in_b, out_b, ref, filename,out_len in batch:
if in_t_steps >= in_b.shape[0]:
padding = pt_zeros(in_t_steps - in_b.shape[0], in_dim).float()
data = [from_numpy(in_b).float()]
if input_pad_at.lower() == 'start':
data.insert(0, padding)
else:
data.append(padding)
tmp_in: Tensor = pt_cat(data)
else:
tmp_in: Tensor = from_numpy(in_b[:in_t_steps, :]).float()
input_tensor.append(tmp_in.unsqueeze_(0))
if out_t_steps >= out_b.shape[0]:
padding = pt_ones(out_t_steps - len(out_b)).mul(PAD).long()
data = [from_numpy(out_b).long()]
if output_pad_at.lower() == 'start':
data.insert(0, padding)
else:
data.append(padding)
tmp_out: Tensor = pt_cat(data)
else:
tmp_out: Tensor = from_numpy(out_b[:out_t_steps]).long()
output_tensor.append(tmp_out.unsqueeze_(0))
input_tensor = pt_cat(input_tensor)
if augment:
input_tensor = spec_augment(input_tensor)
output_tensor = pt_cat(output_tensor)
all_ref = [i[2] for i in batch]
filename = [i[3] for i in batch]
*_, target_len = zip(*batch)
target_len = torch.LongTensor(target_len)
file_names = [i[2] for i in batch]
return input_tensor, output_tensor,file_names,target_len, all_ref
# EOF
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from typing import Tuple, List, AnyStr, Union
from pathlib import Path
from numpy import ndarray, recarray
from torch.utils.data import Dataset
from numpy import load as np_load
import torchaudio
import torch
import numpy as np
import os
__author__ = 'Konstantinos Drossos -- Tampere University'
__docformat__ = 'reStructuredText'
__all__ = ['ClothoDataset']
class ClothoDataset(Dataset):
def __init__(self, data_dir: Path,
split: AnyStr,
input_field_name: AnyStr,
output_field_name: AnyStr,
load_into_memory: bool,
transforms=None) \
-> None:
"""Initialization of a Clotho dataset object.
:param data_dir: Directory with data.
:type data_dir: pathlib.Path
:param split: Split to use (i.e. 'development', 'evaluation')
:type split: str
:param input_field_name: Field name of the clotho data\
to be used as input data to the\
method.
:type input_field_name: str
:param output_field_name: Field name of the clotho data\
to be used as output data to the\
method.
:type output_field_name: str
:param load_into_memory: Load all data into memory?
:type load_into_memory: bool
"""
super(ClothoDataset, self).__init__()
the_dir: Path = data_dir.joinpath(split)
self.examples: List[Path] = sorted(the_dir.iterdir())
self.input_name: str = input_field_name
self.output_name: str = output_field_name
self.load_into_memory: bool = load_into_memory
self.transforms = transforms
self.resampler = torchaudio.transforms.Resample(orig_freq=settings_features['process']['sr'],
new_freq=settings_features['process']['sr_resample'])
if load_into_memory:
self.examples: List[recarray] = [np_load(str(f), allow_pickle=True)
for f in self.examples]
def __len__(self) \
-> int:
"""Gets the amount of examples in the dataset.
:return: Amount of examples in the dataset.
:rtype: int
"""
return len(self.examples)
def __getitem__(self,
item: int) \
-> Tuple[ndarray, ndarray]:
"""Gets an example from the dataset.
:param item: Index of the item.
:type item: int
:return: Input and output values.
:rtype: numpy.ndarray. numpy.ndarray
"""
ex: Union[Path, recarray] = self.examples[item]
if not self.load_into_memory:
ex: recarray = np_load(str(ex), allow_pickle=True)
in_e, ou_e = [ex[i].item() for i in [self.input_name, self.output_name]]
return in_e, ou_e
class ClothoDatasetEval(Dataset):
def __init__(self, data_dir: Path,
split: AnyStr,
input_field_name: AnyStr,
output_field_name: AnyStr,
load_into_memory: bool,
transforms=None) \
-> None:
"""Initialization of a Clotho dataset object.
:param data_dir: Directory with data.
:type data_dir: pathlib.Path
:param split: Split to use (i.e. 'development', 'evaluation')
:type split: str
:param input_field_name: Field name of the clotho data\
to be used as input data to the\
method.
:type input_field_name: str
:param output_field_name: Field name of the clotho data\
to be used as output data to the\
method.
:type output_field_name: str
:param load_into_memory: Load all data into memory?
:type load_into_memory: bool
"""
super(ClothoDatasetEval, self).__init__()
the_dir: Path = data_dir.joinpath(split)
self.split = split
if split == 'evaluation':
self.examples: List[Path] = sorted(the_dir.iterdir())[::5] # changed
else:
self.examples: List[Path] = sorted(the_dir.iterdir()) # changed
# self.examples: List[Path] = sorted(the_dir.iterdir())
self.input_name: str = input_field_name
self.output_name: str = output_field_name
self.load_into_memory: bool = load_into_memory
self.data_dir = the_dir
self.transforms = transforms
self.resampler = torchaudio.transforms.Resample(orig_freq=settings_features['process']['sr'],
new_freq=settings_features['process']['sr_resample'])
if load_into_memory:
self.examples: List[recarray] = [np_load(str(f), allow_pickle=True)
for f in self.examples]
def __len__(self) \
-> int:
"""Gets the amount of examples in the dataset.
:return: Amount of examples in the dataset.
:rtype: int
"""
return len(self.examples)
def __getitem__(self,
item: int):
"""Gets an example from the dataset.
:param item: Index of the item.
:type item: int
:return: Input and output values.
:rtype: numpy.ndarray. numpy.ndarray
"""
ex: Union[Path, recarray] = self.examples[item]
if not self.load_into_memory:
ex: recarray = np_load(str(ex), allow_pickle=True)
in_e, ou_e = [ex[i].item() for i in [self.input_name, self.output_name]]
all_ref = get_all_ref(ex['file_name'].item(), self.data_dir)
filename = str(ex['file_name'].item())
out_len = len(ou_e)
if self.transforms is not None:
for transform in self.transforms:
in_e, ou_e = transform(dataset=self, inputs=(in_e, ou_e, filename))
return in_e, ou_e, all_ref, filename,out_len
def random_sample(self, sample_audio=True):
"""
Sampling audio or melspectrogram and encoded output
:return:
"""
item = random.randint(0, len(self.examples) - 1)
ex = self.examples[item]
if not self.load_into_memory:
ex = np_load(str(ex), allow_pickle=True)
#if sample_audio:
thedir = Path('./create_dataset/data/clotho_audio_files/').joinpath(self.split)
filename = Path(thedir, ex.file_name[0])
in_e = torchaudio.load(filepath=filename)[0][0]
in_e = self.resampler.forward(in_e)
ou_e = ex[self.output_name].item()
else:
in_e, ou_e = [ex[i].item()
for i in [self.input_name, self.output_name]]
return in_e, ou_e
def get_all_ref(filename, data_dir):
filename = str(filename)
# tgt = [np.load(d, allow_pickle=True).words_ind.tolist()
tgt = [np.load(d, allow_pickle=True)['words_ind'].item().tolist()
for d in [os.path.join(data_dir, 'clotho_file_{filename}.wav_{i}.npy'.
format(filename=filename[:-4], # 删除'.wav'
i=i)) for i in range(5)] # wav_0-wav_4
]
return tgt
# EOF
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from typing import Callable, Union, Tuple, AnyStr, Optional
from functools import partial
from pathlib import Path
from torch.utils.data.dataloader import DataLoader
from typing import MutableSequence, MutableMapping, Union,\
Tuple, List
#from .clotho_dataset import ClothoDataset, ClothoDatasetEval
#from .collate_fn import clotho_collate_fn, clotho_collate_fn_eval
__author__ = 'Konstantinos Drossos'
__docformat__ = 'reStructuredText'
__all__ = ['get_clotho_loader']
def get_clotho_loader(data_dir: Path,
split: str,
settings_features:MutableMapping[
str, Union[str, bool, MutableMapping[str, str]]],
input_field_name: str,
output_field_name: str,
load_into_memory: bool,
batch_size: int,
nb_t_steps_pad: Union[AnyStr, Tuple[int, int]],
shuffle: Optional[bool] = True,
drop_last: Optional[bool] = True,
input_pad_at: Optional[str] = 'start',
output_pad_at: Optional[str] = 'end',
num_workers: Optional[int] = 1,
return_reference: Optional[bool] = False,
) \
-> DataLoader:
"""Gets the clotho data loader.
:param return_reference:
:param data_dir: Directory with data.
:type data_dir: pathlib.Path
:param split: Split to use (i.e. 'development', 'evaluation')
:type split: str
:param input_field_name: Field name of the clotho data\
to be used as input data to the\
method.
:type input_field_name: str
:param output_field_name: Field name of the clotho data\
to be used as output data to the\
method.
:type output_field_name: str
:param load_into_memory: Load all data into memory?
:type load_into_memory: bool
:param batch_size: Batch size to use.
:type batch_size: int
:param nb_t_steps_pad: Number of time steps to\
pad/truncate to. Cab use\
'max', 'min', or exact number\
e.g. (1024, 10).
:type nb_t_steps_pad: str|(int, int)
:param shuffle: Shuffle examples? Defaults to True.
:type shuffle: bool, optional
:param drop_last: Drop the last examples if not making\
a batch of `batch_size`? Defaults to True.
:type drop_last: bool, optional
:param input_pad_at: Pad input at the start or\
at the end?
:type input_pad_at: str
:param output_pad_at: Pad output at the start or\
at the end?
:type output_pad_at: str
:param num_workers: Amount of workers, defaults to 1.
:type num_workers: int, optional
:return: Dataloader for Clotho data.
:rtype: torch.utils.data.dataloader.DataLoader
"""
transforms = []
transforms.append(MixUp(p=0.5,
settings_features=settings_features,
simple_concat_captions=True,
sample_audio=True))
if return_reference:
dataset: ClothoDatasetEval = ClothoDatasetEval(
data_dir=data_dir, split=split,
input_field_name=input_field_name,
output_field_name=output_field_name,
load_into_memory=load_into_memory,transforms=transforms)
collate_fn: Callable = partial(
clotho_collate_fn_eval,
nb_t_steps=nb_t_steps_pad,
input_pad_at=input_pad_at,
output_pad_at=output_pad_at, split=split)
else:
dataset: ClothoDataset = ClothoDataset(
data_dir=data_dir, split=split,
input_field_name=input_field_name,
output_field_name=output_field_name,
load_into_memory=load_into_memory)
collate_fn: Callable = partial(
clotho_collate_fn,
nb_t_steps=nb_t_steps_pad,
input_pad_at=input_pad_at,
output_pad_at=output_pad_at)
return DataLoader(
dataset=dataset, batch_size=batch_size,
shuffle=shuffle, num_workers=num_workers,
drop_last=drop_last, collate_fn=collate_fn)
# EOF
학습 데이터 정의
training_data = get_clotho_loader(data_dir=data_dir, split='development',
settings_features=settings['feature_extraction_settings'],
input_field_name='features',
output_field_name='words_ind',
load_into_memory=False,
batch_size=hp.batch_size,
nb_t_steps_pad='max',
num_workers=4, return_reference=True)
학습
def train():
model.train()
total_loss_text = 0.
start_time = time.time()
batch = 0
for src, tgt, tgt_len, ref in training_data:
src = src.to(device)
tgt = tgt.to(device)
tgt_pad_mask = get_padding(tgt, tgt_len)
tgt_in = tgt[:, :-1]
tgt_pad_mask = tgt_pad_mask[:, :-1]
tgt_y = tgt[:, 1:]
optimizer.zero_grad()
output = model(src, tgt_in, target_padding_mask=tgt_pad_mask)
loss_text = criterion(output.contiguous().view(-1, hp.ntoken), tgt_y.transpose(0, 1).contiguous().view(-1))
loss = loss_text
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), hp.clip_grad)
optimizer.step()
total_loss_text += loss_text.item()
writer.add_scalar('Loss/train-text', loss_text.item(), (epoch - 1) * len(training_data) + batch)
batch += 1
if batch % hp.log_interval == 0 and batch > 0:
mean_text_loss = total_loss_text / hp.log_interval
elapsed = time.time() - start_time
current_lr = [param_group['lr'] for param_group in optimizer.param_groups][0]
logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
'loss-text {:5.4f}'.format(
epoch, batch, len(training_data), current_lr,
elapsed * 1000 / hp.log_interval, mean_text_loss))
total_loss_text = 0
start_time = time.time()
epoch = 1
if hp.mode == 'train':
while epoch < hp.training_epochs + 1:
epoch_start_time = time.time()
train()
torch.save(model.state_dict(), '{log_dir}/{num_epoch}.pt'.format(log_dir=log_dir, num_epoch=epoch))
scheduler.step(epoch)
eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
beam_size=2)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
beam_size=3)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
beam_size=4)
epoch += 1
에러발생
ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
File “/home/hj20/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py”, line 202, in _worker_loop
data = fetcher.fetch(index)
File “/home/hj20/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py”, line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File “/home/hj20/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py”, line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File “”, line 171, in getitem
in_e, ou_e = transform(dataset=self, inputs=(in_e, ou_e, filename))
File “”, line 105, in call
resulted_labels, random_sample[1]
File “”, line 94, in mix_audio_and_labels
mixed_audio = self.mix_audio(first_audio, second_audio)
File “”, line 56, in mix_audio
longer[start:end] += np.dot(shorter,(1 - a)) #shorter * (1 - a)
ValueError: operands could not be broadcast together with shapes (1855,) (1855,64)
help me ㅠㅠ