ValueError: operands could not be broadcast together with shapes (1855,) (1855,64)

The following error occurred during model training.

ValueError: operands could not be broadcast together with shapes (1855,) (1855,64)

input : audio feature(waveform) , log mel spectrogram = 64
using librosa package

AudioAugmentation is not used.

I padded input(audio feature) with different length with class :clotho_collate_fn for the longest length. The detailed code is as follows.

settings_features=
{‘keep_raw_audio_data’: False,
‘process’: {‘sr’: 44100,
‘sr_resample’: 16000,
‘nb_fft’: 1024,
‘hop_size’: 512,
‘nb_mels’: 64,
‘window_function’: ‘hann’,
‘center’: True,
‘f_min’: 0.0,
‘f_max’: None,
‘htk’: False,
‘power’: 1.0,
‘norm’: 1}}

import numpy as np
import random
from tools.features_log_mel_bands import feature_extraction

from pathlib import Path
import pysndfx
import gc

import copy

from tools.file_io import load_audio_file
import torch


__author__ = 'Nikita Kuzmin -- Lomonosov Moscow State University'

class MixUp:

    def __init__(self, p, settings_features, simple_concat_captions=True,
                 sample_audio=False):

        self.p = p
        self.sample_audio = sample_audio
        self.settings_features = settings_features
        self.simple_concat_captions = simple_concat_captions

    def from_mel(self, mel):
        return 700 * (10 ** (mel / 2595.0) - 1)

    def to_mel(self, hertz):
        return 2595.0 * np.log10(1 + hertz / 700.0)

    def mix_audio(self, first_audio, second_audio):

        a = np.random.uniform(0.4, 0.6) #베타 분포에서 뽑음

        shorter, longer = first_audio, second_audio

        if shorter.shape[0] == longer.shape[0]:
            if self.sample_audio:
                return (longer + shorter) / 2.0
            else:
                longer = from_mel_to_audio(longer, **self.settings_features['process']) * a
                shorter = from_mel_to_audio(shorter,
                                            **self.settings_features['process'])
                return feature_extraction((longer + shorter) / 2, **self.settings_features['process'])

        if first_audio.shape[0] > second_audio.shape[0]:
            shorter, longer = longer, shorter


        if self.sample_audio:
            start = random.randint(0, longer.shape[0] - 1 - shorter.shape[0])
            end = start + shorter.shape[0]
            longer *= a
            longer[start:end] += np.dot(shorter,(1 - a)) #shorter * (1 - a)
        else:
            longer = from_mel_to_audio(longer, **self.settings_features['process']) * a
            shorter = from_mel_to_audio(shorter,
                                        **self.settings_features['process'])
            start = random.randint(0, longer.shape[0] - 1 - shorter.shape[0])
            end = start + shorter.shape[0]
            longer[start:end] += np.dot(shorter,(1 - a))
            longer = feature_extraction(longer,
                                        **self.settings_features['process'])

        return longer

    def mix_labels(self, first_labels, second_labels):
        if self.simple_concat_captions:
            return np.hstack([first_labels[:-1], second_labels[1:]])
        else:

            first_token = first_labels[0]
            last_token = first_labels[-1]
            first_labels = first_labels[1:-1]
            second_labels = second_labels[1:-1]
            res = np.empty((first_labels.size + second_labels.size,),
                           dtype=first_labels.dtype)
            min_size = min(first_labels.size, second_labels.size)
            res[0:2*min_size:2] = first_labels[:min_size]
            res[1:2*min_size:2] = second_labels[:min_size]
            if first_labels.size > second_labels.size:
                res[min_size * 2:] = first_labels[min_size:]
            elif second_labels.size > first_labels.size:
                res[min_size*2:] = second_labels[min_size:]
            res = np.concatenate(([first_token], res))
            res = np.concatenate((res, [last_token]))
            return res

    def mix_audio_and_labels(self,
                             first_audio, second_audio,
                             first_labels, second_labels):
        mixed_audio = self.mix_audio(first_audio, second_audio)
        mixed_labels = self.mix_labels(first_labels, second_labels)

        return mixed_audio, mixed_labels

    def __call__(self, dataset, inputs):
        resulted_audio, resulted_labels, filename = inputs[0], inputs[1], inputs[2]
        if np.random.uniform() <= self.p:
            random_sample = dataset.random_sample(sample_audio=self.sample_audio)
            resulted_audio, resulted_labels = self.mix_audio_and_labels(
                resulted_audio, random_sample[0],
                resulted_labels, random_sample[1]
            )
        return resulted_audio, resulted_labels


class AudioAugmentation:
    # https://github.com/ex4sperans/freesound-classification
    def __init__(self, p):

        self.p = p
        self.effects_chain = (
            pysndfx.AudioEffectsChain()
                .reverb(
                reverberance=random.randrange(50),
                room_scale=random.randrange(50),
                stereo_depth=random.randrange(50)
            )
                .pitch(shift=random.randrange(-300, 300))
                .overdrive(gain=random.randrange(2, 10))
                .speed(random.uniform(0.9, 1.1))
        )

    def __call__(self, dataset, inputs):

        resulted_audio = inputs[0]
        captions = inputs[1]
        del inputs
        gc.collect()
        if np.random.uniform() < self.p:
            resulted_audio = torch.from_numpy(self.effects_chain(resulted_audio.numpy()))
        return resulted_audio, captions

#clotho_collate_fn


#!/usr/bin/env python
# -*- coding: utf-8 -*-

from typing import MutableSequence, Union, Tuple, AnyStr
from numpy import ndarray
import torch
from torch import cat as pt_cat, zeros as pt_zeros, \
    ones as pt_ones, from_numpy, Tensor
from hparams import hparams as hp
from data_augmentation.SpecAugment import spec_augment

__author__ = 'Konstantinos Drossos -- Tampere University'
__docformat__ = 'reStructuredText'
__all__ = ['clotho_collate_fn']


def clotho_collate_fn(batch: MutableSequence[ndarray],
                      nb_t_steps: Union[AnyStr, Tuple[int, int]],
                      input_pad_at: str,
                      output_pad_at: str) \
        -> Tuple[Tensor, Tensor]:
    """Pads data.

    :param batch: Batch data.
    :type batch: list[numpy.ndarray]
    :param nb_t_steps: Number of time steps to\
                       pad/truncate to. Cab use\
                       'max', 'min', or exact number\
                       e.g. (1024, 10).
    :type nb_t_steps: str|(int, int)
    :param input_pad_at: Pad input at the start or\
                         at the end?
    :type input_pad_at: str
    :param output_pad_at: Pad output at the start or\
                          at the end?
    :type output_pad_at: str
    :return: Padded data.
    :rtype: torch.Tensor, torch.Tensor
    """
    if type(nb_t_steps) == str:
        truncate_fn = max if nb_t_steps.lower() == 'max' else min
        in_t_steps = truncate_fn([i[0].shape[0] for i in batch])
        out_t_steps = truncate_fn([i[1].shape[0] for i in batch])
    else:
        in_t_steps, out_t_steps = nb_t_steps

    in_dim = batch[0][0].shape[-1]
    eos_token = batch[0][1][-1]
    PAD = 4367

    input_tensor, output_tensor = [], []

    for in_b, out_b in batch:
        if in_t_steps >= in_b.shape[0]:
            padding = pt_zeros(in_t_steps - in_b.shape[0], in_dim).float()
            data = [from_numpy(in_b).float()]
            if input_pad_at.lower() == 'start':
                data.insert(0, padding)
            else:
                data.append(padding)
            tmp_in: Tensor = pt_cat(data)
        else:
            tmp_in: Tensor = from_numpy(in_b[:in_t_steps, :]).float()
        input_tensor.append(tmp_in.unsqueeze_(0))

        if out_t_steps >= out_b.shape[0]:
            padding = pt_ones(out_t_steps - len(out_b)).mul(PAD).long()
            data = [from_numpy(out_b).long()]
            if output_pad_at.lower() == 'start':
                data.insert(0, padding)
            else:
                data.append(padding)

            tmp_out: Tensor = pt_cat(data)
        else:
            tmp_out: Tensor = from_numpy(out_b[:out_t_steps]).long()
        output_tensor.append(tmp_out.unsqueeze_(0))

    input_tensor = pt_cat(input_tensor)
    output_tensor = pt_cat(output_tensor)

    
    file_names = [i[2] for i in batch]
    
    
    
    return input_tensor, output_tensor, file_names


def clotho_collate_fn_eval(batch: MutableSequence[ndarray],
                           nb_t_steps: Union[AnyStr, Tuple[int, int]],
                           input_pad_at: str,
                           output_pad_at: str,
                           split: str,
                           augment:bool) \
        -> Tuple[Tensor, Tensor, Tensor, list]:
    """Pads data.

    :param batch: Batch data.
    :type batch: list[numpy.ndarray]
    :param nb_t_steps: Number of time steps to\
                       pad/truncate to. Cab use\
                       'max', 'min', or exact number\
                       e.g. (1024, 10).
    :type nb_t_steps: str|(int, int)
    :param input_pad_at: Pad input at the start or\
                         at the end?
    :type input_pad_at: str
    :param output_pad_at: Pad output at the start or\
                          at the end?
    :type output_pad_at: str
    :return: Padded data.
    :rtype: torch.Tensor, torch.Tensor
    """
    if type(nb_t_steps) == str:
        truncate_fn = max if nb_t_steps.lower() == 'max' else min
        in_t_steps = truncate_fn([i[0].shape[0] for i in batch])
        out_t_steps = truncate_fn([i[1].shape[0] for i in batch])
    else:
        in_t_steps, out_t_steps = nb_t_steps

    in_dim = batch[0][0].shape[-1]
    eos_token = batch[0][1][-1]
    batch = sorted(batch, key=lambda x: x[-1],reverse=True)
    PAD = 4367
    input_tensor, output_tensor = [], []

    for in_b, out_b, ref, filename,out_len in batch:
        if in_t_steps >= in_b.shape[0]:
            padding = pt_zeros(in_t_steps - in_b.shape[0], in_dim).float()
            data = [from_numpy(in_b).float()]
            if input_pad_at.lower() == 'start':
                data.insert(0, padding)
            else:
                data.append(padding)
            tmp_in: Tensor = pt_cat(data)
        else:
            tmp_in: Tensor = from_numpy(in_b[:in_t_steps, :]).float()
        input_tensor.append(tmp_in.unsqueeze_(0))

        if out_t_steps >= out_b.shape[0]:
            padding = pt_ones(out_t_steps - len(out_b)).mul(PAD).long()
            data = [from_numpy(out_b).long()]
            if output_pad_at.lower() == 'start':
                data.insert(0, padding)
            else:
                data.append(padding)

            tmp_out: Tensor = pt_cat(data)
        else:
            tmp_out: Tensor = from_numpy(out_b[:out_t_steps]).long()
        output_tensor.append(tmp_out.unsqueeze_(0))

    input_tensor = pt_cat(input_tensor)

    if augment:
        input_tensor = spec_augment(input_tensor)

    output_tensor = pt_cat(output_tensor)
    all_ref = [i[2] for i in batch]
    filename = [i[3] for i in batch]
    *_, target_len = zip(*batch)
    target_len = torch.LongTensor(target_len)
    
    file_names = [i[2] for i in batch]
    
    return input_tensor, output_tensor,file_names,target_len, all_ref

# EOF

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from typing import Tuple, List, AnyStr, Union
from pathlib import Path

from numpy import ndarray, recarray
from torch.utils.data import Dataset
from numpy import load as np_load


import torchaudio
import torch
import numpy as np
import os

__author__ = 'Konstantinos Drossos -- Tampere University'
__docformat__ = 'reStructuredText'
__all__ = ['ClothoDataset']


class ClothoDataset(Dataset):

    def __init__(self, data_dir: Path,
                 split: AnyStr,
                 input_field_name: AnyStr,
                 output_field_name: AnyStr,
                 load_into_memory: bool,
                transforms=None) \
            -> None:
        """Initialization of a Clotho dataset object.

        :param data_dir: Directory with data.
        :type data_dir: pathlib.Path
        :param split: Split to use (i.e. 'development', 'evaluation')
        :type split: str
        :param input_field_name: Field name of the clotho data\
                                 to be used as input data to the\
                                 method.
        :type input_field_name: str
        :param output_field_name: Field name of the clotho data\
                                 to be used as output data to the\
                                 method.
        :type output_field_name: str
        :param load_into_memory: Load all data into memory?
        :type load_into_memory: bool
        """
        super(ClothoDataset, self).__init__()
        the_dir: Path = data_dir.joinpath(split)

        self.examples: List[Path] = sorted(the_dir.iterdir())
        self.input_name: str = input_field_name
        self.output_name: str = output_field_name
        self.load_into_memory: bool = load_into_memory
        self.transforms = transforms
        self.resampler = torchaudio.transforms.Resample(orig_freq=settings_features['process']['sr'],
                                                        new_freq=settings_features['process']['sr_resample'])
            
            
            
        if load_into_memory:
            self.examples: List[recarray] = [np_load(str(f), allow_pickle=True)
                                             for f in self.examples]

    def __len__(self) \
            -> int:
        """Gets the amount of examples in the dataset.

        :return: Amount of examples in the dataset.
        :rtype: int
        """
        return len(self.examples)

    def __getitem__(self,
                    item: int) \
            -> Tuple[ndarray, ndarray]:
        """Gets an example from the dataset.

        :param item: Index of the item.
        :type item: int
        :return: Input and output values.
        :rtype: numpy.ndarray. numpy.ndarray
        """
        ex: Union[Path, recarray] = self.examples[item]
        if not self.load_into_memory:
            ex: recarray = np_load(str(ex), allow_pickle=True)

        in_e, ou_e = [ex[i].item() for i in [self.input_name, self.output_name]]

        return in_e, ou_e

class ClothoDatasetEval(Dataset):

    def __init__(self, data_dir: Path,
                 split: AnyStr,
                 input_field_name: AnyStr,
                 output_field_name: AnyStr,
                 load_into_memory: bool,
                 transforms=None) \
            -> None:
        """Initialization of a Clotho dataset object.

        :param data_dir: Directory with data.
        :type data_dir: pathlib.Path
        :param split: Split to use (i.e. 'development', 'evaluation')
        :type split: str
        :param input_field_name: Field name of the clotho data\
                                 to be used as input data to the\
                                 method.
        :type input_field_name: str
        :param output_field_name: Field name of the clotho data\
                                 to be used as output data to the\
                                 method.
        :type output_field_name: str
        :param load_into_memory: Load all data into memory?
        :type load_into_memory: bool
        """
        super(ClothoDatasetEval, self).__init__()
        the_dir: Path = data_dir.joinpath(split)
        self.split = split
        
        if split == 'evaluation':
            self.examples: List[Path] = sorted(the_dir.iterdir())[::5]  # changed
        else:
            self.examples: List[Path] = sorted(the_dir.iterdir())  # changed
        # self.examples: List[Path] = sorted(the_dir.iterdir())
        self.input_name: str = input_field_name
        self.output_name: str = output_field_name
        self.load_into_memory: bool = load_into_memory
        self.data_dir = the_dir
        self.transforms = transforms
        self.resampler = torchaudio.transforms.Resample(orig_freq=settings_features['process']['sr'],
                                                        new_freq=settings_features['process']['sr_resample'])  
        
        if load_into_memory:
            self.examples: List[recarray] = [np_load(str(f), allow_pickle=True)
                                             for f in self.examples]

    def __len__(self) \
            -> int:
        """Gets the amount of examples in the dataset.

        :return: Amount of examples in the dataset.
        :rtype: int
        """
        return len(self.examples)

    def __getitem__(self,
                    item: int):
        """Gets an example from the dataset.

        :param item: Index of the item.
        :type item: int
        :return: Input and output values.
        :rtype: numpy.ndarray. numpy.ndarray
        """
        ex: Union[Path, recarray] = self.examples[item]
        if not self.load_into_memory:
            ex: recarray = np_load(str(ex), allow_pickle=True)

        in_e, ou_e = [ex[i].item() for i in [self.input_name, self.output_name]]

        all_ref = get_all_ref(ex['file_name'].item(), self.data_dir)

        filename = str(ex['file_name'].item())
        out_len = len(ou_e)
        
        if self.transforms is not None:
            for transform in self.transforms:
                in_e, ou_e = transform(dataset=self, inputs=(in_e, ou_e, filename))
        

        return in_e, ou_e, all_ref, filename,out_len

     
    def random_sample(self, sample_audio=True):
        """
        Sampling audio or melspectrogram and encoded output
        :return:
        """

        item = random.randint(0, len(self.examples) - 1)
        ex = self.examples[item]
        if not self.load_into_memory:
            ex = np_load(str(ex), allow_pickle=True)
        #if sample_audio:
            thedir = Path('./create_dataset/data/clotho_audio_files/').joinpath(self.split)
            filename = Path(thedir, ex.file_name[0])
            in_e = torchaudio.load(filepath=filename)[0][0]
            in_e = self.resampler.forward(in_e)
            ou_e = ex[self.output_name].item()
        else:
            in_e, ou_e = [ex[i].item()
                          for i in [self.input_name, self.output_name]]

        return in_e, ou_e
            
            
def get_all_ref(filename, data_dir):
    filename = str(filename)
    # tgt = [np.load(d, allow_pickle=True).words_ind.tolist()
    tgt = [np.load(d, allow_pickle=True)['words_ind'].item().tolist()
           for d in [os.path.join(data_dir, 'clotho_file_{filename}.wav_{i}.npy'.
                                  format(filename=filename[:-4],  # 删除'.wav'
                                         i=i)) for i in range(5)]  # wav_0-wav_4
           ]
    return tgt
# EOF


#!/usr/bin/env python
# -*- coding: utf-8 -*-

from typing import Callable, Union, Tuple, AnyStr, Optional
from functools import partial
from pathlib import Path

from torch.utils.data.dataloader import DataLoader

from typing import MutableSequence, MutableMapping, Union,\
    Tuple, List




#from .clotho_dataset import ClothoDataset, ClothoDatasetEval
#from .collate_fn import clotho_collate_fn, clotho_collate_fn_eval

__author__ = 'Konstantinos Drossos'
__docformat__ = 'reStructuredText'
__all__ = ['get_clotho_loader']


def get_clotho_loader(data_dir: Path,
                      split: str,
                      settings_features:MutableMapping[
                          str, Union[str, bool, MutableMapping[str, str]]],
                      input_field_name: str,
                      output_field_name: str,
                      load_into_memory: bool,
                      batch_size: int,
                      nb_t_steps_pad: Union[AnyStr, Tuple[int, int]],
                      shuffle: Optional[bool] = True,
                      drop_last: Optional[bool] = True,
                      input_pad_at: Optional[str] = 'start',
                      output_pad_at: Optional[str] = 'end',
                      num_workers: Optional[int] = 1,
                      return_reference: Optional[bool] = False,
                      ) \
        -> DataLoader:
    """Gets the clotho data loader.

    :param return_reference:
    :param data_dir: Directory with data.
    :type data_dir: pathlib.Path
    :param split: Split to use (i.e. 'development', 'evaluation')
    :type split: str
    :param input_field_name: Field name of the clotho data\
                             to be used as input data to the\
                             method.
    :type input_field_name: str
    :param output_field_name: Field name of the clotho data\
                             to be used as output data to the\
                             method.
    :type output_field_name: str
    :param load_into_memory: Load all data into memory?
    :type load_into_memory: bool
    :param batch_size: Batch size to use.
    :type batch_size: int
    :param nb_t_steps_pad: Number of time steps to\
                           pad/truncate to. Cab use\
                           'max', 'min', or exact number\
                           e.g. (1024, 10).
    :type nb_t_steps_pad: str|(int, int)
    :param shuffle: Shuffle examples? Defaults to True.
    :type shuffle: bool, optional
    :param drop_last: Drop the last examples if not making\
                      a batch of `batch_size`? Defaults to True.
    :type drop_last: bool, optional
    :param input_pad_at: Pad input at the start or\
                         at the end?
    :type input_pad_at: str
    :param output_pad_at: Pad output at the start or\
                          at the end?
    :type output_pad_at: str
    :param num_workers: Amount of workers, defaults to 1.
    :type num_workers: int, optional
    :return: Dataloader for Clotho data.
    :rtype: torch.utils.data.dataloader.DataLoader
    """

    transforms = []
    transforms.append(MixUp(p=0.5,
                              settings_features=settings_features,
                              simple_concat_captions=True,
                              sample_audio=True))

    if return_reference:
        dataset: ClothoDatasetEval = ClothoDatasetEval(
            data_dir=data_dir, split=split,
            input_field_name=input_field_name,
            output_field_name=output_field_name,
            load_into_memory=load_into_memory,transforms=transforms)

        collate_fn: Callable = partial(
            clotho_collate_fn_eval,
            nb_t_steps=nb_t_steps_pad,
            input_pad_at=input_pad_at,
            output_pad_at=output_pad_at, split=split)
    else:
        dataset: ClothoDataset = ClothoDataset(
            data_dir=data_dir, split=split,
            input_field_name=input_field_name,
            output_field_name=output_field_name,
            load_into_memory=load_into_memory)

        collate_fn: Callable = partial(
            clotho_collate_fn,
            nb_t_steps=nb_t_steps_pad,
            input_pad_at=input_pad_at,
            output_pad_at=output_pad_at)

    return DataLoader(
        dataset=dataset, batch_size=batch_size,
        shuffle=shuffle, num_workers=num_workers,
        drop_last=drop_last, collate_fn=collate_fn)

# EOF

학습 데이터 정의

training_data = get_clotho_loader(data_dir=data_dir, split='development',
                                  settings_features=settings['feature_extraction_settings'],    
                                  input_field_name='features',
                                      output_field_name='words_ind',
                                      load_into_memory=False,
                                      batch_size=hp.batch_size,
                                      nb_t_steps_pad='max',
                                      num_workers=4, return_reference=True)

학습

def train():
    model.train()
    total_loss_text = 0.
    start_time = time.time()
    batch = 0
    for src, tgt, tgt_len, ref in training_data:
        src = src.to(device)
        tgt = tgt.to(device)
        tgt_pad_mask = get_padding(tgt, tgt_len)
        tgt_in = tgt[:, :-1]
        tgt_pad_mask = tgt_pad_mask[:, :-1]
        tgt_y = tgt[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_in, target_padding_mask=tgt_pad_mask)

        loss_text = criterion(output.contiguous().view(-1, hp.ntoken), tgt_y.transpose(0, 1).contiguous().view(-1))
        loss = loss_text
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), hp.clip_grad)
        optimizer.step()
        total_loss_text += loss_text.item()

        writer.add_scalar('Loss/train-text', loss_text.item(), (epoch - 1) * len(training_data) + batch)

        batch += 1

        if batch % hp.log_interval == 0 and batch > 0:
            mean_text_loss = total_loss_text / hp.log_interval
            elapsed = time.time() - start_time
            current_lr = [param_group['lr'] for param_group in optimizer.param_groups][0]
            logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                         'loss-text {:5.4f}'.format(
                epoch, batch, len(training_data), current_lr,
                elapsed * 1000 / hp.log_interval, mean_text_loss))
            total_loss_text = 0
            start_time = time.time()

epoch = 1
if hp.mode == 'train':
    while epoch < hp.training_epochs + 1:
        epoch_start_time = time.time()
        train()
        torch.save(model.state_dict(), '{log_dir}/{num_epoch}.pt'.format(log_dir=log_dir, num_epoch=epoch))
        scheduler.step(epoch)
        eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=2)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=3)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=4)
        epoch += 1

에러발생

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
File “/home/hj20/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py”, line 202, in _worker_loop
data = fetcher.fetch(index)
File “/home/hj20/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py”, line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File “/home/hj20/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py”, line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File “”, line 171, in getitem
in_e, ou_e = transform(dataset=self, inputs=(in_e, ou_e, filename))
File “”, line 105, in call
resulted_labels, random_sample[1]
File “”, line 94, in mix_audio_and_labels
mixed_audio = self.mix_audio(first_audio, second_audio)
File “”, line 56, in mix_audio
longer[start:end] += np.dot(shorter,(1 - a)) #shorter * (1 - a)
ValueError: operands could not be broadcast together with shapes (1855,) (1855,64)

help me ㅠㅠ

I think the broadcasting should be possible, if you unsqueeze(1) the missing dim1 in one of the tensors, which is raising the error (assuming that broadcast is really wanted).

Which part should I put in the unsqueze (1)?

You should use it on the tensor having only one dimension (check the shape of longer and shorter) and also make sure these shapes “make sense”, i.e. you really want to unsqueeze and broadcast the operation.