o
    i                  '   @   s  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	Z	d dl
m
Z
 d dlmZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d d	lmZ d dlZd dlZd dlZdfddZdfddZdgddZ dhddZ!diddZ"de#de#fddZ$dd Z%dd  Z&djd"d#Z'g d$d d g g d%g dd&d'd&d(d(d)d fd*ee( d+ee( d,e)d-ee) d.e)d/e*d0e*d1ee) d2ee* d3e(d4ee( d5e)d6e+d7e*d8e+d9e+d:e+d;ee)e)f d<e)f&d=d>Z,dkd?d@Z-dAdB Z.dldDdEZ/dFdG Z0ddHdIdIdIdIdIdJdKdLdMg g fdNee( dOe)dPe)d.e)dQe1dRee( dSee( fdTdUZ2dmdWdXZ3G dYdZ dZZ4d[d\ Z5d)d]e(d^e)d_e*d`e*fdadbZ6dndddeZ7dS )o    )
ThreadPoolN)partial)Path)tqdm)ListTuple)open_memmap
read_audio)reverberate }  c              	   C   sv   t | }g }td|jd |D ]%}||||  }|jd |kr0t |t ||jd  f}|| qt |S )a  
    Takes an input list of 1D arrays (of different lengths), concatenates them together,
    and then extracts clips of a uniform size by dividing the combined array.

    Args:
        audio_data (List[ndarray]): A list of 1D numpy arrays to combine and stack
        clip_size (int): The desired total length of the uniform clip size (in samples)

    Returns:
        ndarray: A N by `clip_size` array with the audio data, converted to 16-bit PCM
    r   )nphstackrangeshapezerosappendarray)
audio_data	clip_sizeZcombined_dataZnew_examplesichunk r   K/home/kim/smarthome/.venv/lib/python3.10/site-packages/openwakeword/data.pystack_clips(   s   

r   c           	   	   C   s   g }| D ]}z	| t| W q ty   Y qw tdd |D | }t||f}d}d}|D ]6}t||f}|jd |kr^|d| ||ddf< ||d }|d7 }|jd |ksA|jdkre|nd}q1|d 	tj
}|S )a  
    Takes the specified audio files and shapes them into an array of N by `clip_size`,
    where N is determined by the length of the audio files and `clip_size` at run time.

    Clips longer than `clip size` are truncated and extended into the N+1 row.
    Clips shorter than `clip_size` are combined with the previous or next clip
    (except for the last clip in `files`, which is ignored if it is too short.)

    Args:
        files (List[str]): A list of filepaths
        clip_size (int): The number of samples (of 16khz audio) for all of the rows in the array

    Returns:
        ndarray: A N by `clip_size` array with the audio data, converted to 16-bit PCM
    c                 S   s   g | ]}|j d  qS r   )r   .0r   r   r   r   
<listcomp>]       z$load_audio_clips.<locals>.<listcomp>Nr        )r   r
   
ValueErrorsumr   emptyr   r   sizeastypeint16)	filesr   r   r   NXZprevious_row_remainderZcntrowr   r   r   load_audio_clipsC   s*   r,   ffmpegc                 C   sD   |dkrd|  d| d}n|dkrd|  d| d}t | d S )NZsoxzsox "z" -G -r 16000 -c 1 -b 16 ""r-   zffmpeg -y -i "z" -ar 16000 ")ossystem)Z
input_fileoutput_filebackendcmdr   r   r   _convert_clipv   s   
r4   >  r    c                 C   s6   t |d}tt|d}||dd t| |D  dS )a9  
    Converts files in parallel with multithreading using Sox or ffmpeg.

    Intended to only convert input audio files into single-channel, 16 khz clips.

    Args:
        input_files (List[str]): A list of paths to input files
        output_files (List[str]): A list of paths to output files, corresponding 1:1 to the input files
        sr (int): The output sample rate of the converted clip
        ncpu (int): The number of CPUs to use for the conversion
        backend (str): The utilty to use for conversion, "sox" or "ffmpeg"

    Returns:
        None
    )Z	processes)r2   c                 S   s   g | ]\}}||fqS r   r   r   r   jr   r   r   r          z!convert_clips.<locals>.<listcomp>N)r   r   r4   starmapzip)Zinput_filesZoutput_filessrZncpur2   poolfr   r   r   convert_clips   s   
 r>   r%   c                    s  g }g }| D ]_}g }g }	|r*dd t ||D }	||	 |dd |	D  ntt|D ]}
|	|
j ||
j ||
 j	 q1|dkrU|t
|	| q|dkre|dd t|	D  q|g kr fddt||D }dd |D d	d |D fS |g fS )
ah  
    Gets the paths of wav files in flat target directories, automatically filtering
    out files below/above the specified length (in seconds). Assumes that all
    wav files are sampled at 16khz, are single channel, and have 16-bit PCM data.

    Uses `os.scandir` in Python for highly efficient file system exploration,
    and doesn't require loading the files into memory for length estimation.

    Args:
        target_dir (List[str]): The target directories containing the audio files
        min_length_secs (float): The minimum length in seconds (otherwise the clip is skipped)
        max_length_secs (float): The maximum length in seconds (otherwise the clip is skipped)
        duration_method (str): Whether to use the file size ('size'), or header information ('header')
                               to estimate the duration of the audio file. 'size' is generally
                               much faster, but assumes that all files in the target directory
                               are the same type, sample rate, and bitrate. If None, durations are not calculated.
        glob_filter (str): A pathlib glob filter string to select specific files within the target directory

    Returns:
        tuple: A list of strings corresponding to the paths of the wav files that met the length criteria,
               and a list of their durations (in seconds)
    c                 S      g | ]}t |qS r   )strr   r   r   r   r          z&filter_audio_paths.<locals>.<listcomp>c                 S   s   g | ]}t j|qS r   )r/   pathgetsizer   r   r   r   r      r8   r%   headerc                 S   r?   r   )get_clip_durationr   r   r   r   r      rA   c                    s(   g | ]\}}|kr| kr||fqS r   r   r6   max_length_secsmin_length_secsr   r   r      s   ( c                 S      g | ]}|d  qS r   r   r   r   r   r   r      rA   c                 S   rI   r    r   r   r   r   r   r      rA   )r   globextendr   r/   scandirr   rB   statst_sizeestimate_clip_durationr:   )Ztarget_dirsrH   rG   Zduration_methodZglob_filter
file_paths	durations
target_dirsizesZ	dir_pathsr   filteredr   rF   r   filter_audio_paths   s,   
rV   audio_filesrT   c                 C   sl   t | d }t| d }dtj| d  |jj|jj  }g }|D ]}|	|d | |jj  q$|S )ai  Estimates the duration of each audio file in a list.

    Assumes that all of the audio files have the same audio format,
    bit depth, and sample rate.

    Args:
        audio_file (str): A list of audio file paths
        sizes (int): The size of each audio file in bytes

    Returns:
        list: A list of durations (in seconds) for the audio files
    r      )

torchaudioinfomutagenFiler/   rB   rC   Zbitratelengthr   )rW   rT   detailsZ
correctionrR   r%   r   r   r   rP      s   $rP   c                 C   s   ddd}d}zt | }W n ty   | Y S w tj| }|jdkr2|jdkr0||d  }|S |jdkrB|jdkrB||d	  }|S )
a)  Estimates the duration of an MP3 file from metadata and file-size.
    Is only accurate for 16000 khz sample rate audio with a relatively
    constant bit-rate.

    Args:
        fpath (str): The input path to the MP3 file

    Returns:
        float: The duration of the MP3 file in seconds
    g>&5?g>&%?)16_khz_single_channel16_khz_stereor   r    r5   r_      r`   )rY   rZ   RuntimeErrorr/   rB   rC   Znum_channelssample_rate)ZfpathZconversion_factorsZduration_secondsmdnbytesr   r   r   estimate_mp3_duration   s$   



rf   c                 C   s0   zt | }W n
 ty   Y dS w |j|j S )zJGets the duration of an audio clip in seconds from file header informationr   )rY   rZ   rb   Z
num_framesrc   )clipmetadatar   r   r   rE     s   rE   ra   c                 C   s   | d | d S )a  
    Calculates the duration (in seconds) from a WAV file, assuming it contains 16 khz single-channel audio.
    The bit depth is user specified, and defaults to 2 for 16-bit PCM audio.

    Args:
        size (int): The file size in bytes
        nbytes (int): How many bytes for each data point in the audio (e.g., 16-bit is 2, 32-bit is 4, etc.)

    Returns:
        float: The duration of the audio file in seconds
    ,   r5   r   )r%   re   r   r   r   get_wav_duration_from_filesize  s   rj       randomTg        F)r   r   foreground_clipsbackground_clipscombined_sizelabels
batch_sizesnr_lowsnr_highstart_indexforeground_durationsforeground_truncate_strategyrirsrir_probabilityvolume_augmentationgenerated_noise_augmentationshufflereturn_sequence_labelsreturn_background_clipsreturn_background_clips_delayseedc           0   
   #   s   |rt j| t| |sdg| }n
t|dk r td|s)dgt|  }|rYt jt| }t | |  } t ||  }t ||  }|rYt ||  }t	dt| |D ]}d||||  }dd | |||  D }dd |D }|r fddt
|||||  D }t ||||  }dd t||D }d	d |D }g }t j|d |d
 d
 }t|D ]n\}}|jd || k r|t || |jd  t j}|d| ||< ||d| ||    q|jd || kr5t jdtd
|jd | | }||||  ||< |||| || |    qt j|||}g } g }!t
||||D ]Z\}"}#}$}%|#jd |kr\t|#jt|"|#|$|%}&|!t||%|%|"jd   t j |k rg d}'tjj|t j|'d}(t|(|(  }(t|&|(t j|d}&| |& qIt| })tt |!}*|
rt j |krt !t|
\}+|+jd d
kr|+td|+jd d
 ddf }+t"|)|+dd})|rt jdd|)jd },|,|)jd
dd  d |) })ntjt#|)d
dd\}-}.|)|-j$dd })|)% d t j&})tt '|)jd
ddkd }/|)|/ })||/ }|*|/ }*|sM|)|sG|n|*dfV  qat|% d t j&|/ }|)|sb|n|*|fV  qadS )a  
    Mixes foreground and background clips at a random SNR level in batches.

    References: https://pytorch.org/audio/main/tutorials/audio_data_augmentation_tutorial.html and
    https://speechbrain.readthedocs.io/en/latest/API/speechbrain.processing.speech_augmentation.html#speechbrain.processing.speech_augmentation.AddNoise

    Args:
        foreground_clips (List[str]): A list of paths to the foreground clips
        background_clips (List[str]): A list of paths to the background clips (randomly selected for each
                                      foreground clip)
        combined_size (int): The total length (in samples) of the combined clip. If needed, the background
                             clips are duplicated or truncated to reach this length.
        labels (List[int]): A list of integer labels corresponding 1:1 for the foreground clips. Will be updated
                            as needed with foreground clips to ensure that mixed clips retain the proper labels.
        batch_size (int): The batch size
        snr_low (float): The low SNR level of the mixing in db
        snr_high (float): The high snr level of the mixing in db
        start_index (List[int]): The starting position (in samples) for the foreground clip to start in
                                 the background clip. If the foreground clip is longer than `combined_size`
                                 when starting at this point, the foreground clip will be truncated
                                 according to the `foreground_truncate_strategy` argument.
        foreground_durations (List[float]): The desired duration of each foreground clip (in seconds)
        foreground_truncate_strategy (str): The method used to truncate the foreground clip, if needed based on the
                                            `start_index`, `foreground_durations`, and `combined_size` arguments.
                                            See the options in the `truncate_clip` method.
        rirs (List[str]): A list of paths to room impulse response functions (RIR) to convolve with the
                          clips to simulate different recording environments. Applies a single random selection from the
                          list RIR file to the entire batch. If empty (the default), nothing is done.
        rir_probability (float): The probability (between 0 and 1) that the batch will be convolved with a RIR file.
        volume_augmentation (bool): Whether to randomly apply volume augmentation to the clips in the batch.
                                    This simply scales the data of each clip such that the maximum value is is between
                                    0.02 and 1.0 (the floor shouldn't be zero as beyond a certain point the audio data
                                    is no longer valid).
        generated_noise_augmentation: The probability of further mixing the mixed clip with generated random noise.
                                      Will be either "white", "brown", "blue", "pink", or "violet" noise, mixed at a
                                      random SNR between `snr_low` and `snr_high`.
        return_background_clips (bool): Whether to return the segment of the background clip that was mixed with each
                                        foreground clip in the batch.
        return_background_clips_delay (Tuple(int)): The lower and upper bound of a random delay (in samples)
                                           to apply to the segment of each returned backgroud clip mixed
                                           with each foreground clip in the batch. This is primarily intended to
                                           simulate the drift between input and output channels
                                           in audio devices, which means that the mixed audio is never
                                           exactly aligned with the two source clips.
        shuffle (bool): Whether to shuffle the foreground clips before mixing (default: True)
        return_sequence_labels (bool): Whether to return sequence labels (i.e., frame-level labels) for each clip
                                       based on the start/end positions of the foreground clip.
        seed (int): A random seed

    Returns:
        generator: Returns a generator that yields batches of mixed foreground/background audio, labels, and the
                   background segments used for each audio clip (or None is the
                   `return_backgroun_clips` argument is False)
    r   zQError! At least one value of the `start_index` argument is <0. Check your inputs.r5   c                 S   r?   r   r	   r   r7   r   r   r   r     rA   z#mix_clips_batch.<locals>.<listcomp>c                 S   &   g | ]}t |jd kr|d n|qS r    r   lenr   r   r   r   r   r        & c                    s$   g | ]\}}t |t|  qS r   )truncate_clipint)r   r7   krv   r;   r   r   r     s    c                 S   r?   r   r	   r   r   r   r   r     rA   c                 S   r   r   r   r   r   r   r   r     r   r    )whiteZpinkblueZbrownviolet)colorNavgZrescale_ampg{Gz?      ?dim).NT)r   Zkeepdim)minr!   )(r   rl   r   r   r"   r   Zpermutationr   tolistr   r:   samplerandint	enumerater   repeatceilr&   Zint32r   clonemaxuniformmix_clipget_frame_labels	acoustics	generatornoisechoicetorch
from_numpyvstackrY   loadr   absclampnumpyr'   where)0rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   pr   Zstart_index_batchZforeground_clips_batchZlabels_batchZbackground_clips_batchZbackground_clips_batch_delayeddelayZndxZbackground_clipZrepeatedrZsnrs_dbZmixed_clipsZsequence_labelsfgbgsnrstartZ
mixed_clipZnoise_colorZ
noise_clipZmixed_clips_batchZsequence_labels_batchrir_waveformZvolume_levelsZabs_max_Zerror_indexr   r   r   mix_clips_batch&  s   L
 ""

"
 
r   c                 C   sx   t t | d d t}t d| d}t t|| }t t|| }d|||d < d||d |d < |S )Nip0  i   r    ra   )r   r   r   r&   r   ZarangeZargminr   )ro   r   endbufferZsequence_labelZframe_positionsZstart_frameZ	end_framer   r   r   r     s   r   c                 C   sj   | j dd|j dd}}d|d  }|| | }|||| jd   ||   |||| jd  < |d S )Nra   )r   
      r   )Znormr   )r   r   r   r   Zfg_rmsZbg_rmsscaler   r   r   r     s
   0r   truncate_startc                 C   s   | j d |krU|dkr| | j d | d } |dkr | d| } |dkr=tt| j d | d }| ||  d| } |dkrUtjd| j d | }| |||  } | S )a,  
    Truncates and audio clip with the specified method

    Args:
        x (nd.array): An array of audio data
        max_size (int): The maximum size (in samples)
        method (str): Can be one of four options:
            - "truncate_start": Truncate the start of the clip
            - "truncate_end": Truncate the end of the clip
            - "truncate_both": Truncate both the start and end of the clip
            - "random": Randomly select a segment of the right size from the clip

    Returns:
        nd.array: The truncated audio data
    r   r   NZtruncate_endZtruncate_bothra   rl   )r   r   r   r   rl   r   )xmax_sizemethodnZrnr   r   r   r     s   r   c                 C   s   t |trt|d \}}nt |trtt|\}}|jd dkr6|td|jd d ddf }t	t
| |dd}| S )a  
    Applies reverberation to the input audio clips

    Args:
        x (nd.array): A numpy array of shape (batch, audio_samples) containing the audio clips
        rir_files (Union[str, list]): Either a path to an RIR (room impulse response) file or a list
                                      of RIR files. If a list, one file will be randomly chosen
                                      to apply to `x`

    Returns:
        nd.array: The reverberated audio clips
    r   r    Nr   r   )
isinstancer@   rY   r   listrl   r   r   r   r   r   r   r   )r   Z	rir_filesr   r;   Zreverbedr   r   r   apply_reverb  s   

"r      g      ?g      ?r         ?)SevenBandParametricEQTanhDistortion
PitchShiftBandStopFilterAddColoredNoiseAddBackgroundNoiseGainRIR
clip_pathstotal_lengthr;   augmentation_probabilitiesbackground_clip_paths	RIR_pathsc                 c   s:   t t jdd|d dt jdd|d dg}|g krTttjd	d
|d dddtj|d ddtjdddd|d ddtj|d |ddddtj	d|d dg}n,ttjd	d
|d dddtj|d ddtjdddd|d ddtj	d|d dg}t
dt| |D ]}	| |	|	|  }
g }|
D ]4}t|\}}|d }|jd |kr|d| }||krtdt|||}|t|||d qttj rd nd!}|t|jd"d#||djd"d$}|d% tj kr|g krtt|\}}t| |d&d'}|   d( !tj"V  qd)S )*a	  
    Applies audio augmentations to the specified audio clips, returning a generator that applies
    the augmentations in batches to support very large quantities of input audio files.

    The augmentations (and probabilities) are chosen from experience based on training openWakeWord models, as well
    as for the efficiency of the augmentation. The individual probabilities of each augmentation may be adjusted
    with the "augmentation_probabilities" argument.

    Args:
        clip_paths (List[str]) = The input audio files (as paths) to augment. Note that these should be shorter
                                 than the "total_length" argument, else they will be truncated.
        total_length (int): The total length of audio files (in samples) after augmentation. All input clips
                            will be left-padded with silence to reach this size, with between 0 and 200 ms
                            of other audio after the end of the original input clip.
        sr (int): The sample size of the input audio files
        batch_size (int): The number of audio files to augment at once.
        augmentation_probabilities (dict): The individual probabilities of each augmentation. If all probabilities
                                           are zero, the input audio files will simply be padded with silence. THe
                                           default values are:

                                            {
                                                "SevenBandParametricEQ": 0.25,
                                                "TanhDistortion": 0.25,
                                                "PitchShift": 0.25,
                                                "BandStopFilter": 0.25,
                                                "AddColoredNoise": 0.25,
                                                "AddBackgroundNoise": 0.75,
                                                "Gain": 1.0,
                                                "RIR": 0.5
                                            }

        background_clip_paths (List[str]) = The paths to background audio files to mix with the input files
        RIR_paths (List[str]) = The paths to room impulse response functions (RIRs) to convolve with the input files,
                                producing a version of the input clip with different acoustic characteristics.

    Returns:
        ndarray: A batch of augmented audio clips of size (batch_size, total_length)
    i   r   )Zmin_gain_dbZmax_gain_dbr   g-C6?g?r   )Zmin_distortionZmax_distortionr      r   r5   Z	per_batch)Zmin_transpose_semitonesZmax_transpose_semitonesr   rc   moder   )r   r   r      ra   r   )min_snr_in_dbmax_snr_in_dbZmin_f_decayZmax_f_decayr   r   r   i   )r   Zbackground_pathsr   r   r   r   r   )Zmax_gain_in_dbr   z2Error! Clip does not have the correct sample rate!)Zsamplesrc   zcuda:0cpur    r   )Zaxisr   r   r   r!   N)#audiomentationsZComposer   r   torch_audiomentationsr   r   r   r   r   r   r   rY   r   r   r"   create_fixed_size_clipr   r   r   devicecudaZis_availabler   Z	unsqueezetoZsqueezer   rl   r   r   r   r   r&   r'   )r   r   r;   rq   r   r   r   Zaugment1Zaugment2r   batchZaugmented_clipsrg   Z	clip_dataZclip_srr   Zaugmented_batchr   r   r   r   augment_clips.  s   ;


(r   皙?c                 C   s   t |}tt jd|| }|du r!td|tt| |  }t| |krCt j dkr8| d|  }|S | | d  }|S | |||t|  < |S )a  
    Create a fixed-length clip of the specified size by padding an input clip with zeros
    Optionally specify the start/end position of the input clip, or let it be chosen randomly.

    Args:
        x (ndarray): The input audio to pad to a fixed size
        n_samples (int): The total number of samples for the fixed length clip
        sr (int): The sample rate of the audio
        start (int): The start position of the clip in the fixed length output, in samples (default: None)
        end_jitter (float): The time (in seconds) from the end of the fixed length output
                            that the input clip should end, if `start` is None.

    Returns:
        ndarray: A new array of audio data of the specified length
    r   Nr   )r   r   r   rl   r   r   r   r   )r   Z	n_samplesr;   r   Z
end_jitterZdatr   r   r   r     s   
r   c                   @   sN   e Zd ZdZi di i i fdedededededefd	d
Zdd Zdd ZdS )mmap_batch_generatoraR  
    A generator class designed to dynamically build batches from mmaped numpy arrays.

    The generator will return tuples of (data, labels) with a batch size determined
    by the `n_per_class` initialization argument. When a mmaped numpy array has been
    fully interated over, it will restart at the zeroth index automatically.
    r   
data_fileslabel_filesrq   n_per_classdata_transform_funcslabel_transform_funcsc                    s  |_ |_|_|_|_dd | D _dd | D _dd | D _	fddj D _
fddj D _d jsi _j D ]O\}}tjdj| d j| d	 f}	j|d
rj|d
|	jd d  j| d tdd j D  }
tdtt||
   j|< qTt fddj D }tdd j D | }|_td| d
S d
S )a  
        Initialize the generator object

        Args:
            data_files (dict): A dictionary of labels (as keys) and on-disk numpy array paths (as values).
                               Keys should be integer strings representing class labels.
            label_files (dict): A dictionary where the keys are the class labels and the values are the per-example
                                labels. The values must be the same shape as the correponding numpy data arrays
                                from the `data_files` argument.
            batch_size (int): The number of samples per batch
            n_per_class (dict): A dictionary with integer string labels (as keys) and number of example per batch
                               (as values). If None (the default), batch sizes for each class will be
                               automatically calculated based on the the input dataframe shapes and transformation
                               functions.

            data_transform_funcs (dict): A dictionary of transformation functions to apply to each batch of per class
                                    data loaded from the mmaped array. For example, with an array of shape
                                    (batch, timesteps, features), if the goal is to half the timesteps per example,
                                    (effectively doubling the size of the batch) this function could be passed:

                                    lambda x: np.vstack(
                                        (x[:, 0:timesteps//2, :], x[:, timesteps//2:, :]
                                    ))

                                    The user should incorporate the effect of any transform on the values of the
                                    `n_per_class` argument accordingly, in order to end of with the desired
                                    total batch size for each iteration of the generator.
            label_transform_funcs (dict): A dictionary of transformation functions to apply to each batch of labels.
                                          For example, strings can be mapped to integers or one-hot encoded,
                                          groups of classes can be merged together into one, etc.
        c                 S   s    i | ]\}}|t j|d dqS )r   Z	mmap_moder   r   r   labelflr   r   r   
<dictcomp>  s     z1mmap_batch_generator.__init__.<locals>.<dictcomp>c                 S   s   i | ]
\}}|t |qS r   r   r   r   r   r   r         c                 S   s   i | ]}|d qS r   r   r   r   r   r   r   r         c                       i | ]	}| j | jqS r   datar   r   selfr   r   r         c                    r   r   r   r   r   r   r   r     r   r    r   ra   Nr   c                 S   rI   r   r   r   r   r   r   r   )  rA   z1mmap_batch_generator.__init__.<locals>.<listcomp>c                    s   g | ]}|  qS r   r   )r   val)scale_factorr   r   r   -  rA   c                 S   rI   r   r   r   r   r   r   r   .  rA   zBatches/steps per epoch:)r   r   r   r   r   itemsr   rp   keysdata_counterZoriginal_shapesshapesr   rl   getr   r#   valuesr   r   Zbatch_per_epochprint)r   r   r   rq   r   r   r   Zlblr   Z
dummy_dataratioZbatches_per_epochr   )r   r   r   __init__  s0   (	&&"zmmap_batch_generator.__init__c                 C   s   | S )Nr   r   r   r   r   __iter__2  s   zmmap_batch_generator.__iter__c                 C   s&  	 g g }}| j  D ]}\}}| j| | j| d kr d| j|< | j| | j| | j| |  }| j|  |jd 7  < | jrM| j|rM| j| |}| j|d rf| j	| | j| | j| |  }n|g|jd  }| j
r~| j
|r~| j
| |}|| || qt|t|fS )NTr   )r   r   r   r   r   r   r   r   r   rp   r   r   rL   r   r   r   )r   r*   yr   r   r   Zy_batchr   r   r   __next__5  s"   

"$
zmmap_batch_generator.__next__N)	__name__
__module____qualname____doc__dictr   r  r  r  r   r   r   r   r     s,    	
Mr   c                 C   s:  t j| dd}d}t ||ddddf dkr-|d8 }t ||ddddf dks|jd | d }| dd }t|d	t j||jd |jd
 fd}ttd|jd d|jd d ddD ]-}|d |kr{||| 	 |||< |
  qb|||d  	 |||d < |
  qbt|  t||  dS )a-  
    Trims blank rows from the end of a mmaped numpy array by creates new mmap array without the blank rows.
    Note that a copy is created and disk usage will briefly double as the function runs.

    Args:
        mmap_path (str): The path to mmap array file to trim

    Returns:
        None
    r   r   r   Nr   r    z.npyz2.npyzw+ra   )r   Zdtyper   i   zTrimming empty rows)totalZdesc)r   r   allr   stripr   Zfloat32r   r   copyflushr/   removerename)Z	mmap_pathZ
mmap_file1r   ZN_newZoutput_file2Z
mmap_file2r   r   r   	trim_mmapX  s$     
*
 

r  
input_textr)   include_partial_phraseinclude_input_wordsc                    s  g dg }dd    D }g |v rtjtjtjtdd}tjtjtjtjtdsHttjtjtjtd tj|st	
d ddl}d	}|j|d
d}	t|d}
|	jddD ]	}|ru|
| qlW d   n1 sw   Y  ddlm} ||}t|   D ]S\g kr| qg krt	
d d |ddt	
d d  |tddtdd qtd trt	
d d |d  qfdd|D }g }t|   D ]a\g }  g }tdkr|d n|ttdtd dd |D ]'}t|}d d |D }fd!dt||D }|fd"d|D  q,|g kr^|| qg }t|D ]h}g }t|   D ]\}}t j!! d#| kr|| qp|t j!"| qp|durt   d#krt j!! |krt j!#d#t   d# }|dt j!j"||d$d% qe|d| qe fd&d|D }|S )'ah  
    Generate adversarial words and phrases based on phoneme overlap.
    Currently only works for english texts.
    Note that homophones are excluded, as this wouldn't actually be an adversarial example for the input text.

    Args:
        input_text (str): The target text for adversarial phrases
        N (int): The total number of adversarial texts to return. Uses sampling,
                 so not all possible combinations will be included and some duplicates
                 may be present.
        include_partial_phrase (float): The probability of returning a number of words less than the input
                                        text (but always between 1 and the number of input words)
        include_input_words (float): The probability of including individual input words in the adversarial
                                     texts when the input text consists of multiple words. For example,
                                     if the `input_text` was "ok google", then setting this value > 0.0
                                     will allow for adversarial texts like "ok noodle", versus the word "ok"
                                     never being present in the adversarial texts.

    Returns:
        list: A list of strings corresponding to words and phrases that are phonetically similar (but not identical)
              to the input text.
    )ZAAZAEZAHZAOZAWZAXZAXRZAYZEHZERZEYZIHZIXZIYZOWZOYZUHZUWZUXc                 S   s   g | ]}t |qS r   pronouncingZphones_for_wordr   r   r   r   r     r   z.generate_adversarial_texts.<locals>.<listcomp>	resourceszen_us_cmudict_forward.ptz;Downloading phonemizer model from DeepPhonemizer library...r   Nzchttps://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_forward.ptT)streamwbi   )
chunk_size)
Phonemizerz
The word 'zj' was not found in the pronunciation dictionary! Using the DeepPhonemizer library to predict the phonemes.en_us)langzPhones for 'z': z[\]|\[] z\]\[ z0There are multiple pronunciations for the word 'z'.c                    s.   g | ]}t d  dd t dd|qS )|c                 S   s   t | dd S )Nr   z[0|1|2])r@   group)r   r   r   r   <lambda>  r   z7generate_adversarial_texts.<locals>.<listcomp>.<lambda>z\d+r   )resubjoinr   )vowel_phonesr   r   r     s   . ra   z(.){1,3})max_replacereplace_charc                 S   s   g | ]	}t |d  qS r   r  r   r   r   r   r     r   c                    s   g | ]
\}}| kr|qS r   r   r6   )phonesr   r   r     r   c                    s   g | ]
}   |kr|qS r   )lowerr   )wordr   r   r     r   r    F)r%   replacec                    s   g | ]}| kr|qS r   r   r   )r  r   r   r     r8   )$splitr/   rB   r'  dirnameabspath__file__existsmkdirloggingwarningrequestsr   openiter_contentwriteZdp.phonemizerr  Zfrom_checkpointr:   rL   r   r%  r&  r   r   r   phoneme_replacementr   r  searchr   r   rl   r   r   )r  r)   r  r  Zword_phonesZinput_text_phonesZphonemizer_mdl_pathr7  file_urlr   r=   r   r  Z
phonemizerZadversarial_phrasesZ
query_expsZadversarial_wordsquerymatchesZmatches_phonesZallowed_matchesZadversarial_textsr   Ztxtsr7   r   Zn_wordsr   )r  r+  r(  r-  r   generate_adversarial_texts  s~    &$


 "


,"r@  
"(.){1,3}"c           
      C   sl   g }t | }td|d D ]&}ttt||}|D ]}| }|D ]}	|||	< q#|d| qq|S )Nr    r!  )r   r   	itertoolscombinationsr   r  r   r'  )
Zinput_charsr)  r*  resultscharsr   combindicesZ
chars_copyr   r   r   r   r;    s   
r;  )r   )r-   )r5   r    r-   )r%   N)ra   rJ   )r   )r5   Nr   )rA  )8Zmultiprocessing.poolr   r/   r%  r5  	functoolsr   pathlibr   rl   r   typingr   r   r   r   rB  r  r   r   r   Znumpy.lib.formatr   Zspeechbrain.dataio.dataior
   Z(speechbrain.processing.signal_processingr   rY   r[   r   r   r,   r4   r>   rV   r   rP   rf   rE   rj   r@   r   floatboolr   r   r   r   r   r  r   r   r   r  r@  r;  r   r   r   r   <module>   s   


3
	
4"

	


 
<

 

 
!{(i