o
    i,                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlZd dlmZ d dlZd dlmZ d dlmZmZmZmZ d dlZG dd	 d	Z	
		d!dee dee dededef
ddZd"ddZd#ddZg e jee j!" ddfdee defddZ#dd  Z$dS )$    N)deque)
ThreadPool)ProcessQueue)tqdm)open_memmap)UnionListCallableDequec                   @   s   e Zd ZdZ						d8deded	ed
ededefddZdd Zdd fdee	j
ef defddZdd Zd9de	j
dedefddZd:ded	efd d!Zd;d#d$Zd;d%d&Zd;d'd(Zd)d* Zd+d, Zd-d. Zd<d1ed2efd3d4Zd5d6 Zd7S )=AudioFeaturesz
    A class for creating audio features from audio data, including melspectograms and Google's
    `speech_embedding` features.
     >     onnxcpumelspec_model_pathembedding_model_pathsrncpuinference_frameworkdevicec                    s  |dkrzddl }W n ty   tdw |dkr)tjttj	
 ddd}|dkr<tjttj	
 ddd	}d
|v sDd
|v rHtd| }||_||_|j|||dkr]dgndgd_j d _fdd_|j|||dkr}dgndgd_fdd_n|dkr;zddlm}	 W n ty   tdw |dkrtjttj	
 ddd}|dkrtjttj	
 ddd}d|v sd|v rtd|	j||d_jjdddgdd j  j d d j d d d_fdd }
|
_|	j||d_j  j d d  j d d d_ fd!d"}|_t|d# d$_t !d%_"d&_#d_$t %d_&'t j()d'd(d)*t j+_,d*_-dS )+a  
        Initialize the AudioFeatures object.

        Args:
            melspec_model_path (str): The path to the model for computing melspectograms from audio data
            embedding_model_path (str): The path to the model for Google's `speech_embedding` model
            sr (int): The sample rate of the audio (default: 16000 khz)
            ncpu (int): The number of CPUs to use when computing melspectrograms and audio features (default: 1)
            inference_framework (str): The inference framework to use when for model prediction. Options are
                                       "tflite" or "onnx". The default is "tflite" as this results in better
                                       efficiency on common platforms (x86, ARM64), but in some deployment
                                       scenarios ONNX models may be preferable.
            device (str): The device to use when running the models, either "cpu" or "gpu" (default is "cpu".)
                          Note that depending on the inference framework selected and system configuration,
                          this setting may not have an effect. For example, to use a GPU with the ONNX
                          framework the appropriate onnxruntime package must be installed.
        r   r   NzdTried to import onnxruntime, but it was not found. Please install it using `pip install onnxruntime`r   	resourcesmodelszmelspectrogram.onnxzembedding_model.onnx.tflitezJThe onnx inference framework is selected, but tflite models were provided!ZgpuZCUDAExecutionProviderZCPUExecutionProvider)Zsess_options	providersc                    s    j d d| iS )Ninput)melspec_modelrunxself L/home/kim/smarthome/.venv/lib/python3.10/site-packages/openwakeword/utils.py<lambda>W   s    z(AudioFeatures.__init__.<locals>.<lambda>c                    s    j d d| id  S )NZinput_1r   )embedding_modelr   squeezer   r!   r#   r$   r%   ]   s    tflitezmTried to import the TFLite runtime, but it was not found.Please install it using `pip install tflite-runtime`zmelspectrogram.tflitezembedding_model.tflite.onnxzJThe tflite inference framework is selected, but onnx models were provided!)Z
model_pathZnum_threadsr      Tstrictindexc                    s   | j d dkr!jjdd| j d gdd j  | j d _njdkr9jjdddgdd j  d_j |  j  jS )Nr   r*   r   Tr+   )shaper   resize_tensor_inputallocate_tensors"_tflite_current_melspec_input_size
set_tensorinvoke
get_tensorr   )melspec_input_indexmelspec_output_indexr"   r#   r$   tflite_melspec_predictz   s   



z6AudioFeatures.__init__.<locals>.tflite_melspec_predictc                    s   | j d dkr#jjd| j d dddgdd j  | j d _njdkr>jjdg ddd j  | j d _j |  j  j S )Nr   r   L       Tr+   )r   r8   r9   r   )	r.   r&   r/   r0   $_tflite_current_embedding_batch_sizer2   r3   r4   r'   r   )embedding_input_indexembedding_output_indexr"   r#   r$   tflite_embedding_predict   s    



z8AudioFeatures.__init__.<locals>.tflite_embedding_predict
   )maxlenr8   r9   i       x   ).ZonnxruntimeImportError
ValueErrorospathjoinpathlibPath__file__parentresolveZSessionOptionsZinter_op_num_threadsZintra_op_num_threadsZInferenceSessionr   Zget_providersonnx_execution_providermelspec_model_predictr&   embedding_model_predictZtflite_runtime.interpreterinterpreterZInterpreterr/   r0   Zget_input_detailsZget_output_detailsr1   r:   r   raw_data_buffernponesmelspectrogram_buffermelspectrogram_max_lenaccumulated_samplesemptyraw_data_remainder_get_embeddingsrandomrandintastypeint16feature_bufferfeature_buffer_max_len)r"   r   r   r   r   r   r   ZortZsessionOptionsr(   r7   r=   r#   )r;   r<   r5   r6   r"   r$   __init__&   s~   


 
zAudioFeatures.__init__c                 C   sL   | j   td| _d| _td| _| tj	
dddtj| _dS )zReset the internal buffersr@   r   rA   rB   rC   N)rS   clearrT   rU   rV   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r!   r#   r#   r$   reset   s
   
$zAudioFeatures.resetc                 C   s   | d d S )Nr>      r#   r   r#   r#   r$   r%      s    zAudioFeatures.<lambda>r    melspec_transformc                 C   s   t |trt|tjn|}|jtjkrtd|j dt|j	dk r*|d n|}|jtj
kr8|tj
n|}| |}t|d }||}|S )a  
        Function to compute the mel-spectrogram of the provided audio samples.

        Args:
            x (Union[np.ndarray, List]): The input audio data to compute the melspectrogram from
            melspec_transform (Callable): A function to transform the computed melspectrogram. Defaults to a transform
                                          that makes the ONNX melspectrogram model closer to the native Tensorflow
                                          implementation from Google (https://tfhub.dev/google/speech_embedding/1).

        Return:
            np.ndarray: The computed melspectrogram of the input audio data
        zIInput data must be 16-bit integers (i.e., 16-bit PCM audio).You provided z data.re   Nr   )
isinstancelistrT   arrayr^   r_   dtyperF   lenr.   float32rP   r'   )r"   r    rf   Zoutputsspecr#   r#   r$   _get_melspectrogram   s    

z!AudioFeatures._get_melspectrogramc                 C   s$   |j d dkr|d }| |}|S )z
        Computes the Google `speech_embedding` features from a melspectrogram input

        Args:
            melspec (np.ndarray): The input melspectrogram

        Returns:
            np.ndarray: The computed audio features/embeddings
        r   r   rg   )r.   rQ   )r"   melspec	embeddingr#   r#   r$   _get_embeddings_from_melspec   s   

z*AudioFeatures._get_embeddings_from_melspecr8      window_size	step_sizec                 K   s   | j |fi |}g }td|jd dD ]}||||  }|jd |kr*|| qtjt|ddtj}	| 	|	}
|
S )z@Function to compute the embeddings of the provide audio samples.r   rs   )Zaxis)
ro   ranger.   appendrT   Zexpand_dimsrj   r^   rm   rQ   )r"   r    rt   ru   kwargsrn   windowsiwindowbatchrq   r#   r#   r$   r[      s   

zAudioFeatures._get_embeddingsaudio_lengthc                 C   s0   t jddt|| d t j}| |jS )zjFunction that determines the size of the output embedding array for a given audio clip length (in seconds)rv   r   i  )rT   r\   uniformintr^   r_   r[   r.   )r"   r~   r   r    r#   r#   r$   get_embedding_shape   s   $z!AudioFeatures.get_embedding_shape   c                 C   s  d}d| j v rt|d}tt|jd d d }d}tj|jd ||ftjd	}tdt	||jd |D ]D}||||  }	d
| j v rK| 
|	}
n|ri|	jd |kr[|	jd | nd}t|j| j
|	|d}
|
 |||| ddddf< q6|r|  |S )a&  
        Compute the melspectrogram of the input audio samples in batches.

        Note that the optimal performance will depend in the interaction between the device,
        batch size, and ncpu (if a CPU device is used). The user is encouraged
        to experiment with different values of these parameters to identify
        which combination is best for their data, as often differences of 1-4x are seen.

        Args:
            x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
                        Assumes that all of the audio data is the same length (same number of samples).
            batch_size (int): The batch size to use when computing the melspectrogram
            ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
                        no effect if the underlying model is executing on a GPU.

        Returns:
            ndarray: A numpy array of shape (N, frames, melbins) containing the melspectrogram of
                    all N input audio examples
        NCPUZ	processesr         r9   r   rk   CUDA	chunksize)rO   r   r   rT   ceilr.   rY   rm   rw   maxro   rj   mapr'   close)r"   r    
batch_sizer   pooln_framesZmel_binsmelspecsr{   r}   resultr   r#   r#   r$   _get_melspectrogram_batch   s&   


 $z'AudioFeatures._get_melspectrogram_batchc                 C   s  |j d dk rtdd}d| jv rt|d}|j d d d d }d}tj|j d	 ||ftjd
}g }g }	t|D ]\}
}d}td	|j d	 dD ]}||||  }|j d	 |kr_|	| qI|		|
 t
||kst|
d |j d	 krt|tj}d| jv r| |}n|r|j d	 |kr|j d	 | nd}t|j| j||d}ttd	|j d	 ||	D ]\}}||||  ||ddddf< qg }g }	q:|r|  |S )a  
        Compute the embeddings of the input melspectrograms in batches.

        Note that the optimal performance will depend in the interaction between the device,
        batch size, and ncpu (if a CPU device is used). The user is encouraged
        to experiment with different values of these parameters to identify
        which combination is best for their data, as often differences of 1-4x are seen.

        Args:
            x (ndarray): A numpy array of melspectrograms of shape (N, frames, melbins).
                        Assumes that all of the melspectrograms have the same shape.
            batch_size (int): The batch size to use when computing the embeddings
            ncpu (int): The number of CPUs to use when computing the embeddings. This argument has
                        no effect if the underlying model is executing on a GPU.

        Returns:
            ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
                    all N input melspectrograms
        r   r8   zMEmbedding model requires the input melspectrograms to have at least 76 framesNr   r   rs   `   r   r   r   r   )r.   rF   rO   r   rT   rY   rm   	enumeraterw   rx   rl   rj   r^   rQ   r   rr   zipr   )r"   r    r   r   r   r   Zembedding_dim
embeddingsr}   Zndcsndxrp   rt   r{   r|   r   r   jZndx2r#   r#   r$   _get_embeddings_batch$  sF   




  $z#AudioFeatures._get_embeddings_batchc                 C   s<   | j |||d}| j|dddddddf ||d}|S )a  
        Compute the embeddings of the input audio clips in batches.

        Note that the optimal performance will depend in the interaction between the device,
        batch size, and ncpu (if a CPU device is used). The user is encouraged
        to experiment with different values of these parameters to identify
        which combination is best for their data, as often differences of 1-4x are seen.

        Args:
            x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
                        Assumes that all of the audio data is the same length (same number of samples).
            batch_size (int): The batch size to use when computing the embeddings
            ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
                        no effect if the underlying model is executing on a GPU.

        Returns:
            ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
                    all N input audio clips
        r   r   N)r   r   )r"   r    r   r   r   r   r#   r#   r$   embed_clipsf  s   (zAudioFeatures.embed_clipsc                 C   sx   t | jdk rtdt| j| t| j| d d f| _| jjd | j	kr:| j| j	 dddf | _dS dS )a  Note! There seem to be some slight numerical issues depending on the underlying audio data
        such that the streaming method is not exactly the same as when the melspectrogram of the entire
        clip is calculated. It's unclear if this difference is significant and will impact model performance.
        In particular padding with 0 or very small values seems to demonstrate the differences well.
        i  zHThe number of input frames must be at least 400 samples @ 16khz (25 ms)!i  Nr   )
rl   rS   rF   rT   vstackrV   ro   ri   r.   rW   )r"   Z	n_samplesr#   r#   r$   _streaming_melspectrogram  s   " z'AudioFeatures._streaming_melspectrogramc                 C   s*   | j t|tjr|  dS | dS )z9
        Adds raw audio data to the input buffer
        N)rS   extendrh   rT   ndarraytolistr"   r    r#   r#   r$   _buffer_raw_data  s   *zAudioFeatures._buffer_raw_datac                 C   s  d}| j jd dkrt| j |f}td| _ | j|jd  dkrh| j|jd  d }|dkrN|d|  }| | |  jt|7  _|| d  | _ n)|dkrg| | |  j|jd 7  _td| _ n|  j|jd 7  _| | | jdkr| jd dkr| | j t	| jd d ddD ];}d| }|dkr|nt| j
}| j
d| | tjd d d d d d f }|jd dkrt| j| |f| _q| j}d| _| jjd | jkr| j| j d d d f | _|dkr|S | jS )Nr   r*   r   rv   iir8   )rZ   r.   rT   ZconcatenaterY   rX   r   rl   r   ZarangerV   r^   rm   r   r`   rQ   ra   )r"   r    Zprocessed_samples	remainderZx_even_chunksr{   r   r#   r#   r$   _streaming_features  sD   


.z!AudioFeatures._streaming_features   rv   n_feature_frames	start_ndxc                 C   sv   |dkr&|| dkr|t | nt| j}| j||d d f d tjS | jt d| d d d f d tjS )Nrv   r   rg   )r   rl   r`   r^   rT   rm   )r"   r   r   Zend_ndxr#   r#   r$   get_features  s   "*zAudioFeatures.get_featuresc                 C   s
   |  |S rg   )r   r   r#   r#   r$   __call__  s   
zAudioFeatures.__call__N)r   r   r   r   r   r   )r8   rs   )r   )r   r   )r   rv   )__name__
__module____qualname____doc__strr   rb   rd   r   rT   r   r	   r
   ro   rr   r[   floatr   r   r   r   r   r   r   r   r   r#   r#   r#   r$   r   !   sF    
 $

1
B-r   predict_clipr   r(   
file_pathswakeword_modelsprediction_functionr   r   c                    s<  t dt | t | } fddtdt dt | D }td|d D ]}||d   d|   q+g }	g t |D ].}
dd  D }tjd||d|}| fd	d
}|	t||
fd qD|	D ]}|	  qug }|	D ]}
 rtd 
 s|  qdd |D S )a  
    Bulk predict on the provided input files in parallel using multiprocessing using the specified model.

    Args:
        input_paths (List[str]): The list of input file to predict
        wakeword_models (List[str])): The paths to the wakeword model files
        prediction_function (str): The name of the method used to predict on the input audio files
                                   (default is the `predict_clip` method)
        ncpu (int): How many processes to create (up to max of available CPUs)
        inference_framework (str): The inference framework to use when for model prediction. Options are
                                    "tflite" or "onnx". The default is "tflite" as this results in better
                                    efficiency on common platforms (x86, ARM64), but in some deployment
                                    scenarios ONNX models may be preferable.
        kwargs (dict): Any other keyword arguments to pass to the model initialization or
                       specified prediction function

    Returns:
        dict: A dictionary containing the predictions for each file, with the filepath as the key
    r   c                    s   g | ]
} ||  qS r#   r#   .0r{   )r   	n_batchesr#   r$   
<listcomp>      z bulk_predict.<locals>.<listcomp>r   rv   c                 S   s&   i | ]\}}|t jjjjv r||qS r#   )openwakewordModelrb   __code__co_varnamesr   keyvaluer#   r#   r$   
<dictcomp>  s    z bulk_predict.<locals>.<dictcomp>)r   r   c                    sZ   g }| D ]!}t d   fdd D }|| |fi |i q| d S )Nrv   c                    s"   i | ]\}}| j jv r||qS r#   )r   r   r   funcr#   r$   r     s    z+bulk_predict.<locals>.f.<locals>.<dictcomp>)getattritemsrx   put)ZclipsresultsZclipfiltered_kwargs)ry   mdlsr   qr   r$   f  s   zbulk_predict.<locals>.f)targetargsg{Gz?c                 S   s*   i | ]}t | d  t | d  qS )r   )ri   keysvaluesr   r#   r#   r$   r     s   * Nr#   )r   rl   rw   rx   r   r   r   r   r   startrY   timesleepr   get)r   r   r   r   r   ry   r   chunksr{   Zpschunkr   Zowwr   pr   r#   )r   ry   r   r   r   r   r$   bulk_predict  s8   *
	

r   r   c                 C   s`  ddl m} t|d}||d }||d |d f}	t|dtj|	d}
d}t| }|jd }||kr>t	d| d	| d
|j
||d}||
|||jd  ddddf< ||jd 7 }|
  t| || ddD ]>}||krs n7|j
|||d}||jd  |kr|d||  }||
|||jd  ddddf< ||jd 7 }|
  qk|| dS )a>  
    Computes audio features from a generator that produces Numpy arrays of shape (batch_size, samples)
    containing 16-bit PCM audio data.

    Args:
        generator (Generator): The generator that process the arrays of audio data
        n_total (int): The total number of rows (audio clips) that the generator will produce.
                       Ideally this is precise, but it can be approximate as well as the output
                       .npy file will be automatically trimmed to remove empty values.
        clip_duration (float): The duration (in samples) of the audio produced by the generator
        output_file (str): The output file (.npy) containing the audio features. Note that this file
                           will be written to using memmap arrays, so it can be substantially larger
                           than the available system memory.
        device (str): The device ("cpu" or "gpu") to use for computing features.
        ncpu (int): The number of cores to use when process the audio features (if computing on CPU)

    Returns:
        None
    r   )	trim_mmap)r   r   r   zw+)moderk   r.   zThe value of 'n_total' (z) is less than the batch size (z1). Please increase 'n_total' to be >= batch size.)r   NzComputing features)totaldescr   )Zopenwakeword.datar   r   r   r   rT   rm   nextr.   rF   r   flushr   )	generatorZn_totalZclip_durationoutput_filer   r   r   FZn_feature_colsZoutput_shapefpZrow_counterZ
audio_datar   featuresr#   r#   r$   compute_features_from_generator  s0   

$$
r   c           	   	   C   s   |  dd }tj| ddW}|durt|dd| d}nt|jdd	}t|dd| d}ttj	||d
}|j
ddD ]}|| |t| q@W d   n1 sYw   Y  W d   n1 shw   Y  |  dS )zcA simple function to download a file from a URL with a progress bar using only the requests library/rv   T)streamNZiB)r   unitZ
unit_scaler   zcontent-lengthr   wbi    )
chunk_size)splitrequestsr   r   r   headersopenrG   rH   rI   iter_contentwriteupdaterl   r   )	urltarget_directory	file_sizeZlocal_filenamerprogress_barZ
total_sizer   r   r#   r#   r$   download_file]  s   
r   r   r   model_namesr   c              	      s  t | ts	tdtj|st| tj	 D ]&}tjtj
||d dd s?t|d | t|d dd| qtj	 D ]}tjtj
||d dd s`t|d | qEdd tj	 D }d	d tj	 D }| g kr| D ]6  fd
dt||D }|g krtjtj
||d dd st|d | t|d dd| q{dS |D ] }tjtj
||dd st|| t|dd| qdS )a  
    Download the specified models from the release assets in the openWakeWord GitHub repository.
    Uses the official urls in the MODELS dictionary in openwakeword/__init__.py.

    Args:
        model_names (List[str]): The names of the models to download (e.g., hey_jarvis_v0.1). Both ONNX and
                                 tflite models will be downloaded. If not provided (the default),
                                 the latest versions of all models will be downloaded.
        target_directory (str): The directory to save the models to. Defaults to the install location
                                of openWakeWord (i.e., the `resources/models` directory).
    Returns:
        None
    z2The model_names argument must be a list of stringsdownload_urlr   rv   r   r)   c                 S   s   g | ]}|d  qS )r   r#   r   r#   r#   r$   r     s    z#download_models.<locals>.<listcomp>c                 S   s   g | ]}|d   dd qS )r   r   rv   )r   r   r#   r#   r$   r     s    c                    s   g | ]
\}} |v r|qS r#   r#   )r   r{   r   Z
model_namer#   r$   r     r   r   N)rh   ri   rF   rG   rH   existsmakedirsr   ZFEATURE_MODELSr   rI   r   r   replaceZ
VAD_MODELSZMODELSr   )r   r   Zfeature_modelZ	vad_modelZofficial_model_urlsZofficial_model_namesr   Zofficial_model_urlr#   r   r$   download_modelsq  s<   

$$$ 
r   c                    s    fdd}|S )Nc                    s    fdd}|S )Nc                     sX   i }|  D ]\}}|v rtd| d|  d ||||< q | i |S )NzDEPRECATION: keyword argument 'zB' is no longer valid and will be removed in future releases. Use 'z
' instead.)r   loggingwarningr   )r   ry   Z
new_kwargskv)r   	kwarg_mapr#   r$   wrapped  s   
z*re_arg.<locals>.decorator.<locals>.wrappedr#   )r   r  r  r   r$   	decorator  s   zre_arg.<locals>.decoratorr#   )r  r  r#   r  r$   re_arg  s   
r  )r   r   r(   )r   r   rg   )%rG   numpyrT   rJ   collectionsr   Zmultiprocessing.poolr   multiprocessingr   r   r   r   r   r   Znumpy.lib.formatr   typingr   r	   r
   r   r   r   r   r   r   r   r   rH   rI   rK   rL   rM   rN   r   r  r#   r#   r#   r$   <module>   sR      8

K
?
4