o
    ¢ÄiÑg  ã                   @   s–   d dl Zd dlZd dlmZmZ d dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZ d dl	mZ d dlZd dlmZmZmZmZ G dd„ dƒZdS )é    N)ÚAudioFeaturesÚre_arg)ÚdequeÚdefaultdict)Úpartial)ÚListÚUnionÚDefaultDictÚDictc                   @   sä   e Zd ZdZeddiƒg g ddi ddfdee dee d	ed
e	dede	defdd„ƒZ
dd„ Zdd„ Zi i ddfdejdedede	def
dd„Zd-deeejf defd d!„Z	"	#d.d$ede	d%efd&d'„Zd/dejd)efd*d+„Zd,S )0ÚModelz­
    The main model class for openWakeWord. Creates a model object with the shared audio pre-processer
    and for arbitrarily many custom wake word/wake phrase models.
    Zwakeword_model_pathsÚwakeword_modelsFr   gš™™™™™¹?ÚtfliteÚclass_mapping_dictsÚenable_speex_noise_suppressionÚvad_thresholdÚcustom_verifier_modelsÚcustom_verifier_thresholdÚinference_frameworkc                    s\  t  |¡}	g }
|g kr|	}tt j ¡ ƒ}
nDt|ƒdkrYt|ƒD ]9\}‰ tj 	ˆ ¡r9|
 
tj tj ˆ ¡¡d ¡ q‡ fdd„|	D ƒ}|g krMtd ˆ ¡ƒ‚|d ||< |
 
ˆ ¡ qi | _i | _i | _i | _i | _i | _|| _|dkr¸zddlm} dd	„ }W n9 ty·   t d
¡ |g krštdd„ |D ƒƒršd}n|g kr±tdd„ |D ƒƒr±d}dd„ |D ƒ}ntdƒ‚Y nw |dkrÒz
ddl}dd„ }W n tyÑ   tdƒ‚w t||
ƒD ]\}}|dkr+d|v rétdƒ‚| ¡ }d|_d|_ |j!||dgd| j|< | j|  "¡ d j#d | j|< | j|  $¡ d j#d | j|< t% &|| j| ¡}|| j|< |dkrd|v r9tdƒ‚|j'|dd| j|< | j|  (¡  | j|  )¡ d d d | j|< | j|  *¡ d d d | j|< | j|  )¡ d d }| j|  *¡ d d }t% &|| j| ||¡}|| j|< |rª|| +|¡  ,|d¡rª|| +|¡ | j|< n!t j- ,|d¡r»t j-| | j|< ndd„ t.d| j| ƒD ƒ| j|< t/|t0ƒrå| ,|d¡råt1 2t3|| dƒ¡| j|< t| j ¡ ƒt| ¡ ƒk r÷td ƒ‚q×t4t&t5d!d"ƒ| _6|rdd#l7m8} | 9d$d%¡| _:nd| _:|| _;|dkr"t  <¡ | _=t>d'd&|i|¤Ž| _?dS )(aQ  Initialize the openWakeWord model object.

        Args:
            wakeword_models (List[str]): A list of paths of ONNX/tflite models to load into the openWakeWord model object.
                                              If not provided, will load all of the pre-trained models. Alternatively,
                                              just the names of pre-trained models can be provided to select a subset of models.
            class_mapping_dicts (List[dict]): A list of dictionaries with integer to string class mappings for
                                              each model in the `wakeword_models` arguments
                                              (e.g., {"0": "class_1", "1": "class_2"})
            enable_speex_noise_suppression (bool): Whether to use the noise suppresion from the SpeexDSP
                                                   library to pre-process all incoming audio. May increase
                                                   model performance when reasonably stationary background noise
                                                   is present in the environment where openWakeWord will be used.
                                                   It is very lightweight, so enabling it doesn't significantly
                                                   impact efficiency.
            vad_threshold (float): Whether to use a voice activity detection model (VAD) from Silero
                                   (https://github.com/snakers4/silero-vad) to filter predictions.
                                   For every input audio frame, a VAD score is obtained and only those model predictions
                                   with VAD scores above the threshold will be returned. The default value (0),
                                   disables voice activity detection entirely.
            custom_verifier_models (dict): A dictionary of paths to custom verifier models, where
                                           the keys are the model names (corresponding to the openwakeword.MODELS
                                           attribute) and the values are the filepaths of the
                                           custom verifier models.
            custom_verifier_threshold (float): The score threshold to use a custom verifier model. If the score
                                               from a model for a given frame is greater than this value, the
                                               associated custom verifier model will also predict on that frame, and
                                               the verifier score will be returned.
            inference_framework (str): The inference framework to use when for model prediction. Options are
                                       "tflite" or "onnx". The default is "tflite" as this results in better
                                       efficiency on common platforms (x86, ARM64), but in some deployment
                                       scenarios ONNX models may be preferable.
            kwargs (dict): Any other keyword arguments to pass the the preprocessor instance
        é   r   c                    s.   g | ]}ˆ   d d¡| tjj¡d v r|‘qS )ú Ú_éÿÿÿÿ)ÚreplaceÚsplitÚosÚpathÚsep)Ú.0Új©Úi© úL/home/kim/smarthome/.venv/lib/python3.10/site-packages/openwakeword/model.pyÚ
<listcomp>_   s   . z"Model.__init__.<locals>.<listcomp>z3Could not find pretrained model for model name '{}'r   Nc                 S   s"   |   ||¡ |  ¡  |  |¡d S )N©N)Z
set_tensorZinvokeZ
get_tensor)Ztflite_interpreterZinput_indexZoutput_indexÚxr!   r!   r"   Útflite_predictt   s   z&Model.__init__.<locals>.tflite_predictzŠTried to import the tflite runtime, but it was not found. Trying to switching to onnxruntime instead, if appropriate models are available.c                 S   s   g | ]}d |v ‘qS )ú.onnxr!   ©r   r    r!   r!   r"   r#   |   ó    Zonnxc                 S   s    g | ]}t j | d d¡¡‘qS ©ú.tfliter'   )r   r   Úexistsr   r(   r!   r!   r"   r#   ~   s     c                 S   s   g | ]}|  d d¡‘qS r*   )r   r(   r!   r!   r"   r#   €   s    z‰Tried to import the tflite runtime for provided tflite models, but it was not found. Please install it using `pip install tflite-runtime`c                 S   s   |   d |  ¡ d j|i¡S )Nr   )ÚrunÚ
get_inputsÚname)Z
onnx_modelr%   r!   r!   r"   Úonnx_predict‰   s   z$Model.__init__.<locals>.onnx_predictzdTried to import onnxruntime, but it was not found. Please install it using `pip install onnxruntime`r+   zJThe onnx inference framework is selected, but tflite models were provided!ZCPUExecutionProvider)Zsess_optionsÚ	providersr'   zJThe tflite inference framework is selected, but onnx models were provided!)Z
model_pathZnum_threadsÚshapeÚindexc                 S   s   i | ]	}t |ƒt |ƒ“qS r!   )Ústrr(   r!   r!   r"   Ú
<dictcomp>¶   s    z"Model.__init__.<locals>.<dictcomp>FÚrbaI  Custom verifier models were provided, but some were not matched with a base model! Make sure that the keys provided in the `custom_verifier_models` dictionary argument exactly match that of the `.models` attribute of an instantiated openWakeWord Model object that has the same base models but doesn't have custom verifier models.é   ©Úmaxlen)ÚNoiseSuppressioné    é€>  r   r!   )@ÚopenwakewordZget_pretrained_model_pathsÚlistZMODELSÚkeysÚlenÚ	enumerater   r   r,   ÚappendÚsplitextÚbasenameÚ
ValueErrorÚformatÚmodelsÚmodel_inputsÚmodel_outputsÚmodel_prediction_functionÚclass_mappingr   r   Ztflite_runtime.interpreterÚinterpreterÚImportErrorÚloggingÚwarningÚallZonnxruntimeÚzipZSessionOptionsZinter_op_num_threadsZintra_op_num_threadsZInferenceSessionr.   r2   Zget_outputsÚ	functoolsr   ZInterpreterZallocate_tensorsZget_input_detailsZget_output_detailsr3   ÚgetZmodel_class_mappingsÚrangeÚ
isinstanceÚdictÚpickleÚloadÚopenr   r   Úprediction_bufferZspeexdsp_nsr:   ÚcreateÚspeex_nsr   ZVADÚvadr   Úpreprocessor)Úselfr   r   r   r   r   r   r   ÚkwargsZpretrained_model_pathsZwakeword_model_namesZndxZmatching_modelr   r&   Zortr0   Zmdl_pathZmdl_nameZsessionOptionsZpred_functionZtflite_input_indexZtflite_output_indexr:   r!   r   r"   Ú__init__%   s²   
/ 
€÷ÿ
ÿ


  
 ÿÿ	

zModel.__init__c                 C   sJ   d}| j  ¡ D ]}|| j |  ¡ v r|}q|| j  ¡ v r"||kr"|}q|S )z>Gets the parent model associated with a given prediction labelÚ )rK   r?   Úvalues)r_   ÚlabelÚparent_modelÚmdlr!   r!   r"   Úget_parent_model_from_label×   s   €z!Model.get_parent_model_from_labelc                 C   s    t ttddƒ| _| j ¡  dS )z•Reset the prediction and audio feature buffers. Useful for re-initializing the model, though may not be efficient
        when called too frequently.r7   r8   N)r   r   r   rZ   r^   Úreset)r_   r!   r!   r"   rh   â   s   zModel.resetç        r%   ÚpatienceÚ	thresholdÚdebounce_timeÚtimingc              	   C   s¼  t |tjƒstdt|ƒ› dƒ‚|ri }i |d< t ¡ }| jr(|  |  |¡¡}n|  |¡}|r9t ¡ | |d d< i }	| j	 
¡ D ]}
|rIt ¡ }|dkr‚g }t |d d dd¡D ]}| | j|
 | jj| j|
 | j|
  | dƒ¡ qZt |¡jd	d
d }nV|dkr•| j|
 | j | j|
 ¡ƒ}nC|dk rØ| j|
 dkrºt| j|
 ƒd	kr´| j|
 d ggg}n$d	ggg}n| j|
 dkrØtdd„ | j|
  
¡ D ƒƒ}d	g|d  gg}| j|
 dkrê|d	 d	 d	 |	|
< n| j|
  ¡ D ]\}}|d	 d	 t|ƒ |	|< qñ| ji kr<|	 
¡ D ]/}|	| | jkr:|  |¡}| j |d¡r:| j|  | j | j|
 ¡¡d	 d }||	|< q|	 
¡ D ]}t| j| ƒdk rPd|	|< q@|r_t ¡ | |d |
< q@|i ksj|d	krú|i krstdƒ‚|i kr|d	krtdƒ‚|	 
¡ D ]t}
|  |
¡}|	|
 dkrø|| 
¡ v r¼t | j|
 ¡||  d… }||| k ¡ || k rºd|	|
< q…|d	krø|| 
¡ v røtt ||d  ¡ƒ}t | j|
 ¡| d… }|	|
 || krø||| k ¡ d	krød|	|
< q…|	 
¡ D ]}
| j|
  |	|
 ¡ qþ| j d	krU|rt ¡ }|  !|¡ |r+t ¡ | |d d< t"| j!jƒdd… }t|ƒd	krAt |¡nd	}|	 
¡ D ]}
|| j k rSd|	|
< qG|r\|	|fS |	S )a  Predict with all of the wakeword models on the input audio frames

        Args:
            x (ndarray): The input audio data to predict on with the models. Ideally should be multiples of 80 ms
                                (1280 samples), with longer lengths reducing overall CPU usage
                                but decreasing detection latency. Input audio with durations greater than or less
                                than 80 ms is also supported, though this will add a detection delay of up to 80 ms
                                as the appropriate number of samples are accumulated.
            patience (dict): How many consecutive frames (of 1280 samples or 80 ms) above the threshold that must
                             be observed before the current frame will be returned as non-zero.
                             Must be provided as an a dictionary where the keys are the
                             model names and the values are the number of frames. Can reduce false-positive
                             detections at the cost of a lower true-positive rate.
                             By default, this behavior is disabled.
            threshold (dict): The threshold values to use when the `patience` or `debounce_time` behavior is enabled.
                              Must be provided as an a dictionary where the keys are the
                              model names and the values are the thresholds.
            debounce_time (float): The time (in seconds) to wait before returning another non-zero prediction
                                   after a non-zero prediction. Can preven multiple detections of the same wake-word.
            timing (bool): Whether to return timing information of the models. Can be useful to debug and
                           assess how efficiently models are running on the current hardware.

        Returns:
            dict: A dictionary of scores between 0 and 1 for each model, where 0 indicates no
                  wake-word/wake-phrase detected. If the `timing` argument is true, returns a
                  tuple of dicts containing model predictions and timing information, respectively.
        zSThe input audio data (x) must by a Numpy array, instead received an object of type Ú.rG   r^   é   r   r   )Z	start_ndxr   )Zaxisr$   c                 S   s   g | ]}t |ƒ‘qS r!   )Úintr(   r!   r!   r"   r#   6  r)   z!Model.predict.<locals>.<listcomp>Fé   ri   zjError! When using the `patience` argument, threshold values must be provided via the `threshold` argument!zLError! The `patience` and `debounce_time` arguments cannot be used together!Nr<   r]   iùÿÿÿéüÿÿÿ)#rU   ÚnpÚndarrayrE   ÚtypeÚtimer\   r^   Ú_suppress_noise_with_speexrG   r?   ZarangeÚextendrJ   Úget_featuresrH   ÚarrayÚmaxrI   r@   rZ   rK   Úitemsrp   r   r   rg   rS   Zpredict_probaÚsumÚceilrB   r   r]   r>   )r_   r%   rj   rk   rl   rm   Ztiming_dictZfeature_startZn_prepared_samplesÚpredictionsrf   Zmodel_startZgroup_predictionsr    Z
predictionZ	n_classesZ	int_labelÚclsre   Zverifier_predictionZscoresZn_framesZrecent_predictionsZ	vad_startZ
vad_framesZvad_max_scorer!   r!   r"   Úpredictè   sÌ   
þÿÿÿ

ÿþþ€€€

€
€
€zModel.predictr   ro   ÚclipÚpaddingc           
      K   sè   t |tƒr*tj|dd}tj| | ¡ ¡tjd}W d  ƒ n1 s$w   Y  nt |tj	ƒr2|}|rNt 
t d| ¡ tj¡|t d| ¡ tj¡f¡}g }|}td|jd | |ƒD ]}	| | j||	|	| … fi |¤Ž¡ q]|S )aC  Predict on an full audio clip, simulating streaming prediction.
        The input clip must bit a 16-bit, 16 khz, single-channel WAV file.

        Args:
            clip (Union[str, np.ndarray]): The path to a 16-bit PCM, 16 khz, single-channel WAV file,
                                           or an 1D array containing the same type of data
            padding (int): How many seconds of silence to pad the start/end of the clip with
                            to make sure that short clips can be processed correctly (default: 1)
            chunk_size (int): The size (in samples) of each chunk of audio to pass to the model
            kwargs: Any keyword arguments to pass to the class `predict` method

        Returns:
            list: A list containing the frame-level prediction dictionaries for the audio clip
        r6   ©Úmode©ZdtypeNr<   r   )rU   r4   ÚwaverY   rs   Ú
frombufferÚ
readframesÚ
getnframesÚint16rt   ZconcatenateZzerosZastyperT   r2   rB   r   )
r_   r‚   rƒ   Ú
chunk_sizer`   ÚfÚdatar   Ú	step_sizer    r!   r!   r"   Úpredict_clip„  s&   
þ€ýÿ	&zModel.predict_clipç      à?ÚfeaturesÚfileÚreturn_typec                 K   s>  t j|dd}tj| | ¡ ¡tjd}W d  ƒ n1 sw   Y  ttƒ}d}t	d|j
d | |ƒD ]U}	| j||	|	| … fi |¤Ž}
|
 ¡ D ]?}|
| |kr‰|  |¡}| j | j| ¡}|dkrk||  |¡ |dkr‰|td|	d	 ƒ|	d
 … }t|ƒdkr‰||  |¡ qJq5i }| ¡ D ]}t || ¡||< q‘|S )a-  
        Gets predictions for the input audio data, and returns the audio features (embeddings)
        or audio data for all of the frames with a score above the `threshold` argument.
        Can be a useful way to collect false-positive predictions.

        Args:
            file (str): The path to a 16-bit 16khz WAV audio file to process
            threshold (float): The minimum score required for a frame of audio features
                               to be returned.
            return_type (str): The type of data to return when a positive prediction is
                               detected. Can be either 'features' or 'audio' to return
                               audio embeddings or raw audio data, respectively.
            kwargs: Any keyword arguments to pass to the class `predict` method

        Returns:
            dict: A dictionary with filenames as keys and  N x M arrays as values,
                  where N is the number of examples and M is the number
                  of audio features, depending on the model input shape.
        r6   r„   r†   Nro   r   r’   Zaudioi€»  r<   i ú  )r‡   rY   rs   rˆ   r‰   rŠ   r‹   r   r>   rT   r2   r   r?   rg   r^   ry   rH   rB   r{   r@   Zvstack)r_   r“   rk   r”   r`   r   rŽ   Zpositive_datar   r    r   Zlblrf   r’   ÚcontextZpositive_data_combinedr!   r!   r"   Ú_get_positive_prediction_frames¬  s.   þ
€÷z%Model._get_positive_prediction_framesr;   Ú
frame_sizec                 C   s^   g }t d|jd |ƒD ]}|||| … }| | j | ¡ ¡¡ qd |¡}t |tj	¡}|S )aÄ  
        Runs the input audio through the SpeexDSP noise suppression algorithm.
        Note that this function updates the state of the existing Speex noise
        suppression object, and isn't intended to be called externally.

        Args:
            x (ndarray): The 16-bit, 16khz audio to process. Must always be an
                         integer multiple of `frame_size`.
            frame_size (int): The frame size to use for the Speex Noise suppressor.
                              Must match the frame size specified during the
                              initialization of the noise suppressor.

        Returns:
            ndarray: The input audio with noise suppression applied
        r   ó    )
rT   r2   rB   r\   ÚprocessÚtobytesÚjoinrs   rˆ   r‹   )r_   r%   r—   Úcleanedr    ÚchunkZcleaned_bytestringZcleaned_arrayr!   r!   r"   rw   á  s   
z Model._suppress_noise_with_speexN)r   ro   )r‘   r’   )r;   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r4   rV   ÚboolÚfloatra   rg   rh   rs   rt   r   r   rp   r   r–   rw   r!   r!   r!   r"   r       sb    
øþýüûúùø 2ÿÿÿ
ÿ +üþý
ü5r   )Únumpyrs   r=   Zopenwakeword.utilsr   r   r‡   r   rN   rR   rW   Úcollectionsr   r   r   rv   Útypingr   r   r	   r
   r   r!   r!   r!   r"   Ú<module>   s   