o
    i                     @   s6   d dl Zd dlZd dlZd dlmZ G dd dZdS )    N)dequec                   @   sd   e Zd ZdZejejeje	ddddfde
defdd	Zdd
dZdddZdddZdS )VADz
    A model class for a voice activity detection (VAD) based on Silero's model:

    https://github.com/snakers4/silero-vad
    	resourcesmodelszsilero_vad.onnx   
model_path	n_threadsc                 C   sT   t  }||_||_t j||dgd| _tdd| _t	d
tj| _|   dS )zInitialize the VAD model object.

            Args:
                model_path (str): The path to the Silero VAD ONNX model.
                n_threads (int): The number of threads to use for the VAD model.
        ZCPUExecutionProvider)Zsess_options	providers}   )maxleni>  N)ortZSessionOptionsZinter_op_num_threadsZintra_op_num_threadsZInferenceSessionmodelr   prediction_buffernparrayastypeZint64sample_ratereset_states)selfr   r   ZsessionOptions r   J/home/kim/smarthome/.venv/lib/python3.10/site-packages/openwakeword/vad.py__init__<   s   zVAD.__init__c                 C   s@   t d|dfd| _t d|dfd| _d| _d| _d S )N   @   float32r   )r   Zzerosr   _h_cZ_last_srZ_last_batch_size)r   Z
batch_sizer   r   r   r   \   s   
zVAD.reset_states  c           	         s    fddt djd  D }g }|D ]%}|d | j| j| jd}| jd|}|\}| _| _||d d  qt	|S )aI  
        Get the VAD predictions for the input audio frame.

        Args:
            x (np.ndarray): The input audio, must be 16 khz and 16-bit PCM format.
                            If longer than the input frame, will be split into
                            chunks of length `frame_size` and the predictions for
                            each chunk returned. Must be a length that is integer
                            multiples of the `frame_size` argument.
            frame_size (int): The frame size in samples. The reccomended
                              default is 480 samples (30 ms @ 16khz),
                              but smaller and larger values
                              can be used (though performance may decrease).

        Returns
            float: The average predicted score for the audio frame
        c                    s(   g | ]}||   d   tjqS )i  )r   r   r   ).0i
frame_sizexr   r   
<listcomp>t   s     zVAD.predict.<locals>.<listcomp>r   N)inputhcsrN)
rangeshaper   r   r   r   runappendr   mean)	r   r"   r!   chunksZframe_predictionschunkZ
ort_inputsZort_outsoutr   r    r   predictb   s   
zVAD.predict  c                 C   s   | j | || d S r$   )r   r,   r1   )r   r"   r!   r   r   r   __call__   s   zVAD.__call__N)r   )r   )r2   )__name__
__module____qualname____doc__ospathjoindirnameabspath__file__strintr   r   r1   r3   r   r   r   r   r   6   s"    

 
r   )Zonnxruntimer   numpyr   r8   collectionsr   r   r   r   r   r   <module>   s
   /