o
    i                     @   s   d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlm	Z	m
Z
mZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ eeZg dZ G dd dZ!dS )    N)Path)add_cache_indirection_to_mhaadd_output_qk_to_mhafix_past_sequence_length)optimize_model)AutoTokenizerWhisperConfigWhisperForConditionalGenerationWhisperProcessor)WhisperDecoder)WhisperEncoder)WhisperEncoderDecoderInit)WhisperJumpTimes)InferenceSession)zwhisper-tinyzwhisper-tiny.enzwhisper-basezwhisper-base.enzwhisper-smallzwhisper-small.enzwhisper-mediumzwhisper-medium.enzwhisper-largezwhisper-large-v2zwhisper-large-v3zwhisper-large-v3-turboc                   @   s  e Zd Ze		dAdededededef
dd	Zeded
ededededededededdfddZe			dBdedededej	dej
dedededeeejjf fddZedeeB eB ded
ededed ed!ed"ed#efd$d%Ze			&				dCded'ed(ed)ed*ed+eded,ed
ed-edededefd.d/Ze	0	dDd1ed2ejjdej	d3ed4ef
d5d6Zed3ed4efd7d8Zed9ejd:ejd;efd<d=Ze	0	dDdeded>edej	d3ed4efd?d@ZdS )EWhisperHelper F
output_dirmodel_name_or_pathsuffix
new_folderreturnc                 C   s^   |}t j|rt|jd }n|dd }||7 }|r$t j| |n| }t j||d S )a  Build onnx path

        Args:
            output_dir (str): output directory
            model_name_or_path (str): pretrained model name, or path to the model checkpoint
            suffix (str, optional): suffix like "_encoder" or "_decoder_fp16" will be appended to file name. Defaults to None.
            new_folder (bool, optional): create a new directory for the model. Defaults to False.
        Returns:
            str: path of onnx model
        /z.onnx)ospathisdirr   partssplitjoin)r   r   r   r   Z
model_name	directory r    p/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/models/whisper/whisper_helper.pyget_onnx_path*   s   zWhisperHelper.get_onnx_pathprovider!separate_encoder_and_decoder_inituse_decoder_masked_mha	output_qkencoder_pathdecoder_path	cache_dirNc	                 C   sX  t j| |d}	|	| tj| |d}
|
| tj| |d}|| |r(d S ddddddiddd	d
d
dg dddidddddd
|	jddigii}tj|dd}tt	j
|dd}|| W d    n1 spw   Y  d|v r{g n| i ig}|	j|	jd|dt	j
||	j|	j |	jddddddd d!d"d#|	j|	j|	jd$	d|dt	j
||	j|	j |	jd%d%id&d'd(d)|	j|	j|	jd$	|	j|	jd*|	jd+d,d-d.d/|	jd0d0d1d1|d/d/d1d/d2d3}|r|d4 d5 d6 d7d8d9 |r|d4 d5 d: d;d<i tt	j
|d=d}tj||dd W d    d S 1 s%w   Y  d S )>Nr)   Zfeature_extractionsequenceZ	operationZaudio_decoderZAudioDecoder)nametypeZSTFTZSTFTNormi     (               ,?    +0?    @0B?    @*P?g    @Y?    -b?    `h?    (&p?    xnt?    6y?     ~?g    L#?    F?g    H?g    K?g    ?g    %?g    bT?g    L?g    ?g    ?g    D?g    ?g    +?g    |?g    ?g    G?g    A]?g    ?g    ?g    ?g    ^?g    а?g   4ӱ?g    ܲ?g   ?g    D?g   9"?g    F?g   !r?g    ?g   3ܺ?g   ?g   _?g   ?g   \?g   ?g   @X?g    8
?g   @þ?g   @*v?g   a0?g    ]?g   @?g   qo?g   @r4?g   ?g   !?g    ?g    a?g   @3?g   ?g   ?g   ?g   b?g   k?g    4%?g   y?g   @?g    x?g   ?g   @`?g    ?g   J?g   ?g   R8?g   U?g    )?g    x?g   ?g   8?g   @?g   @Z?g   
?g   ?g   ?g   `Ă?g     ?g   ?g   `}?g   }?g   `E?g   }?g   ?g   C}?g    ?g   ~?g   ?g   S?      ?g   `V@?g   ?g   ?g   0?g   `^A?g   w?g   x?g   ]?g   !A?g   ?g   8?g   ?g    >?g   }?g   `1?g   `?g    8?g   `v?g   c?g   ?g    .?g   xk?g   @է?g   ?g   x?g   Z?g   @?g    ?g   	?g   C?g    |?g   C?g   @f?g   %?g   '\?g   @?g   @?g   @K?g   83?g   g?g   `R?g   w?g    ?g   2?g   #d?g    ?g    ?g   ?g   u"?g   @OP?g    r}?g    ۩?g   ?g   t ?g   *?g   T?g   |?g   y?g   @?   ?g   @$?g   ;?   w_?    ^?    j?g   ?g   @?g   @Z?g   #?g    A?g   @R^?g    ,z?g   ?g   ?g    6?g   `]?g   @?    ?g   *#?g    7?g   J?g    ]]?   n?g    Q?   ӎ?g    [?   ?g   r?    ?   @?    #?g   ?g   C?   `?   _?   ?    t?   ?   ~?   rL   rK   rJ   rI   rH   rG   g   C?g   ?rF   rE   rD   g   r?rC   g   Z?rB   g    Q?rA   g   \]?g   J?g    7?g   @*#?r@   g    ?g   @]?g   5?g   `?g   ?g   +z?g    R^?g    A?g   @#?g    Z?g    ?g   @?r?   r>   r=   g   ;?g    $?r<   g    ?g   y?g   |?g   T?g   `*?g   `t ?g   @?g   ک?g   q}?g    OP?g   @u"?g   ?g   ?g   ?g   #d?g   @2?g   @ ?g   w?g    R?g   @g?g   83?g    K?g    ?g   ྒ?g   @'\?g   `%?g   e?g    C?g   |?g   @C?g   `	?g   ?g   ?g   Z?g   `x?g   @?g   ԧ?g    xk?g   .?g   ?g   @c?g    v?g   8?g   ?g   0?g   `}?g   >?g    ?g   @8?g    ?g   @!A?g    ]?g   @x?g   @w?g   ]A?g    0?g   @?g    ?g   @V@?r;   g   `S?g   ?g   ~?g   ?g   @C}?g   ?g   `}?g    E?g   }?g    }?g   `?g    ?g    Ă?g   @?g   `?g   @
?g   Y?g    ?g   8?g    ?g   w?g   )?g   @U?g   R8?g   @?g   @J?g   ?g   `?g   @?g   x?g   ?g    y?g   3%?g   k?g   @a?g   ?g   ?g   ?g   @3?g    a?g    ?g    ?g   @?g    q4?g   po?g    ?g    \?g   @`0?g   @)v?g   @¾?g    7
?g    X?g   @?g   Z?g   ?g   _?g   ?g   1ܺ?g    ?g   r?g    F?g   7"?g    B?g    ?g    ܲ?g   2ӱ?g    а?g    Z?g    ?g    ?g    ?g    >]?g    C?g    ?g    |?g    (?g    ?g    D?g    ?g    ?g    F?g    ^T?g    %?g    ?g    K?g    D?r:   g    H#?r9   r8   r7   r6   r5   r4   g    @Y?r3   r2   r1   r0   )n_fftZframe_length
hop_length_comment)r,   r-   attrsZlog_mel_spectrogramZLogMelSpectrum   )
chunk_sizerO   rN   Zn_mel   )indentzaudio_processor_config.jsonwcpuzonnxruntime-genai)Zlog_idprovider_options	input_idszpast_key_self_%dzpast_value_self_%dzpast_key_cross_%dzpast_value_cross_%d)rY   Zpast_key_namesZpast_value_namesZcross_past_key_namesZcross_past_value_nameslogitszpresent_key_self_%dzpresent_value_self_%d)rZ   Zpresent_key_namesZpresent_value_names)	Zsession_optionsfilenameZ	head_sizehidden_sizeinputsoutputsnum_attention_headsZnum_hidden_layersZnum_key_value_headsZaudio_featuresencoder_hidden_stateszpresent_key_cross_%dzpresent_value_cross_%d)r`   Zcross_present_key_namesZcross_present_value_nameswhisper)bos_token_idZcontext_lengthdecoderencodereos_token_idpad_token_idr-   
vocab_sizer/   FT      ?r   rM   )Zdiversity_penaltyZ	do_sampleearly_stoppinglength_penalty
max_length
min_lengthZno_repeat_ngram_size	num_beamsnum_return_sequencesZpast_present_share_bufferrepetition_penaltytemperatureZtop_kZtop_p)modelsearchrq   rc   r]   past_sequence_lengthcache_indirection)rs   rt   r^   Zoutput_cross_qk_nameszoutput_cross_qk_%dzgenai_config.json)r   from_pretrainedZsave_pretrainedr   r
   Znum_mel_binsjsondumpsopenr   r   r   writerb   rk   basenameZd_modelZdecoder_attention_headsZdecoder_layersZencoder_attention_headsZencoder_layersre   rf   rg   updatedump)r   r#   r$   r%   r&   r'   r(   r   r)   config	tokenizer	processorZaudio_processor_cfgZaudio_processor_jsonfrX   Zgenai_configr    r    r!   save_processingF   s   


   !      .



3K$zWhisperHelper.save_processingT
model_impldevicedtypemerge_encoder_and_decoder_initno_beam_search_opc                 C   s  |dkrt j| |dd}n!ddl}	tj| s#| dd dd }
n| }
|	j|
||d	d
}| j	|d |dkr@|j	|d t
j| |d}t|||| }d|i}|rgt|||| }|d|i nt||| }|||d |rt||| }|d|i |S )a|  Load model given a pretrained name or path, then build models for ONNX conversion.

        Args:
            model_name_or_path (str): pretrained model name or path
            model_impl (str): library to load model from
            cache_dir (str): cache directory
            device (torch.device): device to run the model
            dtype (torch.dtype): dtype to run the model
            merge_encoder_and_decoder_init (bool, optional): Whether merge encoder and decoder initialization into one ONNX model. Defaults to True.
            no_beam_search_op (bool, optional): Whether to use beam search op or not. Defaults to False.
            output_qk (bool, optional): Whether to output QKs to calculate batched jump times for word-level timestamps. Defaults to False.
        Returns:
            Dict[str, torch.nn.Module]: mapping from name to modules for ONNX conversion.
        Zhfeagerr)   Zattn_implementationr   Nr   r      T)Zdownload_rootZ	in_memoryr   r   r*   rc   rd   )rd   Zdecoder_initZ
jump_times)r	   ru   ra   r   r   existsr   
load_modelevaltor   r   r   r{   r   r   )r   r   r)   r   r   r   r   r&   rq   ra   Zname_or_pathr}   rc   
componentsZencoder_decoder_initrd   Zbatched_jump_timesr    r    r!   r   m  s0   zWhisperHelper.load_modelrq   onnx_model_pathverboseuse_external_data_formatuse_fp16_inputsuse_int32_inputsuse_encoder_hidden_statesuse_kv_cache_inputsc	           	   
   C   s   t | tr| ||||| dS t | tr!| |||||| dS t | tr4| |||||||| dS t | trE| |||||| dS tdt|  )a/  Export model component to ONNX

        Args:
            model (class): PyTorch class to export
            onnx_model_path (str): path to save ONNX model
            provider (str): provider to use for verifying parity on ONNX model
            verbose (bool): print verbose information.
            use_external_data_format (bool): use external data format or not.
            use_fp16_inputs (bool): use float16 inputs for the audio_features, encoder_hidden_states, logits, and KV caches.
            use_int32_inputs (bool): use int32 inputs for the decoder_input_ids.
            use_encoder_hidden_states (bool): use encoder_hidden_states as model input for decoder-init/decoder-without-past models.
            use_kv_cache_inputs (bool): use KV caches as model inputs for decoder-with-past models.
        z%Unknown instance for model detected: N)
isinstancer   export_onnxr   r   r   
ValueErrorr-   )	rq   r   r#   r   r   r   r   r   r   r    r    r!   r     sL   




	zWhisperHelper.export_onnxrW   optimized_model_path
is_float16r_   r\   num_decoder_layersuse_gpu
is_decoderc              
   C   s   ddl m} |d}d|_|dk|_t| d||d||dd}|	r?|
r?|r/t|\}}t||}|r?t|tt	dd| dd	}|j
||dd
 dS )zHOptimize ONNX model with an option to convert it to use mixed precision.r   )FusionOptionsZbartTZrocmF)Z
model_typeZ	num_headsr\   	opt_leveloptimization_optionsr   Zonly_onnxruntime   )Zskip_node_idxs)Zall_tensors_to_one_fileN)Zfusion_optionsr   Zuse_multi_head_attentionZ!disable_multi_head_attention_biasr   r   r   r   listrangeZsave_model_to_file)r   r   r   r_   r\   r   r   r   r#   r   r   r%   r&   r   r   mZpast_seq_len_namer    r    r!   optimize_onnx  s*   

zWhisperHelper.optimize_onnxrM   r   pt_model
batch_sizeprompt_modec              
      s@  zddl m} W n, ty4 } z tjd| dd d}td| d t| W Y d }~nd }~ww ddl m} |d	d
dd}g }	|dkrW |d d d gddj}
n/ |d d d gddj |d d d gddjg}	t	|	|ks{J t
|	d |	d f}
d\}}}}d\}}|
|||||||ddd	}|rddg} fdd|D }g }g }t|D ]7}t
|| j|d|d< |	| ||d< |jd i |   }|| | j|ddd  q|
|d< |d= ng }|jd i |   } j|ddd g}t|}|d= |d= ||||fS )!Nr   )load_datasetz.An error occurred while importing `datasets`: T)exc_infozpip install datasetszCCould not import `datasets`. Attempting to install `datasets` via `z`.z)hf-internal-testing/librispeech_asr_dummycleanZ
validation)r   rM   Zaudioarraypt)Zreturn_tensors   )rR   r   rM   rM   )rh   rh   )	input_featuresrk   rl   rm   rn   rj   ro   ri   	use_cachezJohn has doubtszMaria has grave doubtsc                    s   g | ]}  |qS r    )Zget_prompt_ids.0pr   r    r!   
<listcomp>L  s    zBWhisperHelper.pt_transcription_for_verify_onnx.<locals>.<listcomp>r   
prompt_idsr   Zskip_special_tokensri   r   r    )Zdatasetsr   	Exceptionloggererrorwarningr   systemr   lentorchcatr   r   Z
from_numpygeneratedetachrW   numpyappendbatch_decoder   )r   r   r   r   r   r   eZinstall_cmdZdsZinput_features_r   rk   rl   rm   rn   rj   ro   r]   Zpromptsr   pt_transcription
pt_outputsi	pt_outputr    r   r!    pt_transcription_for_verify_onnx  sf   	
z.WhisperHelper.pt_transcription_for_verify_onnxc           
      C   sF   | dkr|rd}d}d}d}||||h}|S d}d}d}	|||	h}|S )	NrM   z{ John has doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Izy John has doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of Rocky Iz Maria has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rockyz Maria has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of Rocky IzX Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.zY Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.zZ "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.r    )
r   r   Z'expected_transcription_no_comma_prompt1Z)expected_transcription_misspelled_prompt1Z'expected_transcription_no_comma_prompt2Z)expected_transcription_misspelled_prompt2expected_transcription_optionsZexpected_transcription_no_commaZ!expected_transcription_with_commaZ+expected_transcription_with_quote_and_commar    r    r!   select_transcription_optionsc  s,   z*WhisperHelper.select_transcription_optionsr   ort_outputsr   c                 C   s   | | || }}|j |j }}||krKt|dkr!|d }|j }t|dkr.|d }|j }|d |d krKt|d |d }|d| }|d| }|j |j ksSJ ||fS )z8Get PyTorch and ONNX Runtime output token ids at index irM   r   N)shaper   min)r   r   r   r   
ort_outputZpt_shapeZ	ort_shapeZmin_lenr    r    r!   get_outputs  s   zWhisperHelper.get_outputsort_sessionc           &      C   s  t j| |dd|}tj| |d}tj| |d}tj|||||d\}	}
}}|jg}|jddd}dd	 |D }|| }d
d	 |	 D }dd	 |	 D }t
jt
jt
jt
jt
jt
jd}d|v }t||ddD ]\}}|dkr~|	|    |	|< qi|dkrt
j|j|| d|	|< qi|dkrt
j||jf|| d|	|< qi|dkr|s|r|gn|g}t
j||| d|	|< qig }t|D ]}|||   qtdd |D }g }|D ]}g ||jg|t|  }|||  qt
j||| d|	|< qi|dkrt
jdg|| d|	|< qi|dkr%t
jddgg|| d|	|< qi|dkr<t
t
j|g|| d|d|	|< qi|dkrNt
jdg|| d|	|< qit
j|	| g|| d|	|< qi|d|	d dddddf }|j |dd}t!||}d} t|D ]#}t"|||\}!}"| t
#|!|"9 } | |
| |v o|| |v 9 } qd}#| st|D ] }t"|||\}!}"|!|" }$t|$$ |$ t%d }%t|#|%}#q|#dkrt&'d!|
  t&'d"|  dS )#zRCompare the result from PyTorch and ONNX Runtime to verify the ONNX model is good.r   r   r*   )r   r   englishZ
transcribe)languagetaskc                 S   s   g | ]}|d  qS )rM   r    )r   tokenr    r    r!   r     s    z-WhisperHelper.verify_onnx.<locals>.<listcomp>c                 S      g | ]}|j qS r    )r,   r   entryr    r    r!   r         c                 S   r   r    )r-   r   r    r    r!   r     r   )ztensor(float)ztensor(float16)ztensor(int64)ztensor(int32)ztensor(int8)ztensor(uint8)Zextra_decoding_idsF)strictr   Z
vocab_maskr   Zprefix_vocab_maskZdecoder_input_idsc                 s   s    | ]}t |V  qd S )N)r   r   r    r    r!   	<genexpr>  s    z,WhisperHelper.verify_onnx.<locals>.<genexpr>Zlogits_processorrM   Zcross_qk_layer_headr   rp   rh   NTr   )keyzPyTorch outputs: zONNX Runtime outputs: )(r	   ru   r   r
   r   r   r   Zdecoder_start_token_idZget_decoder_prompt_idsZ
get_inputsnpZfloat32Zfloat16Zint64Zint32Zint8Zuint8zipr   rW   r   Zonesrg   r   r   r   tolistmaxrf   r   repeatrunr   r   r   Zallcloser   absr   r   )&r   r)   r   r   r   r   r   r   r}   r]   r   r   Zdecoder_prompt_idsZstart_idr   Zforced_decoder_idsZ	ort_namesZ
ort_dtypesZ	ort_to_npZuse_extra_decoding_idsr,   r   Zraw_input_idsZort_promptsr   max_lenZpadded_promptsr   Zpadded_promptr   Zort_transcriptionr   Zparityr   r   Zmax_diffdiffZ
max_diff_ir    r    r!   verify_onnx  s   
	


$
"

zWhisperHelper.verify_onnx)r   F)TFF)FFrW   FFFF)rM   F)__name__
__module____qualname__staticmethodstrboolr"   r   r   r   r   dictnnModuler   r   r   r   r   intr   r
   r   r   r   Zndarrayr   r   r   r    r    r    r!   r   )   sF   	
    *	=
	@	
0E r   )"rv   loggingr   pathlibr   r   r   r   Zconvert_generationr   r   r   Z	optimizerr   Ztransformersr   r   r	   r
   Zwhisper_decoderr   Zwhisper_encoderr   Zwhisper_encoder_decoder_initr   Zwhisper_jump_timesr   Zonnxruntimer   	getLoggerr   r   ZPRETRAINED_WHISPER_MODELSr   r    r    r    r!   <module>   s"   
