o
    i--                     @  sF  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlm	Z
 d dlZd dlmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! d dl"Z#e$dZ%d)ddZ&d)ddZ'dd Z(		d*d+dd Z)d,d#d$Z*g fd,d%d&Z+e,d'krd(Z-ej.-e- e/e- e+  dS dS )-    )annotationsN)setup_logger)get_rankget_size)add_io_bindings_as_ortvaluesconvert_inputs_for_ort%get_merged_sample_with_past_kv_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)setup_torch_model)make_dynamic_cache)
AutoConfig)__version__)DynamicCache argsargparse.Namespaceconfigr   c                 C  s"   | j rdnd\}}|j}|||fS )N)      )r   r   )use_past_kvZmax_position_embeddings)r   r   past_sequence_lengthZcurr_sequence_lengthmax_sequence_length r   l/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/models/llama/llama_parity.pyget_sequence_lengths%   s   
r   c                 C  s   t  }d}t| |\}}}| jr#t|| j||||| j| jd|d
}|S | jr5t|| j||| jd|d}|S t	|| j||dd}|S )N   T)Zseq_lenpast_seq_lenmax_seq_lenuse_fp16use_buffer_sharereturn_dict
world_size)r    r"   r#   )r"   )
r   r   mergedr   devicer    r!   r   r
   r	   )r   r   r#   Z
batch_sizer   Zsequence_lengthr   inputsr   r   r   
get_inputs+   s<   r'   c                 C  s   t | tttfr
| S t | trtdd | D S t | tr$dd | D S t | tr0dd | D S t | tr>dd |  D S t | t	j
rH|  S t| d	rQ|  S t | tretttt| j| jd
dS tdt|  )Nc                 s  s    | ]}t |V  qd S )Ntorch_deepcopy.0vr   r   r   	<genexpr>R   s    z!torch_deepcopy.<locals>.<genexpr>c                 S  s   g | ]}t |qS r   r(   r*   r   r   r   
<listcomp>T       z"torch_deepcopy.<locals>.<listcomp>c                 S  s   h | ]}t |qS r   r(   r*   r   r   r   	<setcomp>V   r/   z!torch_deepcopy.<locals>.<setcomp>c                 S  s   i | ]	\}}|t |qS r   r(   )r+   kr,   r   r   r   
<dictcomp>X   s    z"torch_deepcopy.<locals>.<dictcomp>cloneF)strictz(torch_deepcopy not implemented for type )
isinstanceintfloatstrtuplelistsetdictitemsnpZndarraycopyhasattrr3   r   r   r)   zipZ	key_cacheZvalue_cacheNotImplementedErrortype)valuer   r   r   r)   N   s"   





r)   locationr8   use_auth_tokenboolkv_cache_ortvaluesr<   pytorch_modelNone | torch.nn.ModuleNone | AutoConfigc                 C  sd  |}|d u rt | ||| jrtjntj| jd\}}t| |}d|v r4tt	tdkr4t
|d |d< t|}| jdkrBtj  t }	|di |j   }
| jdkr^tj  t }td||	  d | jrz|d urz~tj  t| |\}}}t|| j||d}| j  d}|d	kr|d
| jif}tj| jt  |gd}t!||}| jdkrt"||| jt#| j| j|d\}}|$  t }	|%| |&  t }|' d }~nt }	|(d |}t }|d }td||	  d d| jv sd| jv rdnd}t)j*|
|||d}t+d|  |s0t+dt),|
|   |S )NZtorch_dtyper%   Zpast_key_valuesz4.45cpuzPyTorch took z s)r!   r   r   ZExecutionProviderZCUDAExecutionProvider	device_id)Zsess_options	providers)Z
ort_inputsr%   rN   r!   rH   r   zONNX Runtime took int4int8g      4@g      ?)ZrtolZatolz,Are PyTorch and ONNX Runtime results close? z
Max diff: r   )-r   r    torchfloat16float32r%   r'   pvVersiontransformers_versionr   r)   execution_providercudaZsynchronizetimeZlogitsdetachrM   numpyloggerinfo	small_gpuZempty_cacher   r   r!   upperrankortZInferenceSessionZonnx_model_pathZSessionOptionsr   r   r6   Zsynchronize_inputsZrun_with_iobindingZsynchronize_outputsZcopy_outputs_to_cpurunr>   Zallclosewarningmax)r   rE   rF   rH   rI   r   Zpy_modelr&   Zinputs_after_deepcopy
start_timeZ
pt_outputsZend_timer   _r   epZ	ort_modelZ
io_bindingZort_outputsZtolZparityr   r   r   verify_parityd   s   	









	
 ri   argv	list[str]c                 C  sj  t  }|jddddd |jdddtjdd	d
 |jdddtjddd
 |jddddg ddd |jddddd |jdd |jddddd |jdd |jdd dd!d |jdd" |jd#dd$d |jdd% |jd&d'dg d(d)d* |jd+dtd,d-d. |jd/dd0d | g kr| n|| }|j	d1v s|j	d2kr|j
dkrd3|_	|S d4|_	|S )5Nz-mz--model_nameFzModel name in Hugging Face)requiredhelpz-tz--torch_model_directory.zMPath to folder containing PyTorch model and associated files if saved on disk)rl   defaultrm   z-oz--onnx_model_pathTzSPath to ONNX model (with external data files saved in the same folder as the model)z-epz--execution_providerrM   )rM   rY   Zrocmz(Execution provider to verify parity with)rl   ro   choicesrm   z-vz	--verbose
store_truezPrint verbose logs)actionrm   )verbosez-pz--use_past_kvzfUse past key and past value as inputs to the model. Necessary for decoder_with_past_model.onnx models.)r   z-gz--use_buffer_sharezWUse if model has GroupQueryAttention and you want to enable past-present buffer sharing)r!   z--mergedz2Use merged model (i.e. decoder_merged_model.onnx).)r$   z-fpz--precision)rP   rQ   fp16fp32zPrecision of model)rl   rp   rm   z--cache_dirz./model_cachezQmodel cache dir to override default HF cache dir to avoid overflood the /home dir)rl   rC   ro   rm   z--small_gpuzhLoad the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB. >   rQ   ru   rP   ru   rt   )argparseArgumentParseradd_argumentospathjoinset_defaultsr8   
parse_args	precisionrX   )rj   parserr   r   r   r   get_args   s   

		r   c                 C  s  t | }t|j td|  t }t|d|jdk ||_t|d|j	dkr*dnd|  t|dt
|j |jtjdk}|rH|jn|j}i }|jsYt|||| d S d  }}|jsst||||jrjt
jnt
j|jd	\}}d
|_t||||||d}d|_t||||||d d S )NzArguments: r    rt   device_namerM   zcuda:r%   rn   rL   F)rI   r   T)r   r   rs   r]   r^   r   setattrr~   ra   rX   rR   r%   r   Ztorch_model_directoryry   rz   r{   Z
model_namer$   ri   r_   r   r    rS   rT   r   )rj   r   ra   rF   rE   rH   r   llamar   r   r   main,  s8   
 
	r   __main__r   )r   r   r   r   )NN)r   r   rE   r8   rF   rG   rH   r<   rI   rJ   r   rK   )rj   rk   )0
__future__r   rv   loggingry   rZ   r\   r>   Zpackaging.versionversionrU   rR   Zbenchmark_helperr   Zdist_settingsr   r   Zllama_inputsr   r   r   r	   r
   r   Zllama_torchr   Z(models.torch_export_patches.cache_helperr   Ztransformersr   r   rW   Ztransformers.cache_utilsr   Zonnxruntimerb   	getLoggerr]   r   r'   r)   ri   r   r   __name__seedrandomZmanual_seedr   r   r   r   <module>   s@    


#
dd'

