o
    iG                     @   sZ  d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z! e"d
Z#ej$ddZ$dej%vrwe&e$ej%d< ddl'Z'ddl(m)Z)m*Z*m+Z+ dd Z,dd Z-de.de.fddZ/dd Z0dd Z1dd Z2e3dkre2  dS dS )a   Benchmarking the inference of pretrained transformer models.
PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
One difference is that random input_ids is generated in this benchmark.

For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.

Example commands:
    Export all models to ONNX, optimize and validate them:
        python benchmark.py -b 0 -o -v -i 1 2 3
    Run OnnxRuntime on GPU for all models:
        python benchmark.py -g
    Run OnnxRuntime on GPU for all models with fp32 optimization:
        python benchmark.py -g -o
    Run OnnxRuntime on GPU with fp16 optimization:
        python benchmark.py -g -o -p "fp16"
    Run TorchScript on GPU for all models:
        python benchmark.py -e torchscript -g
    Run TorchScript on GPU for all models with fp16:
        python benchmark.py -e torchscript -g -p "fp16"
    Run ONNXRuntime and TorchScript on CPU for all models with quantization:
        python benchmark.py -e torchscript onnxruntime -p "int8" -o
    Run OnnxRuntime with the ROCM provider and graph optimization script:
        python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
    Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
        python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm

It is recommended to use run_benchmark.sh to launch benchmark.
    N)datetime)ConfigModifierOptimizerInfo	Precisioncreate_onnxruntime_sessionget_latency_resultinference_ortinference_ort_with_io_bindingoutput_detailsoutput_fusion_statisticsoutput_summarysetup_logger)FusionOptions)MODEL_CLASSESMODELS)create_onnxruntime_inputexport_onnx_model_from_ptexport_onnx_model_from_tfload_pretrained_model)version)QuantizeHelper F)ZlogicalZOMP_NUM_THREADS)
AutoConfigAutoTokenizerLxmertConfigc           4      C   s  dd l }g }| r'd| vr'd| vr'd| vr'd| vr'td |S d}|dkr?tj}d}d	| vr?td
 |S |tjkrMtd| d |D ]}t| d }|
D ]}|t|krf n|d | }t| d |_	t
|}d|v rt , t|t| d t| d t| d |||||| |||||||\}} }!}"W d    n1 sw   Y  d|v rt|t| d t| d t| d |||||| |||||||\}} }!}"| sqZt|| |d|||d}#|#d u rqZdd |# D }$g }%| rdnd}&tj||d}'tt|t|t|!|'jg}(tt||'jg})|D ]}*|*dkr,q#|D ]}+|"d ur<|+|"kr<q.d|v rDtjntj},t|!|*|+||'|,}-d|j||&||| ||||*|+| tt d}.|'j	dv rt d| d|*d|'j!|'j!g  nt d| d|*|+g  |rt"|#|-|.|	|*|}/nG|##|$|-}0|(g}1t$t|0D ]}2|2dkrt| d dkr|1%|) q|1%|( qd|v rtj&ntj'}3t(|#|-|.|	|$|0|%|1|*|&|3|}/t |/ |%|/ q.q#qZqO|S )Nr   ZCUDAExecutionProviderZMIGraphXExecutionProviderZROCMExecutionProviderZDmlExecutionProviderzPlease install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance.Ztensorrt   ZTensorrtExecutionProviderzhPlease install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance.zOptimizerInfo is set to zA, graph optimizations specified in FusionOptions are not applied.   pt      tfT)Zenable_all_optimizationnum_threadsverboseZ(enable_mlas_gemm_fastmath_arm64_bfloat16c                 S   s   g | ]}|j qS  )name).0Znode_argr#   r#   \/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/benchmark.py
<listcomp>   s    z#run_onnxruntime.<locals>.<listcomp>cudacpu	cache_dironnxruntimeZenginer   	providersdeviceZ	optimizer	precisionZ
io_binding
model_nameZinputsthreads
batch_sizesequence_lengthZcustom_layer_numr   vitZswinzRun onnxruntime on  with input shape Zgpt))r,   Zget_available_providersloggererrorr   ZNOOPTwarningr   len
model_typer   parsetorchZno_gradr   r   r   Zget_outputsr   from_pretrainednumpyprodmaxZhidden_sizeZint64int32r   __version__get_layer_numstrr   nowinfo
image_sizer   runrangeappendZlonglongZintcr	   )4use_gpuprovidermodel_namesmodel_classconfig_modifierr0   r!   batch_sizessequence_lengthsrepeat_timesinput_countsoptimizer_infovalidate_onnxr+   onnx_dirr"   	overwritedisable_ort_io_bindinguse_raw_attention_maskmodel_fusion_statisticsmodel_source(enable_arm64_bfloat16_fastmath_mlas_gemmargsr,   resultsZwarm_up_repeatr1   Zall_input_namesZ
num_inputsZinput_namesfusion_optionsZonnx_model_fileZis_valid_onnx_model
vocab_sizeZmax_sequence_lengthZort_sessionZort_output_namesZoutput_buffersr/   configZmax_last_state_sizeZmax_pooler_sizer3   r4   Zinput_value_typeZ
ort_inputsZresult_templateresultZort_outputsZoutput_buffer_max_sizesiZ	data_typer#   r#   r&   run_onnxruntimeY   sh  











	

	


Nrf   c                    s  g }| rt j std |S t d |D ]=}tj||	|d}|| t	||||d}|j
dv r:|d g}n
tj||d}|j}td|  td	|   |tjkr_|  t | red
nd}|| |tjkrwt|}|D ]}|dkrqy|D ]}|j
dv rtd| d|d|j|jg  t j|d|j|jf|tjkrt jnt j|dn&|d ur||krqtd| d||g  t jd|jd ||ft j|dz^|	rt j |n|
rt !|n|   t"j# fdd|dd}|	rdn|
rdndt j$d| rdndd|d|d||||% t&t'( d}|)t*|| t| |+| W q t,yS } zt-| t j.  W Y d }~qd }~ww qyq|S )NzYPlease install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.F)torchscriptr+   )rc   r+   custom_model_classr5   r   r*   zModel zNumber of parameters zcuda:0r)   zRun PyTorch on r7   r   )sizedtyper/   r   )lowhighri   rj   r/   c                      s    S Nr#   r#   	inference	input_idsr#   r&   <lambda>  s    zrun_pytorch.<locals>.<lambda>repeatnumberrg   torch2r>   NAr(   r   r-   )/r>   r(   Zis_availabler8   r9   Zset_grad_enabledr   r?   modifyr   r<   r   model_max_lengthdebugZnum_parametersr   FLOAT16Zhalfr/   toINT8r   Zquantize_torch_modelrH   rI   ZrandnZfloat16Zfloat32randintrb   longZjittracecompiletimeitrs   rD   rE   rF   r   rG   updater   rL   RuntimeError	exceptionZempty_cache)rM   rO   rP   rQ   r0   r!   rR   rS   rT   rg   ru   r+   r"   r`   r1   rc   model	tokenizermax_input_sizer/   r3   r4   runtimesrd   er#   rn   r&   run_pytorch:  s   









"


9r   do_eager_modeuse_xlac                    s*   ddl m dd l fdd}|S )Nr   )wrapsc                    sT     fdd} j d fdd}du r(du s&J d|S |S )	Nc                         | i |S rm   r#   r_   kwargsfuncr#   r&   run_in_eager_mode     zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_eager_mode)Zexperimental_compilec                     r   rm   r#   r   r   r#   r&   run_in_graph_mode  s   zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_graph_modeTFzcCannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`.)function)r   r   r   r   r    r   r   r   r&   run_func  s   

z+run_with_tf_optimizations.<locals>.run_func)	functoolsr   
tensorflow)r   r   r   r#   r   r&   run_with_tf_optimizations  s   r   c                    s  g }dd l jj| | sjg d | r$j s$td |S | r`j	d}zj|d d jj
|d d jjdd W n ty_ } zt| W Y d }~nd }~ww |tjksj|tjkrntd|D ]}tj||	d |  t| |	|dd	tj||	d}|j}|D ]}|dkrq|D ]}|d ur||krqtd
| d||g  dd l}|  fddt|| D }j|||fjdzzt dddfdd}t dddfdd}t ddd fdd}| j!r	|nt" t#r|  t$j%fdd|dd}dj&d| r)dndd|d|d||||' t(t)* d }|+t,|| t| |-| W q tyv } zt| dd!l.m/} |0 }|1  W Y d }~qd }~ww qqp|S )"Nr   ZGPUzVPlease install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.Tz/gpu:0)r/   z+Mixed precision is currently not supported.r*   )rc   r+   rh   Zis_tf_modelzRun Tensorflow on r7   c                    s   g | ]} d  jd qS )r   r   )r}   rb   )r%   re   )rc   rngr#   r&   r'     s    z"run_tensorflow.<locals>.<listcomp>)shaperj   F)r   r   c                      s    ddS )NF)trainingr#   r#   rp   r   r#   r&   encoder_forward  s   z'run_tensorflow.<locals>.encoder_forwardc                      s     ddS )NF)Zdecoder_input_idsr   r#   r#   r   r#   r&   encoder_decoder_forward  r   z/run_tensorflow.<locals>.encoder_decoder_forwardc                     s8   j dd jg} j dd jg}| |ddS )Nr   F)Zvisual_featsZ
visual_posr   )randomnormalZvisual_feat_dimZvisual_pos_dim)Zfeatspos)rc   rp   r   r    r#   r&   lxmert_forward  s   z&run_tensorflow.<locals>.lxmert_forwardc                      s     S rm   r#   r#   )ro   r#   r&   rq   '  s    z run_tensorflow.<locals>.<lambda>r   rr   r   rv   r(   r)   r   r-   )r(   )2r   rc   	threadingZ set_intra_op_parallelism_threadsZset_visible_devicestestZis_built_with_cudar8   r9   Zlist_physical_devicesZexperimentalZset_memory_growthZ
distributeZOneDeviceStrategyr   r   r   rz   r|   NotImplementedErrorr   r?   rw   r   r   rx   rH   r   RandomrK   ZconstantrC   r   Zis_encoder_decoder
isinstancer   r   rs   rD   rE   rF   r   rG   r   r   rL   Znumbar(   Zget_current_devicereset)rM   rO   rP   rQ   r0   r!   rR   rS   rT   r+   r"   r`   Zphysical_devicesr   r1   r   r   r3   r4   r   valuesr   r   r   r   rd   r(   r/   r#   )rc   ro   rp   r   r   r    r&   run_tensorflow  s   









Ir   c                  C   s  t  } | jddddtg dtt ddt  d | jd	dd
tdddgdd | jddtd ttddt d | jddddtdgg ddd | jdddtt	j
dddd | jddtt	j
dddd | jdd dd!d"d# | jd$dtd d%d | jd&d'ttjttd(d) | jd*dd!d+d# | jd,dd!d-d# | jd.d/ttjttd0d) | jd1d2dd!d3d# | jd4d5dd d6d7 | jd8d9dd d:d7 | jd;d<dd d=d7 | jd>d?ddd
gtg d@dAdB | jdCdDddEtdFdG | jdHdIdtd
gdJ | jdKdLdtg dMdJ | jdNdd!dOd# | jddP | jdQdRddtdSgdTdU | jdVdtd dWd | jdXdd!dYd# | jddZ t|  |  }|S )[Nz-mz--modelsF+)zbert-base-casedzroberta-baseZgpt2z Pre-trained models in the list: z, )requirednargstypedefaultchoiceshelpz--model_sourcer   r   r    zExport onnx from pt or tfz--model_classz!Model type selected in the list: )r   r   r   r   r   z-ez	--enginesr,   )r,   r>   ru   rg   r   zEngines to benchmarkz-cz--cache_dir.Zcache_modelsz%Directory to cache pre-trained models)r   r   r   r   z
--onnx_dirZonnx_modelszDirectory to store onnx modelsz-gz	--use_gpu
store_truezRun on gpu device)r   actionr   z
--providerzExecution provider to usez-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r   r   r   r   z	--verbosezPrint more informationz--overwritezOverwrite existing modelsz-oz--optimizer_infozjOptimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_optz-vz--validate_onnxzValidate ONNX modelz-fz--fusion_csvz:CSV file for saving summary results of graph optimization.)r   r   r   z-dz--detail_csvz#CSV file for saving detail results.z-rz--result_csvz$CSV file for saving summary results.z-iz--input_counts)r   r   r   zXNumber of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.)r   r   r   r   r   r   z-tz--test_timesd   z8Number of repeat times to get average inference latency.)r   r   r   r   z-bz--batch_sizes)r   r   r   z-sz--sequence_lengths)             @         z--disable_ort_io_bindingz=Disable running ONNX Runtime with binded inputs and outputs. )rZ   z-nz--num_threadsr   zThreads to use)r   r   r   r   r   z--force_num_layersz%Manually set the model's layer numberz*--enable_arm64_bfloat16_fastmath_mlas_gemmzHEnable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP )r^   )argparseArgumentParseradd_argumentrF   listr   keysjoinr   ospathr   ZFLOAT32r   ZBYSCRIPTintset_defaultsr   add_arguments
parse_args)parserr_   r#   r#   r&   parse_argumentsF  sV  

					

r   c                  C   s  t  } t| j | jtjkr| jstd d S | jtj	kr-| jr-| j
dvr-td d S t| jdkrCt| jd  d dv rCdg| _td	d
 | jD | _td|   tj| jsvzt| j W n tyu   td| j Y nw d| jv }d| jv }d| jv }d| jv }d| jv }|rttjtdk rtdtj  d S t| j}g }| jD ]}t| t tj!"  |s|s|r+| j#dgkrt$d |r|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|r|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|r+|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|rG|t)| j| j| j&|| j|| j'| j| j(| j| j7 }i }	|rz4| j* }
|t+| j| j
| j| j&|| j|| j'| j| j(| j#| j,| j-| j| j.| j| j/| j0|
|	| j1| j2| 7 }W q t3y   t4d Y qw qt56 7d}|	r| j8pd| d}t9|	| t|dkr| j'dgkrt$d d S | j:pd| d}t;|| | j<pd| d}t=|||  d S )Nzfp16 is for GPU only)ZmigraphxZrocmzint8 is for CPU onlyr   r   r   )r6   Zswimr   c                 S   s   h | ]
}|d kr
t n|qS )r   )	cpu_count)r%   xr#   r#   r&   	<setcomp>  s    zmain.<locals>.<setcomp>zArguments: z#Creation of the directory %s failedr>   ru   rg   r,   r   z2.0.0z2PyTorch version must be >=2.0.0 and you are using zB--input_counts is not implemented for torch or torchscript engine.TF	Exceptionz%Y%m%d-%H%M%SZbenchmark_fusion_z.csvzNo any result available.Zbenchmark_detail_Zbenchmark_summary_)>r   r   r"   r0   r   rz   rM   r8   r9   r|   rN   r;   modelsr   rS   sortedr!   rH   r   r   existsr+   mkdirOSErrorZenginesr   r=   r>   rD   r   Zforce_num_layersZset_num_threadsry   Z
__config__Zparallel_inforU   r:   r   rP   rR   Z
test_timesr   Zuse_mask_indexrf   rV   rW   rX   rY   rZ   r]   r^   r   r   r   rG   strftimeZ
fusion_csvr   Z
detail_csvr
   Z
result_csvr   )r_   Zenable_torchZenable_torch2Zenable_torchscriptZenable_onnxruntimeZenable_tensorflowrQ   r`   r!   r\   r[   Z
time_stampZcsv_filenamer#   r#   r&   main  s  


$












r   __main__)4__doc__r   loggingr   r   r   r@   ZpsutilZbenchmark_helperr   r   r   r   r   r   r	   r
   r   r   r   ra   r   Zhuggingface_modelsr   r   Zonnx_exporterr   r   r   r   	packagingr   Zquantize_helperr   	getLoggerr8   r   environrF   r>   Ztransformersr   r   r   rf   r   boolr   r   r   r   __name__r#   r#   r#   r&   <module>   sB   4

 bp  I #
