o
    i<                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlZd dlmZm	Z	m
Z
mZmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ ed	Zdd
dZdd Zedkrpe Zeej  ee dS dS )    N)datetime)	Precisioncreate_onnxruntime_sessionget_ort_environment_variablesprepare_environmentsetup_logger)DEFAULT_TOLERANCEMODEL_CLASSESPRETRAINED_GPT2_MODELS
Gpt2Helper)version)QuantizeHelper)
AutoConfig)__version__ c                 C   s6  t  }|jdddtddt d |jddtd	tt d
dt  d |jddtt	j
dddd |jddtt	j
dddd |jdddtdd |jdddddd |jdddddd |jdd  |jd!td"dg d#d$d% |jd&ddd'd |jdd( |jd)d*ttjttd+d, |jd-ddd.d |jdd/ |jd0d1d2td3gd4d5 |jd6d2td3gd7d5 |jd8d9d2tg d:d;d5 |jd<d=dd d>d? |jd@dtdAdBd |jdCdddD |jddE |jdFdddD |jddG |jdHdddD |jddI |jdJdddD |jddK || }|S )LNz-mz--model_name_or_pathTz;Model path, or pretrained model name selected in the list: z, )requiredtypehelpz--model_classFZGPT2LMHeadModelz!Model type selected in the list: )r   r   defaultchoicesr   z--cache_dir.Zcache_modelsz%Directory to cache pre-trained models)r   r   r   r   z
--onnx_dirZonnx_modelszDirectory to store onnx modelsz--test_timesd   z8Number of repeat times to get average inference latency.)r   r   r   r   z-vz--validate_onnx
store_truezValidate ONNX model)r   actionr   z-oz--optimize_onnxz'Use optimizer.py to optimize onnx model)optimize_onnxz--stager   )r         a6  Stage in generation: 1 (initial decoder), 2 (decoder), 0 (both). 1 - decode the first token when past_sequence_length is zero; 2 - decode the remaining tokens when past_sequence_length is not zero; 0 - one onnx model for both stages 1 and 2. Note that we will optimize 1 and 2 differently for best performance.)r   r   r   r   r   z	--use_gpuzuse GPU for inference)use_gpuz-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r   r   r   r   z--torchscriptzuse Torchscript)torchscriptz-bz--batch_sizes+r   z
batch size)nargsr   r   r   z--sequence_lengthsz!sequence lengths (excluding past)z-sz--past_sequence_lengths)          @         zpast sequence lengthsz-rz--result_csvz$CSV file for saving summary results.)r   r   r   z--thread_numzThreads to usez--include_copy_output_latency)r   r   )include_copy_output_latencyz	--verbose)verbosez--output_torch_latency)output_torch_latencyz--disable_io_binding)disable_io_binding)argparseArgumentParseradd_argumentstrjoinr
   listr	   keysospathintset_defaultsr   FLOAT32
parse_args)argvparserargs r<   m/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/models/gpt2/benchmark_gpt2.pyparse_arguments!   s   
			
r>   c           !      C   s  t tt dk rtdtd|   | jtjkr&| j	r"| j
s&J d| jtjkr3| j
r3J d| jdkrB| jdgksBJ dt| jdkrOtjd	d
n| j ttj  | j}| j}t||| j
 t| j d }t}tj| j| j|d}|j| j||d}t | j
rdnd}|!| |j"dk}|j#|| j| jd	|d}	|	d }
t| j d }|j$|||
| j%|||d | j	s| jtj&kr|	| jtjkrt'| jnd }
|j	|	d |
| jtjk|j(j)|j(j*|d	| jd | jtjkrtd t+,|
|	d | t+-|}td |	d }
| jr|j|||||d}t.|
| j
d| j| j%d}|d u r0d S |/t0| j1t0| jt0| j2|| j}|3||| jtjk}| j4p[d5t67 8d}t9|ddde}g d }t:j;||d!}|<  | j1D ]I}| j2D ]A}| jD ]9}|dkr|dkr|dksJ t=d"||| |j>||||j)|j*|j"|j?|| jtjk||d#}|/||||| j}z| j@s| jAr|B||| jC\}}tD|D ],\}}tE|tFrt=d$| d%tG| d&|d jH  qt=d$| d'|jH  qnd }d }| jIr|J||| jC\}}n|jK||||| jCd| jLd(\}}| j@rb|}| jIsDg }|D ]}|M|N O  q7|jP||| jtQ| j tQ| j d)rbtd*tQ| j  d+ td,||||| jIrod-nd|rud.nd | j| j| jtR | j
| j| j	| j|||| jI|r|d/nd0|d/d } |S|  W q tTy   tjUd1d	d2 Y    W d    d S w q}qwW d    n	1 sw   Y  td3|  |S )4Nz3.1.0z/This tool requires transformers 3.1.0 or later.z
Arguments:z'fp16 requires --optimize_onnx --use_gpuzquantization only supports CPUr   r   z<past_sequence_lengths shall be 0 for stage==1 (init decoder)T)Zlogical)r   	cache_dir)configr?   zcuda:0cpu   )Zhas_pastZ
new_folderrawr   )has_position_idshas_attention_maskZfp32)Zauto_mixed_precisionstagezquantizing model...Zint8zfinished quantizing modelF)Zenable_all_optimizationZnum_threadsr)   zbenchmark_result_{}.csvz%Y%m%d-%H%M%Sar   )modenewline)Z
model_namemodel_classrF   Zenvironment_variablesZgpu	precisionZ	optimizerr   
batch_sizesequence_lengthpast_sequence_lengthr+   torch_latencyZonnxruntime_latency)
fieldnameszMRunning test for batch_size=%d sequence_length=%d past_sequence_length=%d ...)Zfloat16rD   rE   ztorch output z is tuple of size z, shape z shape )Zreturn_numpyr(   )rJ   ZrtolZatolz:Pytorch and ONNX Runtime outputs are all close (tolerance=z).zZbatch_size=%d, sequence_length=%d, past_sequence_length=%d, onnxruntime_latency=%.2f %s %sz(disable_io_binding)z, torch_latency={torch_latency}z.2fNone	Exception)exc_infozResults are saved to file )Vr   parsetransformers_versionRuntimeErrorloggerinforK   r   ZFLOAT16r   r   ZINT8rF   Zpast_sequence_lengthstorchZset_num_threadsZ
thread_numpsutil	cpu_countprintZ
__config__Zparallel_infor?   Zonnx_dirr   r	   rJ   r   r   Zfrom_pretrainedZmodel_name_or_pathr   devicetoZn_layerZget_onnx_pathsZexport_onnxr)   r7   r/   r@   Znum_attention_headsZhidden_sizer   Zquantize_onnx_modelZquantize_torch_modelr   Zget_output_shapesmaxZbatch_sizesZsequence_lengthsZget_output_buffersZ
result_csvformatr   nowstrftimeopencsv
DictWriterwriteheaderdebugZget_dummy_inputsZ
vocab_sizeZvalidate_onnxr*   Zpytorch_inferenceZ
test_times	enumerate
isinstancetuplelenshaper+   Zonnxruntime_inferenceZ$onnxruntime_inference_with_binded_ior(   appendrA   numpyZcompare_outputsr   r   writerowrR   error)!r;   r?   
output_dirrJ   Z
gpt2helperr@   modelr]   Zuse_external_data_formatZonnx_model_pathsZonnx_model_pathZuse_paddingsessionZmax_output_shapesZoutput_buffersZcsv_filenameZcsv_fileZcolumn_namesZ
csv_writerrL   rM   rN   Zdummy_inputsZoutput_shapesZoutputsrO   ivalueZort_outputsZort_latencyZcopy_outputsoutputrowr<   r<   r=   main   s  
"







"



   rx   __main__)N)!r,   rd   loggingr3   r   rZ   rY   Zbenchmark_helperr   r   r   r   r   Zgpt2_helperr   r	   r
   r   	packagingr   Zquantize_helperr   Ztransformersr   r   rU   	getLoggerrW   r>   rx   __name__r;   r)   r<   r<   r<   r=   <module>   s.   

  s
