o
    ih                     @   sv  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&Z'e(e)Z*d	d
 Z+de j,de-fddZ.de j,fddZ/dd Z0dd Z1dd Z2dd Z3dd Z4dd Z5d"ddZ6dd  Z7e)d!kre7  dS dS )#    N)measure_memorysetup_logger)get_rankget_size)add_io_bindings_as_ortvalues%get_merged_sample_with_past_kv_inputsget_msft_sample_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)ORTModelForCausalLM)ProfilerActivityprofilerecord_function)trange)
AutoConfigAutoModelForCausalLMAutoTokenizerc                 C   sP   | j dv rdS | j dkr"zt|jW S  ty!   t|jj Y S w t| S )N   hf-pt-eagerhf-pt-compiler   hf-ort)benchmark_typelenZinputs_names	ExceptiondecoderZinput_names
get_inputsargsmodel r    i/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/models/llama/benchmark.pyget_ort_model_inputs_len(   s   

r"   r   ort_model_inputs_lenc                 C   s  d\}}| j dkrdn| jj}| j dv r4t| j| j| j| jdd}t| j| j| j| j| jdd}||fS | j dv r|d	kr]t| j| j| j| jdd}t| j| j| j| j| jdd}||fS t	| j| j| j| jd
|| j| j
ddd
}t	| j| j| jd| j|| j| j
ddd
}||fS | j dkrt	| j| j| j| jd
|| j| j
dd| jd}t	| j| j| jd| j|| j| j
dd| jd}||fS | j dkr|dk}t| j| jd
| j|| j| j
|d}t| j| j| jd|| j| j
|d}||fS td)NNNort-msfti   r   T)return_dict)use_fp16r&   >   r      r   pt)seq_lenpast_seq_lenmax_seq_lenr'   use_buffer_shareenginer&      ort-convert-to-onnxort)r*   r+   r,   r'   r-   r.   r&   
world_size   )r+   r*   r,   r'   r-   split_kvz/Unable to auto-detect inputs for provided model)r   configZmax_position_embeddingsr	   target_device
batch_sizesequence_lengthr
   r'   r   r-   r2   r   r   )r   r#   init_inputsiter_inputsr,   r4   r    r    r!   r   5   s   
o
\F
*

r   c                 C   s  d\}}d\}}| j dv rB| jr| jn| j}t }tj|| jr#tjntj	| j
| j
d| jd| j}t }| j dkrAt|}n| j dv rYt }| j|_| jrXd|_d|_ntd| j  | j d	krt| jtu rr| jd
 n| j}t| jtu r| jd nd }d }d }	t| jD ]'}
d|
vsd|
v sd|
v rqd|
v s|
dkr|
}d|
v r|
}	d|
v r|
}|
}	qt }tj| j||	| j
| j
d|dkrdnd |||d
}t }| j dv rtd| j !| j"  t }tj#| j !| j"|| jgd}t }td||  d |S )Nr$   r   T)Ztorch_dtypeuse_auth_tokentrust_remote_codeZ	use_cache	cache_dirr   >   r0   r   r%   r/   Cannot recognize r   r   z.onnxz
.onnx_dataz
.onnx.dataZdecoder_modelz
model.onnxZdecoder_with_past_modelZdecoder_merged_model)	decoder_file_namedecoder_with_past_file_namer;   r<   Zuse_io_bindingZ
use_mergedproviderprovider_optionsZsession_options   r0   r%   zLoading model from )	providerszLoaded model in  s)$r   Zhf_pt_dir_path
model_nametimer   from_pretrainedr'   torchZfloat16Zfloat32authr=   tor6   compiler1   ZSessionOptionsr   Zenable_profilingverboseZlog_verbosity_levelZlog_severity_levelr   typeexecution_providertupleoslistdirhf_ort_dir_pathr   loggerinfoort_model_pathformatrankZInferenceSession)r   r   Zsess_options
start_timeend_timesourcerA   rB   r?   r@   filenamer    r    r!   	get_model   s   
	




r]   c                    sV   j dv r
t jnt jtjdd} jr||}t|  fdd} fdd}|D ]}|  || |  q-d} j dv rFt j	nt j	tjdd}	|	D ]}|  t

 }
|| |  t

 }|||
 7 }qQ j dvrvtd	 | j	 } j| } jdkrtd
 j  td j  td| d td| d d S )NrC   zWarm up)fileZdescc                     *    j dkr jdv r j S  fddS )NcpurC   c                     &    j dkrtj rtj S dd S )Nr`   c                  W      d S Nr    kwargsr    r    r!   <lambda>      =time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>devicerI   cudais_availableZsynchronizerd   r   r    r!   rf        
+time_fn.<locals>.<lambda>.<locals>.<lambda>)rj   r   
io_bindingZsynchronize_inputsrd   rm   r    r!   rf        
ztime_fn.<locals>.<lambda>c                     r_   )Nr`   rC   c                     ra   )Nr`   c                  W   rb   rc   r    rd   r    r    r!   rf   (  rg   rh   ri   rd   rm   r    r!   rf   %  rn   ro   )rj   r   rp   Zsynchronize_outputsrd   rm   r    r!   rf   "  rq   r   Z	Benchmark zBatch Size: zSequence Length: z	Latency: rE   zThroughput: z tps)r   rangeZwarmup_runsr   sysstdoutrM   rT   rU   Znum_runsrG   r7   rX   r8   )r   fninputsZwarmup_rangeoutputsZ
input_syncZoutput_sync_
total_timeZbench_rangerY   rZ   ZlatencyZ
throughputr    rm   r!   time_fn  sF   











r{   c           	      C   sH  d| j  d| j d| j  d| j d| j d|jdd d| dtj	 d}d }| jdv rt
tjtjgddd }td	 || W d    n1 sPw   Y  W d    n1 s_w   Y  |jd
dj| j| jd}tj| j| d}t|d}|| W d    |S 1 sw   Y  |S || | d}|S )NbZ_sry   -z%Y-%m-%d_%H:%M:%Sr   T)Z
activitiesZrecord_shapesZprofile_memoryZmodel_inferencer3   )Zgroup_by_stack_n)Zsort_byZ	row_limitz.logwz.json)r7   r8   r   lower	precisionrj   __name__replacedatetimenowr   r   ZCPUCUDAr   Zkey_averagestableZpt_filter_byZpt_num_rowsrQ   pathjoin
log_folderopenwrite)	r   rv   rw   Zinputs_typeprefixr\   ZprofZ	prof_datafr    r    r!   
profile_fnR  s.   T






r   c                    s   t  }t|}|jdd   | jdkr+td|jd dtjdd  d t	
  tj  t| jdk fd	d
d tj  d S )Ng?)intervalr   zCPU usage: F)Zlogical%r`   c                      s    S rc   r    r    rv   rw   r    r!   rf   |  s    zmeasure_fn.<locals>.<lambda>)Zis_gpufunc)rQ   getpidpsutilProcessZcpu_percentrX   rT   rU   	cpu_countgcZcollectrI   rk   Zempty_cacher   rj   rt   ru   flush)r   rv   rw   pidprocessr    r   r!   
measure_fno  s   

&
r   c                    s    fdd}|}| j dkr|| || | jrlt| ||d}| j dkrA jj }td| d|  t	|tj
| j| t| ||d}| j dkrj jj }td| d|  t	|tj
| j| d S td	 t| || t| || td
 t| || t| || d S )Nc                    s    di | }|S )Nr    r    rw   rx   r   r    r!   
get_logits  s   z$run_hf_inference.<locals>.get_logitsr   promptr   	Renaming  to token7
Evaluating `model(inputs)` step to get past_key_values5
Evaluating `model(inputs)` step with past_key_values)r   r   r   r   sessionend_profilingrT   warningrQ   renamer   r   r   Zdecoder_with_pastrU   r{   r   )r   r9   r:   r   r   generate_fnnew_lognameold_lognamer    r   r!   run_hf_inference  s.   




r   c                    sV   fdd}fdd}fdd} j dkr|n|}i } jry|||\}	}t ||	d}
 }td	| d
|
  t|tj	 j
|
 t |||\}}t ||d}
 }td	| d
|
  t|tj	 j
|
 d S td |||\}	}t ||	 t ||	 td |||\}}t || t || d S )Nc                    sP   t | }  jdkr$t|  jt j j|\}}t d| ||fS | |fS )Nr`   rp   )r   rj   r   intrX   r-   setattr)rw   kv_cache_ortvaluesrp   r   r    r!   prepare_ort_inputs  s   

z-run_ort_inference.<locals>.prepare_ort_inputsc                    s     |  d S rc   )Zrun_with_iobinding)rp   r   r    r!   with_io_binding  s   z*run_ort_inference.<locals>.with_io_bindingc                    s     d | }|S rc   )runr   r   r    r!   without_io_binding  s   z-run_ort_inference.<locals>.without_io_bindingr`   r   r   r   r   r   r   )rj   r   r   r   rT   r   rQ   r   r   r   r   r]   rU   r{   r   )r   r9   r:   r   r   r   r   r   r   Zort_init_inputsr   r   Zort_iter_inputsr    r   r!   run_ort_inference  s4   

r   c                 C   sH   | j dv rt| ||| d S | j dv rt| ||| d S td| j  )N>   r   r   r   rC   r>   )r   r   r   r   )r   r9   r:   r   r    r    r!   run_inference  s
   

r   c              	   C   s  t  }|jddtdg dd |jddtddd	 |jd
ddddd |jdddtdg ddd |jdtddd |jdtddd |jdtddd |jddd d! |jd"d#d$d! |jd%d&ttj rid'nd(g d)d* |jd+d,td-d. |jd/d0td1d. |jd2d3td4d. |jd5td6d. |jd7td8d. |jd9td:d. |jd;ddd< |jd=td>d?d |jd@tdAdBd |jdCddd< |jdDttj	
dEdFd |jdGtddHdIdJ | }tj|j t|j dK|jv rt|dL|j  dM |jdNkr|jdO| if|_n|jdPkr|jdO| if|_d'|_|jdQkr-|js-J dR|jdSv r;|js;J dT|jdU|_|jdU|_|jdVv s[|jdWkr]|jd(kr]dndX|_|jrxt|jd:krtt|jd:ksxJ dY|S )ZNz-btz--benchmark-typeT)r   r   r   r%   r0   )rN   requiredchoicesz-mz--model-namez<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rN   r   helpz-az--authF
store_truez5Use Hugging Face authentication token to access model)defaultactionr   z-pz--precisionfp32)int4int8fp16r   zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r   rN   r   r   r   z--hf-pt-dir-pathrr   zNPath to directory containing all PyTorch files (e.g. tokenizer, PyTorch model))rN   r   r   z--hf-ort-dir-pathzhPath to directory containing all ONNX files (e.g. tokenizer, decoder_merged, decoder, decoder_with_past)z--ort-model-pathzPath to ONNX modelz-bz--batch-sizesz1 2)r   z-sz--sequence-lengthsz32 64 128 256 512z-dz--devicerk   r`   )r`   rk   Zrocm)rN   r   r   z-idz--device-idr   )rN   r   z-wz--warmup-runsr3   z-nz
--num-runs
   z--seed   z--max-length    z--num-return-sequencesr/   z	--profile)r   r   z--pt-filter-byZself_cpu_time_totalz"What to filter PyTorch profiler byz--pt-num-rowsi  z.Number of rows for PyTorch profiler to displayz	--verbosez--log-folder.zFolder to cache log filesz--cache-dirz./model_cachez-Cache dir where Hugging Face files are stored)rN   r   r   r   r1   rO   ZExecutionProviderZCUDAExecutionProviderZ	device_idZROCMExecutionProviderr   z,Please specify a path to `--hf-ort-dir-path`rC   z+Please specify a path to `--ort-model-path` >   r   r   r   r   zOPlease provide only one (batch_size, sequence_length) combination for profiling)argparseArgumentParseradd_argumentstrrI   rk   rl   r   rQ   r   r   
parse_argsnprandomseedZmanual_seedr   r   rj   upperrO   rS   rV   batch_sizessplitsequence_lengthsr   r   r   )rX   parserr   r    r    r!   get_args  s   
	*"r   c                  C   s  t  } t }t| }t|j t|j dtj	j
_| |_||_tj|j|j|j|jd}tj|j|j|j|jd}|jdkrEd|j n|j}|jdk}t|d| t|d| t|d| t|d	| t|}t||}|jd
v rtj|j|jdd}	ttdd |	j j!}
|ot"|
dko|jdk}t|d| nt|dd t#$|j%|j&D ]3\}}|jdkrtd| d| d t|dt'| t|dt'| t(||\}}t)|||| qd S )NT)r=   r;   r<   r`   zcuda:r   	tokenizerr5   r6   r'   rC   F)Zload_external_datac                 S   s
   | j dkS )NZGroupQueryAttention)Zop_type)noder    r    r!   rf     s   
 zmain.<locals>.<lambda>r   r-   z
Batch size = z and sequence length = z...r7   r8   )*r   r   r   r   rM   rT   rU   __dict__rI   backendsZcudnnZ	benchmarkrX   r2   r   rH   rF   r=   rJ   r   rj   r   r   r]   r"   r   onnxZ
load_modelrV   rW   listfiltergraphr   r   	itertoolsproductr   r   r   r   r   )rX   r2   r   r   r5   r6   r'   r   r#   Z
onnx_modelZ	gqa_nodesr-   r7   r8   r9   r:   r    r    r!   main  sH   





r   __main__)r   )8r   r   r   r   loggingrQ   rt   rG   numpyr   r   r   rI   Zbenchmark_helperr   r   Zdist_settingsr   r   Zllama_inputsr   r   r   r	   r
   r   Zoptimum.onnxruntimer   Ztorch.profilerr   r   r   Ztqdmr   Ztransformersr   r   r   Zonnxruntimer1   	getLoggerr   rT   r"   	Namespacer   r   r]   r{   r   r   r   r   r   r   r   r    r    r    r!   <module>   sJ    
 UF>;
	 	2
