o
    i%a                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZ d dlZeeZddd	Zd
d Z dd Z!dd Z"dd Z#dd Z$dd Z%edkre%  dS dS )    )annotationsN)setup_logger)add_io_bindings_as_tensorsget_initial_inputs_and_outputs)
AutoConfigAutoModelForCausalLMAutoTokenizerBitsAndBytesConfigargsargparse.Namespacec                 C  s  | j dv rd }| jdkr9| jdkr9tdddtjd}tj| jdkr$| jn| j	| j
| j| j| jdd|| jd	id
	}n[z&tj| jdkrD| jn| j	| j
| j| j| jd| jdkrVdndd| j}W n4 ty } z(td| tj| jdkrv| jn| j	| j
| j| j| jddd| j}W Y d }~nd }~ww |  | j dkrt|}|S t }| jdkrdd| jifnd}tj| j||gd}|S )N   
pt-compilept-eagerint4cudaTZnf4)Zload_in_4bitZbnb_4bit_use_double_quantZbnb_4bit_quant_typeZbnb_4bit_compute_dtype Zflash_attention_2Z80GB)	cache_dirtorch_dtypeuse_auth_tokentrust_remote_code	use_cacheattn_implementationZquantization_configZ
max_memoryZsdpa)r   r   r   r   r   r   z&Try to load a model using eager mode: eagerr   CUDAExecutionProvider	device_idZCPUExecutionProvider)sess_options	providers)benchmark_typeonnx_precisiondevicer	   torchfloat16r   from_pretrainedhf_dir_path
model_namer   r   authtrustr   totarget_device	ExceptionprintevalcompileortZSessionOptionsZInferenceSessiononnx_model_path)r
   modelZ
bnb_configer   ep r2   m/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/models/llama/benchmark_e2e.py	get_model8   st   
	



r4   c           
   	   C  s"  | j dkr t  |di |}W d    n1 sw   Y  d }| j dv r4| jdkr3tj| j nt|||| j| j	}|
  t }t|D ]8}| j dv ryt  |di |}| jdkritj| j W d    n1 ssw   Y  qJ|| |  qJt }|| | }	|	|fS )Nr   r   cpur2   )r   r    Zno_gradr   r   Zsynchronizer(   r   use_fp16use_buffer_shareZsynchronize_inputstimeperf_counterrangeZrun_with_iobindingZsynchronize_outputs)
r
   r/   runsinputsoutputsZ
io_bindingstart_endZavgr2   r2   r3   run_inferencex   s4   








rA   c           	   	   C  sF   t   t||||| j| j| j| j\}}t| || j||\}}||fS N)clear_cacher   r(   r6   r7   enginerA   Zwarmup_runs)	r
   r/   config	tokenizerprompt_lengthpromptr<   r=   r?   r2   r2   r3   prepare_model_for_inference   s   rI   c                   C  s   t   tj  d S rB   )gcZcollectr    r   Zempty_cacher2   r2   r2   r3   rC      s   rC   c                 C  sv   t j| ddddddddd	|d
  dd|d
  dd	| dd| dddgd}|j|dd td| d d S )Nz
Batch SizezPrompt LengthzPrompt Processing Latency (ms)z"Prompt Processing Throughput (tps)zSampling Latency (ms)zSampling Throughput (tps)z"First Token Generated Latency (ms)z&First Token Generated Throughput (tps)Average Latency of First    z Tokens Generated (ms)Average Throughput of First z Tokens Generated (tps)zWall-Clock Latency (s)zWall-Clock Throughput (tps))columnsF)indexzResults saved in !)pdZ	DataFrameZto_csvloggerinfo)resultsfilenameZ
gen_lengthZdfr2   r2   r3   save_results   s(   

rV   c               
   C  s  t  } | jddtdg dd | jddtdd	d
 | jdddddd | jdddddd | jddttjdddd | jdtddd | jddddd | jd d!dtjdd"d#d$d%d& | jd'ddd(d | jd)ddd*df | jd+d,d-d. | jd/d0d1d. | jd2d3dtd4g d5d6d7 | jd8d9td:d;d | jd<d=ttj	
 rd>nd?d?d>gd@ | jdAdBtdCdD | jdEdFtdGdD | jdHdItdJdD | jdKtdLdD |  }tj|j t|j dM|jv rt|dN|j  dO |jdPkr|jdQ|jif|_|jdMkr|jsJ dR|jdS|_|jdS|_t|dT|j |jdUv s8|jdVkr:|jd?kr:d4ndW|_|jd?krIdX|j n|j}|jdWkrUtjntj}|jdMkr`dMndY}t|dZ| t|d[| t|d\| t|d]|jdWk |jo|dMk|_|S )^Nz-btz--benchmark-typeT)r   r   r-   )typerequiredchoicesz-mz--model-nameFz<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rW   rX   helpz-az--auth
store_truez5Use Hugging Face authentication token to access model)defaultactionrZ   z-tz--trustzeWhether or not to allow for custom models defined on the Hugging Face Hub in their own modeling filesz-cz--cache-dir.Zmodel_cachezPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.)rW   r\   rZ   z--hf-dir-pathr   zPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.z-oz--onnx-model-pathzPath to ONNX model)rX   rZ   z-fz--prompts-filemodelsllamazprompts.jsonzsJSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt)rX   r\   rZ   z--use_buffer_sharez3Use when GroupQueryAttention (GQA) is in ONNX modelz--anomaly-filteringzUse this flag to filter anomaly accelerator times for tokens generated.               This may give more accurate latency and throughput metrics for tokens generated.               Wall-clock metrics are still reported with anomaly times though.z-bz--batch-sizesz1 2)r\   z-sz--prompt-lengthsz16 64 256 1024z-pz--precisionfp32)r   int8fp16ra   zePrecision for model. For ONNX models, the model's precision should be set before running this script.)rX   rW   r\   rY   rZ   z-gz--generation-length   z Number of new tokens to generatez-dz--devicer   r5   )rW   r\   rY   z-idz--device-idr   )rW   r\   z-wz--warmup-runs   z-nz
--num-runsd   z--seedrL   r-   execution_providerZExecutionProviderr   r   z,Please specify a path to `--onnx-model-path` r   >   rb   ra   r   rc   zcuda:ptr(   r   rD   r6   )argparseArgumentParseradd_argumentstrospathjoinintr    r   Zis_available
parse_argsnprandomseedZmanual_seedr   setattrr   upperrg   r   r.   batch_sizessplitprompt_lengths	precisionr!   Zfloat32r7   )parserr
   r(   r   rD   r2   r2   r3   get_args   s   
*r}   c            9        s  t  } td t| j d }t| j}tj|dd d}W d    n1 s(w   Y  t	j
| jdkr7| jn| j| j| j| jd}tj
| jdkrL| jn| j| j| j| jd}t| }g }t| j| jD ]u\}}t|t|}}td| d|  t  || j }	||vrttd	| d
| j d| j d| d| d| d| d| d| j d| d|| g| }
||g}ztd t| |||||
\}}t| || j||\}}|d }|||  }td| d td|||   d |||g td t  t| |||||
\}}|d   }|j!d }|j"}t#|dr,|j$n|j%|j& }t'j(|| j)t'j*d}g }g }t+, }||	krt| |d||\}}|-| t+, }|d  j!d dkr|d! .dd }|j/dd"0d|j12|d|j1}t'3|d  d|4 }n|d  d d dd d f }t'j5|dd"}||B |j6k}|7||j68|dg}t+, } |-| |  t'j9||gdd"}|d7 }||d< t'9|d! | :t'j;8|dgd|d!< d#|v rt'j<|d# dd"d$ 8|dd |d#< |d  j!d dkr|d  d d d dd d f = |d < |d  >  | j?d%kr,|d& |d&< nh| j@stA|jBD ]}!|d'|! d( |d)|! d(< |d'|! d* |d)|! d*< q5|d! j!d }"tA|jBD ]2}!t'j(|||"|| j)| jCd}#t'j(|||"|| j)| jCd}$|Dd'|! d(|#= d'|! d*|$= i qa||	ksIt+, }%|Ed$ | jFrd+ tG|tH|}&tItJ fd,d|}tH|}'td-|&|'  d.  d/d  d0 t.|tH| }(|(d })|d|(  }*td1|) d td2|* d |d$ }+|+d },|d|+  }-td3|, d td4|- d | jd5 }.t.|d |. tH|d |.  }/|/d }0|d|/  }1td6|. d7|0 d td8|. d7|1 d t.|tH| }2|2d }3|d|2  }4td6| j d7|3 d td8| j d7|4 d |%| }5||| j |5  }6td9|5 d: td;||| j |5   d td< ||)|*|,|-|0|1|3|4|5|6g
 |-| W qe tKy }7 ztd=| d| d>|7  W Y d }7~7qed }7~7ww d?| j? d@tLjLM dAdB}8tN||8| j d S )CNFc                 S  s   dd |   D S )Nc                 S  s   i | ]	\}}t ||qS r2   )rq   ).0kvr2   r2   r3   
<dictcomp>j  s    z*main.<locals>.<lambda>.<locals>.<dictcomp>)items)dr2   r2   r3   <lambda>j  s    zmain.<locals>.<lambda>)object_hookr   )r   r   r   zRunning batch size = z, prompt length = z2
                                A prompt of size z was not found in 'zv'. There are a couple of solutions to fix this.
                                1) You can change one of the keys in 'z' to be z).
                                    If za < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until zB = actual prompt's length.
                                    If zm > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that zd = actual prompt's length.
                                2) You can add a new key-value entry in 'z' of the form 'z,': 'your prompt goes here'.
                zMeasuring prompt processing...i  z&Average Latency of Prompt Processing: z msz)Average Throughput of Prompt Processing: z tpszMeasuring token generation...Z	input_idshead_dim)r   Zdtype   ZlogitsZattention_mask)dimZposition_idsr   ri   Zpast_key_valueszpresent.z.keyzpast_key_values.z.value
   c                   s   |   k S rB   r2   )Zacc_timeZanomaly_threshold_factorZ
min_time_sr2   r3   r     s    zFiltered out z$ anomaly accelerator times that are zx greater than z ms...zAverage Latency of Sampling: z Average Throughput of Sampling: z"Latency of First Token Generated: z%Throughput of First Token Generated: rL   rK   z Tokens Generated: rM   zWall-Clock Latency: z szWall-Clock Throughput: zAdding results to CSVz$Could not benchmark at batch size = z - Z
benchmark_Z_e2e_z%Y-%m-%d_%H:%M:%Sz.csv)Or}   r   rR   rS   __dict__openZprompts_filejsonloadr   r"   r#   r$   r   r%   r&   r   r4   	itertoolsproductrx   rz   rq   rC   Zgeneration_lengthNotImplementedErrortextwrapdedentrI   rA   Znum_runsextendcloneshapeZnum_key_value_headshasattrr   Zhidden_sizeZnum_attention_headsr    Zzerosr(   boolr8   r9   appendsumZ	unsqueezerepeatZ
vocab_sizeviewZgatherZsqueezeZargmaxZeos_token_idZmasked_fillZreshapecatr'   Zint64max
contiguousZzero_rD   r7   r:   Znum_hidden_layersr   updatepopZanomaly_filteringminlenlistfilterr)   datetimenowrV   )9r
   Zsize_to_promptfrE   rF   r/   Zall_csv_metricsZ
batch_sizerG   
max_lengthrH   Zcsv_metricsr<   r=   Zaccelerator_prompt_latency_sZaccelerator_prompt_latency_msZaccelerator_prompt_thrptZall_token_idsZcurrent_lengthZ	num_headsZ	head_sizeZhas_eosZaccelerator_timesZsampling_timesZwall_clock_start_timeZaccelerator_time_latency_sZsampling_start_timeZprompt_end_indicesZidxsZnext_token_logitsZnext_tokensZtokens_to_addZsampling_end_timeiZnew_sequence_lengthZpresent_keyZpresent_valueZwall_clock_end_timeZ	orig_sizenew_sizeZavg_sampling_latency_sZavg_sampling_latency_msZavg_sampling_thrptZfirst_token_latency_sZfirst_token_latency_msZfirst_token_thrptZhalfwayZhalfway_token_latency_sZhalfway_token_latency_msZhalfway_token_thrptZall_token_latency_sZall_token_latency_msZall_token_thrptZwall_clock_latency_sZwall_clock_thrptr0   rU   r2   r   r3   mainb  s  








&& 
O
	
 
(r   __main__)r
   r   )&
__future__r   rj   r   rJ   r   r   loggingrn   r   r8   numpyrs   ZpandasrQ   r    Zbenchmark_helperr   Zllama_inputsr   r   Ztransformersr   r   r   r	   Zonnxruntimer-   	getLogger__name__rR   r4   rA   rI   rC   rV   r}   r   r2   r2   r2   r3   <module>   s<   

@	 % ~
