o
    i;                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 eeZdd Zdd Zdd	 Zd
d Zdd ZedkrJe  dS dS )    N)setup_logger)BenchmarkRecordc                  C   s  t  } | jddtdd | jddtdd | jdd	td
d | jddtdd | jddddd | jddddd | jdtddd | jdtddd | jdtddd | jdtddd | jd td!d"d# | jd$td!g d%d&d' | jd(td!g d)d*d' | jd+td,d-d | jd.ddd/d | jd0td1d2d | jd3td d4d |  }t|d5|jd6d7 	d8d9 d:|j
 d;|j }|js||_tj|jd!d< | jd=9  _|S )>Nz-b--batch-sizesz1 2)typedefaultz-s--sequence-lengthsz8 16 32 64 128 256 512z-w--warmup-runs   z-n
--num-runs  z--hf-pt-eagerF
store_truez,Benchmark in PyTorch without `torch.compile`)r   actionhelpz--hf-pt-compilez)Benchmark in PyTorch with `torch.compile`--hf-ort-dir-path zDPath to folder containing ONNX models for Optimum + ORT benchmarking)r   r   r   z--ort-msft-model-pathzAPath to ONNX model from https://github.com/microsoft/Llama-2-Onnxz --ort-convert-to-onnx-model-pathz'Path to ONNX model from convert_to_onnx--cache-dirz./model_cachez-Cache dir where Hugging Face files are stored--model-nameTzModel name in Hugging Face)r   requiredr   --precision)Zint4Zint8Zfp16Zfp32zPrecision to run model)r   r   choicesr   --device)cpucudaZrocmzDevice to benchmark modelsz--device-idr   zGPU device IDz	--verbosezPrint detailed logsz	--timeout
   z8Number of mins to attempt the benchmark before moving on--log-folderz'Path to folder to save logs and results
model_size/.-z./_)exist_ok<   )argparseArgumentParseradd_argumentstrint
parse_argssetattr
model_namesplitreplacer   	precision
log_folderosmakedirstimeout)parserargsZlog_folder_name r4   m/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/models/llama/benchmark_all.pyget_args   s    r6   c              
   C   s  g }d\}}}d\}}}	}
d}d}d}d}d}d}d	}t |}|D ]}|d
d}||v r9t|t|d  }q"||v rHt|t|d  }q"||v rOd}q"||v rVd}q"||v rlt|t||d }|d }q"||v r~t|t||d }	q"||v rd|v rt||dd |d d }
n |||t| d  dd}t||  d }t|d }
|||||||	|
g }|	| q"W d    |S 1 sw   Y  |S )N)NNN)NNNNzBatch Size: zSequence Length: zto get past_key_valueszwith past_key_valuesz	Latency: zThroughput: zpeak=
r   promptz	per-token r   ZCPU=   z MB'"Zmax_used_MB)
openr,   r'   lenfloatrfindfindjsonloadsappend)	device_idlog_filebase_resultsentries
batch_sizeZsequence_lengthstepZ	latency_sZ
latency_msZ
throughputZmemoryZbatch_patternZsequence_patternZprompt_step_patternZper_token_step_patternZlatency_patternZthroughput_patternZmemory_patternfZ
input_linelineZpeakusageentryr4   r4   r5   process_log_file   s`   


&"
	
))rP   c                 C   sV  dd l }|j| g dd}|d d|d< |d d|d< |d d|d< |d d|d< |d	 d
|d	< |d d
|d< |d d
|d< |d d
|d< dd l}|j}tdd |D }d}d}|r}|d dd }|d dd }g }	| D ]\}
}|d dv rt|d |d d|d ||}n)|d dv rt|d |d d|d t	j
t	j}nt|d |d |d |d dd}|d |j_|d |j_|d |j_|d |j_|d |jjd< |d |jjd< |d	 |jjd< |d |j_|d |jjd< |d |j_|	| qt||	 t|d d!|	 td"| d# d S )$Nr   )Warmup RunsMeasured Runs
Model NameEngine	PrecisionDevice
Batch SizeSequence LengthStepLatency (s)Latency (ms)Throughput (tps)Memory (GB))columnsrQ   r'   rR   rW   rX   rZ   r@   r[   r\   r]   c                 S   s(   g | ]}|j d v r|j  d|j qS ))onnxruntimezonnxruntime-gpu==)keyversion).0ir4   r4   r5   
<listcomp>   s   ( z save_results.<locals>.<listcomp>r   r`   r;   rT   )optimum-ortr_   rS   rU   r_   rV   )pytorch-eagerpytorch-compileZpytorchrY   Zmeasure_stepengineZlatency_s_meanZthroughput_tps.csvz.jsonzResults saved in !)ZpandasZ	DataFrameZastypepkg_resourcesworking_setsortedr+   Ziterrowsr   torch__name____version__configwarmup_runsZmeasured_runsrJ   Z
seq_lengthZ
customizedmetricsZlatency_ms_meanZmax_memory_usage_GBrE   Zsave_as_csvZsave_as_jsonr,   loggerinfo)resultsfilenamepdZdfrl   Zinstalled_packagesZinstalled_packages_listZort_pkg_nameZort_pkg_versionrecordsr    rowrecordr4   r4   r5   save_results   s`   "r}   c           	   	   C   s   | dt j  dd}tj| j|}t|d'}tj|||d}z|	| j
 W n tjy8   |  Y nw W d    n1 sCw   Y  td | j| j| j|| j| jg}t| j||}|S )Nr    %Y-%m-%d_%H:%M:%Sz.logw)stdoutstderrz Gathering data from log files...)datetimenowr/   pathjoinr.   r>   
subprocessPopenwaitr1   TimeoutExpiredkillru   rv   rs   num_runsr*   r-   devicerP   rF   )	r3   benchmark_cmdri   Zlog_filenamelog_pathrG   processrH   rw   r4   r4   r5   	benchmark$  s   
r   c                  C   s  t  } t| j t| j dtjj_	g }t
| jtjd< | jrWdddddd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jdg}td t	| |d}|| | jrdddddd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jdg}td t	| |d}|| | jrdddddd| jd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jdg}td t	| |d}|| | jrdddddd| jd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jg}td t	| |d}|| | jrHdddddd| jd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jg}td t	| |d }|| | j d!| j d!tj d"d#}t |tj!"| j| d S )$NTZCUDA_VISIBLE_DEVICESpythonz-mzmodels.llama.benchmarkz--benchmark-typezhf-pt-eagerr   r   r   r   r   r   r
   r   r   z--authz'Benchmark PyTorch without torch.compilerg   zhf-pt-compilez$Benchmark PyTorch with torch.compilerh   zhf-ortr   z Benchmark Optimum + ONNX Runtimerf   zort-msftz--ort-model-pathz)Benchmark Microsoft model in ONNX Runtimezort-convert-to-onnxz/Benchmark convert_to_onnx model in ONNX Runtimer_   r    r~   rj   )#r6   r   verboseru   rv   __dict__ro   backendsZcudnnr   r&   rF   r/   environZhf_pt_eagerr*   r-   Zbatch_sizesZsequence_lengthsr   rs   r   r.   	cache_dirextendZhf_pt_compileZhf_ort_dir_pathZort_msft_model_pathZort_convert_to_onnx_model_pathr   r   r   r}   r   r   )r3   Zall_resultsr   rw   Zcsv_filer4   r4   r5   main6  s:  











"r   __main__)r#   r   rC   loggingr/   r   ro   Zbenchmark_helperr   rt   r   	getLoggerrp   ru   r6   rP   r}   r   r   r4   r4   r4   r5   <module>   s(   
 9M 2
