o
    iFU                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlZddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZmZ dd	lmZ G d
d dZd)dedefddZd)dedefddZG dd dZdefddZdefddZ	d)dejdejdB fddZdd Z dd  Z!e"d!kre! Z#e$d"e#  e#j%du re#j&d#krd$nd%e#_%e#j'rej() sJ e#j*d&krd'e v sJ d(e#_+n
e#j,rJ e#j+rJ e#j,se#j+ree# dS e e# dS dS )*z]
Benchmark performance of SAM2 encoder with ORT or PyTorch. See benchmark_sam2.sh for usage.
    N)Mapping)datetime)SAM2ImageDecoder)SAM2ImageEncoder)decoder_shape_dictencoder_shape_dictload_sam2_model)InferenceSessionSessionOptionsget_available_providers)CudaSessionc                +   @   s   e Zd Zddddddddddddejdddddd	dfd
edededejdededededededededededededededededef*dd Z	d!d" Z
d#eeee f fd$d%Zd#eeejf fd&d'Zd(S ))
TestConfigimage_encoderCPUExecutionProvidermax-autotune      FT     
model_type	onnx_pathsam2_dirdevice	component
batch_sizeheightwidth
num_labels
num_points	num_masksmulti_mask_outputuse_tf32enable_cuda_graphprefer_nhwcwarm_upenable_nvtx_profileenable_ort_profileenable_torch_profilerepeatsverbosec                 C   s   |dv sJ |	dkr|	dksJ |
dkr|
dksJ || _ || _|| _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| jdkru| jdkrq| jdkswJ dd S d S )NZsam2_hiera_tinyZsam2_hiera_smallsam2_hiera_largeZsam2_hiera_base_plus   i   r   r   z7Only image size 1024x1024 is allowed for image encoder.)r   r   r   r   providertorch_compile_moder   r   r   r   r   r   r    r   r!   r"   dtyper#   r$   r%   r&   r'   r(   r)   )selfr   r   r   r   r   r-   r.   r   r   r   r   r   r   r    r!   r"   r/   r#   r$   r%   r&   r'   r(   r)    r1   m/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/models/sam2/benchmark_sam2.py__init__   s>   
zTestConfig.__init__c                 C   s
   t |  S N)varsr0   r1   r1   r2   __repr__V   s   
zTestConfig.__repr__returnc                 C   s6   | j dkrt| j| j| jS t| j| j| j| j| jS )Nr   )	r   r   r   r   r   r   r   r   r   r6   r1   r1   r2   
shape_dictY   s   
zTestConfig.shape_dictc                 C   s   | j }| jdkrdtj| jd| j| j|| jdiS tjdddd|| jdtjddd	d	|| jdtjdddd|| jdtj	d
d| j
| jdf|| jdtj	d
d| j
| jftj| jdtj| j
ddd|| jdtj| j
|| jdtj| j| jgtj| jddS )Nr   image   )r/   r   r          @      r   r      )image_features_0image_features_1image_embeddingspoint_coordspoint_labelsinput_maskshas_input_masksoriginal_image_size)r/   r   torchrandnr   r   r   r   Zrandrandintr   r   Zint32ZzerosZonesZtensor)r0   r/   r1   r1   r2   random_inputs_   s    
"zTestConfig.random_inputsN)__name__
__module____qualname__rI   float32strr   intboolr3   r7   r   listr9   ZTensorrL   r1   r1   r1   r2   r      s    	

;r   configr8   c                 C   s   | j rtdt|   | jdkr=t| jtrtj	 n| jj
}t|| j}t| j|d< | jr5d|d< | j|fdg}ndg}t| j||d}|S )Nzcreate session for CUDAExecutionProviderr!   r   r#   r   )	providers)r)   printr5   r-   
isinstancer   rQ   rI   cudacurrent_deviceindexr   Zget_cuda_provider_optionsr"   rR   r!   r#   r	   r   )rU   session_options	device_idZprovider_optionsrW   ort_sessionr1   r1   r2   create_ort_sessiont   s   
r`   c                 C   s,   t | |}t|| j| j}||   |S r4   )r`   r   r   r"   Zallocate_buffersr9   )rU   r]   r_   cuda_sessionr1   r1   r2   create_session   s   
rb   c                   @   s(   e Zd ZdZddefddZdd ZdS )	OrtTestSessionz;A wrapper of ORT session to test relevance and performance.NrU   c                 C   s   t ||| _| | _d S r4   )rb   r_   rL   	feed_dict)r0   rU   r]   r1   r1   r2   r3      s   zOrtTestSession.__init__c                 C   s   | j | jS r4   )r_   inferrd   r6   r1   r1   r2   re      s   zOrtTestSession.inferr4   )rM   rN   rO   __doc__r   r3   re   r1   r1   r1   r2   rc      s    rc   ra   c                 C   s"   t   }| |}t   }|| S r4   )timere   )ra   
input_dictstart_endr1   r1   r2   measure_latency   s   
rl   c                 C   sL  | j j}|dk}|r!tjdjdkr!| jr!dtjjj_	dtjj
_	|o(| jtjk}|  }t i tj|| j|dM t| j| j| j d}| jdkr`|rc| jdkrctj|jj| jdd	d
|j_|  d }t|j| j | jd}t|}|r| jdkrtd| j d t| jD ]	}	||\}
}}q|r| jrdd l }ddlm!} |"  td |#d ||dd W d    n1 sw   Y  |$  |r$| j%r$tj&j'tj&j(j)tj&j(j*gdd&}td tj&+d || W d    n1 sw   Y  W d    n	1 sw   Y  t|, j-ddd |.d | j/dkr9	 W d    W d    d S td| j/ d t00 }t| j/D ]}	||\}
}}|r\tj1  qKn|d |d |d |d |d  |d! |d" |d# f}t2|| j3d$}|r| jdkrtj|j| jdd	d
|_t| jD ]
}	|| \}}}q|r| jrdd l }ddlm!} |"  td% |#d ||d&di W d    n	1 sw   Y  |$  |r7| j%r7tj&j'tj&j(j)tj&j(j*gdd'}td' tj&+d( ||  W d    n	1 sw   Y  W d    n	1 s"w   Y  t|, j-ddd |.d) | j/dkrL	 W d    W d    d S td| j/ d t00 }t| j/D ]}	|| \}}}|rotj1  q^t00 }|| | j/ W  d    W  d    S 1 sw   Y  W d    d S 1 sw   Y  d S )*NrZ   r      T)device_typer/   enabled)r   r   noneF)modeZ	fullgraphZdynamicr:   )r   r/   zBRunning warm up. It will take a while since torch compile mode is .cudartz#Start nvtx profiling on encoder ...one_run)r%   )Z
activitiesZrecord_shapesz$Start torch profiling on encoder ...encoderZcuda_time_total
   )Zsort_byZ	row_limitztorch_image_encoder.jsonzStart z runs of performance tests...rA   rB   rC   rD   rE   rF   rG   rH   )multimask_outputz"Start nvtx profiling on decoder...r%   z$Start torch profiling on decoder ...decoderztorch_image_decoder.json)4r   typerI   rZ   Zget_device_propertiesmajorr!   backendsmatmulZ
allow_tf32Zcudnnr/   rP   rL   Zinference_modeZautocastr   r   r   r   r.   compiler   forwardr9   rJ   tor   rX   ranger$   r%   nvtxrt   cudaProfilerStartannotatecudaProfilerStopr'   ZprofilerZprofileZProfilerActivityZCPUCUDAZrecord_functionZkey_averagestableZexport_chrome_tracer(   rg   Zsynchronizer   r    )rU   rn   Zis_cudaZenabled_auto_castZ
ort_inputsZ
sam2_modelZimage_shapeZimgZsam2_encoderrj   Z_image_features_0Z_image_features_1Z_image_embeddingsr   rt   Zprofri   Ztorch_inputsZsam2_decoderZ_masksZ_iou_predictionsZ_low_res_masksrk   r1   r1   r2   	run_torch   s   
"



-


h
Vr   args
csv_writerc                 C   s  | j }| j}| j}|rtj }td|}d}nd}td}d}d}tjtjtj	d}t
d+i d| jd	| jd
| jd| jd|d| jd| jd| jd|ddd|d|| j d| jd| jd| jd| jd| jd| jd| jdd}	| jdkr(t }
| j|
_|	jrd|
_d|
_d|
_t|	|
}|	  }zt!|	jD ]}t"||}qW n t#y } zt$d|	d |  W Y d }~d S d }~ww |	jrdd l%}dd!lm&} |'  |(d" |)|}W d    n1 sw   Y  |*  |	jr|j+,  |dkrd S g }t!|D ]}t"||}|-| qt./|}~nGt0 3 zt1|	}W n$ t#yW } zt$d|	d |  W Y d }~W d    d S d }~ww W d    n	1 scw   Y  |dkrod S | jd# |rxdnd }i d| jd| jd| jd$|d|d|	jd|	j2d| jd| jd| jd%| j3d&|	j4d'|	j5d(|	j6d)| jd|	jd|| j| j||d*}|d ur|7| t$t8|	  t$|  d S ),NrZ   rV   r   cpuFr   fp32Zfp16Zbf16r   r   r   r   r-   r   r   r   r   r!   Tr"   r/   r#   r(   r$   r%   r&   r'   r.   r)   ort   zFailed to run config=z. Exception: rs   ru   :use_gpur    r   r   r   intra_op_num_threads)r%   r.   engineaverage_latencyr1   )9r   use_cuda_graphr(   rI   rZ   r[   r   rP   Zfloat16Zbfloat16r   r   r   r   r   r   r   r   r/   r#   r$   r%   r&   r'   r.   r   r
   r   Zenable_profilingZlog_severity_levelZlog_verbosity_levelrb   rL   r   rl   	ExceptionrX   r   rt   r   r   re   r   r_   Zend_profilingappend
statisticsmeanZno_gradr   r!   rx   r   r   r   writerowr5   )r   r   r   r"   r(   r^   r   r-   ZdtypesrU   Zsess_optionssessionrh   rj   er   rt   Zlatency_listZlatencyr   r   rowr1   r1   r2   run_test  s&  

	









	


r   c                 C   s   | j rdnd}d|| jt d}t|ddd}g d}tj||d	}|	  t
| | W d    d S 1 s;w   Y  d S )
NZgpur   zbenchmark_sam_{}_{}_{}.csvz%Y%m%d-%H%M%Sa )rq   newline)r   r   r/   r   r"   r#   r!   r   r   r   r    r   r   r   r   r$   r(   r%   r.   r   r   )
fieldnames)r   formatr   r   nowstrftimeopencsv
DictWriterwriteheaderr   )r   featuresZcsv_filenameZcsv_fileZcolumn_namesr   r1   r1   r2   run_perf_test  s   "r   c                  C   s  t jdd} | jddddgddd | jd	dg d
ddd | jddddd | jdd | jddddd | jdd | jddtg dddd | jddtddd | jddtdd d | jd!dtdd"d | jd#dtd$d%d | jd&dtd'd(d | jd)dtd*d*d+gd,d- | jd.dddd/d0 | jd1dddd2d0 | jd3dddd4d0 | jd5dddd6d0 | jd7dddd8d0 | jd9dtd:g d;d<d- | jd=dtd>d?d | jd@dtdAdBd | jdCdtd g dDdEd- |  }|S )FNz,Benchmark SMA2 for ONNX Runtime and PyTorch.)descriptionz--componentFr   image_decoderzDcomponent to benchmark. Choices are image_encoder and image_decoder.)requiredchoicesdefaulthelpz--dtyper   r   zData type for inference.z	--use_gpu
store_truezUse GPU for inference.)r   actionr   )r   z--use_cuda_graphzUse cuda graph in onnxruntime.)r   z--intra_op_num_threads)r   r   r@   r   rm      r   z&intra_op_num_threads for onnxruntime. )r   rz   r   r   r   z--batch_sizer   z
batch size)r   rz   r   r   z--heightr   zimage heightz--widthzimage widthz	--repeatsr   z8number of repeats for performance test. Default is 1000.z	--warm_upr   z)number of runs for warm up. Default is 5.z--enginer   rI   zengine for inference)r   rz   r   r   r   z--multimask_outputz:Export mask_decoder or image_decoder with multimask_output)r   r   r   r   z--prefer_nhwcz;Use prefer_nhwc=1 provider option for CUDAExecutionProviderz--enable_nvtx_profilezVEnable nvtx profiling. It will add an extra run for profiling before performance test.z--enable_ort_profilezEnable ORT profiling.z--enable_torch_profilezYEnable PyTorch profiling. It will add an extra run for profiling before performance test.z--model_typer+   r*   zsam2 model namez
--sam2_dirz./segment-anything-2z6The directory of segment-anything-2 git root directoryz--onnx_pathz6./sam2_onnx_models/sam2_hiera_large_image_encoder.onnxzpath of onnx modelz--torch_compile_mode)zreduce-overheadr   zmax-autotune-no-cudagraphsrp   z4torch compile mode. none will disable torch compile.)argparseArgumentParseradd_argumentset_defaultsrR   rQ   
parse_args)parserr   r1   r1   r2   _parse_arguments  s  				r   __main__z
arguments:r   r   rp   r   rV   Fr4   )-rf   r   r   r   rg   collections.abcr   r   rI   r   r   r   r   Z
sam2_utilsr   r   r   Zonnxruntimer	   r
   r   Z*onnxruntime.transformers.io_binding_helperr   r   r`   rb   rc   rl   r   	Namespacer   r   r   r   rM   r   rX   r.   r   r   rZ   Zis_availabler   r'   r%   r1   r1   r1   r2   <module>   s\   Z 
~% )



