o
    iC                     @   sp   d dl Z d dlZd dlZd dlmZ d dlZejej	ej
ejdZdd ZG dd dZ						dd
dZdS )    N)AutoTokenizer)ztorch.int32ztorch.int64ztorch.float32ztorch.float16c                 C   s8   ddl m} ||  | | |  |jj d S )Nr   )cudart)cudar   Z
cudaMemcpydata_ptrZelement_sizeZnelementZcudaMemcpyKindZcudaMemcpyDeviceToDevice)dstsrcr    r   p/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/models/phi2/inference_example.pycuda_memcpy   s   r
   c                   @   sb   e Zd Zdd Zdd Zdd Zdejded	efd
dZ		dddZ
dddZdd Zdd ZdS )ORTGeneratorc                 C   s:   || _ d| _d| _d| _d| _d| _d| _d| _i | _d S )N    P   i   r   F)	onnx_decoder_path	num_heads	head_size
num_layersmax_sequence_length	device_iduse_cuda_graphuse_traced_inputsstatic_inputs_map)selfZdecoder_pathr   r   r	   __init__"   s   
zORTGenerator.__init__c                 C   s  || j v rd S td}td| j}i }tj|dftj|d|d< tjdgtj|d|d< tj|dg tj|d|d< tjdgtj|d|d	< || j| j	| j
f}t| jD ] }tj||tjd
}|d| | d| |  i qVtj|ddftj|d|d< || j |< d S )Ncpur      )dtypedevice	input_idsr   step	seqlens_ktotal_sequence_lengthr   r   	past_key_past_value_   logits)r   torchr   r   zerosint32tensorint64r   r   r   ranger   float16update
contiguousclone)r   
batch_sizeZ
cpu_deviceZcuda_deviceZ	static_ioZcache_shapeicacher   r   r	   append_static_inputs-   s   

,z!ORTGenerator.append_static_inputsc              	   C   s2  | j rtjntj| _tj|d | jtjd}tj|d | jtjd}|j\}}| j	o5|| j
v o5| jo5| j | _| jsEtjdg| jtjdn| j
| d }| js\tj|dg | jtjdn| j
| d }t||ddtj | jstjdgtdtjdn| j
| d	 }||d< | | d
}	| jr| |	d< | j	r| |	d< | |	d	< |	d= | jr| jnd}
| jrd|| j|
| jfn|| j|
| jf}| jst| jD ]0}tj|| j| jd}| js|	d| | d| |  in|	d| | i qn,t| jD ]&}|	d| | j
| d|   d| | j
| d|   i qtj||d| j| jd}d| i}| js| jrWd|| j|| jfn|| j|| jf}t| jD ]0}tj|| j| jd}| js|d| | d| | in|d| | i qd|	|fS )Nr   r!   attention_maskr   r   r   r   r   r    )r   r4      r"   r#   past_r$   r%   present_key_present_value_present_)use_fp16r&   r,   float32torch_dtyper)   r   r(   shaper   r   use_buffer_share	packed_kvr   r*   r
   sumsubtor.   use_stepr   r   r   r+   r   r'   r-   r/   )r   encodings_dictr   r4   r0   sequence_lengthr   r   Ztotal_seq_lengthinputsZpast_seq_lengthZ
past_shaper1   pastr%   outputspresent_shapepresentr   r   r	   get_initial_inputs_and_outputsD   s   
	,z+ORTGenerator.get_initial_inputs_and_outputsmodelrF   rH   c           
   	   C   s  |  }d }| D ](\}}|j||jj|jjdkrdn|jjtt|j t	|j
| d |j}q
| D ]T}|j}	| jrgd|	v rg||	dd }|j|	|jj|jj| jrZtjntjt	|j
| d q7||	 }|j|	|j|jdkrwdn|j| jrtjntjt	|j
| d q7|S )Nr   r   )nameZdevice_typer   Zelement_typer=   Z
buffer_ptrrJ   rG   )
io_bindingitemsZ
bind_inputr   typeindexpt_to_npreprr   tupler=   r   Zget_outputsrM   r>   replaceZbind_outputr:   npr,   r;   )
r   rL   rF   rH   rN   r   kvoutputrM   r   r   r	   apply_io_binding   sD   		zORTGenerator.apply_io_bindingTFc           	      C   s   || _ t }d|_d|_|| _| j dkrd| j | jdfnd}tj| j||gd| _t	 | _
tj r<td| j ntd| _|| _|| _|| _|| _tjd	d
d| _d| j_d S )N   r   ZCUDAExecutionProvider)r   Zenable_cuda_graphZCPUExecutionProvider)sess_options	providersr   r   zmicrosoft/phi-2T)Ztrust_remote_codez[PAD])r   ortZSessionOptionsZlog_verbosity_levelZlog_severity_levelr   InferenceSessionr   sessZ
RunOptionsror&   r   Zis_availabler   r:   r>   r?   rC   r   Zfrom_pretrained	tokenizerZ	pad_token)	r   r   r:   r>   r?   rC   r   r\   epr   r   r	   create_session   s$   

$zORTGenerator.create_sessionc              
   C   s  |  |\}}|d  }|j\}}	|	}
tj|| jtjd}|r"g }d}|
|k r&| | j||}|r7t		 }|
  |rc| jrG| jdd | j|| j | jr`| jd| jr]t|nd d}n| j|| j |  |r|t		 }|||  |d d d dd d f }tj|dd	}||B | jjk}||| jj|d
g}tj||gdd	}t|rnq|
d
7 }
|tj|d< | jrt| j| d |d  | j| d |d< | jrtj|
d
 g| jtj d|d< | jrt| j| d |d  | j| d |d< | jrR|d }|| |d
 tj|d< |
|d d< | jrQt| j| d |d  | j| d |d< |d d | j| d d< | j| d |d< nt|d | |d
gd
tj|d< |d jd
 d
kr|d d d d d
d d f ! |d< | jr| j| d |d< |d "  | j#s"t$| j%D ]-}| j&s|d|  |d| < |d|  |d| < q|d|  |d| < q|d jd
 }| j&rd|| j'|| j(fn|| j'|| j(f}t$| j%D ]2}tj|| j| j)d}| j&s|*d| |! d| | ! in|*d| |! i q|
|k s)|rRt+d| d|	 d||	   t+dd|d   ddt,-|d
d    d d S | jj.|dd}|S )Nr   r!   TZgpu_graph_idz-1Fr%   )dimr   r   r   r    r   r4   r7   r"   r8   r#   r9   r6   r5   zBatch size: z, Sequence length: z, Token num: zPrompt letency: i  zms, Token latency: ms)Zskip_special_tokens)/rK   r/   r=   r&   r'   r   boolrZ   r`   timeZsynchronize_inputsr   ra   Zadd_run_config_entryZrun_with_iobindingr   strZsynchronize_outputsappendZargmaxrb   Zeos_token_idZmasked_fillZreshapecatallrB   r(   r
   r   rC   r)   r*   r.   Zzero_r>   r+   r   r?   r   r   r<   r-   printrV   meanZbatch_decode)r   rD   
max_lengthcuda_graph_annotation	benchmarkrF   rH   Zall_token_idsr0   rE   Zcurrent_lengthZhas_eosZlatencyZ
prompt_runrN   startendZnext_token_logitsZnext_tokensZtokens_to_addZprevious_seqlens_kr1   Znew_sequence_lengthrI   rJ   textsr   r   r	   generate_impl   s   


&h0zORTGenerator.generate_implc                 C   s   | j j|dd}| |||S )NT)padding)rb   Zbatch_encode_plusrv   )r   promptrp   rq   rD   r   r   r	   generatea  s   zORTGenerator.generatec                 C   sx   |\}}|| }i }t jdd||ft jd |d< t j||ft jd |d< | j|||dd | j|||dd d S )	Nr   iX  )r   r   r4   F)rr   T)r&   randintr(   tolistZonesrv   )r   prompt_shape	token_numrq   r0   rE   rp   rD   r   r   r	   generate_benchmarkf  s    zORTGenerator.generate_benchmarkN)TTFFF)F)__name__
__module____qualname__r   r3   rK   r^   r_   dictrZ   rd   rv   ry   r~   r   r   r   r	   r   !   s    ])

r   FTc                    s   t |   |||||  fdd}dg}	|s||	 |r=d}
dD ]} | dD ]}||f} j||
|d q-q$d S d S )Nc                    sZ   t | }r j|d  j| d|d}tt |D ]}td| |  td||  qd S )N)r0      )rp   rq   zPrompt: zTexts: )lenr3   ry   r+   rn   )rx   Zexample_batch_sizeru   r1   	generatorr   r   r	   
simple_run  s   zrun_phi2.<locals>.simple_runzV```python
    def print_prime(n):
    """
    Print all primes between 1 and n
    """r   )r   r5   r[      )   i   )rq   )r   rd   r3   r~   )Zonnx_model_pathr>   r   r?   r:   rC   r   Zrun_benchmarkr   rx   r}   r0   rE   r|   r   r   r	   run_phi2u  s"   

r   )FTFFF)ri   numpyrV   r&   Ztransformersr   Zonnxruntimer^   r(   r*   r;   r,   rR   r
   r   r   r   r   r   r	   <module>   s(     Z