o
    iR                     @   sl  d dl mZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZ d d	lmZ eeZG d
d dZG dd dZG dd dZG dd dZG dd dZG dd dZG dd dZ G dd dZ!G dd deZ"G dd deZ#G dd de"Z$G d d! d!e"Z%G d"d# d#e"Z&G d$d% d%e"Z'G d&d' d'eZ(dS )(    )	getLoggerN)DynamoOnnxHelper)Fusion)AttentionOpTypeFusionOptions) FusionBiasSkipLayerNormalizationFusionSkipLayerNormalization)NumpyHelper)
ModelProto	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @      e Zd Zdd ZdS )ProcessGemmWFuncc                 C   s   t |dS )N   r   )np	transposeselfx r   a/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/onnx_model_phi.py__call__      zProcessGemmWFunc.__call__N__name__
__module____qualname__r   r   r   r   r   r          r   c                   @   r   )ProcessMatMulQFuncc                 C   s   t t |ddd dS )N   r   r   r   r   splitr   r   r   r   r         zProcessMatMulQFunc.__call__Nr   r   r   r   r   r"      r!   r"   c                   @   r   )ProcessMatMulKFuncc                 C      t t |ddd dS )Nr#   r   r   r   r$   r   r   r   r   r      r&   zProcessMatMulKFunc.__call__Nr   r   r   r   r   r'      r!   r'   c                   @   r   )ProcessMatMulVFuncc                 C   r(   )Nr#   r      r   r$   r   r   r   r   r   $   r&   zProcessMatMulVFunc.__call__Nr   r   r   r   r   r)   #   r!   r)   c                   @   r   )ProcessBiasQFuncc                 C      t |ddd }|S )Nr#   r   r   r%   r   r   r   r   r   )      zProcessBiasQFunc.__call__Nr   r   r   r   r   r+   (   r!   r+   c                   @   r   )ProcessBiasKFuncc                 C   r,   )Nr#   r-   r   r.   r   r   r   r   r   /   r/   zProcessBiasKFunc.__call__Nr   r   r   r   r   r0   .   r!   r0   c                   @   r   )ProcessBiasVFuncc                 C   r,   )Nr#   r-   r*   r.   r   r   r   r   r   5   r/   zProcessBiasVFunc.__call__Nr   r   r   r   r   r1   4   r!   r1   c                   @   r   )ProcessRotCacheFuncc                 C   s8   t |jdks	J |jd dkr|d d ddf S |S )Nr*   r       r      )lenshaper   r   r   r   r   ;   s   zProcessRotCacheFunc.__call__Nr   r   r   r   r   r2   :   r!   r2   c                       s  e Zd Zdedee f fddZdefddZdd	 Z	d
d Z
dd Zdd Zd7ddZdd Zdd Zdd Zdee dedee fddZd8dee d ee d!efd"d#Zd8dee d ee d!efd$d%Zd9dee d ee d!efd'd(Zd8dee d ee d!efd)d*Zd8dee d ee d!efd+d,Zd:dee d ee d!efd-d.Zd:dee d ee d!efd/d0Zd:dee d ee d!efd1d2Z		&	3	4d;dee d ee d!efd5d6Z  ZS )<Fissionmodelnodes_to_findc                    s   t  |d| d S )NZDONOTUSEsuper__init__)r   r8   r9   	__class__r   r   r<   E   s   zFission.__init__attn_op_typec                 C   s
   || _ d S N)r?   )r   r?   r   r   r   set_attention_op_typeL   s   
zFission.set_attention_op_typec                 C   s   |d t | S )N_)str)r   layer_idnamer   r   r   	get_unameO   s   zFission.get_unamec                 C   s>   |D ]}||ks| |s||r|  S qtd| d)NzEdge z
 not found)endswith
startswith
ValueError)r   edgesrE   edger   r   r   get_edge_by_nameR   s
   zFission.get_edge_by_namec                 C      |  |j|S r@   )rL   inputr   noderE   r   r   r   get_input_by_nameX      zFission.get_input_by_namec                 C   rM   r@   )rL   outputrO   r   r   r   get_output_by_name[   rR   zFission.get_output_by_nameNc                 C   sd   | j |}t|}||}tj|d u r|d n|tj|j|	 
 dd}| j || j |jS )NZ
_processedTZ	data_typedimsvalsraw)r8   get_initializerr	   to_arrayr   make_tensorr   FLOATr6   flattentobytesadd_initializerthis_graph_namerE   )r   Zinitializer_nameZfunctorZcustom_nameiZ
i_np_arrayZprocessed_i_np_arrayZ
new_tensorr   r   r   process_initializer^   s   

zFission.process_initializerc                 C   &   | j  j }||_tj|jj_	d S r@   )
r8   graph
value_infoaddrE   r   r\   typetensor_type	elem_typer   rE   new_value_infor   r   r   add_fp32_value_infol      zFission.add_fp32_value_infoc                 C   rc   r@   )
r8   rd   re   rf   rE   r   INT64rg   rh   ri   rj   r   r   r   add_int64_value_infoq   rm   zFission.add_int64_value_infoc                 C   s\   | j  jD ]}|j|kr| j  j|  nqtj|tj|d}| j  j	|g d S )Nri   r6   )
r8   rd   re   rE   remover   make_tensor_value_infor   r\   extend)r   rE   r6   re   rk   r   r   r   replace_fp32_value_infov   s   
zFission.replace_fp32_value_infosubgraph_nodesrD   layer_known_edges_namesc                 C   s   |D ]_}t |jD ]\}}|dkrq	||vr'| |||j|< | |j|  q	t |jD ]\}}|dkr6q-||vrK| |||j|< | |j|  q-| ||j|_| j| | j| j	|j< qd S )N )
	enumeraterN   rF   rl   rS   rE   nodes_to_addappendr`   node_name_to_graph_name)r   ru   rD   rv   new_nodera   rE   r   r   r   set_unique_name_and_add_nodes   s&   z%Fission.set_unique_name_and_add_nodesrw   inputsoutputsprefixc                 C   s>   t |dksJ t |dksJ tjd|||d dd}|gS )Nr#   r   LayerNormalizationZ_LayerNormalizationg   >)r~   r   rE   epsilonr5   r   	make_noder   r~   r   r   rP   r   r   r   	layernorm      zFission.layernormc                 C   sr   t |dksJ t |dksJ tjd|d |d g|d g|d d}tjd|d |d g||d	 d}||gS )
Nr#   r   ZMatMulr   Z
matmul_outr~   r   rE   Addr*   ZBiasr   )r   r~   r   r   matmulrf   r   r   r   gemm   s   zFission.gemmr3   c              	   C   sB   t |dksJ t |dksJ tjd|||d d||d}|gS )N   r   ZRotaryEmbeddingcom.microsoft)r~   r   rE   domainrotary_embedding_dim	num_headsr   )r   r~   r   r   Zrot_dimr   rP   r   r   r   rotary      	zFission.rotaryc                 C   s>   t |dksJ t |dksJ tjd|||d dd}|gS )Nr   FastGelur   )r~   r   rE   r   r   r   r   r   r   fastgelu   r   zFission.fastgeluc                 C   s<   t |dksJ t |dksJ tjd|||d d}|gS )Nr*   r   r   r   r   r   r   r   r   rf      s   zFission.addc              	   C   sB   t |dksJ t |dksJ tjd|||d d|dd}|gS )N   r#   MultiHeadAttentionr   r   )r~   r   rE   r   r   unidirectionalr   r   r~   r   r   r   rP   r   r   r   mha   r   zFission.mhac              	   C   sB   t |dksJ t |dksJ tjd|||d d||d}|gS )N   r#   GroupQueryAttentionr   )r~   r   rE   r   r   Zkv_num_headsr   r   r   r   r   gqa   r   zFission.gqac                 C   sF   t |dksJ t |dksJ tjd|||d d|dddd	}|gS )N   r*   	Attentionr   r   r3   )r~   r   rE   r   r   r   Z	do_rotaryr   r   r   r   r   r   	attention   s   zFission.attentionP      %?c                 C   sF   t |dksJ t |dksJ tjd|||d d||||d	}|gS )N   r   PagedAttentionzvllm.ort.ext)r~   r   rE   r   r   Znum_kv_heads	head_sizescaler   )r   r~   r   r   r   r   r   rP   r   r   r   
paged_attn  s   	zFission.paged_attnr@   )rw   )rw   r3   r3   )rw   r3   )rw   r3   r   r   )r   r   r    r   listrC   r<   r   rA   rF   rL   rQ   rT   rb   rl   ro   rt   r   intr}   r   r   r   r   rf   r   r   r   r   __classcell__r   r   r=   r   r7   D   sR    

        r7   c                       s\   e Zd Zdededef fddZdefddZd	d
 Zde	fddZ
de	fddZ  ZS )Phi2PreProcessorr8   r   hidden_sizec                    s(   t  | d| _|| _|| _d| _d S )Nr3   Zmodeling_phi_PhiModel_model_1)r;   r<   num_hidden_layersnum_attention_headsr   	func_namer   r8   r   r   r=   r   r   r<     s
   
zPhi2PreProcessor.__init__returnc                 C   s   i }d|d< d|d< d|d< d|d< t d	| jd	D ],}d
| |d| < d| |d| < d| |d| d< d| |d| d< qdd | jjjD }d|v rbd|v rbd|d< d|d< |S d|v rjd|v slJ d|d< d|d< |S )NZlogitsZ	lm_head_1	input_idsZl_input_ids_Z
past_key_0Z
key_statesZpast_value_0Zvalue_statesr   Z	past_key_Zkey_states_Zpast_value_Zvalue_states_Zpresent_key_Zmodel_layers__1Zpresent_value_Z_1_1c                 S   s   g | ]}|j qS r   rE   ).0or   r   r   
<listcomp>2  s    z7Phi2PreProcessor.get_phi2_edge_dict.<locals>.<listcomp>Zmodel_layers_0_1_1Zmodel_layers_0_1_2Zpresent_key_0Zpresent_value_0Zmodel_layers_0_1)ranger   r8   rd   rS   )r   Z	edge_dictra   r   r   r   r   get_phi2_edge_dict&  s&   z#Phi2PreProcessor.get_phi2_edge_dictc                 C   s<   d}| j jjD ]}|j|}|dkr|j|d  |_qd S )NZ)modeling_phi_PhiDecoderLayer_model_layersr-   )r8   rd   rP   op_typefind)r   Zphi2_transformer_layer_namerP   indexr   r   r   simplify_phi2_op_type<  s   z&Phi2PreProcessor.simplify_phi2_op_typer?   c              
   C   s  |t jk| _|t jk| _| jj}g }|jD ]}d|jv rkt	j
|j| js&tjntjddgd}t	j
dtjdgd}t	j
dtjddgd}t	j
dtjddgd}t	j
d	tjdgd}	| jsc||||gn||||	g | jrd
|jv rt	j
|jd
d|jjjdd| jd| j| j gd}
||
g q| jrd
|jv rt	j
|j|jjjg dd}
||
g d|jv rt	j
|j|jjjg dd}
||
g qd
|jv sd|jv rt	j
|j|jjjd| jd| j| j gd}
||
g q|d |j| g }t|jD ]_\}}|dkr||g q| jr?d|jv r=t	j
|jdd|jjjdd| jd| j| j gd}
||
g q| jrEqt	j
|j|jjjd| jd| j| j gd}
||
g q|d |j| d S )Nr   
batch_sizeseq_lenrp   stepr   position_idsattention_maskinput_metadatapast_keyZpastr*   Zpast_seq_len)
num_blocksr   Zhead_size_x
block_sizeZblock_x
past_value)r   r   r   r   rN   r   present_keyZpresentZtotal_seq_lenrS   )r   r   Zuse_attnr   Zuse_vllmr8   rd   rN   rE   r   rr   r   INT32rn   rs   replacerg   rh   ri   r   r   Z
ClearFieldrx   rS   )r   r?   rd   Z
new_inputsviZvi_iidZvi_stepZvi_pidZvi_maskZvi_metaZvi_cacheZnew_outputsra   r   r   r   process_graph_ioC  s   














z!Phi2PreProcessor.process_graph_ioc                 C   s~   d }| j jD ]}|j| jr|j} nq|d usJ | | | |   |   | 	  |t
jkr8|   | | d S r@   )r8   Z	functionsrE   rG   r   Zunroll_functionZupdate_edgesr   r   Zremove_dropout_layerr   r   Zremove_lm_head_layerr   )r   r?   Zfunction_namefuncr   r   r   preprocess_onnx  s   

z Phi2PreProcessor.preprocess_onnx)r   r   r    r
   r   r<   dictr   r   r   r   r   r   r   r   r=   r   r     s    }r   c                       *   e Zd Zdef fddZdd Z  ZS )FissionTransformerEmbeddingPhir8   c                       t  |dg d S )NZ6torch_nn_modules_sparse_Embedding_model_embed_tokens_1r:   r   r8   r=   r   r   r<        z'FissionTransformerEmbeddingPhi.__init__c           	      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| |d}|||g}tjd||g|gddg}| 	|d| | j
| d	| _d S )
NOptimizing %s...r*   r   r   zembed_tokens.weightGatherZEmbedding_Gatherr   T)loggerinforE   r5   rN   rS   rQ   r   r   r}   nodes_to_removerz   prune_graph)	r   rP   input_name_to_nodesoutput_name_to_noderN   rS   Z	embeddingrv   ru   r   r   r   fuse  s"   


	
z#FissionTransformerEmbeddingPhi.fuser   r   r    r   r<   r   r   r   r   r=   r   r     
    r   c                       r   )FissionTransformerLayerNormPhir8   c                    r   )NZ@torch_nn_modules_normalization_LayerNorm_model_final_layernorm_1r:   r   r=   r   r   r<     r   z'FissionTransformerLayerNormPhi.__init__c           
      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| |d}| |d}||||g}g }	|	| |||g|gd | 	|	d| | 
|g d	 | 
|g d	 | j| d
| _d S )Nr   r#   r   r   zfinal_layernorm.weightzfinal_layernorm.biasFinalc   r   r   r   T)r   r   rE   r5   rN   rS   rQ   rs   r   r}   rt   r   rz   r   )
r   rP   r   r   rN   rS   	ln_weightln_biasrv   ru   r   r   r   r     s   


z#FissionTransformerLayerNormPhi.fuser   r   r   r=   r   r     r   r   c                       r   )!FissionTransformerCausalLMHeadPhir8   c                    r   )NZ(torch_nn_modules_linear_Linear_lm_head_1r:   r   r=   r   r   r<     r   z*FissionTransformerCausalLMHeadPhi.__init__c           
      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| | |dt }| |d}||||g}g }	|		| 
|||g|gd | |	d	| | |g d
 | |g d | j| d| _d S )Nr   r   r   r*   r   zlm_head.weightzlm_head.biasZLMHead_r   r   )r   r   i   T)r   r   rE   r5   rN   rS   rb   rQ   r   rs   r   r}   rt   r   rz   r   )
r   rP   r   r   rN   rS   Z	fc_weightZfc_biasrv   ru   r   r   r   r     s   


z&FissionTransformerCausalLMHeadPhi.fuser   r   r   r=   r   r     r   r   c                       sF   e Zd Zdedef fddZdd Zdd Zd	d
 Zdd Z	  Z
S )FissionTransformerBlockPhir8   r   c                    sT   || _ d}i | _g }t|D ]}d| d}|| || j|< qt || d S )Nr3   Z*modeling_phi_PhiDecoderLayer_model_layers_r   )r   func_to_layer_idr   rz   r;   r<   )r   r8   r   Zmax_num_layersr9   layerr   r=   r   r   r<   5  s   
z#FissionTransformerBlockPhi.__init__c                 C   s   | j |j S r@   )r   r   )r   rP   r   r   r   get_layer_idE  r   z'FissionTransformerBlockPhi.get_layer_idc                 C   s   t jddgdgdtjdt jdddgdgd	d
t jdddgdgdd
t jddgdgdtjdt jddgdgdd
t jdddgdgdddt jddgdgdtjdg}|S )NZCastr   Z
mask_int64ZCast_gqa_aux_0)r~   r   rE   toZ	ReduceSumoneZmask_row_sumsZReduceSum_gqa_auxr   SubZseqlens_k_int64ZSub_gqa_aux	seqlens_kZCast_gqa_aux_1ZShapeZ
mask_shapeZShape_gqa_aux_0r   Ztotal_seq_len_int64ZGather_gqa_aux_0r   )r~   r   rE   axistotal_sequence_lengthZCast_gqa_aux_2)r   r   r   rn   r   )r   gqa_aux_nodesr   r   r   get_gqa_aux_nodesH  sV   +z,FissionTransformerBlockPhi.get_gqa_aux_nodesc	                 C   sX  | j |}	| j |}
| j |}tt|	d}tt|
d}tt|d}tj|||fdd}| j |}| j |}| j |}t|}t|}t|}tj|||fdd}|jd }tj	|t
j||d g|  dd}| j || j tj	|t
j|d g|  dd}| j || j | |j | |j ||fS )Nr   r   )r   r   r#   TrU   )r8   rY   r   r   r	   rZ   stackr6   r   r[   r   r\   r]   r^   r_   r`   rl   rE   )r   Zq_wZk_wZv_wZq_bZk_bZv_bZweight_nameZ	bias_nameZq_weightZk_weightZv_weightZqwkwZvwZ
qkv_weightZq_biasZk_biasZv_biasZqbkbZvbZqkv_biasr   weightZbiasr   r   r   pack_qkv_gemmv  sD   






z(FissionTransformerBlockPhi.pack_qkv_gemmc           $      C   s  t d|j t d| j  | |}|jd }| |d}| |d}|jd }| |d}	| |d}
| |d	}| |d
}d\}}}}}}d\}}d\}}| jt	j
kr| | |dt }| | |dt }| | |dt }| |d}| |d}| |d}| | |dt }| | |dt }n.| | |d| |d| |d| |d| |d| |d| |d| |d\}}| | |dt }| |d}| | |dt }| | |dt }| |d}| |d}g }||||g |||	|
g |||g | jt	j
kr2|||||||||g n|||g |||||||g |g d g }|| |||gdg || d||gd gd! || d||gd"gd# || d"gd$g || d$||gd%gd& || d d%gd'gd( || |d'g|gd) | jt	j
kr|| d||gd*gd+ || d||gd,gd- || d||gd.gd/ | jt	jkrd0nd1}|| d*|||gd2gd+ || d,|||gd3gd- | jt	jkr|| d2d3d.d4d5d4||gd|	|
g n| jt	jkri|| d2d3d.||d6d7gd|	|
g |dkrh|  } | D ]}!| j|! | j| j|!j< qD| j !t"j#t$j%d8gd9d:d;d<| j n9| jt	jkr|| &d2d3d.||d=gdg n!d>| }"d?| }#||"|#g || 'd||d5|"gd|#g | (||| | )|g d@ | )|g d@ | j*| dA| _+d S )BNr   zAttentionOpType: r   r   r   r-   r   Zpresent_valuezinput_layernorm.weightzinput_layernorm.bias)NNNNNN)NNzself_attn.q_proj.weightzself_attn.k_proj.weightzself_attn.v_proj.weightzself_attn.q_proj.biaszself_attn.k_proj.biaszself_attn.v_proj.biaszrotary_emb.cos_cachedzrotary_emb.sin_cachedattn_qkv_weightattn_qkv_biaszself_attn.dense.weightzself_attn.dense.biaszmlp.fc1.weightzmlp.fc2.weightzmlp.fc1.biaszmlp.fc2.bias)r   r   r   r   r   r   Zln_outZattn_outZattn_add_outZOutProj_Zfc1_outZFC1_Zgelu_outZfc2_outZFC2_Zresidual_1_outZ
Residual_1Z
Residual_2queryZQ_keyZK_valueZV_r   r   Z	query_rotZkey_rotrw   r   r   r   r   Zint64)Zdtyper   r   r   Zpast_Zpresent_r   T),r   r   rE   r?   r   rN   rQ   rS   rT   r   r   rb   r   r2   r   rF   rs   r   r   r   rf   r   r   r   r   r   r   r   ry   rz   r`   r{   r8   r_   r   Z
from_arrayr   arrayr   r   r}   rt   r   r   )$r   rP   r   r   rD   Zi_hidden_statesZi_key_cacheZi_value_cacheZo_hidden_statesZo_key_cacheZo_value_cacher   r   Zattn_q_weightZattn_q_biasZattn_k_weightZattn_k_biasZattn_v_weightZattn_v_biasr   r   Z	cos_cacheZ	sin_cacheZattn_out_weightZattn_out_biasZmlp_fc1_weightZmlp_fc2_weightZmlp_fc1_biasZmlp_fc2_biasrv   ru   Zpos_ids_namer   r|   Z	past_nameZpresent_namer   r   r   r     s  










	



zFissionTransformerBlockPhi.fuse)r   r   r    r   r   r<   r   r   r   r   r   r   r   r=   r   r   4  s    .*r   c                       sX   e Zd Zdededef fddZddedB d	ef fd
dZdd Z	dddZ
  ZS )PhiOnnxModelr8   r   r   c                    sJ   t  | t| j||| _t| || _t| | _t	| | _
t| | _d S r@   )r;   r<   r   r8   phi2_preprocessorr   fission_transformer_blockr   fission_causal_lm_headr   fission_transformer_layernormr   fission_transformer_embeddingr   r=   r   r   r<   O  s   

zPhiOnnxModel.__init__NFoptionsadd_dynamic_axesc                    s   |d usJ |j }| j| | j| | j  | j  | j  | j  t	 
  t| | _t| | _| j  | j  d S r@   )Zattention_op_typer   rA   r   r   applyr  r   r  r;   r   r   Zfuse_slnr   Zfuse_bias_sln)r   r  r  r?   r=   r   r   optimizeW  s   







zPhiOnnxModel.optimizec                 C   s@   i }g d}|D ]}|  |}t|||< qtd|  |S )z8
        Returns node count of fused operators.
        )	r   r   r   r   GeluBiasGelur   r   SkipLayerNormalizationzOptimized operators: )Zget_nodes_by_op_typer5   r   r   )r   op_countopsopnodesr   r   r   get_fused_operator_statisticsl  s   
z*PhiOnnxModel.get_fused_operator_statisticsc                    s    du r|    dtf fdd}|d|d |d |d }|d	|d
 |d }|d|d }|dko@||ko@||k}|dkrJtd |dkrStd |dkr\td |S )zA
        Returns True when the model is fully optimized.
        Nop_namec                    s     | pdS )Nr   )get)r  fused_op_countr   r   r
    rR   z1PhiOnnxModel.is_fully_optimized.<locals>.op_countr   r   r   r   r  r  r   r   r	  r   zLayer Normalization not fusedzGelu (or FastGelu) not fusedz+Attention (or MultiHeadAttention) not fused)r  rC   r   debugwarning)r   r  r
  r   ZgeluZ
layer_normZ
is_perfectr   r  r   is_fully_optimized  s*   


zPhiOnnxModel.is_fully_optimized)NFr@   )r   r   r    r
   r   r<   r   boolr  r  r  r   r   r   r=   r   r   N  s
    r   ))loggingr   numpyr   Zdynamo_onnx_helperr   Zfusion_baser   Zfusion_optionsr   r   Zfusion_skiplayernormr   r   Zfusion_utilsr	   Zonnxr
   r   r   r   r   Z
onnx_modelr   r   r   r   r"   r'   r)   r+   r0   r1   r2   r7   r   r   r   r   r   r   r   r   r   r   <module>   s:   
 Z 4"!!  