o
    i                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZmZmZ d dlmZ eeZG dd	 d	ZG d
d deZdS )    )	getLoggerN)Fusion)AttentionMaskFormat)FusionUtilsNumpyHelper)	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @   sR   e Zd ZdZdefddZdefddZdd	 Zd
d Z	de
de
dB fddZdS )AttentionMask:
    Fuse Attention subgraph into one Attention node.
    modelc                 C   s2   || _ i | _i | _t|| _tj| _| | _	d S N)
r   mask_indicemask_castedr   utilsr   ZMaskIndexEndmask_formatZget_opset_versionopset_version)selfr    r   c/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/fusion_attention.py__init__   s   
zAttentionMask.__init__r   c                 C   s
   || _ d S r   )r   )r   r   r   r   r   set_mask_format    s   
zAttentionMask.set_mask_formatc                 C   s*   || j v r|| j | ksJ || j |< d S r   )r   )r   mask
mask_indexr   r   r   set_mask_indice#   s   
zAttentionMask.set_mask_indicec                 C   s    t | jdks	J tt| jS Nr   )lenr   nextiter)r   r   r   r   get_first_mask(   s   zAttentionMask.get_first_maskmask_2dreturnNc              	   C   s`  | j tjkrd S || jv r| j| S | j|r!| j|\}}n
| j|\}}d}|r2|| j	|< | j tj
kr?|| j|< |S | jd}| jdk rltjd|g|g| jddd}|jtddgtd	d
g n7d}| j|d u r| jtj|tjdgdgdd tjd||g|g| jddd}|jtd	d
g | j| || j|< |S )NTr      Z	ReduceSumZMaskReduceSuminputsoutputsnameaxes   Zkeepdimsr   Zort_const_1_reduce_sum_axesFr(   	data_typedimsvalsraw)r   r   ZNoMaskr   r   Zfind_graph_inputr   Zcast_graph_input_to_int32Zcast_input_to_int32r   r   create_node_namer   r	   	make_node	attributeextendmake_attributeget_initializeradd_initializerZmake_tensorr   INT64add_node)r   r"   ZcastedZ
input_nameZ
_cast_nodeZoutput_nameZmask_index_nodeZ	axes_namer   r   r   process_mask,   sV   




$	
zAttentionMask.process_mask)__name__
__module____qualname____doc__r   r   r   r   r   r!   strr9   r   r   r   r   r      s    
r   c                (       s,  e Zd ZdZdddddgfdededed	edB d
ededee	 f fddZ
dedeeef fddZdedeeef fddZdefddZde	fddZde	de	de	fddZde	d e	d!e	fd"d#Zd$ed%edB d&edB d'e	dedB f
d(d)Zd*ed+ed,ed$ed%edB d&edB deeeef fd-d.Z	/	/		/	/	/	/	dAd*ed+ee	B dB d,ee	B dB d$ed%edB d&edB deded0e	d1e	de	d2ede	de	d3e	d4e	d5ededB f$d6d7Z	/		/	/	/	/	dBd8e	dB d*ed+ed,ed$ed%ed&ededed9e	d0e	d:e	d;ede	de	d3e	d4e	d<edB dedB f&d=d>Zd?d@ Z  ZS )CFusionAttentionr   NFSkipLayerNormalizationLayerNormalizationr   hidden_size	num_headsattention_maskuse_multi_head_attention!disable_multi_head_attention_biassearch_op_typesc           	         sh   |rdnd}t  ||| || _|| _|r|nt|| _|| _|| _d | _d| _	d| _
d | _d| _d S )NMultiHeadAttention	AttentionT)superr   rB   rC   r   rD   rE   rF   mask_filter_valuenum_heads_warninghidden_size_warningshape_infershape_infer_done)	r   r   rB   rC   rD   rE   rF   rG   Zattention_op_name	__class__r   r   r   l   s   

zFusionAttention.__init__concatr#   c                 C   s   t |jdkr;| j|jd }| j|jd }t|tjr;|jdkr;t|tjr;|jdkr;|d |d |d  fS | j| j	fS )aU  
        Detect num_heads and hidden_size from Concat node in the following subgraph:

        SkipLayerNormalization or EmbedLayerNormalization
                        /        |
                     MatMul    Shape
                        |        |
                       Add     Gather(indices=0)
                        |        |
                        |      Unsqueeze
                        |        |
                        |     Concat (*, -1, 12, 64)
                        |     /
                       Reshape
                          |
                       Transpose
                 r*   r   )
r   inputr   get_constant_value
isinstancenpndarraysizerC   rB   )r   rR   rC   	head_sizer   r   r   )get_num_heads_and_hidden_size_from_concat   s   



z9FusionAttention.get_num_heads_and_hidden_size_from_concat	reshape_qc                 C   s*  | j |jd }|du r1| j |d}|dur"|jdkr"| |S td|jd  | j| j	fS t
|tjrIt|dksI|d dksI|d dkrUtd	| | j| j	fS |d }|d }|| }| jdkry|| jkry| jrytd
| j| d| _| j	dkr|| j	kr| jrtd| j	| d| _||fS )zDetect num_heads and hidden_size from a reshape node.

        Args:
            reshape_q (NodeProto): reshape node for Q

        Returns:
            Tuple[int, int]: num_heads and hidden_size
        r*   NConcatz%s is not initializer.rS   rT   r   rU   zGq_shape_value=%s. Expected value are like [0, 0, num_heads, head_size].z>--num_heads is %d. Detected value is %d. Using detected value.Fz@--hidden_size is %d. Detected value is %d. Using detected value.)r   rW   rV   Z
get_parentop_typer]   loggerdebugrC   rB   rX   rY   rZ   r   rL   warningrM   )r   r^   Zq_shape_valuerR   rC   r\   rB   r   r   r   get_num_heads_and_hidden_size   s:   


z-FusionAttention.get_num_heads_and_hidden_sizeadd_qkc                 C   s   | j s| jjdd| _d| _ | jd u rd S | j|jd }| j|jd }|d u s/|d u r7td| d S ||krCtd| d S |jd S )NT)updater   r*   zone of the inputs of %s is Nonez)the shape of two inputs of %s is not same)rO   r   Zinfer_runtime_shaperN   Zget_edge_shaperV   ra   rb   )r   re   Zinput_0_shapeZinput_1_shaper   r   r   get_add_qk_str   s   

zFusionAttention.get_add_qk_strc                    s    d t tfdd| j}t|dkrS t|dks J | jd}tjd fddt| j	D g|dd	}| j
| | j| j|< S )
NZ_maskc                    s   | j d  kS r   )output)node)mask_output_namer   r   <lambda>   s    z0FusionAttention.reshape_add_qk.<locals>.<lambda>r*   r   r_   c                    s   g | ]} qS r   r   ).0_)re   r   r   
<listcomp>   s    z2FusionAttention.reshape_add_qk.<locals>.<listcomp>r&   r'   r(   axis)listfilternodes_to_addr   r   r0   r	   r1   rangerC   appendthis_graph_namenode_name_to_graph_name)r   re   Zconcat_nodeconcat_node_nameZconcat_add_qk_fp32r   )re   rj   r   reshape_add_qk   s    zFusionAttention.reshape_add_qkpast_kpast_vc                 C   s   | j d}| j d}|d dd}|d dd}tjd|g|g|dgd}tjd|g|g|dgd}| j| | j| | j| j|< | j| j|< | j d}	|dd	ddd
d}
tjd||g|
g|	dd}| j| | j| j|	< |
S )zConcatenate past_k and past_v inputs to create past_kv input.

        Args:
            past_k (str): name of past K value
            past_v (str): name of past V value

        Returns:
            kv_output_name (str): name of past KV value
        	UnsqueezeZ_5d.rm   r   )r&   r'   r(   r)   r_   z.valuez.kv_valueZ_kvro   )	r   r0   replacer	   r1   rs   ru   rv   rw   )r   rz   r{   Zunsqueeze_k_nameZunsqueeze_v_nameZ	k_5d_nameZ	v_5d_nameZk_5dZv_5drx   Zkv_output_name	concat_kvr   r   r   r     sD   		zFusionAttention.concat_kvpresent_k_namepresent_v_namekv_nodec                 C   s   d\}}| j |}| j |}|du r(tjtjddd|d}| j || j |du r@tjtjddd|d}| j || j | j d}| j d}	t	j
d||g|g|dd	}
t	j
d||g|g|	dd	}| j|
 | j| | j| j|< | j| j|	< dS )
a?  Split kv_node containing present KV values into separate present K and present V values.

        Args:
            present_k_name (str): name of output to store present K value in
            present_v_name (str): name of output to store present V value in
            kv_node (str): name of present KV values
        )Zindex_0Zindex_1Nr   Zint64)Zdtyper(   r*   ZGatherro   )r   r5   r
   Z
from_arrayrY   arrayr6   rv   r0   r	   r1   rs   ru   rw   )r   r   r   r   Zk_indexZv_indexZk_dimZv_dimZgather_k_nameZgather_v_name	present_k	present_vr   r   r   split_kv8  s:   	zFusionAttention.split_kvq_addk_addv_addname_prefixc                 C   s   | j |jd p| j |jd }t|}t|}t|}|d ur<| j |jd p6| j |jd }	t|	}|d urW| j |jd pQ| j |jd }
t|
}tj|||fdd}dt|j	 }|d }| j
||j|g|d |S )Nr*   r   rp   rU   	_qkv_biasr(   r,   r-   r.   )r   r5   rV   r   to_arrayrY   Z
zeros_likestackprodshaper6   r,   )r   r   r   r   r   q_biasqbkbvbk_biasv_biasqkv_biasqkv_bias_dim	bias_namer   r   r   create_combined_qkv_biase  s(   $


$
$
z(FusionAttention.create_combined_qkv_biasq_matmulk_matmulv_matmulc           #      C   s"  | j d}|jd |jd kr|jd |jd ksJ | j |jd }| j |jd }	| j |jd }
t|}t|	}t|
}|j|jkrR|j|jksTJ |jd }tj|||fdd	|d| f}|d }| j
||j|jd |jd g|d |d }tjd|jd |g|g|d	}| j| j|< |g}|d
 }| j
|tjdgdgdd |d }| j
|tjdg|gdd |d }| j
|tjdgd| gdd |d }| j
|tjdgd| gdd |d }| j
|tjdgdgdd |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |}|} |}!||||g | jr|dur| j |jd rodnd}"tt| j |j|" r||jd|" < |}|| | j| j|j< |dur| j |jd rdnd}"tt| j |j|" r||jd|" < |} || | j| j|j< |dur| j |jd rdnd}"tt| j |j|" r||jd|" < |}!|| | j| j|j< | j| || |!fS )a  Create packed QKV MatMul node before MultiHeadAttention node.
           This is for the scenario where an Attention node should be created but cannot be created
           because past_key and past_value are separate inputs and not one concatenated input.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path

        Returns:
             q_output (NodeProto): Slice node for Q
             k_output (NodeProto): Slice node for K
             v_output (NodeProto): Slice node for V
        MatMulr   r*   r   rU   _qkv_weightr   Z_qkv_outr%   Z_q_start_indexFr+   Z_k_start_indexZ_v_start_indexrT   Z_end_of_qkv_indexZ_qkv_last_axisZ_q_outSliceZ_k_outZ_v_outN)r   r0   rV   r5   r   r   r   rY   r   Zreshaper6   r,   r	   r1   rv   rw   r   r7   r(   r3   rF   anyru   rs   )#r   r   r   r   r   r   r   Zmatmul_node_nameq_weightk_weightv_weightqwkwvwd
qkv_weightZqkv_weight_nameZqkv_matmul_outputZ
qkv_matmul	qkv_nodesZq_slice_nameZk_slice_nameZv_slice_nameZend_of_qkv_nameZqkv_last_axis_nameZq_slice_outputq_sliceZk_slice_outputk_sliceZv_slice_outputv_sliceZq_outputZk_outputZv_outputZinitializer_inputr   r   r   create_packed_qkv_matmul_node  s   ,



"






 

 

 

z-FusionAttention.create_packed_qkv_matmul_node rh   key_padding_maskunidirectionalr   r   
packed_qkvc                 C   s,  |dksJ |dkr|| dkrt d|| dS dd | j jD }| jd}g }|rM| ||||||\}}}||jd |jd |jd g n`t	|t
r~t	|t
r~| jrl||jd |jd |jd g nA||jd |jd |jd g n/t	|trt	|tr||v r||v r| jr||jd ||g n||jd ||g ndS | js| ||||}|| n|d |r|r||
|||g n|
s|r||
|g |	g}|r|r|||g tjd|||d	}d
|_|jtd| |r|jtdt| | d |S )a  Create a MultiHeadAttention node.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            output (str): output name of MHA
            key_padding_mask (str): name of key padding mask
            add_qk (str): name of add after Q x K'
            unidirectional (bool): whether to apply causal attention mask automatically or not
            past_k (str): name of past K value - (batch_size, num_heads, past_sequence_length, head_size)
            past_v (str): name of past V value - (batch_size, num_heads, past_sequence_length, head_size)
            present_k (str): name of present K value - (batch_size, num_heads, sequence_length, head_size)
            present_v (str): name of present V value - (batch_size, num_heads, sequence_length, head_size)
            packed_qkv (bool): whether to combine MatMuls from Q, K, V paths
                               Note: This is for the scenario where an Attention node should be created but cannot be created
                               because past_key and past_value are separate inputs and not one concatenated input.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   9input hidden size %d is not a multiple of num of heads %dNc                 S   s   h | ]}|j qS r   r   )rl   ri   r   r   r   	<setcomp>H      zBFusionAttention.create_multihead_attention_node.<locals>.<setcomp>rI   r   rH   r%   com.microsoftrC   r   )ra   rb   r   graphrV   r0   r   r3   rh   rX   r   rF   r>   r   ru   r	   r1   domainr2   r4   intincrease_counter)r   r   r   r   r   r   r   rC   rB   rh   r   re   r   rz   r{   r   r   r   Zgraph_input_namesZmha_node_nameZ
mha_inputsr   r   r   r   Zmha_outputsZmha_noder   r   r   create_multihead_attention_node  sl   /
$$$

z/FusionAttention.create_multihead_attention_noder   first_input
add_qk_strcausalscalec           5      C   sj  |dksJ |	dkr|	| dkrt d|	| dS d}|du r)|du r)|du r)d}| j|jd }| j|jd }| j|jd }d\}}}|r| j|jd p\| j|jd }| j|jd pn| j|jd }| j|jd p| j|jd }|r|r|r|sdS |du rt|jd  d dS t|}t|}t|}|j|jksJ |jd }|jd }|jd }||  kr|ksJ  J |	dkr|	|krt 	d	|	| d} |j|jkrd} t
|jdd }!t
|jdd }"t
|jdd }#d}$| rt
j|||fdd
}%|!|" |# }$nt
j|||fdd
}%d|! }$d}&d}'|rt|}(t|})t|}*t
|(j}+t
|)j},t
|*j}-|+|,  kr_|!ksbJ  J |-|#ksiJ | r}t
j|(|)|*fdd
}'|+|, |- }&nt
j|(|)|*fdd
}'d|+ }&| jd}.| js| j|.d |j|t|$g|%d |r| j|.d |jt|&g|'d | jr|rt d dS |jd |jd |jd |.d g}/|dur|/| tjd|/|g|.d}0| d nr|
|.d |r|.d ndg}/|dur	|/| n|/d |o|}1|1r!| ||}2|/|2 |r1|1s,|/d |/| |g}3|rT|rT|dddddd}4|3|4 | |||4 tjd|/|3|.d}0| d d|0_|0jtd|g |r~|0jtddg |dur|0jtd|g | r|0jtd|!|"|#gg | jdur|0jtdt | jg |0S )a>  Create an Attention node.

        Args:
            mask_index (str | None): mask input
            q_matmul (NodeProto): MatMul node in fully connection for Q
            k_matmul (NodeProto): MatMul node in fully connection for K
            v_matmul (NodeProto): MatMul node in fully connection for V
            q_add (NodeProto): Add bias node in fully connection for Q
            k_add (NodeProto): Add bias node in fully connection for K
            v_add (NodeProto): Add bias node in fully connection for V
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            first_input (str): first input name
            output (str): output name
            add_qk_str (str): name of Add node after Q x K'
            causal: whether it is uni-directional mask.
            past_k (str): name of input for past K value
            past_v (str): name of input for past V value
            present_k (str): name of output to store present K value
            present_v (str): name of output to store present V value
            scale: scale before softmax

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   r   NTFr*   )NNNzl is not an initializer. Please set do_constant_folding=True in torch.onnx.export to unblock attention fusionzInput hidden size (%d) is not same as weight matrix dimension of q,k,v (%d). Please provide a correct input hidden size or pass in 0r   rU   rI   r   r   r   zVMultiHeadAttention does not support relative_position_bias: cannot fuse the attention.rH   r%   r   z.key_keyr}   rm   r   rC   r   r   Zqkv_hidden_sizesrK   )!ra   rb   r   r5   rV   printr   r   r   rc   rY   r   Zconcatenater   r0   rE   r6   r,   r   rh   ru   r	   r1   r   r   r   r   r   r2   r3   r4   rK   float)5r   r   r   r   r   r   r   r   rC   rB   r   rh   r   r   rz   r{   r   r   r   Zhas_biasr   r   r   r   r   r   r   r   r   Z
qw_in_sizeZ
kw_in_sizeZ
vw_in_sizeZis_qkv_diff_dimsZqw_out_sizeZkw_out_sizeZvw_out_sizeZqkv_weight_dimr   r   r   r   r   r   Zq_bias_shapeZk_bias_shapeZv_bias_shapeZattention_node_nameZattention_inputsZattention_nodeZpast_existsZpast_kvZattention_outputsZ
present_kvr   r   r   create_attention_node  s  .
$$$








 












z%FusionAttention.create_attention_nodec           ;      C   sV  |}|}|j dkr| j|dd}|d ur|}nd S | j|g dg d}d }|d ur4|\}	}	}
}}n| j|g dg d}|d urK|\}	}}}nd S g }t|jD ]\}}||vr]qT||d jd krgqT|| qTt|dkrud S |d }| j|d	d}|d ur||jd  }|d urt|d
kr|d }|j dkr|jd }n,d S |d urt|dkr|jd }nd S |j dkr|| }|D ]}|j dkr|jd }q|| }|j dkrt|jdkr|jd }|| }dd |D }|	ddkrd S | j|g dg d}|d u rt
d d S |\}	}	}}d}d}d}d}g dg dfg dg dfg dg dfg dg dfg dg dfg dg d fd!}d } | D ]6\}!}"| j||"d |"d } | d u rjqS|!d"krrd#}n|!d$krzd#}n|!d%krd#}n|!d&krd#} | d u rt
d' d S d }#d }$d }%d }&|r| \}	}%}$}	n$|r| \}	}#}%}$n|r| \}	}	}$n|r| \}	}#}$}&}	n| \}	}#}	}$|&p|$}&| j|&g dg d(}'|'d u r| j|&g d)g d*}'|'d u rt
d+ d S |'d, }(|'d- })|'d. }*|$}+|r'| j|$d	d/gdd g},|,d u r#t
d0 d S |,\}+}	| j|+g d|r3dndddd g}-|-d u rW| j|$g d1g d2}-|-d u rWt
d3 d S |-d- }.|-d. }/d }0d4}1|r| j|%g d5g dfg d6g dfg d7g d8fg|\}	}0}	n_|r| j|%g d9g d8fg d6g dfg|\}	}0}	|#d ur| |#}1|1d u rt
d:|# d S n+|rn'| j|#g d;g d<fg d=g d>fg d?g d@fg dAg dBfg|\}	}0}	|s|0d u rt
dC d S |s0t|0dkr0| j|0d \}	}2|2d u s"t|2tjr"|2jdkr"t|2dkr$d S t|2dDkr0t|2| _|jd |kr%|*jd |kr'|/jd |kr)|sV| j|0d. jd nd }3|d u r_|
n|}4| |(\}5}6|5dksr|6dkryt
dE d S | j|3|*|/||)|.||5|6||4jd |1dF}7|7d u rd S | j|7 | j| j|7j< |d ur|jd }8dG|8 }9| jdH|8 t j!dgdd|5t"|6|5 gddI}:| j#t$%dJ|4jd |:jg|9gdK|8 | j |9|jd< | j&'|4||g | j&'|  | j&'| j(s|'n|'d d.  | j&'| j(s	|-n|-d d.  | j&'| j(s|n|d d.  d#| _)d S d S d S d S )LNrA   Addr   )r   r   Reshape	Transposer   )NNr   r   r   )r   ZEinsumr   r   )r*   Nr   r   r*   MulrT      r@   rS   c                 S   s   g | ]}|j qS r   )r`   )rl   childr   r   r   rn     r   z(FusionAttention.fuse.<locals>.<listcomp>r   rU   )r   r   r   r   )r*   r   r   Nz&fuse_attention: failed to match v pathF)Softmaxr   Divr   )r   r   Nr   )r   r   r   r   )r   Wherer   r   )r   r   rT   r   )r   r   r   r   )r   r   r   rT   )r   r   r   )r   r   r   )r   r   r   r   Sqrt)r   r   Nr   r*   )Zpath1Zpath2path3path4path5sdpar   Tr   r   r   z'fuse_attention: failed to match qk path)r   r   r   N)r   r   r   r   r   )r   r   r   r   Nz&fuse_attention: failed to match q pathr   r   z/fuse_attention: failed to match mul sqrt q path)r   r   r   r   r   )r*   r   r   r   Nz&fuse_attention: failed to match k pathr   )Expandr   Equal)r   r|   r|   )Castr   r   r   )r   r   r   r   )r   r   r|   r|   z6fuse_attention: failed to verify shape inference of %s)r   Subr   r|   r|   )Nr   r*   r   r   )r   r   r|   r|   )Nr   r*   r   )r   r   r   r   r|   r|   )Nr   r   r*   r   r   )r   r   r   r   r   r|   r|   )Nr   r   r*   r   r   r   z)fuse_attention: failed to match mask pathizmFailed to detect num_heads and hidden_size for Attention fusion. Please specify those parameters in argument.)r   r   r   r   r   r   r   rC   rB   r   rh   r   Zedge_modified_Zshape_modified_tensorr+   r   Zreshape_modified_)*r`   r   Zmatch_parentZmatch_parent_path	enumeraterV   rh   ru   r   countra   rb   itemsZmatch_parent_pathsrg   Zget_constant_inputrX   rY   rZ   r[   r   rK   rD   r9   rd   rc   r   rs   rv   rw   r(   r6   r   r7   r   r8   r	   r1   Znodes_to_remover3   rE   Zprune_graph);r   ri   Zinput_name_to_nodesZoutput_name_to_nodeZnormalize_nodeZ
start_nodeZadd_before_layernormr   Zeinsum_noderm   Zreshape_qkvZtranspose_qkvZ
matmul_qkvZother_inputsZ_iZ
node_inputZ
root_inputZmul_before_layernormZmul_childrenZlayernorm_nodechildrenr   Zparent_nodeZchildren_typesZv_nodesZadd_vZmatmul_vZ
is_distillZis_distill_addZis_no_mask_attentionZis_sdpaZqk_pathsZqk_nodeskvre   Z	matmul_qkZwhere_qkZafter_qZq_nodesr^   Zadd_qZmatmul_qZafter_kZmul_k_nodesZk_nodesZadd_kZmatmul_kZ
mask_nodesr   Zmul_valr   Zattention_last_nodeZq_num_headsZq_hidden_sizenew_nodeZunique_indexZnew_edgeZshape_tensorr   r   r   fusev  s  







	















	








0 



	   
zFusionAttention.fuse)r   r   Fr   r   r   r   F)r   Fr   r   r   r   N)r:   r;   r<   r=   r   r   r   boolrq   r>   r   r   tupler]   rd   rg   ry   r   r   r   r   r   r   r   r   __classcell__r   r   rP   r   r?   g   s(   	.7-

 

	

 	

 kr?   )loggingr   numpyrY   Zfusion_baser   Zfusion_optionsr   Zfusion_utilsr   r   Zonnxr   r   r	   r
   Z
onnx_modelr   r:   ra   r   r?   r   r   r   r   <module>   s   V