o
    ¢ÄiŒ  ã                   @   sp   d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	 d dl
mZ eeƒZG dd„ deƒZG dd	„ d	eƒZd
S )é    )Ú	getLogger)ÚFusion)ÚFusionUtils)Ú	NodeProtoÚTensorProtoÚhelper)Ú	OnnxModelc                       s  e Zd ZdZd2dedef‡ fdd„Zdedd	eeef B fd
d„Z	dede
eee f dedefdd„Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdedeed	eB f fdd„Z					d3deded ed!ed"d	eB d#ed	B fd$d%„Zd&d'„ Zd(d)„ Z		d4d*d+„Zd,d-„ Zd.d/„ Zd0d1„ Z‡  ZS )5ÚFusionEmbedLayerNoMaskzŒ
    Fuse embedding layer into one node (EmbedLayerNormalization).
    It supports the following model types: BERT, DistilBert, ALBert.
    úno maskÚmodelÚdescriptionc                    s<   t ƒ  |dddg|¡ t|ƒ| _d | _d| _d | _d | _d S )NÚEmbedLayerNormalizationÚLayerNormalizationÚSkipLayerNormalizationF)ÚsuperÚ__init__r   ÚutilsÚshape_inferÚshape_infer_doneÚ	attentionÚ
embed_node)Úselfr   r   ©Ú	__class__© úd/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/fusion_embedlayer.pyr      s   ü

zFusionEmbedLayerNoMask.__init__ÚaddÚreturnNc                 C   sP   | j  |dgdg¡}|d u rd S | j  |dgdg¡}|d u r d S |d |d fS )NÚGatherr   é   )r   Úmatch_parent_path)r   r   Úgather_0_pathÚgather_1_pathr   r   r   Úmatch_two_gather%   s   z'FusionEmbedLayerNoMask.match_two_gatherÚ	layernormÚinput_name_to_nodesÚis_distil_bertc           
      C   sÐ  | j j|d|dd| _| jdurdS |jd |vrdS ||jd  }tdd„ |D ƒƒ}|g d	¢kr_|D ]+}|jd
kr^| j  |g d¢g d¢¡}|dur^|d jd |jd kr^|d | _ dS q3t	|ƒdkr¶|d jdkr¶|d jd |v r¶||d jd  }t	|ƒdkr¶|d jdkr¶|d jd |v r¶||d jd  }	|	D ]}|jdkr¬|| _ dS qŸtdd„ |	D ƒƒ}|rÓ|g d¢krÑ|g d¢krÑ|g d¢krÑt
 d¡ dS dS |g d¢kræ|g d	¢kræt
 d¡ dS dS )a§  Check that LayerNormalization has a child of Attention node or subgraph like Attention.

        Args:
            layernorm (NodeProto): LayerNormalization node
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            is_distil_bert (bool): whether it is DistilBert or not

        Returns:
            bool: whether there is Attention node or subgraph like Attention
        Ú	AttentionF)Ú	recursiveNTr   c                 S   ó   g | ]}|j ‘qS r   ©Úop_type©Ú.0Úchildr   r   r   Ú
<listcomp>J   ó    zCFusionEmbedLayerNoMask.check_attention_subgraph.<locals>.<listcomp>)ÚMatMulr1   r1   r   r   )ÚAddr1   ÚMultiHeadAttentionr1   )NNr   r   éÿÿÿÿé   r   r1   r2   c                 S   r)   r   r*   r,   r   r   r   r/   g   r0   )r1   r1   r1   ÚShaper   )r2   r1   r1   r1   r6   r6   )r2   r1   r1   r1   r6   z<No Attention like subgraph in children of LayerNormalization)r2   r1   r1   r1   )r   Zfind_first_child_by_typer   ÚoutputÚsortedr+   r    ÚinputÚcross_attentionÚlenÚloggerÚdebug)
r   r$   r%   r&   ÚchildrenZchildren_typesÚnodeÚpath1ZgrandchildrenÚnodesr   r   r   Úcheck_attention_subgraph0   sZ   ÿ

ý 
€,
þ
ò
z/FusionEmbedLayerNoMask.check_attention_subgraphc                 C   s  | j  |ddgddg¡}|du r"| j  |g d¢g d¢¡}|du r"dS |d |d	 }}|jd |kr4dS | j  |g d
¢g d¢fg d¢g d¢fg|¡\}}}|du rSdS |d }	| j |	dd¡rg| j |	dd¡sidS |d }
| j |
dd¡swdS |d	 }|jd |kr„dS dS )az    Match position embedding path from input_ids to Gather for DistilBert.

        Pattern is like the following:
                 (input_ids)
                      |
                     Shape
                       |                          |    Gather (indices=1)
                       |       |
                       |      Cast (optional)
                       |       |
                       |      Range (start=0, end=*, delta=1)
                       |       |
                       |    Unsqueeze
                       |    /
                      Expand
                        |
                      Gather
        ÚExpandr6   r   N)rC   ZWhereZReshaper6   )r   r   r5   r   Fr   r4   )Ú	UnsqueezeÚRangeÚCastr   r6   )r   r   r   r   r   )rD   rE   r   r6   )r   r   r   r   r5   éþÿÿÿT)r   r    r9   Zmatch_parent_pathsr   Úcheck_node_input_value)r   Úposition_embedding_gatherÚ	input_idsÚoutput_name_to_noder@   ÚexpandÚshapeÚ_Zpath2Z
range_nodeZgather_nodeZ
shape_noder   r   r   Ú#match_position_embedding_distilbert„   sD   ýþ
úÿÿz:FusionEmbedLayerNoMask.match_position_embedding_distilbertc                 C   s   dS )aY  Match position embedding path from input_ids to Gather for Roberta.

        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
                                                |                              ^
                                                V                              |
                                                +------------------------------+

        Roberta new pattern from transformers v4.9:
           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
                                                |                                           ^
                                                V                                           |
                                                +-------------------------------------------+

        start_node = position_embedding_gather
        start_index = 1

        # match optional Cast node.
        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
        if parent is None:
            return
        if parent.op_type == "Cast":
            if OnnxModel.get_node_attribute(parent, "to") != 7:
                return
            start_node = parent
            start_index = 0

        i, path, return_indices = self.model.match_parent_paths(
            start_node,
            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
            output_name_to_node)

        if path is not None:
            # constant input of Add shall be 1.
            i, value = self.model.get_constant_input(path[0])
            if value != 1:
                return False

            _, self.padding_word_id = self.model.get_constant_input(path[-1])

            return input_ids == path[-1].input[0]
        Fr   ©r   rI   rJ   rK   r   r   r   Ú match_position_embedding_robertaÂ   s   -z7FusionEmbedLayerNoMask.match_position_embedding_robertac                 C   s˜  | j  |ddgddg|¡}|du rdS |\}}| j  |jd ¡}|durTt|jƒdkrT|jd dkrT| j |ddg¡rT| j |ddg¡rTt|jƒd	ksV| j |d	dg¡sVdS | j  ¡ }|d
k rjt	 
|ddg¡sidS n| j |ddg¡sudS | j  |d|¡}	|	du rƒdS |	jdkr›| j |	dd¡s’dS | j  |	d|¡}
n|	}
|
du s¦|
jdkr¨dS | j |
dd¡s²dS | j  |
d|¡}|du sÃ|jdkrÅdS ||jd kS )a	    Match position embedding path from input_ids to Gather for BERT.

        BERT Embedding Layer Pattern:
                                    (input_ids)
                                   /                                          /          Shape
                                /              |
                              /              Gather (indices=1)
                             /                  |
                            /                  Add (optional, B=0)
                           /                    |
                        Gather (segment_ids) Unsqueeze (axes=0)
                           \        |           |
                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
                              \    /            |
                                Add          Gather
                                   \       /
                                      Add
                                       |
                                LayerNormalization
        ÚSlicerD   r   r5   NFr   é   é   é   Zaxesr2   r   r6   )r   r    Úget_constant_valuer9   r;   rM   r   rH   Zget_opset_versionr   Zcheck_node_attributeZ
get_parentr+   )r   rI   rJ   rK   ÚpathÚsliceZ	unsqueezeZslice_weightZopset_versionr?   ZgatherrM   r   r   r   Úmatch_position_embedding_bertñ   sT   üÿþ 
ÿ
z4FusionEmbedLayerNoMask.match_position_embedding_bertc                 C   s(   |   |||¡r	dS |  |||¡rdS dS )NTF)rY   rO   rP   r   r   r   Úmatch_position_embedding9  s
   z/FusionEmbedLayerNoMask.match_position_embeddingc                 C   s®  |j d }|r|j d nd}|j d }| js!| jjdd| _d| _| jdurs| j |¡}| j |¡}|r6|s8J ‚t|ƒdkrLt|ƒdkrL|d |d ksYt d|› d|› ¡ dS |rs| j 	||¡sst d	|› d
| j |¡› ¡ dS | j 
|j d ¡}	|	du s‡t|	jƒdkrŽt d¡ dS | j 
|j d ¡}
|
du s¬t|
jƒdks¬|	jd |
jd kr³t d¡ dS |rÚ| j 
|j d ¡}|du sÓt|jƒdksÓ|	jd |jd krÚt d¡ dS |	jd |
jd krt d|j d › d|	jd › d|j d › d|
jd › ¡ |rU|	jd |jd kr-t d|j d › d|	jd › d|j d › d|jd › ¡ |
jd |jd krUt d|j d › d|
jd › d|j d › d|jd › ¡ dS )zXSanity check of embedding weights, and match hidden_size of weights and shape of inputs.r   NT)Úupdater5   z^Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: z vs FzYCannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: z != r   zICannot fuse EmbedLayerNormalization: word embedding table is not expectedzMCannot fuse EmbedLayerNormalization: position embedding table is not expectedzLCannot fuse EmbedLayerNormalization: segment embedding table is not expectedzword_embedding_table (z) size z <= position_embedding_table (z <= segment_embedding_table (zposition_embedding_table ()r9   r   r   Zinfer_runtime_shaper   Zget_edge_shaper;   r<   ÚinfoZcompare_shaperV   rM   Úwarning)r   Úword_embedding_gatherÚsegment_embedding_gatherrI   rJ   Úsegment_idsÚposition_idsZinput_ids_shapeZposition_ids_shapeZword_embedding_tableZposition_embedding_tableZsegment_embedding_tabler   r   r   Úcheck_embeddingG  sj   


ÿÿ


2ÿ2ÿ2ÿz&FusionEmbedLayerNoMask.check_embeddingÚ
input_namec                 C   sd   d}| j  |¡}|dur&|jjjtjkr | j |¡\}}||fS |}||fS | j |¡\}}||fS )a¨  Cast a graph input or node input to int32.

        Args:
            input_name (str): name of graph input or node input

        Returns:
            A tuple of casted input name and the cast node.
            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
        N)	r   Úfind_graph_inputÚtypeZtensor_typeZ	elem_typer   ZINT32r   Zcast_input_to_int32)r   rc   Zinput_cast_nodeZgraph_inputZint32_outputr   r   r   Úcast_to_int32‘  s   üþz$FusionEmbedLayerNoMask.cast_to_int32FrJ   r^   rI   r_   ra   c	                 C   sª  g }	|   |¡\}}
| j d¡}|jdkr|jd }|jd }n
|jd }|jd }d}|durL|   |jd ¡\}}
|||jd |jd |jd ||g}n|d|jd |jd d||g}|durp| d¡ |   |¡\}}
| |¡ |d	 |d
 g}|r‰|dur€|n|d }| |¡ tjd|||d}d|_|j	D ]}|j
dkr¦|j	 |g¡ q˜t|j	ƒdkr¹|j	 t dd¡g¡ |	 |¡ |	D ]	}| j| j|j
< qÀ| j |	¡ || _|S )ag  Create an EmbedLayerNormalization node. Note that segment embedding is optional.

        Args:
            input_ids (str): input_ids for word embeddings
            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
            word_embedding_gather (NodeProto): the Gather node for word embedding
            position_embedding_gather (NodeProto): the Gather node for position embedding
            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.

        Returns:
            NodeProto: the EmbedLayerNormalization node created.
        r   r   r   r5   rS   Nr   Ú Ú_outputZ_dummy_mask_indexZ_embedding_sum)ZoutputsÚnamezcom.microsoftÚepsilongê-™—q=)rf   r   Zcreate_node_namer+   r9   Úappendr   Z	make_nodeÚdomainÚ	attributeri   Úextendr;   Zmake_attributeZthis_graph_nameZnode_name_to_graph_nameÚnodes_to_addr   )r   rJ   r$   r^   rI   r_   ra   Úembedding_sum_outputÚembedding_sum_namero   rN   Z	node_nameÚgammaÚbetaZembed_node_inputsr`   Zembed_node_outputsri   r   Zattr?   r   r   r   Úcreate_fused_node¨  sl   



ùù



ü

€
z(FusionEmbedLayerNoMask.create_fused_nodec                 C   s$   | j  |jd |jd ¡ d| _d S )Nr   T)r   Úreplace_input_of_all_nodesr7   Zprune_graph)r   r$   r   r   r   r   Úfinish_fusion
  s   
z$FusionEmbedLayerNoMask.finish_fusionc                 C   s*   |j dkot|jƒdkot|jd ƒdkS )Nr   rS   r   )r+   r;   r7   )r   r?   r   r   r   Ú"is_skip_layer_norm_with_sum_output  s   *z9FusionEmbedLayerNoMask.is_skip_layer_norm_with_sum_outputc              
   C   sx  |   |¡}|d u rdS |\}}|jd }	|jd }
| j||dds#dS |  |d |¡s,dS |jdkrP|  |¡}d}|}|rA|jd nd }|d uoN| j |¡d u}n@|}|jdkrYdnd}t	|jƒ|krg|j| nd }|d uot| j |¡d u}|o‚||v o‚t	|| ƒdk}|d uo|jdkp|p|}| j
|	|||||
||r|nd d}|r´d	|j|< |s´| j ||jd
 ¡ |  ||¡ dS )NFr   ©r&   r   rS   r2   r   )rp   rq   Z_no_use__to_be_removed_r5   T)r#   r9   rB   rb   r+   rw   r7   r   Zfind_graph_outputr;   rt   ru   rv   )r   r$   Úadd_before_layernormr%   rK   Úoptional_segment_gatherÚ
two_gatherr^   rI   rJ   ra   Zneed_embedding_sum_outputZsum_output_indexZnode_with_sum_outputZ
sum_outputZis_sum_graph_outputZis_sum_used_by_multiple_nodesr   r   r   r   Ú	fuse_gpt2  sX   





ÿýÿÿ
ø
z FusionEmbedLayerNoMask.fuse_gpt2c           
      C   s‚   |   |¡}|du rdS |\}}|jd }| j||ddsdS |  |||¡s'dS |  |d|¡s0dS |  ||||d¡}	|  ||	¡ dS )aÄ  Fuse embedding layer for DistilBert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        NFr   Trx   )r#   r9   rB   rZ   rb   rt   rv   )
r   r$   ry   r%   rK   r{   r^   rI   rJ   r   r   r   r   Úfuse_distilbertc  s    


ÿz&FusionEmbedLayerNoMask.fuse_distilbertc                 C   sè   | j  |dgdg¡}|du rdS |  |d ¡}|du rdS |\}}|jd }	| j||dds0dS | j  |dgdg¡}
|
du r@dS |
d }|  ||	|¡sZ|  ||	|¡sTdS |}|}|}|  |||¡scdS |  |	||||¡}|  ||¡ dS )	a¾  Fuse embedding layer for Bert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        r2   r   NFr   rx   r   T)	r   r    r#   r9   rB   rZ   rb   rt   rv   )r   r$   ry   r%   rK   Zadd_2_gatherr{   r^   r_   rJ   Zposition_embedding_pathrI   Útempr   r   r   r   Ú	fuse_bertŒ  s>   	
ûz FusionEmbedLayerNoMask.fuse_bertc           	      C   s  | j  |dgdg¡}|jdkr|d u rd S |d }d }nP| j  |dgdg¡}| j  |dgdg¡}|d u rG|d urG|d u r>d S |d }|d }n%|d urh|d u rh| j  |dgdg¡}|d u r_d S |d }|d }n|}d }|  |||||¡rwd S |  ||||¡rd S |  ||||¡r‹d S d S )Nr2   r   r   r   r   )r   r    r+   r|   r}   r   )	r   r?   r%   rK   Zfirst_add_pathry   rz   r!   r"   r   r   r   Úfuse¾  s<   



ÿÿzFusionEmbedLayerNoMask.fuse)r
   )NFN)N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Ústrr   r   Útupler#   ÚdictÚlistÚboolrB   rO   rQ   rY   rZ   rb   rf   rt   rv   rw   r|   r}   r   r€   Ú__classcell__r   r   r   r   r	      sT    þýü
ûT>/HJ÷þýüûú
ùb
ÿQ)2r	   c                       s8   e Zd Zd	def‡ fdd„Zdd„ Z‡ fdd„Z‡  ZS )
ÚFusionEmbedLayerNormalizationFr   c                    s   t ƒ  |d¡ || _d S )Nz	with mask)r   r   Úuse_mask_index)r   r   rŒ   r   r   r   r   ä  s   
z&FusionEmbedLayerNormalization.__init__c                 C   sÆ   | j }t|jƒdkr|j |¡ t d|j¡ n"t|jƒdkr1|jd s1||jd< t d|j¡ n	t d|j¡ d S |D ]$}t d|j¡ |jdkrS|jd |jd< q<|jd	kr`|jd |jd
< q<d S )Né   zappend mask to %szreplace mask in %szskip mask in %szupdate mask_index in %sr'   r   rS   r3   rT   )	r   r;   r9   rk   r<   r=   ri   r+   r7   )r   Ú
mask_int32Úattention_nodesr   Zattention_noder   r   r   Úreplace_maskè  s"   


€ûz*FusionEmbedLayerNormalization.replace_maskc                    sf  d | _ d | _d | _tƒ  |||¡ | jd u rd S | js't d¡ |  d¡ d S | j d u r=| jd u r=t d¡ |  d¡ d S | j rG| j j	d }n| jj	d }|| }| j
 |¡rkdd„ |D ƒ}|  ||¡ |  d¡ d S ||vr|t d	|¡ |  d¡ d S || }|jd
v r±dd„ |D ƒ}|jdkr¤|j	d }t|ƒt|ƒkr¤| j |¡ |  ||¡ |  d¡ d S d S )NzG--use_mask_index is not set: EmbedLayerNormalization will not have maskz EmbedLayerNormalization(no mask)zLEmbedLayerNormalization will not have mask since attention node is not foundrS   rT   c                 S   ó   g | ]	}|j d v r|‘qS ©)r'   r3   r*   ©r-   r?   r   r   r   r/     ó    z6FusionEmbedLayerNormalization.fuse.<locals>.<listcomp>z"EmbedLayerNormalization(with mask)zHEmbedLayerNormalization will not have mask since %s is not a node output)Ú	ReduceSumrF   c                 S   r‘   r’   r*   r“   r   r   r   r/   $  r”   r•   r   )r   r:   r   r   r€   rŒ   r<   r=   Zincrease_counterr9   r   rd   r   r+   r;   Znodes_to_removerk   )r   r?   r%   rK   rŽ   Zchildren_nodesr   r   r   r   r€   ý  sJ   









ùz"FusionEmbedLayerNormalization.fuse)F)r   r‚   rƒ   r   r   r   r€   rŠ   r   r   r   r   r‹   ã  s    r‹   N)Úloggingr   Zfusion_baser   Zfusion_utilsr   Zonnxr   r   r   Z
onnx_modelr   r   r<   r	   r‹   r   r   r   r   Ú<module>   s        X