o
    i(                    @   sf  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlZddlZddlZddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z* ddl+m,Z, ddl-m.Z/ ddl0m1Z1m2Z2 e3dZ4G dd deZ5dde6e7 dB dej8fddZ9dej8fddZ:dej8fddZ;dde7de<fdd Z=dde7de<de<fd!d"Z>d#e7d$e<d%e<de%fd&d'Z?d(ejd)efd*d+Z@d(ejd)efd,d-ZAd(ejd)efd.d/ZB	0	1		dd2ed3ed4e7d5eCd6eDdB d7eDdB fd8d9ZEd:ed;efd<d=ZF	1dd(ed5eCde6e fd>d?ZGd@dA ZHdBdC ZIdDdE ZJdFefdGdHZKdFedIe<dJe<de<fdKdLZLdFefdMdNZMdOedPe7fdQdRZNdg fdOedSeCdTe6eC fdUdVZOdOefdWdXZPdOedPe7fdYdZZQ		[	\ddOed]e7d^eCd_eCd`eCf
dadbZRdFefdcddZSdFefdedfZTdgefdhdiZUddje7de<fdkdlZV	ddje7dme7de<de<fdndoZWdpdq ZXe5jYfdej8dre5fdsdtZZdej8dOee!B duej[dvej[dweCdxeCdye6e6eC  deDe7ef fdzd{Z\d|d} Z]		~ddej8de6e7 dB de<fddZ^ddej8de6e7 dB fddZ_dde6e7 dB de6e7 dB fddZ)e`dkr1e)  dS dS )a  
This converts GPT2 or T5 model to onnx with beam search operator.

Example 1: convert gpt2 model with beam search:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx

Example 2: convert gpt2 model with beam search containing specific cuda optimizations:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu                       --past_present_share_buffer --use_decoder_masked_attention

Example 3: convert gpt2 model with beam search with mixed precision and enable SkipLayerNorm strict mode:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu -p fp16 --use_sln_strict_mode

Example 4: convert T5 model with beam search in two steps:
    python -m models.t5.convert_to_onnx -m t5-small
    python convert_generation.py -m t5-small --model_type t5                     --decoder_onnx ./onnx_models/t5-small_decoder.onnx                       --encoder_decoder_init_onnx ./onnx_models/t5-small_encoder.onnx          --output ./onnx_models/t5_small_beam_search.onnx

Example 5: convert T5 model with beam search. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output t5_small_beam_search.onnx

Example 6: convert T5 model with beam search containing specific cuda optimizations. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output t5_small_beam_search.onnx           --use_gpu --past_present_share_buffer --use_decoder_masked_attention

Example 7: convert MT5 model with external data file like mt5-base-beamsearch.onnx.data in below example.
    python convert_generation.py -m google/mt5-base --model_type mt5 --output mt5-base-beamsearch.onnx -e

Example 8: convert gpt2 model with greedy search:
    python convert_generation.py -m gpt2 --output gpt2_greedy_search.onnx --num_beams 1 --num_return_sequences 1

Example 9: convert gpt2 model with sampling:
    python convert_generation.py -m gpt2 --output gpt2_sampling.onnx --num_beams 1 --num_return_sequences 1 --top_p 0.6
    N)Enum)Path)Any)	Precisionsetup_logger)NumpyHelper)
GraphProto
ModelProtoTensorProto)	OnnxModel)
GPT2ConfigGPT2LMHeadModelGPT2Tokenizer	MT5ConfigMT5ForConditionalGenerationT5ConfigT5ForConditionalGenerationT5Tokenizer)GraphOptimizationLevelInferenceSessionSessionOptionsget_available_providers)main)PRETRAINED_GPT2_MODELS)export_onnx_models)PRETRAINED_MT5_MODELSPRETRAINED_T5_MODELS c                   @   s    e Zd ZdZdZdZdd ZdS )GenerationTypeZbeam_searchZgreedy_searchZsamplingc                 C   s   | j S N)value)self r"   e/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/transformers/convert_generation.py__str___   s   zGenerationType.__str__N)__name__
__module____qualname__
BEAMSEARCHGREEDYSEARCHSAMPLINGr$   r"   r"   r"   r#   r   Z   s
    r   argvreturnc                 C   sB  t  }|d}|jdddtddtt t  d |jdd	td
g dddg d d |jdd	tt	j
dddd |jdd	tddd |jdd	tddd |jdd	ddd |jd	d |d}|jddtdd |jd d!d	ttjjtjjtjjgd"d |jd#d$d	d%d&gd'd( |jd)d*d	dd+d |jd	d, |jd-d.d	dd/d |jd	d0 |jd1d2d	dd3d |jd	d4 |jd5d6d	dd7d |jd	d8 |jd9d:d	dd;d |jd	d< |jd=d	dd>d |jd	d? |d@}|jdAd	ddBd |jd	dC |jdDd	ddEd |jd	dF |jdGd	ddH |jd	dI |jdJtd	dKdLdM |jdNd	ddOd |jd	dP |jdQd	ddRd |jd	dS |jdTd	ddUd |jd	dV |jdWd	ddXd |jd	dY |jdZd	dd[d |jd	d\ |jd]d	dd^d |jd	d_ |jd`d	ddad |jd	db |dc}|jddtd	dedfdM |jdgtd	dhdidM |jdjtd	dkdldM |jdmtd	dedndM |jdotd	dedpdM |jdqtd	dedrdM |jdstd	dtdudM |jdvtd	dtdwdM |jdxtd	tdy dzdM |jd{td	ded|dM |jd}td	d~ddM |jdtd	dKddM |jdtd	dddM |jdtd	dddM |jdtd	dddM |d}|jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	tdedd |jdd	ddd |jd	d || }|S )zParse arguments

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.

    Returns:
        argparse.Namespace: Parsed arguments.
    zInput optionsz-m--model_name_or_pathTzEPytorch model checkpoint path, or pretrained model name in the list: , )requiredtypehelpz--model_typeFgpt2)r2   t5mt5z*Model type (default is gpt2) in the list: )r/   r0   defaultchoicesr1   --cache_dir.Zcache_modelsz%Directory to cache pre-trained models)r/   r0   r5   r1   z--decoder_onnxr   zLPath of onnx model for decoder. Specify it when you have exported the model.z--encoder_decoder_init_onnxzgPath of ONNX model for encoder and decoder initialization. Specify it when you have exported the model.z	--verbose
store_truezPrint more information)r/   actionr1   )verbosezOutput options--outputz,Output path for onnx model with beam search.z-p--precisionzTPrecision of model to run. fp32 for full precision, fp16 for half or mixed precisionz-b--op_block_list*autozDisable certain onnx operators when exporting model to onnx format. When using defaultvalue for gpt2 type of model fp16 precision, it will be set to ["Add", "LayerNormalization", "SkipLayerNormalization", "FastGelu"]. Other situation, it will be set to [])r/   nargsr5   r1   z-e--use_external_data_formatz!save external data for model > 2G)use_external_data_formatz-sz--run_shape_inferencezrun shape inference)run_shape_inferencez-dpvsz--disable_pad_vocab_sizezDo not pad logits MatMul weight to be a multiple of 8 along the dimension where dim value is the vocab size. The logits MatMul may hence be of poor performance for fp16 precision.)disable_pad_vocab_sizez-dsgdz,--disable_separate_gpt2_decoder_for_init_runzDo not create separate decoder subgraphs for initial and remaining runs. This does not allow for optimizations based on sequence lengths in each subgraph)*disable_separate_gpt2_decoder_for_init_runz-iz--disable_shared_initializerszdo not share initializers in encoder and decoder for T5 or in the init decoder and decoder for GPT2. It will increase memory usage of t5/mt5/gpt2 models.)disable_shared_initializersz--encoder_decoder_initzbAdd decoder initialization to encoder for T5 model. This is legacy format that will be deprecated.)encoder_decoder_initz6Beam search parameters that stored in the output modelz--output_sequences_scoreszoutput sequences scores)output_sequences_scoresz--output_token_scoreszoutput token scores)output_token_scoresz--early_stopping)r/   r:   )early_stoppingz--no_repeat_ngram_sizer   zNo repeat ngram size)r0   r/   r5   r1   z--vocab_maskz\Enable vocab_mask. This mask applies only to every generated token to filter some bad words.)
vocab_maskz--past_present_share_bufferzWUse shared buffer for past and present, currently work for gpt2 greedy/sampling search.)past_present_share_bufferz--use_decoder_masked_attentionzUses `DecoderMaskedSelfAttention` or `DecoderMaskedMultiHeadAttention` to optimize the decoding Attention computation. Must be used with `past_present_share_buffer`. Currently, only Attention head sizes of 32, 64 and 128 are supported.)use_decoder_masked_attentionz--prefix_vocab_maskzeEnable prefix_vocab_mask. This mask can be used to filter bad words in the first generated token only)prefix_vocab_maskz--custom_attention_maskz]Enable custom_attention_mask. This mask can be used to replace default encoder attention mask)custom_attention_maskz--presence_maskz!Presence mask for custom sampling)presence_maskz--seedzRandom seed for sampling op)seedzYBeam search parameters not stored in the output model, for testing parity and performancez--min_length   zMin sequence lengthz--max_length2   zMax sequence lengthz--num_beams   z	Beam sizez--num_return_sequencesz&Number of return sequence <= num_beamsz--length_penaltyz<Positive. >1 to penalize and <1 to encourage short sentence.z--repetition_penaltyz-Positive. >1 to penalize and <1 to encourage.z--temperature      ?z6The value used to module the next token probabilities.z--top_pzTop P for samplingz--filter_valueZInfzFilter value for Top P samplingz--min_tokens_to_keepzAMinimum number of tokens we keep per batch example in the output.z--presence_penalty        z%presence penalty for custom sampling.z--customz&If 1 customized top P logic is appliedz--vocab_sizezIVocab_size of the underlying model used to decide the shape of vocab maskz--eos_token_idzKcustom eos_token_id for generating model with existing onnx encoder/decoderz--pad_token_idzKcustom pad_token_id for generating model with existing onnx encoder/decoderz0Other options for testing parity and performancez--use_sln_strict_modez_Enable strict mode for SLN in CUDA provider. This ensures a better accuracy but will be slower.)use_sln_strict_mode	--use_gpuz)use GPU for inference. Required for fp16.)use_gpuz--disable_parityzdo not run parity test)disable_parityz--disable_perf_testzdo not run perf test)disable_perf_testz--torch_performanceztest PyTorch performance)torch_performancez--total_runsz4Number of times of inference for latency measurementz--save_test_dataz-save test data for onnxruntime_perf_test tool)save_test_data)argparseArgumentParseradd_argument_groupadd_argumentstrjoinr   r   r   ospathset_defaultsr   ZFLOAT32r    FLOAT16intfloat
parse_args)r+   parserZinput_groupZoutput_groupZmodel_groupZbeam_parameters_groupZ
test_groupargsr"   r"   r#   parse_argumentsc   s  	
		




ro   rn   c                 C   s   | j }d|d| jdd| jddddd	g}| jr|d
| jg | jr&|d | jr.|d t| j	r?|dg || j	 | jt
jjkrM| jsMJ d| jrXtd|  t|d dS )zqConvert GPT-2 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r-   r<   z--optimize_onnxr=   z--test_runs1z--test_cases10z--overwriter7   rZ   rB   r>   zEfp16 or mixed precision model cannot run in CPU. Please add --use_gpuzarguments for convert_to_onnx:)r+   N)model_name_or_pathdecoder_onnx	precision	cache_dirextendr[   appendrC   lenop_block_listr   ri   r    r;   loggerinfoconvert_gpt2_to_onnx)rn   Z
model_name	argumentsr"   r"   r#   gpt2_to_onnx  s8   


r~   c                 C   s   t | j| jt| jj| j| j| jt	j
jk| jddddd| j| j| jt	j
jkd}td|d   td|d   |d | _|d | _dS )	znConvert T5 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    FT)rr   ru   
output_dirr[   rC   Zoptimize_onnxrt   r;   Zuse_decoder_start_token	overwriteZdisable_auto_mixed_precisionZuse_int32_inputs
model_typerH   Zforce_fp16_iozonnx model for encoder: r   zonnx model for decoder: rS   N)export_t5_onnx_modelsrr   ru   r   outputparentr[   rC   rt   r   ri   r    r   rH   rz   debugencoder_decoder_init_onnxrs   )rn   pathsr"   r"   r#   
t5_to_onnx$  s*   

r   T	onnx_pathrC   c                 C   sP   ddl m} tj| dd}|j|ddd}|r!tj|| |d d	S td d	S )
zShape inference on an onnx file, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    r   )SymbolicShapeInferenceTZload_external_dataF)Z
auto_mergeZguess_output_ranksave_as_external_dataz4Failed to run symbolic shape inference on the model.N)	Z&onnxruntime.tools.symbolic_shape_inferr   onnx
load_modelZinfer_shapesr   saverz   warning)r   rC   r   modeloutr"   r"   r#   shape_inferenceB  s   r   c                 C   s  t j| dd}|jjd j}t|}| }||v sJ || }|jdkr'dS d}||j	d }|du rR|
|dd}	|	du rBdS ||	j	d }|du rPdS d}|jtjjkr[dS t|jd	krddS |jd }
|
d
 dkrqdS t|
d
 d
 }||
 }|jr|rtj|jd |ftjd}tjt||fdd}||jd< ntj||jd ftjd}tjt||fdd}||jd< | |_ndS tj|| |d dS )zPad the logits MatMul weight in the provided decoder model, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   MatMulFrS   N	Transpose      dtypeaxisr   )r   r   graphr   namer   output_name_to_nodeop_typeget_initializerinputmatch_parent	data_typer
   ZDataTyperi   rx   dimsmathceilraw_datanpZzerosZfloat16concatenater   to_arraytobytesr   )r   rC   decoder_model_protologits_output_namedecoder_modelr   matmul_nodeZpad_along_axis_1Zlogits_weightZtranspose_before_matmulZactual_vocab_sizeZpadded_vocab_sizepaddingZpadding_dataZweight_with_paddingr"   r"   r#   pad_weights_of_logits_matmulT  sN   


r   
model_pathr[   rY   c                    sx   t  }tj|_|rddgndg}|r3dt vrtdtd |r3ddi}d|i  fdd|D }t| ||d	}|S )
a  Create OnnxRuntime session.

    Args:
        model_path (str): onnx model path
        use_gpu (bool): use GPU or not
        use_sln_strict_mode (bool): use strict mode for skip layer normalization or not

    Raises:
        RuntimeError: CUDAExecutionProvider is not available when --use_gpu is specified.

    Returns:
        onnxruntime.InferenceSession: The created session.
    ZCUDAExecutionProviderZCPUExecutionProviderz5CUDAExecutionProvider is not available for --use_gpu!zuse CUDAExecutionProviderZ"enable_skip_layer_norm_strict_modeTc                    s$   g | ]}| v r| | fn|qS r"   r"   ).0r   Zprovider_optionsr"   r#   
<listcomp>  s    z&create_ort_session.<locals>.<listcomp>)	providers)	r   r   ZORT_DISABLE_ALLZgraph_optimization_levelr   RuntimeErrorrz   r{   r   )r   r[   rY   Zsess_optionsZexecution_providersZcuda_provider_optionsort_sessionr"   r   r#   create_ort_session  s   


r   r   rt   c              	   C   s  |t jjk}t| j}|d }|dksJ g ddd t|D  }t| jt|kr:tdt| dt| j t|D ]E\}}| j| j|kr[td| d	| d| j| j t	j
}|dkrj|rgt	jnt	j}| j| jjj}	|	|krtd| d
| d|	 q>td dgdd t|D  }
t| jt|
krtdt|
 dt| j t|
D ]>\}}| j| j|krtd| d	| d| j| j |rt	jnt	j}| j| jjj}||krtd| d
| d| qtd dS )a  Verify GPT-2 subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of GPT-2
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
       rS   )	input_idsposition_idsattention_maskc                 S      g | ]}d | qS )Zpast_r"   r   ir"   r"   r#   r         z(verify_gpt2_subgraph.<locals>.<listcomp> Number of inputs expected to be . Got Input  is expected to be $ is expected to have onnx data type z:Verifying GPT-2 graph inputs: name and data type are good.logitsc                 S   r   )Zpresent_r"   r   r"   r"   r#   r     r   !Number of outputs expected to be Output z;Verifying GPT-2 graph outputs: name and data type are good.N)r   ri   r    rx   r   range
ValueError	enumerater   r
   INT32FLOATr0   tensor_type	elem_typerz   r{   r   )r   rt   
is_float16input_countlayer_countexpected_inputsr   expected_inputexpected_type
input_typeexpected_outputsexpected_outputoutput_typer"   r"   r#   verify_gpt2_subgraph  s>   
"
"
r   c              	   C   s<  |t jjk}|rtjntj}t| j}|d d }|dksJ ddg}t|D ]}|d|  |d|  q't|D ]}|d|  |d	|  q>t| jt|krit	d
t| dt| j t
|D ]?\}}| j| j|krt	d| d| d| j| j |dk rtjn|}	| j| jjj}
|
|	krt	d| d|	 d|
 qmdg}t|D ]}|d|  |d|  qt| jt|krt	dt| dt| j t
|D ]8\}}| j| j|krt	d| d| d| j| j | j| jjj}||krt	d| d| d| qdS )  Verify T5 decoder subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of T5 decoder
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
    r   rU   rS   r   encoder_attention_maskpast_key_self_past_value_self_Zpast_key_cross_Zpast_value_cross_r   r   r   r   r   r   present_key_self_present_value_self_r   r   N)r   ri   r    r
   r   rx   r   r   rw   r   r   r   r   r0   r   r   r   )r   rt   r   Z
float_typer   r   r   r   r   r   r   r   r   r   r"   r"   r#   verify_t5_decoder_subgraph  sH   
""
r   c              	   C   s  |t jjk}d| jd jv }g d}|r|dd }t| jt|kr2tdt| dt| j t|D ]9\}}| j| j|krStd| d	| d| j| j t	j
}| j| jjj}||krotd| d
| d| q6|rt| jd dks}J t| jd }	|	dksJ g }
t|	D ]}|
d|  |
d|  qnStd t| jd d dksJ t| jd d }	|	dksJ ddg}
t|	D ]}|
d|  |
d|  qt|	D ]}|
d|  |
d|  qt| jt|
krtdt|
 dt| j t|
D ]B\}}| j| j|kr2td| d	| d| j| j |r8t	jnt	j}| j| jjj}||krUtd| d
| d| qtd dS )r   crossr   )Zencoder_input_idsr   Zdecoder_input_idsNr   r   r   r   r   r   rS   Zpresent_key_cross_Zpresent_value_cross_zZThis format is deprecated. Please export T5 encoder in new format with only cross outputs.rU   r   Zencoder_hidden_statesr   r   r   r   zMT5 encoder graph verified: name and data type of inputs and outputs are good.)r   ri   r    r   r   rx   r   r   r   r
   r   r0   r   r   r   rw   rz   r   r   r{   )r   rt   r   Z
new_formatr   r   r   r   r   r   r   r   r   r"   r"   r#   'verify_t5_encoder_decoder_init_subgraphG  s\   "
"
r   shared_   graph1graph2shared_prefixmin_elementssignature_cache1signature_cache2c                 C   s  i }i }g }g }	g }
| j D ]N}|jrt|j|ksq|j D ]=}|jr)t|j|ks*qt||||rZ||j ||j< || |j|vrX||j }|||j< |	| |
|  nqqtd|
  | j	D ]}t
t|jD ]}|j| |
v rtd|j|  qpqg|j	D ]}t
t|jD ]}|j| |
v rtd|j|  qq|	D ]}|j | q|jD ]}|j|v r||j |_q|j	D ]4}t
t|jD ]*}|j| |v r||j|  }td|j d| d|j|  d|  ||j|< qq|D ]}| j | q| jD ]}|j|v r||j |_q| j	D ]7}t
t|jD ],}|j| |v rO||j|  }td|j d| d|j|  d|  ||j|< q$q|	D ]	}||j |_qU|	D ] }tj|j}tj|j|j|}| j| |j| qa|	S )	a  Remove initializers with same value from two graphs.

    Args:
        graph1 (GraphProto): the first graph to process
        graph2 (GraphProto): the second graph to process
        shared_prefix (str): add prefix to the shared initializers among two graphs
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
        signature_cache1 (dict): Optional dictionary to store data signatures of tensors in graph1 in order to speed up comparison
        signature_cache2 (dict): Optional dictionary to store data signatures of tensors in graph2 in order to speed up comparison
    zshared initializers:zname is found in graph 1: zname is found in graph 2: zgraph 2 rename node z input z from z to zgraph 1 rename node )initializerr   sumr   Zhas_same_valuer   rw   rz   r   noder   rx   r   r   remove
value_infor   numpy_helperr   shapehelpermake_tensor_value_infor   )r   r   r   r   r   r   Zmapping_initializers_1Zmapping_initializers_2Zshared_initializers_1Zshared_initializers_2Zshared_initializers_namesZinitializer1Zinitializer2Zshared_namer   jr   r   new_namer   r"   r"   r#   remove_shared_initializers  s   












*


*
r   encoder_modelr   c                 C   s`   t | }t |}|d |d i i }}|| || t|jj|jjd||d}|S )NZe_Zd_Zs_)r   r   r   )r   Zadd_prefix_to_namesZremove_duplicated_initializerr   r   r   )r   r   encoderdecoderr   r   initializersr"   r"   r#   get_shared_initializers  s   




r   c                 C   s   g }| j D ]}|jrt|j|ksq|| q|D ]}| j | q|D ]}tj|j}tj	
|j|j|}| j| q%|S )a^  Remove initializers of a graph, when they have number of elements larger than a threshold.

    Args:
        graph (GraphProto): the graph.
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.

    Returns:
        List[TensorProto]: initializers that are removed from the graph.
    )r   r   r   rw   r   r   r   r   r   r   r   r   r   r   )r   r   Zmoved_initializerstensorr   r   r   r"   r"   r#   move_initializers  s   
r   c                 C   s   | j dkrtd| j d| j dkr| j}n^| j dkr | j}nU| j dkr)| j}nL| j dkr2| j}nC| j dkr;| j}n:| j d	krD| j}n1| j d
krM| j	}n(| j dkrV| j
}n| j dkr_| j}n| j dkrh| j}ntd| j d| j  d| j|fS )z
    Convert attribute to kwarg format for use with onnx.helper.make_node.
        :parameter attribute: attribute in AttributeProto format.
        :return: attribute in {key: value} format.
    r   z
attribute z does not have type specified.rS   r   r   rU            r   	   
   z has unsupported type r8   )r0   r   r   fr   stgZfloatsZintsstringsZtensorsZgraphs)	attributer    r"   r"   r#   _attribute_to_pair>  s0   











r  c                 C   sD   i }| j D ]}t|\}}|||i q| jr |d| ji |S )Ndomain)r  r  updater  )r   kwargsattrkeyr    r"   r"   r#   	kwargs_ofc  s   
r  c                 C   s   t dd | jjjjD S )Nc                 S   s   g | ]}|j r
|j n|jqS r"   )	dim_param	dim_value)r   dr"   r"   r#   r   n  s    zshape_of.<locals>.<listcomp>)tupler0   r   r   dim)vir"   r"   r#   shape_ofm  s   r  subgc              
   C   s  d}d}g }t | jD ],\}}||kr1t|}tjj|j|jjj	|d |d |d d|d gd}|
|g q|
tjjdtjjdgd	g | d
 | j
| g }t | jD ],\}}||krt|}tjj|j|jjj	|d |d |d d|d gd}|
|g qZ| d | j
| g }| jD ]P}	|	}
|	jdkrt|	}|ddi g }|
|	j t|dk r|
dg t|dk st|dk r|
dg tjjd||	jfd|	ji|}
|
|
g q| d | j
| | S )Nr   rS   r   r   max_seq_lenrU   r   r   past_sequence_lengthr   r   r   	AttentionrM   r  r   r  r   r   )r   r   r  r   r   r   r   r0   r   r   rv   r
   r   
ClearFieldr   r   r   r  r  rx   	make_node)r  Zinput_past_0Zoutput_past_0
new_inputsr   r  r   new_outputs	new_nodesr   new_noder  nisr"   r"   r#   1update_decoder_subgraph_past_present_share_bufferq  sZ    



 
r'  is_beam_searchswitch_attentionc                 C   s  |r@g }t | jD ]
\}}||g q	|tjjdtjjdgdg |tjjdtjjg ddg | d | j| |rg d}g }| j	D ]x}|j
dkrt|}	|	 D ]}
|
d	krd  d
S |
|vrx|
dkrutd|
 d |	|
= qZg }||j |rt|dk r|dg t|dk st|dk r|dg t|dk r|dg tjjd||jfd|ji|	}||g qK| d | j	| dS )aS  Update the Attention nodes to DecoderMaskedSelfAttention.

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
        is_beam_search (bool): Boolean specifying if the sampling algo is BeamSearch
        switch_attention (bool): Boolean specifying if `Attention` is to be switched with `DecoderMaskedSelfAttention`
    
beam_widthrS   r  cache_indirection
batch_sizer*  r  r   rM   	num_headsscaleZmask_filter_valuer  r  Zqkv_hidden_sizesFZunidirectionalzRemoving attribute: zB from Attention node while switching to DecoderMaskedSelfAttentionr  r   r   r  ZDecoderMaskedSelfAttentionr   r   T)r   r   rv   r   r   r   r
   r   r   r   r   r  copyrz   r   rx   r!  r   r   )r  r(  r)  r"  Z_ir  'decoder_masked_attention_supported_attrr$  r   r  kr&  r"   r"   r#   4update_decoder_subgraph_use_decoder_masked_attention  sl   
 
	



r4  c                 C   s  t  }g }dd t| jD }i }i }| jD ]'}|jD ]}|r0||vr)|g||< q|| | q|jD ]}|r<|||< q4q| jD ],}|jdkrn|jd rT|jd sUqA|jd |jd }	}
d}d|
v r| jD ]}|jdkr|jd |
kr|jd j} nqin| j	D ]}|j
|
kr|} nq|du rqAtj|}|jdkrn| d	v rn|jd |v rn||	 }|jd
kr|jd sqA|jd |v r|jd ds|jd dr| dkr||jd  || t||jd  dkr|| qA|jd |vrqA||jd  }|jdkr|jd sqA||jd  }|jdkr*|jd s+qA|jd |v rn|jd dsE|jd drn| dkrn||jd  ||||g t||jd  dkrn|| qAqA||fS )az  Correct graph which originally use dim of past_seq_len from input_ids's shape which is fixed to max_seq_len after
       shared past/present buffer

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
    return:
        tensor_names_to_rename : set of tensor names which is equal to past_sequence_length
        nodes_to_remove : list of node to remove
    c                 S   s   i | ]\}}|j |qS r"   r   )r   indexinpr"   r"   r#   
<dictcomp>      z+find_past_seq_len_usage.<locals>.<dictcomp>GatherrS   r   NZ	Constant_Constant>   rS   r   Shaper   r   r   Reshaper   )setr   r   r   rw   r   r   r  r  r   r   r   r   r   sizeitem
startswithaddrx   rv   )r  tensor_names_to_renamenodes_to_removegraph_input_namesZinput_name_to_nodesr   r   Z
input_nameZoutput_nameZshape_tensor_nameZshape_index_nameZini_gather_indicesZ
const_noder   Zgather_indices_arr
shape_nodeZreshape_nodeZtranspose_noder"   r"   r#   find_past_seq_len_usage  s   









rG  r   past_seq_len_namec                 C   s   d}t tdd | jjj}|D ]"}t|jdk r&|jd t|jdk s|j| |j| q| jjjtj	j
|tjg dd |   | S )Nr+  c                 S   
   | j dkS NMultiHeadAttentionr   r   r"   r"   r#   <lambda>d     
 z.add_cache_indirection_to_mha.<locals>.<lambda>r   r   r-  r*  Zmax_sequence_lengthr  )listfilterr   r   r   rx   r   rw   r   r   r   r
   r   topological_sort)r   rH  Zcache_indirection_name	mha_nodesr   r"   r"   r#   add_cache_indirection_to_mhaa  s   
rU  r   skip_node_idxsc              
   C   sR  d}g }t tdd | jjj}t|D ]\}}||v rqd}|jD ]}	|	jdkr.|	j} nq"|}
|
dkrJ| jjj	D ]}|j|j
d krI|j}
 nq:d}| jjj
D ]}|j|j
d krf|jjjjd	 j} nqQt|jdk r{|jd
 t|jdk sn| d|d	  }|j| |tjj||
d|d|gd q| jjj| |   | S )NZoutput_cross_qkc                 S   rI  rJ  rL  rM  r"   r"   r#   rN  z  rO  z&add_output_qk_to_mha.<locals>.<lambda>r   r/  r   target_sequence_lengthrS   r   r   _r-  sequence_lengthr  )rQ  rR  r   r   r   r   r  r   r   r   r   r   r0   r   r   r  r  rx   r   rw   r   r   r   rv   rS  )r   r   rV  Zoutput_qk_basenameZ
output_qksrT  idxr   r/  attZoutput_qk_dtyper   rW  Zoutput_qk_namer"   r"   r#   add_output_qk_to_mhav  sP   


r\  c                    s  d}d}d}t tdd | jjjd }| |g dg d}| |d	d
gddg}|d ur1|}n|d ur8|}ntd d S |d }|jdkr|d }	| |	ddgddg  d u rbtd d S | |	g dg d}
|
d u rxtd d S |
d } |
dd  krtd d S t t fdd| jjjd }| jjj	| | jjj	 d  | jjj	 d  ||	j
d< ||j
d< n| |g dg d}|d u rtd d S |d }| |g dg dd u rtd d S d }|dd  dd  kr
td d S t tfdd| jjjd }| jjj	| t tfd d| jjjd }| jjj	| | jjj	d  | jjj	d  | jjj	d!  | jjj	d"  ||j
d< ||j
d< | jjj
tjj|tjdgd# tjjd$|g|g| d$d%}tjj|tjg d#}tjjd&|g|g| d&tjd'}tjj|tjg d#}| jjj||g | jjj||g |   | |fS )(Nr  past_seq_len_int32past_seq_len_int64c                 S   rI  )NLayerNormalizationrL  nr"   r"   r#   rN    rO  z*fix_past_sequence_length.<locals>.<lambda>r   )Addr:  ZTileZExpand	UnsqueezeRange)r   rS   rS   r   r   r   rb  SlicerS   zBCannot identify base path for fixing past_sequence_length subgraphrX   rd  r:  r<  zDCannot identify gather path for fixing past_sequence_length subgraph)rb  r:  r<  rS   r   r   zACannot identify add path for fixing past_sequence_length subgraphz]Gather path and add path do not share the same nodes for calculating the past_sequence_lengthc                    s   | j d  d jd kS Nr   rS   r   r   r`  )gather_pathr"   r#   rN    r9  )rc  rb  r:  r<  r=  r   )r   r   r   r   r   r   zGCannot identify input_ids path for fixing past_sequence_length subgraph)rc  r:  r<  r=  r   )rS   r   r   r   r   zFCannot identify past_key path for fixing past_sequence_length subgraphr   ziThe input_ids path and past_key path do not share the same nodes for calculating the past_sequence_lengthc                    s   | j d  d jd kS rg  rh  r`  past_key_pathr"   r#   rN  '  r9  c                    s   | j d  d jd kS )Nr   rS   rh  r`  rj  r"   r#   rN  )  r9  r   rU   r  Squeezeinputsoutputsr   Castro  rp  r   to)rQ  rR  r   r   r   match_parent_pathrz   r{   r   r   r   rw   r   r   r   r
   r   r!  create_node_nameINT64rv   r   rS  )r   rH  r]  r^  r   Zbase_path_hfZbase_path_oai	base_pathZ	base_nodeZ
range_nodeZadd_pathadd_nodeZconstant_in_gatherZinput_ids_pathZunsqueeze_nodeZconstant_in_reshapesqueeze_nodeZsqueeze_output	cast_nodeZcast_outputr"   )ri  rk  r#   fix_past_sequence_length  s   "




 


 


r{  c                 C   s  d}d}| j jjtjj|tjdgdtjj|tjg ddg t	t
dd | j jj}t|D ]\}}d}|jD ]}|jd	krF|j} nq:d
|d  }	tjj|	tjd|ddgd}
|d dkrj| j jj|
 tjjd|jd |jd |jd ddt|jdkr|jd ndt|jdkr|jd nd||||jd g|jd t|jdkr|jd ndt|jdkr|jd nd|d dkr|	ndg|jddd||d dd}|d dkr|jd | j jj| | j jj|g q1|   | S )Nr*  r+  rS   r  rP  c                 S   rI  rJ  rL  rM  r"   r"   r#   rN  b  rO  z(replace_mha_with_dmmha.<locals>.<lambda>r   r/  output_cross_qk_r   r-  zencode_sequence_length / 2DecoderMaskedMultiHeadAttentionr   rU   r  r  r   rK  com.microsoft)ro  rp  r   r  r/  	output_qkrM   )r   r   r   rv   r   r   r   r
   r   rQ  rR  r   r   r  r   r   r   r   rw   r!  rx   replacer   rS  )r   rH  r*  r+  rT  rZ  r   r/  r[  Zqk_output_nameZ	qk_outputZ
dmmha_noder"   r"   r#   replace_mha_with_dmmhaS  sl   



r  rS   rX   	attn_maskkv_num_heads
world_sizewindow_sizec           1      C   sP  |  tjjdtjdgdgd tjjd|dg|d g| dd}tjjd|d dgdg| dd}tjjd	dgd
g| d	tjd}tjjd|g|d g| dd}tjjd|d dgdg| ddd}	tjjd	dgdg| d	tjd}
| j	j
j|||||	|
g ttdd | j	j
j}t|D ]\}}| |g dg d}| |ddgddg}d\}}}|d ur|\}}}n|d ur|\}}| |g dg d}| |ddgddg}d\}}}|d ur|\}}}n|d ur|\}}| |ddgddg}| |dgdg}d\}}|d ur|\}}n	|d ur"|d }d}|d ur>|d ur>|jD ]}|jdkr<|j}q1d}|jD ]}|jdkrN|j}qC|jd |jd kod|jd |jd k}|d uor|d uor|d u} |d u o|d u o|d u }!d \}"}#}$|r| s|!rt| |jd }%t| |jd }&t| |jd }'|%jd! }(tj|%|&|'fdd"|(d#|( })tjj|)d$| d%})|  |) tjjd|jd |)jg|)j d&g| dd}*| j	j
j|*g | j	j
j| | j	j
j| | j	j
j| |*jd }"| rt| |jd }+t| |jd },t| |jd }-|+jd! }(tj|+|,|-fdd"d#|( }.tjj|.d'| d%}.|  |. tjjd|*jd |.jg|.j d&gd(}/| j	j
j|/g | j	j
j| | j	j
j| | j	j
j| |/jd }"n|jd }"|jd }#|jd }$tjjd)|"|#|$|jd* |jd+ |jd |
jd |d ur|jd nd,|d ur|jd# nd,g	|j|j d-d)d.|| |dkr|| n|| |t!|d uo|d u|d/
}0| j	j
j| | j	j
j|0g |d ur| j	j
j| |d ur%| j	j
j| q| S )0NonerS   r   r   r   valsZ	ReduceSumZ	_row_sumsrn  SubZseqlens_k_int64rq  Z	seqlens_krr  r<  _shaper:  Ztotal_seq_len_int64r   )ro  rp  r   r   Ztotal_seq_lenc                 S   rI  rJ  rL  rM  r"   r"   r#   rN     rO  z&replace_mha_with_gqa.<locals>.<lambda>)RotaryEmbeddingrb  r   )r   r   r   r  r   )NNNrf  rb  r   NNinterleavedr/  )r   r   r   rX   r   r   ZQKV_Weight_r5  _outputZ	QKV_Bias_)ro  rp  ZGroupQueryAttentionr  r  r   rK  r~  )	ro  rp  r   r  r/  r  Zlocal_window_sizeZ	do_rotaryZrotary_interleaved)"add_initializerr   r   make_tensorr
   rv  r!  ru  r   r   r   r   rv   rQ  rR  r   rt  r  r   r   r   r   r   r   r   r   stackreshaper   Z
from_arrayr   r   r  rj   )1r   r  r  r  r  Zreduce_sum_nodeZsub_nodeZseqlen_k_cast_noderF  Zgather_nodeZtotal_seqlen_cast_noderT  rZ  r   Zq_path_1Zq_path_2Zq_rotaryZq_addq_matmulZk_path_1Zk_path_2Zk_rotaryZk_addk_matmulZv_path_1Zv_path_2Zv_addv_matmulr  r[  r/  Zroot_input_is_sameZall_paths_have_biasZall_paths_have_no_biasZq_input_to_attentionZk_input_to_attentionZv_input_to_attentionqwkwvwr  
qkv_weightZpacked_matmul_nodeZqbkbZvbZqkv_biasZpacked_add_nodeZgqa_noder"   r"   r#   replace_mha_with_gqa  sH  


$






*

 









r  c              	      s  d}dd j D }|dk r$|| ds$|d7 }|dk r$|| drd}tj| d }d| |   fddt|D }td	|  tj   }td
|  |d }|d }|d }	d}
jD ]_}|jdkr|j d |v rtd|j	 d|j  |
d7 }
||j d  }d| }dgdt|j  }|
| |j| |jtjddg tj|tj||d|	g}j|g qg|
|krtd| d|
 d S )NrS   c                 S      g | ]}|j qS r"   r5  r   gir"   r"   r#   r         zBupdate_decoder_subgraph_output_cross_attention.<locals>.<listcomp>r   pastr   c                    s"   i | ]}j |d     j|qS )r   )r   r   )r   layerinput_cross_past_0r  r"   r#   r8    s   " zBupdate_decoder_subgraph_output_cross_attention.<locals>.<dictcomp>z    -- past_key_cross_inputs = zpast_key_cross_0_shape is r   r}  z'    -- add cross QK output from: node: z with output: r|  r   r  z#Did not add cross QK for all layersz vs )r   rA  rx   r   r   printr  r   r   r   rw   rv   r  r   r   make_attributer   r
   r   r   )r  input_self_past_0rE  Zoutput_self_present_0
num_layersZpast_key_cross_inputsZinput_past_key_cross_0_shapeZbatch_size_dimZnum_heads_dimZcross_seq_len_dimZnum_layer_output_qkr   r  Zcross_attention_out_nameZappended_namesZcross_attentionr"   r  r#   .update_decoder_subgraph_output_cross_attention  sH   



r  c              	   C   s$  d}dd | j D }|dk r$|| ds$|d7 }|dk r$|| drd}tt| j | d }d| | }g }g }| jD ]}|jdkrK||g q>t||k rTd	S d }	| jD ]}|jd
krd|}	 nqYg d}
d}t| \}}t|dkr|D ]}td| d| d qy|D ]}td|j d|j	  qt
jjddgdgdd}t
jjddg|gdtjd}|||g | jD ]}t|jdkr|	d ur|jd |	j d krt
jjddgdgdtjd}|jd |j d< ||g |jdkrt|}| D ]
}||
vr||= q|j d |j d |j d g}|t|j dkr%|j d ndg |t|j dkr8|j d ndg |t|j dkrK|j d ndg |t|j d kr^|j d  ndg |dg |d!g |d"g |t|j dkr|j d ndg d|d#< t
jjd$||jfd%|j	i|}||vrt|j D ]\}}||v r||j |< q||g q| d& | j| d'd | j D }g }t| j D ]0\}}||kr||k rt|}t
jj|j	|jjj|d |d d(|d gd)}||g qd|vr|t
jjdt
jjdgd*g d!|vr1|t
jjd!t
jjdgd*g d"|vrG|t
jjd"t
jjg d+d*g | d, | j | g }t| jD ]+\}}||kr}t|}t
jj|j	|jjj|d |d d(|d gd)}||g qY| d- | j| d.S )/NrS   c                 S   r  r"   r5  r  r"   r"   r#   r     r  zSupdate_decoder_subgraph_share_buffer_and_use_decoder_masked_mha.<locals>.<listcomp>r   r  rU   r   rK  FZRelativePositionBiasr.  Z#past_sequence_length_squeezed_int64r   zFound tensor name `z` to be renamed to ``zFound node to remove: type = z	, name = rm  r  Zpast_sequence_length_squeezedZ!node_past_sequence_length_squeezer5  rq  Z&node_past_sequence_length_squeeze_cast)r   rs  Zpast_sequence_length_int64Zpast_sequence_length_castr   r  r  r  r*  r+  rM   r}  r   r   c                 S   r  r"   r5  )r   r7  r"   r"   r#   r   !  r  r  r  r  r,  r   r   T)r   rA  rj   rx   r   r   rv   rG  r  r   r   r   r!  r
   rv  r   r  r1  r   r   r  r   r0   r   r   r   )r  r  rE  Zoutput_self_past_0r  r  r$  Z	old_nodesr   Zrel_pos_bias_noder2  Ztarget_squeezed_past_seq_namerC  rD  Zname_to_renamenrry  rz  r  r3  r&  r6  r   Zorig_input_namesr"  r   r  r   r#  r"   r"   r#   ?update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha  s
  




*
&&&&&





 

	

r  model_protoc                 C   s  t | }| }g }g }| D ]}|jdkrd|jd v r&d|jd v r&q||jd  }||jd  }||jd  }||jd }	||jd }
||jd }|	rY|
rY|s\ dS t|	}t|
}t|}tj	|||gdd}|j
d	d
d}tjj|d |	jdkrtjntj|jd |jd g|  d}| jj|g tjjd	|jd |d g|d g|d}|jd |jd< d|jd< d|jd< ||g ||||g q|| || |  |  dS )Nr}  Zpast_key_crossrS   Zpast_value_crossr   r   Fr   r   Z
MatMul_QKV)Zname_prefixZ_weightr  Z_outrn  r   T)r   r   nodesr   r   r   r   r   r   r   ru  r   r   r  r   r
   r   ri   r   flattentolistr   r   rv   r!  r   Z	add_nodesZremove_nodesZupdate_graphrS  )r  
onnx_modelr   Znodes_to_addrD  r   r  r  r  Zq_weightZk_weightZv_weightr  r  r  r  Zmatmul_node_nameweightr   r"   r"   r#   pack_qkv_for_decoder_masked_mhaP  sZ   








r  decoder_onnx_pathc                 C   s   t j| dd}tt|jjD ],}|jj| jdks#|jj| jdkr;|jj| jjj	j
d }|dr8|  d|_qtj|| |d dS )aQ  Update the input shapes for the inputs "input_ids" and "position_ids" and make the sequence length dim value 1 for each of them.
       The decoder model will be over-written.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   rS   r  r   )r   r   r   rx   r   r   r   r0   r   r   r  HasFieldClearr  r   r   )r  rC   r   r   Zshape_dim_protor"   r"   r#   *update_input_shapes_for_gpt2_decoder_model  s   	
r  init_decoder_onnx_pathc                 C   s  t j| dd}|jjd j}t|}| }||v sJ || }|jdkr'dS ||g dg d}|du rA||g d	g d
}|du r_||g dg d}|du r_||g dg d}|du redS |d }	|	jdk}
|
sd}||	g d|dddg}|du rd}||	g d|dddg}|du rd}||	g d|ddg}|du rd}||	g d|ddg}nBd}||	g d|ddg}|du rd}||	g d|ddg}|du rd}||	ddg|dg}|du rd}||	ddg|dg}|du rdS |dkrdnd}|
s|	|	d|}n|	|	d|}|du r!dS |d }|d }t j
jdtjdgdgd}t j
jdtjdgdgd}t j
jdtjdgdgd}t j
jdtjdgdgd}|| || || || d|jd  }t j
jd|jd ddddg|g|ddd }|
s|jd n|jd! }d|jd  }t j
jd|ddddg|g|dd"d }|| || |||jd | ||	|| |  tj|||d# dS )$a  Generates the initial decoder GPT2 subgraph and saves it for downstream use.
       The initial decoder model will be saved to init_decoder_onnx_path.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        init_decoder_onnx_path (str): Path of GPT-2 init decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   F)rq  r_  rb  rb  rq  r   rq  FastGelurq  r   rq  r_  rb  )r   r   r   rS   r   r   r   r   r   r   r   r   r   N)
rq  SkipLayerNormalizationrq  r   rq  r  rq  r   rq  r  )
r   r   rS   r   r   r   r   r   r   r   )r_  rb  rb  r   r  r   r_  rb  )r   r   rS   r   r   r   r   r   )r  r   r  r   r  )r   rS   r   r   r   rX   r  )rb  rq  r   r  rS   )rb  r   r  )rq  r   r  r  rb  rl  ZSliceLastTokenStartsr  ZSliceLastTokenEndsZSliceLastTokenAxesZSliceLastTokenStepsZedge_modified_re  ZGatherLastToken_0_rn  r   ZGatherLastToken_1_r   )r   r   r   r   r   r   r   r   rt  r   r   r  r
   r   r  r!  ru  rx  Zreplace_node_inputrS  r   )r  r  rC   Zinit_decoder_model_protor   gpt2_init_decoder_modelr   Zlogits_matmul_nodeZ"logits_matmul_to_residual_add_pathZresidual_add_nodeZis_skiplayernorm_pathZ&residual_add_to_attention_parent_indexZresidual_add_to_attention_pathZ residual_add_to_add_parent_indexZadd_before_residual_addZ	attentionZmatmul_after_attentionZslice_startsZ
slice_endsZ
slice_axesZslice_stepsZslice_0_output_nameZslice_node_0Zadd_before_residual_add_outputZslice_1_output_nameZslice_node_1r"   r"   r#   generate_gpt2_init_decoder  sX  













r  c           	      C   s   t d}t |j}t |j}t |j}| jjD ]%}|jjjj	D ]}|
dr;|j||||fv r;t|j}|  ||_qq| jjD ]%}|jjjj	D ]}|
dre|j||||fv ret|j}|  ||_qIqAdS )zoMake dim_proto numeric.

    Args:
        model: T5 encoder and decoder model.
        config: T5 config.
    rS   r  N)rd   r/  Zd_modelZd_kvr   r   r0   r   r   r  r  r  rj   r  r  r   )	r   configrY  r/  Zhidden_sizeZ	head_sizer   Z	dim_protor  r"   r"   r#   make_dim_proto_numeric_t5	  s>   




r  generation_typec           -      C   s  | j dk}|tjk}|tjk}|tjk}| j}td|  t| j	dkrM| j	d dkrM|rJ| j
tjjkrJg d| _	td| j	  td ng | _	|sQ|re|sWtd	| jr^td
| jretd|rp|rp| jsptd| jry|sytd| jr| jstd|r| jrtj| jrtd| j  nN| js| j d| j
 d}tt| jj| | _td| j d| j d t|  n"| jr| jrtd| j d| j  ntd| j d t|  d}| j s| j
tjjkr|r|s|s|rtd| j d t!| j| j"}|st#d d}	d}
| j$sg|rg|s,|s,|rgtd| j d d | j
 d}tt| jj| }
t%| j|
| j"}	|	sXt#d! |	rgt&| j| j"sgtd"|sq| j'sq|	rtd#| j d t(| j| j" |	rtd#|
 d t(|
| j" |rt)j*| j| j+d$}n| j d%krt,j*| j| j+d$}n	t-j*| j| j+d$}| j.rtd&|  |j/}|r|j/n|j0}|j1}| j1d'kr| j1}| j/d'kr| j/}| j0d'kr| j0}t2j3| jd(d)}| j  d*|j4_5d}| j dkr)t6|j4| j
 |	r(t2j3|
d(d)}| j  d+|j4_5t6|j4| j
 nt7|j4| j
 d}|r:g d,}n
|s@|rDg d-}| j8rN|9d. n|9d/ | j:r]|9d0 n|9d/ | j;rl|9d1 n|9d/ |r| j<r| j=r|9d2 n|9d/ | j>r|9d3 d4g}| jr|9d5 | jr| jsJ d6|9d7 d}|rt2j?j@d8||d9| j  d:}n#|rt2j?j@d;||d<| j  d:}n|rt2j?j@d=||d>| j  d:}d?|_Ad}|rt2j?Bd@|t2j?BdA|t2j?BdB| jCt2j?BdC| jDr	dndt2j?BdD| j dkrdndg}nw|rCt2j?Bd@|t2j?BdA|t2j?BdD| j dkr7dndt2j?BdB| jCg}nP|rt2j?Bd@|t2j?BdA|t2j?BdD| j dkr^dndt2j?BdB| jCt2j?BdE| jEt2j?BdF| jFt2j?BdG| jGt2j?BdH| jHt2j?BdI| j<t2j?BdJ| jIg
}|r|Jt2j?BdK|g |jKJ| g }| j dLv rf| j'rtdM| j d t(| j| j" t2j3| jd(d)}t|j4jLdNkrdOndP}| j  dQ| |j4_5tM|j4| j
 tN|| tN|| |r%| jstdRtdS tO|j4rtdT ntdU tP|r tdV ntdW | jQs@tR||}tt| dXdYdZ |D  d[ |jSdksJJ d\|jKJt2j?BdO|j4t2j?Bd]|j4t2j?Bd^|jSg n|	r| jQstR||}tt| dXd_dZ |D  d` |rtda tT|j4 | jrtU|j4|dstdb|jK9t2j?Bdc|j4 ntV|j4}tt| dd |rtde tT|j4 | jrtU|j4|d(stdf|jK9t2j?Bd]|j4 t2j?WdgtXjYdhdig}t2j?WdjtXjYdg}t2j?WdktXjYdg}t2j?WdltXjYdg}t2j?WdmtXjYdg}t2j?WdntXjZdg}t2j?WdotXjZdg}d} |r<|||||||g} n|sB|rH||||g} | j8r[t2j?Wd.tXjY|g}!| 9|! | j:rot2j?Wd0tXjYdh|g}"| 9|" | j;rt2j?Wd1tXjYdhdig}#| 9|# | j<r| j=rt2j?Wd2tXjYdh|g}$| 9|$ |r| j>rt2j?Wd3tXjYdg}%| 9|% d}&|rt2j?Wd4tXjYg dp}&n|s|rt2j?Wd4tXjYdhdjg}&|&g}'| jrt2j?Wd5tXjZdhdmg}(|'9|( | jr t2j?Wd7tXjZdqdhdl|g})|'9|) t2j?[|g|s| j  drn| j  ds| |'|}*t2j?j\|*dt|j]du}+| j"rHddvl^m_}, |,`t2ja|,`dwk r=t#dx tbjc|+| jd(d(dy nt2c|+| j tdz| j  dS ){zConvert model according to command line arguments.

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r2   z**** past_present_share_buffer=rS   r   r@   )rb  r_  r  r  z**** Setting op_block_list to zI**** use --op_block_list if you want to override the block operator list.z<Currently only gpt2 with greedy search/sampling is supportedzLoutput_sequences_scores currently is not supported in greedy search/samplingzHoutput_token_scores currently is not supported in greedy search/samplingzi`use_decoder_masked_attention` MUST be turned on to use `past_present_share_buffer` in case of BeamSearchzS`past_present_share_buffer` MUST be turned on to use `use_decoder_masked_attention`z?`use_decoder_masked_attention` option is only supported on GPUsz)skip convert_to_onnx since path existed: Z_past_z.onnxzConvert GPT model z	 to onnx z ...z,skip convert_to_onnx since paths specified: z and zConvert model z to onnx ...Fz=Pad logits MatMul weights for optimal MatMul perf in fp16 on z. The file will be overwritten.z]Tried and failed to pad logits MatMul weights. Performance may be sub-optimal for this MatMulNz*Creating an initial run GPT2 decoder from z. Zgpt2_init_past_zuTried and failed to generate the init decoder GPT2 model. Performance may be sub-optimal for the initial decoding runzGCould not update the input shapes for the non-initial decoder subgraph.z Run symbolic shape inference on ru   r3   zConfig=rX   Tr   z decoderz init decoderr   
max_length
min_length	num_beamsnum_return_sequenceslength_penaltyrepetition_penaltyr   r  r  r  rL   r   rO   r   rQ   rR   	sequencessequences_scoresz8--output_token_scores requires --output_sequences_scoresscoresZ
BeamSearchZBeamSearch_rn  ZGreedySearchZGreedySearch_ZSamplingZ	Sampling_r~  eos_token_idpad_token_idno_repeat_ngram_sizerK   r   temperaturetop_pfilter_valuemin_tokens_to_keepcustompresence_penalty
vocab_sizer3   r4   zSymbolic shape inference on r   r   zencoder and decoder init zMpast_present_share_buffer is only supported with use_decoder_masked_attentionzl*****update t5 decoder subgraph to share past/present buffer and use decoder_masked_multihead_attention*****z4*****update t5 decoder subgraph successfully!!!*****zF*****DecoderMaskedMultiHeadAttention is not applied to T5 decoder*****z9*****pack qkv for decoder masked mha successfully!!!*****z3*****pack qkv for decoder masked mha failed!!!*****z shared initializers (c                 S   r  r"   r5  r   r"   r"   r#   r   ?  r  z,convert_generation_model.<locals>.<listcomp>z>) in encoder and decoder subgraphs are moved to the main graphz%decoder_start_token_id should be >= 0r   decoder_start_token_idc                 S   r  r"   r5  r   r"   r"   r#   r   [  r  zC) in decoder and init decoder subgraphs are moved to the main graphzY*****update init decoder subgraph to make past and present share buffer******************zLCould not update the init decoder subgraph to use DecoderMaskedSelfAttentionZinit_decoderz: initializers from the decoder are moved to the main graphzT*****update decoder subgraph to make past and present share buffer******************zGCould not update the decoder subgraph to use DecoderMaskedSelfAttentionr   r-  rY  r  r  r  r  r  r  )r-  r  r  zmax_length - sequence_lengthz beam searchz greedy searchzonnxruntime.transformers)Zproducer_nameZopset_imports)versionz1.12.0z0Require onnx >= 1.12 to save large (>2GB) model!)r   Zall_tensors_to_one_filezmodel save to )dr   r   r(   r)   r*   rM   rz   r{   rx   ry   rt   r   ri   r    NotImplementedErrorrI   rJ   rN   r   r[   rs   rf   rg   existsrr   r   r   r   as_posixr~   r   r   rE   r   rC   r   rF   r  r  rD   r   r   from_pretrainedru   r   r   r;   r  r  r  r   r   r   r   r   r   rL   rw   rO   rP   r  rQ   rR   r   r!  r  r  r  rK   r  r  r  r  r  rv   r  r   r   r  r  r  rG   r   r  r'  r4  r   r   r
   r   r   Z
make_graphZ
make_modelZopset_import	packagingr  parse__version__r   r   )-rn   r  Zis_gpt2Zis_beamsearchZis_greedysearchZis_samplingrM   Zonnx_filenameZlogits_matmul_weight_paddedZgpt2_init_decoder_generatedZgpt2_init_decoder_onnx_pathZgpt2_init_decoder_onnx_filenamer  r  r  r  r   r  ro  rp  r   Zattr_to_extendr   r   suffixr   r  r  r  r  r  r  Zgraph_inputsrL   rO   r   rQ   rR   r  Zgraph_outputsr  r  Z	new_graphZ	new_modelr  r"   r"   r#   convert_generation_model	  s  
	






	























	







	
r  r   r   r  r  bad_words_idsc                 C   s   | j rtj std| jtjjkr|	  t
| j rdnd}|| td ||}||}g }t| jD ]3}	t }
|j||| j| j| j| j| j||| j| j| j|r\|ndd| jpc| jd}	|t |
  q<|jd }dd	lm} |||S )
a  Test PyTorch performance of text generation.

    Args:
        args (argparse.Namespace): arguments parsed from command line
        model (Union[GPT2LMHeadModel, T5ForConditionalGeneration]): PyTorch model
        input_ids (torch.Tensor): input_ids
        attention_mask (torch.Tensor): Attention mask
        eos_token_id (int): EOS token ID
        pad_token_id (int): Padding token ID
        bad_words_ids (List[List[int]]): Words shall not be generated.

    Raises:
        RuntimeError: PyTorch with CUDA is not available for --use_gpu

    Returns:
        Dict[str, Any]: A dictionary with string with metric name, and value can be integer or string.
    z=Please install PyTorch with Cuda for testing gpu performance.zcuda:0cpuFNTr   r   r  r  r  rK   r  r  r  r  r  r  r  Zreturn_dict_in_generateZoutput_scoresr   get_latency_result)r[   torchcudaZis_availabler   rt   r   ri   r    Zhalfdevicers  Zset_grad_enabledr   
total_runstimegenerater  r  r  rK   r  r  r  r  rI   rJ   rw   r   benchmark_helperr  )rn   r   r   r   r  r  r  r  Ztorch_latencyrX  startr-  r  r"   r"   r#   test_torch_performance  sB   







r  c                 C   sp   t j| jt jd}t| jd D ]%}d}t| jd D ]}| | | |kr0|dkr0d|| |< q|d7 }qq|S )Nr   r   rS   )r   onesr   int32r   )r   r  r   r   Zabs_posr   r"   r"   r#   create_attention_mask9  s   
r  F	sentences	is_greedyc           +      C   s  | j dksJ tj| j| jd}d|_|j|_tj| j| j|j	d}|du r*g d}||ddd	}|d
 }|d }d}|j
|dd}	dd |	D }	| jrStd|	 ng }	|j}
|
j	}|
j	}|
j}g }d}| jstd td |j||| j| j| j| j| j||| j| j| j|	r|	ndd| jp| jd}td
| td td|j | jrtd|j | jrtd|j t |jD ]\}}|j!|dd}|"| t| d|  qtd td |r|# $ %t&j't&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j)dd}nB|# $ %t&j't&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j)dt&j(| jgt&j)dd}| jrgt&j*|t&j'd}| jrc|	D ]}d||< q[||d< | j+rrt,|||d< |j-d }| j.rt/d  t&j*||ft&j'd}||d!< | j0rt1| j2j34 }td"| dd#l5m6} t/d$| d% |g}t |D ]\}}t7j89|d&t:| }||| qtd'| | j;rdS td( t<| j2| j=| j>}td) |?d|}g }t@| jAD ]}tBB }|?d|}|"tBB |  qdd*lCmD}  |j-d }| ||}!td+ |d }"td|" | jr7td|d,  | jrBtd|d-  |rm|"j-\}}#g }$t@|D ]}|j!|"| dd}|$"| td.| d/|  qPn5|"j-\}}%}#g }$t@|D ](}t@|%D ] }&|j!|"| |& dd}|$"| td.| d0|& d|  qqy|r|jE|| jd1}'tFG|"}(td td2 t|' t| td td3 t|( t|$ td ||$k})td4|)rd5nd6 |)|!d7< | jHrtI| ||||||	}*td8|* td9|! |!S ):a9  Test GPT-2 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r2   r  left)ru   r  N)zThe product is releasedzI enjoy walking in the parkzTest best way to investptTZreturn_tensorsr   r   r   walk in park)Zadd_prefix_spacec                 S      g | ]}|gqS r"   r"   r   Zword_idr"   r"   r#   r   m  r  z"test_gpt_model.<locals>.<listcomp>r  2--------------------------------------------------CTest PyTorch model and beam search with huggingface transformers...r  !huggingface transformers outputs:r  r  r  Zskip_special_tokens: 'Testing beam search with onnxruntime...r   r  r  r   rL   zYUse prefix vocab mask with all ones in ORT, but no corresponding setting for Torch model.rO   test_data_diroutput_test_datazSaving test_data to z/test_data_set_* ...test_data_set_
ORT inputszCreating ort session......zRun ort session......r  ORT outputs:rS   r   batch z sequence: 
 sequence rX   Torch Sequences:ORT Sequences:zTorch and ORT result issame	differentparityTorch LatencyORT)Jr   r   r  rr   ru   padding_sideZ	eos_tokenZ	pad_tokenr   r  encoderL   rz   r   r  r  r\   r  r  r  r  r  rK   r  r  r  r  rI   rJ   r  r  r  r   decoderw   r  numpyastyper   r  arrayfloat32r  rP   r  r   rO   r{   r_   r   r   r   r  bert_test_datar  rf   rg   re   rd   r]   r   r[   rY   runr   r  r  r  r  r  r  
LongTensorr^   r  )+rn   r  r  	tokenizerr   ro  r   r   	bad_wordsr  r  r  r  r  torch_decoded_sequencesbeam_outputsr   sequencedecoded_sequencerL   bad_word_idr-  rO   r  r  
all_inputsdirr   resultlatencyrX  r  r  r   r  r  ort_decoded_sequencesnum_sequencesr   torch_sequencesort_sequencesis_sametorch_latency_outputr"   r"   r#   test_gpt_modelE  sB  
















	
r+  c           )      C   s(  | j dv sJ | jrtd dS tj| j| jd}d|_| j dkr,t	j| j| jd}n	t
j| j| jd}|du r=ddg}||d	d
d}|d }|d }d}||dd }dd |D }| jrhtd| ng }|j}	|	j}
|	j}|	j}td|
 d| d|  g }| jstd td |j||| j| j| j| j| j|
|| j| j| j|r|ndd
| jp| jd}td| td td|j | jrtd|j | jrtd|j  t!|jD ]\}}|j"|d
d}|#| t| d|  qtd td t$j%|t$j&d }| jr|D ]}d!||< q|' ( )t$j&t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j+d t$j*| jgt$j+d d"}| jr]||d#< | j,rht-|||d< | j.rt/| j0j12 }td$| d!d%l3m4} |g}t!|D ]\}}t5j67|d&t8| }||| qtd'| t9| j0| j:| j;}g }t<| j=D ]}t>> }|?d|}|#t>> |  q|j@d! }d!d(lAmB} |||}td) |d! } td|  | jrtd|d*  | jrtd|d+  | j@\}}!}"g }#t<|D ](}t<|!D ] }$|j"| | |$ d
d}|##| td,| d-|$ d|  qq	| jsz|jC|| jd}%tDE| }&td td. t|% t| td td/ t|& t|# td ||#k}'td0|'rsd1nd2 |'|d3< | jFrtG| ||||
||}(td4|( td5| |S )6a=  Test T5 or MT5 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  zLSkipping parity test as prefix vocab mask is not implemented by Hugging FaceNr  r  r3   z4translate English to French: The product is releasedzsummarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.r  Tr  r   r   r  rX   c                 S   r  r"   r"   r  r"   r"   r#   r   I  r  z!test_t5_model.<locals>.<listcomp>r  zeos_token_id:z, pad_token_id:z, vocab_size:r  r  r  r  r  r  r  r  r  r   r   r   r  rL   r  r  r  r  r  r  rS   r   r  r  r	  r
  zTorch and ORT result is r  r  r  r  r  )Hr   rO   rz   r   r   r  rr   ru   r  r   r   r  rL   r  r  r  r  r\   r  r  r  r  r  rK   r  r  r  r  rI   rJ   r  r  r  r   r  rw   r   r  r  r  r  r  r  r  rP   r  r_   r   r   r   r  r  r  rf   rg   re   rd   r   r[   rY   r   r  r  r  r   r  r  r  r  r  r^   r  ))rn   r  r  r   ro  r   r   r  r  r  r  r  r  r  r  r   r  r  rL   r   r  r  r!  r"  r   r$  rX  r  r#  r-  r  r   r  r&  r  r%  r   r'  r(  r)  r*  r"   r"   r#   test_t5_model  s   













	
r,  c                 C   sr  t | }t|j |jdv rB|jr tj|js td|j |j	r2tj|j	s2td|j	 |jr8|j	r>|j	rB|jsBtd|j
dkoK|jdk}|jdkr}|r}|jdkrv|jdk rvt|tj td	 |jd
kss|jss|jrudS nt|tj nt| td |jdv rt||d}nt|||d}|r|jrtd|j d|j d |S td|j  |S )a/  Main entry function

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Raises:
        ValueError: Path does not exist: --encoder_decoder_init_onnx
        ValueError: Path does not exist: --decoder_onnx
        ValueError: --decoder_onnx and --encoder_decoder_init_onnx are not used together for T5

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  z1Path does not exist: --encoder_decoder_init_onnx z$Path does not exist: --decoder_onnx zB--decoder_onnx shall use together with --encoder_decoder_init_onnxrS   r2   rW   rV   zThe test for gpt2_sampling onnx model is limited to non-custom model with small top_p(e.g <=0.01) value. The result should be the same as gpt2 greedy search.g{Gz?Nzstart testing model...)r  )r  r  zOutput files: r.   z.datazOutput file: )ro   r   r;   r   r   rf   rg   r  r   rs   r  r  r  r  r   r*   rz   r{   r  rR   r)   r,  r+  rC   r   )r+   r  rn   r  r#  r"   r"   r#   r     sF   



r   __main__r   )T)r   r   NN)r   )r   rS   rX   )NFr  )a__doc__r`   loggingr   rf   r  enumr   pathlibr   typingr   r  r   r   r  r  r   r   Zfusion_utilsr   r   r	   r
   r  r   Ztransformersr   r   r   r   r   r   r   r   Zonnxruntimer   r   r   r   Z4onnxruntime.transformers.models.gpt2.convert_to_onnxr   r|   Z0onnxruntime.transformers.models.gpt2.gpt2_helperr   Z2onnxruntime.transformers.models.t5.convert_to_onnxr   r   Z,onnxruntime.transformers.models.t5.t5_helperr   r   	getLoggerrz   r   rQ  rd   	Namespacero   r~   r   boolr   r   r   r   r   r   rj   dictr   r   r   r  r  r  r'  r4  rG  rU  r\  r{  r  r  r  r  r  r  r  r  r(   r  ZTensorr  r  r+  r,  r%   r"   r"   r"   r#   <module>   s:  %(
	   -N!8L_
j
%
1
Vi 4 *J
 o)  ;#
  -'
   


B

 X $>
;
