o
    iǍ                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlmZ ddlmZ edZ	d*d	d
Zd+ddZd,ddZd,ddZd+ddZdd Zi dddi ddgdfddZdd Zd d! Z				"	#	$	$	%d-d&d'Zi dddd"d#d"d$d$d%ddgfd(d)ZdS ).zWeightOnly for onnxrt adaptor.    N)numpy_helper)np_dtype_to_tensor_dtype   )	ONNXModel)simple_progress_barZneural_compressorc	                 C   s.  || d }	t j|jd |	fdd}
| jd d|d| }| jd |g}g }i }d}|d	kr[|d
d
d
d
df |d
d
dd
df d	> B }|d
d
d
|	f |
d
d
d
d
f< n|dkrb|}
n	td| d t |
d||	f}
t |d|f}|jt jks|jt j	ksJ t
jj| jd d t|j|j| dd}||j || |d
urB|dkr|d}n`|d	krt j|jd d d ddd}t |jd | | d}|d
d
d }|dd
d }||d  d@ ||  B ||d < ||d  d@ ||  d	> B ||d < ntd| dt ||d df}t
jj| jd d d|j| dd}||j || |d |d< |d |d< ||d< ||d< |dkr_||d< t
jj|d|
j|
 dd}|| t
jj|f|| j| jr| jd t| ndt| dd|}||fS )aB  Build MatMulNBits node.

    Args:
        node: original matmul node
        weight_shape: original weight shape
        num_bits (int): num_bits
        group_size (int): how many elements share one scale/zp
        k_blocks (int): block number
        q_weight (array): quantized weight
        scale (array): scale
        zero_point (array): zero point
        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).

    Returns:
        matmul_weight_only_node: MatMulNBits node
        new_inits: initializers of the new node
       r   uint8dtyper   _QGZMatMulNBits   N   z8MatMulNBits does not have kernel support for num_bits = .Z_scaleTnameZ	data_typedimsvalsraw         Z_zpKNbits
block_sizeaccuracy_levelzcom.microsoft)inputsZoutputsr   domain)npzerosshapeinputloggererrorreshaper
   float32Zfloat16onnxhelpermake_tensorr   tobytesappendr   astypefullarangeZravel
ValueErrorZ	make_nodeoutputstr)nodeweight_shapenum_bits
group_sizek_blocksq_weightscale
zero_pointr   Z	blob_sizepackedZq_weight_nameZinput_names	new_initskwargsop_typeZq_weight_pairsZscale_tensorZ	packed_zpidxZeven_idxZodd_idxZ	zp_tensorq_weight_tensorZmatmul_weight_only_node rA   p/home/kim/smarthome/.venv/lib/python3.10/site-packages/onnxruntime/quantization/neural_compressor/weight_only.pymake_matmul_weight_only_node,   s   0&


$*


$rC   r       asymint      ?c              	      s  t | d|f} |dks|dkrd| d  dn|dkr8|dkr)d|d  d nd |dkr6d|d   ndt j| ddd	| }t j| ddd	| }|dkrt t |t |}t |j}	|dk}
||
 d
 t j	   |	|
< |dkrt 
|	jnt j|jddd|d >  }nHt |j}	t  fdd|| ||k   D |	||k< |dkrt 
|	j| |	  nt dt  t 
|	j| |	  d}t j| |	jd}t j| |	|d t j|||d t j||d t j| |d ||	|fS )a	  Quantize tensor per group.

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
        scheme (str, optional): quantization scheme. Defaults to "asym".
        dtype (str, optional): data type. Defaults to "int".
        ratio (float, optional): percentile of clip. Defaults to 1.0.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   rE   uintr   r   r   symTaxisZkeepdimsg       @rF   r   r	   c                    s   g | ]
}t |   qS rA   )float.0imaxqminqrA   rB   
<listcomp>       z quant_tensor.<locals>.<listcomp>out)r    r&   minmaxmaximumabsonesr"   r-   float64r!   arrayflattentolistroundminimum
empty_liker
   divideaddclip)datar5   r6   schemer
   ratiorminrmaxZ	max_ranger9   maskr:   r8   rA   rP   rB   quant_tensor   s<    .&,
rl   c           &      C   s@  t | d|ft j} d| d }d}t j| d ddd}t || }t |t | }t j| ddd}t j	| ddd}	t j|ddd}
t j||  ddd}t j
|	j| jd}||	k}|| |	| ||   ||< d| }t t || |  ||}|| | |  }t j||d  ddd}d}d	}d}t|D ]}t j
|	j| jd}t |||  | | g| jd }||	k}||	| ||   ||< t t || |  ||}|| }t j|ddd}t j|| ddd}t j||  ddd}t |
| |d }|
| ||  | }|| ||  | }|| | |  }t j||d  ddd} t | }!t |}"t |!|"k d }#||#d
d
f ||#d
d
f< | |# ||#< ||# ||#< ||# ||#< qt | |  d|d}$|t j}t j| |jd}%t j| ||%d t j|%|$|%d t j|%|%d t j|%|||%d |%||$fS )a  Quantize tensor per group based on k quant.

    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 32.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   r   r   r   TrJ   r	      皙?Nr   rU   )r    r&   r-   r'   sumsqrtrd   rZ   rW   rX   r[   r"   r
   re   r`   ranger]   subtractwherer\   rb   rc   )&rf   r5   r6   rQ   rR   sum_x2av_xweightsri   rj   sum_wsum_xiscalerk   r9   
quant_datadiffbest_madnsteprdeltarrminis_
iscale_newfactorquant_data_newmul_weights_quant_data_newsum_lsum_l2sum_xlD
this_scalethis_minmadmad_1
best_mad_1idx_to_replacer:   r8   rA   rA   rB   quant_tensor_k_quant_cpu   sd   (


r   c           (      C   s  zddl }ddl}|j r|| } | d|f|j} d| d }d}|j| d ddd}|	|| }|
||| }	|j| ddd}
|j| ddd}|j|	ddd}|j|	|  ddd}|j|j| jd}|
|k}|| || |
|   ||< d| }|||| |
  ||}|| |
 |  }|j|	|d  ddd}d	}d
}d}t|D ]}|j|j| jd}||||  | | g| jd }|
|k}||| |
|   ||< |||| |
  ||}|	| }|j|ddd}|j|| ddd}|j||  ddd}||| |d }|| ||  | } || ||  | }!| | |! |  }|j|	|d  ddd}"||"}#||}$||#|$k d }%||%ddf ||%ddf< |"|% ||%< | |% ||%< |!|% |
|%< q||
 |  d|d}&||j}|j| |jd}'|j| ||'d |j
|'|&|'d |j|'|'d |j|'|||'d |' | |& fW S td t| ||W S  ty   td t| || Y S w )a  Quantize tensor per group based on k quant.

    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   Nr   r   r   TrJ   r	   rm   rn   r   rU   zqTry to use k-quant quantization on CUDA. However, CUDA is not available.Fall back to k-quant quantization on CPU.zNow we are using k-quant quantization on cpu, which is time consuming.Please consider install cupy to speed up on CUDA. See https://cupy.dev/Please also install torch to check CUDA availability.) ZcupytorchcudaZis_availableZasarrayr&   r-   r'   ro   rp   rd   rZ   rW   rX   r[   r"   r
   re   r`   rq   r]   rr   rs   r\   rb   rc   getr$   warningr   ImportErrorinfo)(rf   r5   r6   cpr   rQ   rR   rt   ru   rv   ri   rj   rw   rx   ry   rk   r9   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r:   r8   rA   rA   rB   quant_tensor_k_quant_cuda  s   
(

r   c           
      C   s2   | j }t| |||||\}}}	t|||	  |S )a  Quant dequant tensor per group.

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
        scheme (str, optional): quantization scheme. Defaults to "asym".
        dtype (str, optional): data type. Defaults to "int".
        ratio (float, optional): percentile of clip. Defaults to 1.0.

    Returns:
        output: quant-dequant weight
    )r"   rl   r    r&   )
rf   r5   r6   rg   r
   rh   	org_shapeweightr9   zprA   rA   rB   
qdq_tensory  s   r   c                 C   sH   |dkr| S | j }|| }||d  }|dkr"t| d|fdfd} | S )a  Pad tensor rowi so that it can be is divisible by group_size.

    Args:
        weight (array): weight
        group_size (int): how many elements share one scale/zp
        k_blocks (int): the number of block

    Returns:
        weight: paded weight
    r   r   )r   r   Zconstant)r"   r    pad)r   r6   r7   org_w_shapeZpadded_rowsZpad_lenrA   rA   rB   
pad_tensor  s   r   ZCPUExecutionProviderk_quantc	                 C   s  t | } | jdurtj| jnd}	g }
g }tdd |  D }d}|  D ]B}|jdv r8|d7 }t|| |jdv rj| 	|j
d durj||ji dkrj| 	|j
d }tj||	d	 }t|jd
krlq'|j}|j|v r||j d }||j d }||j d }|j}|dkr|n|d }|d d | d }| |j
d }t|||}|dkp|dk}|r|dkrt|j||\}}}nt|j|||d||j
d d\}}}t||||||d|||dks|dkr|nd|d	\}}| | || |
| nUt|j|||d||j
d d}t||d df}t|}|d|d ddf |}tj j!|j
d d|d| t"||j|# dd}| $| |j|j
d< |dkrj| %| q'| &|
 | '| | (  | S )a  Quant the model with round to nearst method.

    Args:
        model (ModelProto or ONNXModel): onnx model
        weight_config (dict): quantization config
                For example,
                weight_config = {
                    'fc2':
                        {
                            'bits': 4,
                            'group_size': 32,
                            'scheme': 'sym',
                            'algorithm': 'RTN'
                        }
                }
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        ratios (dict, optional): percentile of clip. Defaults to {}.
        accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
        providers (list): providers to use

    Returns:
        model: fake quantized ONNXModel
    N c                 S   s   g | ]	}|j d v r|qS )ZMatMul)r>   rM   rA   rA   rB   rS     s    z rtn_quantize.<locals>.<listcomp>r   r   r   fp32)base_dirr   r   r6   rg   r   r   r   r   rH   r   rE   	r3   r4   r5   r6   r7   r8   r9   r:   r   rF   r   r   Tr   ))r   
model_pathospathdirnamelennodesr>   r   get_initializerr#   r   r   r   to_arraycopyr"   r
   get_initializer_share_numr   r   Trl   rC   r-   add_initializersr,   r   r    r&   Z	transposer(   r)   r*   r   r+   add_initializerremove_initializerZ	add_nodesremove_nodestopological_sort)modelweight_configr5   r6   rg   ratiosr   	providers	algorithmr   Z	new_nodesr   Z	total_numZcurr_idr3   weight_tensorr   r
   r   r7   init_share_numsatisfy_MatMulNBits_conditionr8   r9   r   q_matmul_noder<   r@   rA   rA   rB   rtn_quantize  s   $






"





r   c              	   C   sX   | j }|dkrt| d|fn| } tjtt| tjt| ddd |dd}|S )zGet the scale of weight.r   r   TrJ   r   rK   )r"   r    r&   meanrZ   rX   )r   r6   r   r9   rA   rA   rB   get_weight_scale  s   2r   c              
      s  ddl m} ddlm  t }tjdk r%|dr%ddlm	} |
|  | jr6tj| j| jd d	d	d
d | jsDtj| j ||dn
tj| jd ||d}dd | D }~g }	t|D ]\}
}|dkru|
d |j |kru |	|fS t|dkst|d trt|d t|ksJ dt| dt|d  t|d tr|	t fdd|d  D  q_t|d tjr|	tdd t||d gd
dD  q_|	t fddt||d d
dD  q_|	|fS )as  Prepare inputs for weight only quantization.

    Args:
        model (ModelProto or ONNXModel): onnx model
        n_samples (int, optional): calibration sample number. -1 means all samples.
        dataloader (object): dataloader for calibration.
        providers (list): providers to use

    Returns:
        inputs: prepared inputs.
        so: session options
    r   )	find_specr   to_numpy)      onnxruntime_extensions)get_library_path_augment.onnxTFZsave_as_external_dataZall_tensors_to_one_fileZconvert_attributer   c                 S      g | ]}|j qS rA   r   rM   rA   rA   rB   rS   G      z"prepare_inputs.<locals>.<listcomp>r   zInput number mismatch, require z	 but get c                       g | ]
\}}| |fqS rA   rA   )rN   r   Zinp_datar   rA   rB   rS   T  rT   c                 S   s   g | ]\}}||fqS rA   rA   rN   r   inprA   rA   rB   rS   V  s    strictc                    r   rA   rA   r   r   rA   rB   rS   X  rT   )importlib.utilr   utilr   ortZSessionOptionssysversion_infor   r   Zregister_custom_ops_libraryis_large_modelr(   
save_modelr   r   InferenceSessionSerializeToStringZ
get_inputs	enumerateZ
batch_sizer   
isinstancedictr,   itemsr    Zndarrayzip)r   	n_samples
dataloaderr   r   sor   sessionZinputs_namesr   rO   rf   rA   r   rB   prepare_inputs#  sF   
&*,r      {Gz?FTc
           #         sZ  d| d d dd fdd}
| j }|
| \}}t|dk}d|||f< d| |d	d	f< |r[tt|d	d	d
 }| |d	d	f } ||d	d	f d	d	|f }t| }t| }|tt| }t|d }|||f  |7  < tjtj	|j
}|}td|d |D ]}t|| |d }|| }t| ||d	d	f }t|}t|}t|}|||||f }t|D ]}||d	d	f }|||f }|d
kr|| | dkr|
| || || | d	d	f \}}|tt|d	d	tjf | | d|   } | ||d	d	f< ||  d |d  ||d	d	f< ||  | }!||d	d	d	f  ttj||d	|f ddtj|!dd8  < |!||d	d	f< q||||d	d	f< |d |||d	d	f< | |d	d	d	f  t||d	||f |8  < q|rt|}"||"d	d	f }t|| j }~ |S )a  Quant the weight with GPTQ method.

    Args:
        W (array): weight.
        H (array): Hessian matrix.
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        blocksize (int, optional): blocksize to quantize weight.
        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
        mse (bool, optional): whether get scale and zero point with mse error.
        perchannel (bool, optional): whether quantize weight per-channel.

    Returns:
        Q: fake quantized weight
    r   r   d   g?g333333@c                    s>  | j }stj|  dd} t| j d }ttj| dd|}ttj| dd|}dkrItt	||}|dk }t
|rI||  ||< |dk|dk@ }d||< d||< ||  }dkrpt|j d  d }nt| | }rt| j d gtd }tt  D ]`}d|   }	|	| }
|	| }||
  }dkrt|
 | n|}tt| | | d}|| 8 }tt	|}t|d}||k }t
|r|| ||< || ||< || ||< qs|d }t||}t||}dgdgt|d   }t||}t||}||fS )Nr   r   r   rI   r   r   inf)r"   r    expand_dimsr^   r!   ra   rW   rY   rX   rZ   anyr[   r`   rL   rq   rF   re   powerro   repeatr   r&   )r   r   tmpZxminZxmaxr9   zerobestrO   pZxmin1Zxmax1Zscale1Zzero1qerrr"   gridrQ   Z	maxshrinkmseZnorm
perchannelrg   rA   rB   find_params~  sX   

zgptq.<locals>.find_paramsr   Nr   r   )r"   r    diagZargsortZ
zeros_liker   r/   ZlinalgZcholeskyinvr   rq   rW   r   deepcopyre   r`   Znewaxisr^   matmulr   r&   )#WHr5   r6   rg   	blocksizepercdampactorderr   r   r   r"   r9   r   ZdeadpermZLossesQZdampr   ZHinvi1i2countZW1ZQ1ZErr1ZLosses1ZHinv1rO   wdr   Zerr1ZinvpermrA   r   rB   gptq\  sd   0




(6 D6
r
  c           *         s  t | } | jdurtj| jnd}t| |||\}}~t| jj	j
}| dd |D  g }|  D ]%}|jdv rY||ji dkrY||ji dddkrY||jd	  q4tt|}| | | jrvtj| j| jd
 dddd | jstj| j ||dn
tj| jd
 ||d}t|D ]\}}tt||d  g }g }| j| D ]O}|jdv r||ji dkr||ji dddkr| |jd durt !| | "|jjd | }t|j#dkrq|| || "|j qt|d	krqdd |D }d	|D ]<}|$|g|d	   j#d	 t%& d j#d f fdd|D }7 t%'d     fdd|D }qt(|||ddD ]\}}}|j|v rr||j d }||j d }||j d }|dkry|n|j#d	 }|j)}t*||||||||	|
|d
}| |jd }| +|jd } |dk}!|!r|j#}"|"d	 | d | }#t,|||#}t-|j.|||d\}}$}%t/||"|||#|0d|$0||dkr|%nd|d	\}&}'| 1|' | 2| | 3|& n)tj4j5|jd d|d | t6||j#|0|7 dd!}(| 8|( |(j|jd< | dkr$| 9| qRq| | | jj	j
:| | ;  | jrOd	d"l<m=}) |)| jtj>| jd	  | S )#a  Quant the model with GPTQ method.

    Args:
        model (ModelProto or ONNXModel): onnx model
        dataloader (object): dataloader for calibration.
        weight_config (dict): quantization config
                For example,
                weight_config = {
                    'fc2':
                        {
                            'bits': 4,
                            'group_size': 32,
                            'scheme': 'sym',
                            'algorithm': 'GPTQ'
                        }
                }
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        n_samples (int, optional): calibration sample number.
        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
        blocksize (int, optional): blocksize to quantize weight.
        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
        mse (bool, optional): whether get scale and zero point with mse error.
        perchannel (bool, optional): whether quantize weight per-channel.
        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
        providers (list): providers to use

    Returns:
        model: fake quantized ONNXModel
    Nr   c                 S   r   rA   r   rM   rA   rA   rB   rS     r   z!gptq_quantize.<locals>.<listcomp>r   r   r   ZGPTQr   r   TFr   r   r   r   c                 S   s&   g | ]}t |jd  |jd  fqS r   )r    r!   r"   rM   rA   rA   rB   rS   O  s   & r   c                    s   g | ]
}|     qS rA   rA   rM   )nsamplesr   rA   rB   rS   U  rT   c                    s   g | ]}|t  j  qS rA   )r    r   r   rM   )r   rA   rB   rS   X  s    r   r   r6   rg   )r5   r6   rg   r   r  r  r   r   r   rH   r   rE   r   r   r   r   )load_external_data_for_model)?r   r   r   r   r   r   r   r   r   graphr1   Zremove_tensors_from_outputsr   r>   r   r   r,   r#   listsetZadd_tensors_to_outputsr   r(   r   r   r   r   r   r   r   Zinput_name_to_nodesr   r   r   Zget_noder"   runr    r&   rp   r   r
   r
  r   r   rl   r   rC   r-   r   remove_nodeadd_noder)   r*   r   r+   r   r   Z	MergeFromr   Zonnx.external_data_helperr  split)*r   r   r   r5   r6   rg   r   r  r   r  r   r   r   r   r   r   r   Z
org_outputZoutput_namesr3   r   r?   Z
input_nameZ	node_listrv   r   ZHsrf   r   r
   r8   r   r   r   r   r7   r9   r   r   r<   r@   r  rA   )r   r  r   rB   gptq_quantize  s   /












?r  r  )r   rD   rE   rF   rG   )r   rD   )r   rD   rE   r   r   FFT)__doc__r   loggingr   r   numpyr    r(   r   Zonnx.helperr   Zonnxruntimer   Z
onnx_modelr   r   r   	getLoggerr$   rC   rl   r   r   r   r   r   r   r   r
  r  rA   rA   rA   rB   <module>   sl   


s
6
I
[
w<
 