o
    Wli                     @   s   d Z ddlZddlmZ ddlmZmZ ddlZddl	m
Z
 eejZdZdZdZed	Zh d
Zeeeg dZdddddddZedZG dd deZG dd dZdS )zPython implementation of libtashkeel.

See: https://github.com/mush42/libtashkeel

Ported with the help of ChatGPT 2025-05-01.
    N)Path)OptionalUnion)InferenceSessioni.  _#u   0123456789٠١٢٣٤٥٦٧٨٩>   u   ّu   ِu   ْu   َu   ٍu   ُu   ٌ)R  iQ  iN  iO  iP  iK  iL  iM  u   َّu   ًّu   ُّu   ٌّu   ِّu   ٍّ)u   َّu   ًّu   ُّu   ٌّu   ِّu   ٍّr   c                   @   s   e Zd ZdZdS )TashkeelErrorzError for tashkeel.N)__name__
__module____qualname____doc__ r   r   Q/home/kim/smarthome/.venv/lib/python3.10/site-packages/piper/tashkeel/__init__.pyr	      s    r	   c                   @   s|  e Zd ZdZefdeeef ddfddZd*dede	e
 defd	d
Zd*dedefddZdee dee dedeee ee
 f fddZdedee dee defddZdedee dee dee
 de
defddZdefddZ	d+dededeeee f fdd Zdedeeee f fd!d"Zdedee fd#d$Zdee dee fd%d&Zd'ee dee fd(d)ZdS ),TashkeelDiacritizerz0Add diacritics for Arabic text with libtashkeel.	model_dirreturnNc                    s   t |}t|d | _t|d ddd}t|| _W d   n1 s%w   Y  t|d ddd}t| dd	   D | _W d   n1 sMw   Y   fd
dt	fD | _
t|d ddd}t|| _W d   dS 1 sxw   Y  dS )zInitialize diacritizer.z
model.onnxzinput_id_map.jsonrzutf-8)encodingNztarget_id_map.jsonc                 S   s   i | ]\}}||qS r   r   ).0cir   r   r   
<dictcomp>0   s    
z0TashkeelDiacritizer.__init__.<locals>.<dictcomp>c                    s   h | ]} | qS r   r   r   r   Ztarget_id_mapr   r   	<setcomp>4   s    z/TashkeelDiacritizer.__init__.<locals>.<setcomp>zhint_id_map.json)r   r   sessionopenjsonloadinput_id_mapitemsid_target_mapPADtarget_id_meta_charshint_id_map)selfr   Zinput_id_map_fileZtarget_id_map_fileZhint_id_map_filer   r   r   __init__!   s0   




"zTashkeelDiacritizer.__init__texttaskeen_thresholdc                 C   s
   |  |S )!Add diacritics using libtashkeel.)
diacritize)r&   r(   r)   r   r   r   __call__;   s   
zTashkeelDiacritizer.__call__c                 C   s   |  }t|tkrtdt | |\}}| j|dd\}}| |}| |}t|}|dkr5|S | |||\}	}
| 	|	}|du rN| 
|||S | ||||
|S )r*   zText length cannot exceed T)normalize_diacriticsr   N)striplen
CHAR_LIMITr	   _to_valid_chars_extract_chars_and_diacritics_input_to_ids_hint_to_ids_infer_target_to_diacritics_annotate_text_with_diacritics&_annotate_text_with_diacritics_taskeen)r&   r(   r)   
input_textremoved_chars
diacritics	input_idsdiac_ids
seq_length
target_idslogitsr   r   r   r+   ?   s&   




zTashkeelDiacritizer.diacritizer<   r=   r>   c                 C   s   t j|t jdd|}t j|t jdd|}t j|gt jdd}|||d}| jd|}|d  t j	 }	|d  t j
	 }
|	|
fS )zInfer target ids and logits.)Zdtype   )Zchar_inputsZdiac_inputsZinput_lengthsNr   )nparrayZint64Zreshaper   runflattenZastypeZuint8tolistZfloat32)r&   r<   r=   r>   Zinput_ids_arrZdiac_ids_arrZinput_len_arrZinputsZoutputsr?   r@   r   r   r   r5   \   s   zTashkeelDiacritizer._inferr9   r;   r:   c                 C   sZ   g }t |}|D ]}| |rq||v r|| q|| |t|d qd|S N )iter_is_diacritic_charappendnextjoin)r&   r9   r;   r:   output	diac_iterr   r   r   r   r7   s   s   


z2TashkeelDiacritizer._annotate_text_with_diacriticsr@   	thresholdc                 C   sx   g }t ||}|D ]-}| |rq	||v r|| q	|| t|d\}	}
|
|kr1|t q	||	 q	d|S )N)rH   g        rH   )ziprJ   rK   rL   SUKOONrM   )r&   r9   r;   r:   r@   rP   rN   rO   r   ZdiacZlogitr   r   r   r8      s   



z:TashkeelDiacritizer._annotate_text_with_diacritics_taskeenc                 C   s   |t v S N)ARABIC_DIACRITICS)r&   r   r   r   r   rJ      s   z&TashkeelDiacritizer._is_diacritic_charTr-   c           	      C   s   | dt}g }g }d}t|dg D ]}| |r!||7 }q|| || d}q|r4|  |r;|d |rSt|D ]\}}|| jvrRt	
|d||< qAd||fS )NrH    r   )lstriprM   rT   listrJ   rK   pop	enumerater%   NORMALIZED_DIAC_MAPget)	r&   r(   r-   Zclean_charsr;   Zpending_diacr   r   dr   r   r   r2      s(   





z1TashkeelDiacritizer._extract_chars_and_diacriticsc                 C   s^   g }t  }|D ] }|| jv s|tv r|| q|tv r"|t q|| qd||fS rG   )setr    rT   rK   NUMERALSNUMERAL_SYMBOLaddrM   )r&   r(   validinvalidr   r   r   r   r1      s   z#TashkeelDiacritizer._to_valid_charsc                        fdd|D S )Nc                       g | ]} j | qS r   )r    r   r&   r   r   
<listcomp>       z5TashkeelDiacritizer._input_to_ids.<locals>.<listcomp>r   )r&   r(   r   re   r   r3         z!TashkeelDiacritizer._input_to_idsc                    rc   )Nc                    rd   r   )r%   )r   r\   re   r   r   rf      rg   z4TashkeelDiacritizer._hint_to_ids.<locals>.<listcomp>r   )r&   r;   r   re   r   r4      rh   z TashkeelDiacritizer._hint_to_idsr?   c                    rc   )Nc                    s    g | ]}| j vr j| qS r   )r$   r"   )r   r   re   r   r   rf      s
    
z=TashkeelDiacritizer._target_to_diacritics.<locals>.<listcomp>r   )r&   r?   r   re   r   r6      s   
z)TashkeelDiacritizer._target_to_diacriticsrS   )T)r
   r   r   r   TASHKEEL_DIRr   strr   r'   r   floatr,   r+   rW   inttupler5   r]   r7   r8   boolrJ   r2   r1   r3   r4   r6   r   r   r   r   r      sb    



r   )r   r   pathlibr   typingr   r   numpyrB   Zonnxruntimer   __file__parentri   r0   r#   r_   r]   r^   ZHARAKAT_CHARSmapchrrT   rZ   rR   	Exceptionr	   r   r   r   r   r   <module>   s"    
