o
    Wli #                  	   @   s  U d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZ d	d
lmZmZmZ d	dlmZ eeZg dZi eee eee eee ddgddgddgddgddgddgddgddgddgdd gd!d"gd#d$gd%d&gd'd(gi d)d*gd+d,gd-d.gd/d0gd1d2gd3d4gd5d6gd7d8gd9d:gd;d<gd=d>gd?d@gdAdBgdCdDgdEdFgdGdHgdIdJgi dKdLgdMdNgdOdPgdQdRgdSdTgdUdVgdWdXgdYdZgd[d\gd]d^gd_d`gdadbgdcddgdedfgdgdhgdidjgdkdlgi dmdngdodpgdqdrgdsdtgdudvgdwdxgdydzgd{d|gd}d~gddgddgddgddgddgddgddgddgi ddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgZeeee  f e!d< h dZ"dZ#e$dZ%e$dZ&G dd dZ'	ddee de
eeee  f  dee  fddZ(dedefddZ)defddZ*deeef ddfddZ+dS )zPinyin-based phonemization for Chinese using g2pW.

Partially written by ChatGPT (December 2025).

This code is Apache 2.0 licensed.
    N)MappingSequence)Path)OptionalUnion)urlopen)G2PWConverter)
RbnfEngine   )BOSEOSPAD)DEFAULT_PHONEME_ID_MAP)zhchshbpmfdtnlgkhjqxrzcsyw   Ø   r      r      r      r      r      r   	   r   
   r      r      r      r      r      r      r      r      r      r      r       r!      r"      r#      r$      r%      a   o   e   Zai   ei   Zao    Zou!   an"   en#   ang$   Zeng%   Zong&   i'   ia(   Zie)   Ziao*   iu+   Zian,   in-   Ziang.   Zing/   Ziong0   u1   Zua2   Zuo3   Zuai4   Zui5   Zuan6   un7   Zuang8   Zueng9   v:   ve;   Zvan<   Zvn=   Zer>   Zue?   1@   2A   3B   4C   5D      。E   .   ？F   ?   ！G   !   —H      …   、   ，,   ：:   ；; PHONEME_TO_ID>   r   rs   ru   r}   r   r   r   r   r   r   r   ry   r   r{   r   r   r   r   r   rw   r   zthttps://huggingface.co/datasets/rhasspy/piper-checkpoints/resolve/main/zh/zh_CN/_resources/g2pw.tar.gz?download=trueu.   (?P<sign>[-−])?(?P<num>\d+)\s*(?:°\s*C|℃)u^   (?P<num>-?\d+(?:\.\d+)?|[零〇一二三四五六七八九十百千万亿两点]+)\s*(?:%|％)c                   @   sh   e Zd ZdZdeeef ddfddZdedeee  fdd	Z	dedefd
dZ
dedefddZdS )ChinesePhonemizerz"Phonemize Chinese text using g2pW.	model_dirreturnNc                 C   s,   t | tt|ddd| _td| _dS )zInitialize phonemizer.ZpinyinT)r   styleZenable_non_tradional_chineser   N)download_modelr   strg2pr	   Zfor_languagenumber_engine)selfr    r   Q/home/kim/smarthome/.venv/lib/python3.10/site-packages/piper/phonemize_chinese.py__init__   s
   
zChinesePhonemizer.__init__textc                 C   s   ddl m} tdd|}g }||gD ]W}| |}| |d }g }t||D ]<\}}|du r;|tv r:|| q)t	|}t
|\}	}
}|
sN|| q)|	sRd}	|	|
|fD ]}|s_J ||| qWq)|| q|S )z%Turn text into phonemes per sentence.r   )stream_to_sentencesu	   [“”"] Nr&   )Zsentence_streamr   resub_numbers_to_wordsr   zipr   append_normalize_g2pw_syllable_split_initial_final_tone)r   r   r   Zall_phonemesZsentenceZsyllsZsentence_phonemessylZsyl_charZini_pZfin_ptoneZsymr   r   r   	phonemize   s2   


zChinesePhonemizer.phonemizec                    s^   dt jdtf fdd}t||}dt jdtf fdd}t||}t d fdd	|S )
Nr   r   c                    s8   |  d}|  d} |}|rd| dS | dS )Nsignnumu   零下u   度)group
_zh_number)r   r   num_str	num_wordsr   r   r   replace_temp  s   



z9ChinesePhonemizer._numbers_to_words.<locals>.replace_tempc                    s0   |  d}td|r |}n|}d| S )Nr   -?\d+(?:\.\d+)?u	   百分之)r   r   	fullmatchr   )r   r   r   r   r   r   replace_percent  s
   

z<ChinesePhonemizer._numbers_to_words.<locals>.replace_percentr   c                    s     | dS )Nr   )r   r   )r   r   r   r   <lambda>   s    z5ChinesePhonemizer._numbers_to_words.<locals>.<lambda>)r   Matchr   TEMP_PATTERNr   PERCENT_PATTERN)r   r   r   r   r   r   r   r      s   

z#ChinesePhonemizer._numbers_to_wordsc                 C   s   | j |jS N)r   Zformat_numberr   )r   r   r   r   r   r   $  s   zChinesePhonemizer._zh_number)__name__
__module____qualname____doc__r   r   r   r   listr   r   r   r   r   r   r   r      s    &%r   phonemesid_mapr   c                 C   sp   |st }g }||t  | D ]}||vrtd| q|||  |tv r.||t  q||t  |S )zGet phoneme ids for phonemes.

    Padding is done after a group of phonemes representing one pinyin group
    instead of between each phoneme.
    zMissing phoneme from id map: %s)r   extendr   _LOGGERwarningGROUP_END_PHONEMESr   r   )r   r   ZidsZphonemer   r   r   phonemes_to_ids(  s   	r   r   c                 C   sF   t d| }|s
| S |d|d}}|dddd}|| S )u   
    - Keep only syllables like [a-züv:]+[1-5]
    - Convert g2pW's 'u:' and 'ü' -> 'v' (ü-family)
      nu:3   -> nv3        (n + v)
      lu:e4  -> lve4       (l + ve)
      ju:an3 -> jvan3      (j + van)
      ju:n3  -> jvn3       (j + vn)
    u   ^([a-züv:]+?)([1-5])$r
      zu:rk      ü)r   matchr   replace)r   r   baser   r   r   r   r   G  s   	r   c                 C   sl   t d| }|s
dS |d|d}}d}tD ]}||r$|} nq|r/|t|d n|}|||fS )zE
    'hang2' -> ('h', 'ang', '2')
    'ai3'   -> ('', 'ai', '3')
    u   ^([a-zvü]+?)([1-5])$)r   r   Nr
   r   r   N)r   r   r   PINYIN_INITIALS
startswithlen)r   r   r   r   inicandZfinr   r   r   r   \  s   

r   r   c              	   C   s   t | } | d }| rtd| dS tdt|  | jddd tt.}tj	|dd}|j
| d	 W d   n1 s?w   Y  W d   dS W d   dS 1 sWw   Y  dS )
z Ensure g2pW model is downloaded.z	g2pw.onnxzFound g2pW model at %sNz(Downloading g2pW model from '%s' to '%s'T)parentsexist_okzr|gz)fileobjmode)path)r   existsr   debuginfoG2PW_URLmkdirr   tarfileopen
extractall)r   Z
model_pathresponsetarr   r   r   r   q  s   
"r   r   ),r   loggingr   r   collections.abcr   r   pathlibr   typingr   r   urllib.requestr   Zg2pwr   Zunicode_rbnfr	   constr   r   r   Zphoneme_idsr   	getLoggerr   r   r   r   dictr   r   int__annotations__r   r   compiler   r   r   r   r   r   r   r   r   r   r   <module>   s   
	
"#$%&'()*+,-/012345678:;<=>?@ABDEFGIKOPQRSWXYZ[\`abcdefghjm_
