o
    WliMB                     @   sF  U d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlZddlZddlmZmZmZ ddlmZmZmZ dd	lmZ dd
l m!Z!m"Z" ddl#m$Z$ da%ee" e&d< e' Z(e Z)dZ*e+dZ,e-e.Z/e	G dd dZ0e	G dd dZ1e	G dd dZ2dS )z&Phonemization and synthesis for Piper.    N)	dataclass)Path)AnyIterableOptionalSequenceTupleUnion   )PhonemeTypePiperConfigSynthesisConfig)BOSEOSPADphonemes_to_ids)ESPEAK_DATA_DIREspeakPhonemizer)TashkeelDiacritizer_ESPEAK_PHONEMIZERg    @z(\[\[.*?\]\])c                   @   s*   e Zd ZU eed< ee ed< eed< dS )PhonemeAlignmentphonemephoneme_idsnum_samplesN)__name__
__module____qualname__str__annotations__r   int r!   r!   E/home/kim/smarthome/.venv/lib/python3.10/site-packages/piper/voice.pyr   !   s   
 r   c                   @   s   e Zd ZU dZeed< 	 eed< 	 eed< 	 ejed< 	 ee	 ed< 	 ee ed< 	 dZ
eej ed	< 	 dZeee  ed
< 	 dZeej ed< dZee ed< dZeee  ed< edejfddZedefddZdS )
AudioChunkzChunk of raw audio.sample_ratesample_widthsample_channelsaudio_float_arrayphonemesr   Nphoneme_id_samplesphoneme_alignments_audio_int16_array_audio_int16_bytes_phoneme_alignmentsreturnc                 C   s0   | j du rt| jt t ttj| _ | j S )zg
        Get audio as an int16 numpy array.

        :return: Audio data as int16 numpy array.
        N)r+   npclipr'   _MAX_WAV_VALUEastypeZint16selfr!   r!   r"   audio_int16_arrayM   s   
zAudioChunk.audio_int16_arrayc                 C   s
   | j  S )zl
        Get audio as 16-bit PCM bytes.

        :return: Audio data as signed 16-bit sample bytes.
        )r5   tobytesr3   r!   r!   r"   audio_int16_bytes[   s   
zAudioChunk.audio_int16_bytes)r   r   r   __doc__r    r   r/   ndarraylistr   r)   r   r*   r   r+   r,   bytesr-   propertyr5   r7   r!   r!   r!   r"   r#   (   s2   
 
r#   c                   @   s  e Zd ZU dZejed< 	 eed< 	 eZ	e
ed< 	 e
 Ze
ed< 	 dZeed< dZee ed	< d
Zee ed< eddedfdeee
f deeee
f  dedeee
f deeee
f  dd fddZdedeee  fddZdee dee fddZ		d$dedee dedee fddZ			d%dede j!dee dededeee"  fdd Z#		d$d!ee dee dedee$j%e&e$j%ee$j% f f fd"d#Z'dS )&
PiperVoicezA voice for Piper.sessionconfigespeak_data_dirdownload_dirTuse_tashkeelNtashkeel_diacritizierg?taskeen_thresholdF
model_pathconfig_pathuse_cudar.   c                 C   s   |du r|  d}t d| t|ddd}t|}W d   n1 s&w   Y  |r:ddd	ifg}t d
 ndg}|du rEt }tt	|t
jt| t
 |dt|t|dS )a  
        Load an ONNX model and config.

        :param model_path: Path to ONNX voice model.
        :param config_path: Path to JSON voice config (defaults to model_path + ".json").
        :param use_cuda: True if CUDA (GPU) should be used instead of CPU.
        :param espeak_data_dir: Path to espeak-ng data dir (defaults to internal data).
        :param download_dir: Path to download resources (defaults to current directory).
        :return: Voice object.
        Nz.jsonzGuessing voice config path: %srzutf-8)encodingZCUDAExecutionProviderZcudnn_conv_algo_searchZ	HEURISTICz
Using CUDAZCPUExecutionProvider)Zsess_options	providers)r?   r>   r@   rA   )_LOGGERdebugopenjsonloadr   cwdr=   r   	from_dictonnxruntimeInferenceSessionr   ZSessionOptions)rE   rF   rG   r@   rA   config_fileZconfig_dictrJ   r!   r!   r"   rO   z   s2   
zPiperVoice.loadtextc           
   	   C   s  | j jtjkrttd|gS | j jtjkr9ddlm	} t
| dd}|du r4|| jd }t| d| ||S | j jtjkrItd| j j g }t|}d}t|D ]\}}|d	rd
}|sh|g  |dkr|||d  dr||d d |d |dd   |t|d k r||d  dr|d d qV| j jdkr| jr| jdu rt | _| j|| jd}t1 tdu rt | j!at| j j|}	|r|	r|d |	d  |	dd }	||	 W d   n1 sw   Y  d}qV|r|d s|"  |S )z
        Text to phonemes grouped by sentence.

        :param text: Text to phonemize.
        :return: List of phonemes for each sentence.
        ZNFDr
   )ChinesePhonemizerZ_chinese_phonemizerNZg2pWzUnexpected phoneme type: Fz[[Tr       ar)rD   )#r?   phoneme_typer   ZTEXTr:   unicodedata	normalizePINYINphonemize_chineserV   getattrrA   setattr	phonemizeZESPEAK
ValueError_PHONEME_BLOCK_PATTERNsplit	enumerate
startswithappendendswithextendstriplenZespeak_voicerB   rC   r   rD   _ESPEAK_PHONEMIZER_LOCKr   r   r@   pop)
r4   rU   rV   Z
phonemizerr(   Z
text_partsZprev_raw_phonemesiZ	text_partZtext_part_phonemesr!   r!   r"   rc      s\   	



"

zPiperVoice.phonemizer(   c                 C   s6   | j jtjkrddlm} ||| j jS t|| j jS )zt
        Phonemes to ids.

        :param phonemes: List of phonemes.
        :return: List of phoneme ids.
        r
   r   )r?   r\   r   r_   r`   r   phoneme_id_map)r4   r(   Zchinese_phonemes_to_idsr!   r!   r"   r      s   zPiperVoice.phonemes_to_ids
syn_configinclude_alignmentsc                 c   s   |du rt }| |}td|| |D ]}|sq| |}d}| j|||d}t|tr3|\}	}n|}	|jrNt	
t	|	}
|
dk rJt	|	}	n|	|
 }	|jdkrX|	|j }	t	|	ddt	j}	d}|durt|t|kr| jjtg }d}g }d}ttg|tgD ]L}| jj|g }|tkrtt||}n|}|}|D ]}|t|krd	} n||| krd	} n|d
7 }q|r n|t||t||| d q|rd}td t| jjdd
|	||||dV  qdS )a  
        Synthesize one audio chunk per sentence from from text.

        :param text: Text to synthesize.
        :param syn_config: Synthesis configuration.
        :param include_alignments: If True and the model supports it, include phoneme/audio alignments.
        Nztext=%s, phonemes=%s)rs   g:0yE>g      ?g      r   FTr
   )r   r   r   zPhoneme alignment failedrY   )r$   r%   r&   r'   r(   r   r)   r*   ) _DEFAULT_SYNTHESIS_CONFIGrc   rK   rL   r   phoneme_ids_to_audio
isinstancetupleZnormalize_audior/   maxabsZ
zeros_likevolumer0   r2   float32rm   r?   rq   getr   	itertoolschainr   r   r:   ri   r   sumr#   r$   )r4   rU   rr   rs   Zsentence_phonemesr(   r   r)   Zaudio_resultaudioZmax_valr*   Zpad_idsZphoneme_id_idxZalignment_failedr   Zexpected_idsZids_to_checkZstart_phoneme_id_idxZ
phoneme_idr!   r!   r"   
synthesize  s   










zPiperVoice.synthesizewav_fileset_wav_formatc           	      C   s|   g }d}| j |||dD ]+}|r&|r$||j ||j ||j d}||j |r7|j	r7|
|j	 q|r<|S dS )a  
        Synthesize and write WAV audio from text.

        :param text: Text to synthesize.
        :param wav_file: WAV file writer.
        :param syn_config: Synthesis configuration.
        :param set_wav_format: True if the WAV format should be set automatically.
        :param include_alignments: If True and the model supports it, return phoneme/audio alignments.

        :return: Phoneme/audio alignments if include_alignments is True, otherwise None.
        T)rr   rs   FN)r   Zsetframerater$   Zsetsampwidthr%   Zsetnchannelsr&   Zwriteframesr7   r*   rk   )	r4   rU   r   rr   r   rs   Z
alignmentsZfirst_chunkZaudio_chunkr!   r!   r"   synthesize_wav|  s$   

zPiperVoice.synthesize_wavr   c                 C   sJ  |du rt }|j}|j}|j}|j}|du r| jj}|du r"| jj}|du r*| jj}ttj|tj	dd}tj|j
d gtj	d}	tj|||gtjd}
||	|
d}| jjdkr[d}| jjdkrg|du rgd}|durxtj|gtj	d}||d< | jd|}|d  }|s|S t|dkr|dfS |d  | jj tj	}||fS )a4  
        Synthesize raw audio from phoneme ids.

        :param phoneme_ids: List of phoneme ids.
        :param syn_config: Synthesis configuration.
        :param include_alignments: Return samples per phoneme id if True.
        :return: Audio float numpy array from voice model (unnormalized, in range [-1, 1]).

        If include_alignments is True and the voice model supports it, the return
        value will be a tuple instead with (audio, phoneme_id_samples) where
        phoneme_id_samples contains the number of audio samples per phoneme id.
        N)Zdtyper   r
   )inputZinput_lengthsscalessid)rt   
speaker_idlength_scalenoise_scalenoise_w_scaler?   r/   Zexpand_dimsarrayZint64shaper{   Znum_speakersr>   runZsqueezerm   Z
hop_lengthr2   )r4   r   rr   rs   r   r   r   r   Zphoneme_ids_arrayZphoneme_ids_lengthsr   argsr   resultr   r)   r!   r!   r"   ru     sT   zPiperVoice.phoneme_ids_to_audio)NF)NTF)(r   r   r   r8   rR   rS   r   r   r   r@   r   rP   rA   rB   boolrC   r   r   rD   floatstaticmethodr	   r   rO   r:   rc   r    r   r   r   r#   r   waveZ
Wave_writer   r   r/   r9   r   ru   r!   r!   r!   r"   r=   e   s   
 


2Q
s

.r=   )3r8   r}   rN   loggingre	threadingr]   r   dataclassesr   pathlibr   typingr   r   r   r   r   r	   numpyr/   rR   r?   r   r   r   constr   r   r   r   r   Zphonemize_espeakr   r   Ztashkeelr   r   r   Lockrn   rt   r1   compilere   	getLoggerr   rK   r   r#   r=   r!   r!   r!   r"   <module>   s<     

<