o
    0iN                  	   @  s  d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
ZddlZddlmZmZmZ eeZejdejd	Zi d
dddddddddddddddddddddddddddddddddddiZed Zed!Zed"d#fed$d%fed&d'fed(d)fed*d+fed,d-fed.dfgZed/Zed0Zed1ZdEdFd7d8ZdGd=d>Z G d?d@ d@Z!G dAdB dBZ"G dCdD dDZ#dS )HzCore TTS engine and text processing components.

This module contains the main Supertonic TTS engine, text processor,
and supporting utilities for audio synthesis.
    )annotationsN)Optional)	normalize   )AVAILABLE_LANGUAGES	MAX_SPEED	MIN_SPEEDuk   [😀-🙏🌀-🗿🚀-🛿🜀-🝿🞀-🟿🠀-🣿🤀-🧿🨀-🩯🩰-🫿☀-⛿✀-➿🇦-🇿]+)flagsu   –-u   ‑u   —   ¯ _u   “"u   ”u   ‘'u   ’   ´`[]|/#u   →u   ←zt[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]u   [♥☆♡©\\]z ,,z \..z !!z \??z ;;z ::z 'z(["\'\`])\1+z\s+u+   [.!?;:,'\"')\]}…。」』】〉》›»]$lengths
np.ndarraymax_lenOptional[int]returnc                 C  s@   |p|   }td|}|tj| ddk tj}|dd|S )z
    Convert lengths to binary mask.

    Args:
        lengths: (B,)
        max_len: int

    Returns:
        mask: (B, 1, max_len)
    r   r   )Zaxis)maxnpZarangeZexpand_dimsastypefloat32Zreshape)r   r   Zidsmask r(   I/home/kim/smarthome/.venv/lib/python3.10/site-packages/supertonic/core.pylength_to_maskU   s   r*   wav_lengthsbase_chunk_sizeintchunk_compress_factorc                 C  s$   || }| | d | }t |}|S )z)Generate mask for latent representations.r   r*   )r+   r,   r.   Zlatent_sizeZlatent_lengthslatent_maskr(   r(   r)   get_latent_maskf   s   r1   c                   @  s   e Zd ZdZd7ddZd8dd	Zd9ddZed9ddZd:ddZ	d:ddZ
d:ddZd:ddZd:ddZd:ddZd:ddZd:ddZd;d!d"Zd<d=d%d&Zd>d)d*Zd?d+d,Zd@d.d/ZdAd2d3Z	#d<dBd5d6Zd#S )CUnicodeProcessora  Processes text into unicode indices for the TTS model.

    This class handles text preprocessing, normalization, and conversion to
    numeric indices that the TTS model can understand.

    Args:
        unicode_indexer_path: Path to the unicode indexer JSON file
    unicode_indexer_pathstrc                 C  s   |  || _|  | _d S N)_load_indexerindexer_make_supported_characterssupported_chars)selfr3   r(   r(   r)   __init__z   s   zUnicodeProcessor.__init__r!   listc              
   C  s   zFt |d6}t|}t|tstdt|j t|dkr%tdt	
d| dt| d W d    W |S 1 s?w   Y  W |S  ty]   t	d|  td	| d
 tjyz } zt	d|  td| d|d }~ww )Nrz$Unicode indexer must be a list, got r   zUnicode indexer is emptyzLoaded unicode indexer from z (z	 entries)zUnicode indexer not found: z"Unicode indexer file not found at z1. Please ensure the model is properly downloaded.z Invalid unicode indexer format: z%Unicode indexer file is malformed at z. Please re-download the model.)openjsonload
isinstancer<   
ValueErrortype__name__lenloggerinfoFileNotFoundErrorerrorJSONDecodeError)r:   r3   fr7   er(   r(   r)   r6   ~   s@   



zUnicodeProcessor._load_indexerset[str]c                 C  s:   t  }t| jD ]\}}|dkrqt|}|| q|S )Nr"   )set	enumerater7   chradd)r:   	supportedZunicode_valueZchar_dict_idxcharr(   r(   r)   r8      s   z+UnicodeProcessor._make_supported_charactersc                 C  s   | j S r5   )r9   r:   r(   r(   r)   supported_character_set   s   z(UnicodeProcessor.supported_character_settextc                 C     t d|}|S )z"Remove emoji characters from text. )_EMOJI_PATTERNsubr:   rV   r(   r(   r)   _remove_emojis   s   zUnicodeProcessor._remove_emojisc                 C  s"   t  D ]
\}}|||}q|S )zBNormalize various punctuation marks and symbols to standard forms.)_SYMBOL_REPLACEMENTSitemsreplace)r:   rV   oldnewr(   r(   r)   _normalize_symbols   s   z#UnicodeProcessor._normalize_symbolsc                 C  s   t d|}td|}|S )z0Remove combining diacritics and special symbols.rX   )_DIACRITICS_PATTERNrZ   _SPECIAL_SYMBOLS_PATTERNr[   r(   r(   r)   $_remove_diacritics_and_special_chars   s   z5UnicodeProcessor._remove_diacritics_and_special_charsc                 C  s.   dddd}|  D ]
\}}|||}q
|S )z9Expand common abbreviations and expressions to full text.z at zfor example, z	that is, )@ze.g.,zi.e.,)r^   r_   )r:   rV   Zexpr_replacementskvr(   r(   r)   _expand_abbreviations   s   z&UnicodeProcessor._expand_abbreviationsc                 C  s   t D ]
\}}|||}q|S )z%Fix spacing around punctuation marks.)_PUNCTUATION_SPACING_PATTERNSrZ   )r:   rV   patternreplacementr(   r(   r)   _fix_punctuation_spacing   s   z)UnicodeProcessor._fix_punctuation_spacingc                 C  rW   )z!Remove duplicate quotation marks.z\1)_DUPLICATE_QUOTES_PATTERNrZ   r[   r(   r(   r)   _remove_duplicate_quotes   s   z)UnicodeProcessor._remove_duplicate_quotesc                 C  s   t d| }|S )zRemove extra whitespace.r   )_WHITESPACE_PATTERNrZ   stripr[   r(   r(   r)   _clean_whitespace   s   z"UnicodeProcessor._clean_whitespacec                 C  s   t |s	|d7 }|S )Nr   )_ENDING_PUNCTUATION_PATTERNsearchr[   r(   r(   r)   _add_period_if_needed   s   
z&UnicodeProcessor._add_period_if_neededlangc                 C  s:   |t vrtd| ddt  d| d| d| dS )zAdd language tokens to text for multilingual model.

        Args:
            text: Preprocessed text
            lang: Language code (en, ko, es, pt, fr)

        Returns:
            Text wrapped with language tokens: <lang>text</lang>
        zInvalid language: 'z'. Supported languages: z, <>z</)r   rB   join)r:   rV   rv   r(   r(   r)   _add_language_token   s   
z$UnicodeProcessor._add_language_tokenNOptional[str]c              
   C  s   zt d|}W n ty" } ztd| d W Y d}~nd}~ww | |}| |}| |}| |}| |}| 	|}| 
|}| |}|durU| ||}|S )a  Preprocess text by normalizing, cleaning, and standardizing format.

        This method applies a series of text transformations in sequence:
        1. Unicode normalization (NFKD)
        2. Emoji removal
        3. Symbol normalization
        4. Diacritics and special character removal
        5. Abbreviation expansion
        6. Punctuation spacing fixes
        7. Duplicate quote removal
        8. Whitespace cleaning
        9. Add period if needed
        10. Add language tokens (for multilingual models)

        Args:
            text: Raw input text
            lang: Language code for multilingual support (en, ko, es, pt, fr).
                If None, no language tokens are added (v1 compatibility).

        Returns:
            Preprocessed and normalized text
        ZNFKDzUnicode normalization failed: z#. Continuing without normalization.N)r   	ExceptionrF   warningr\   rb   re   ri   rm   ro   rr   ru   rz   )r:   rV   rv   rL   r(   r(   r)   _preprocess_text   s"   







z!UnicodeProcessor._preprocess_texttext_ids_lengthsr   c                 C  s   t |}|S r5   r/   )r:   r   	text_maskr(   r(   r)   _get_text_mask!  s   zUnicodeProcessor._get_text_maskc                 C  s   t jdd |D t jd}|S )Nc                 S     g | ]}t |qS r(   )ord).0rS   r(   r(   r)   
<listcomp>&      z<UnicodeProcessor._text_to_unicode_values.<locals>.<listcomp>Zdtype)r$   arrayZuint16)r:   rV   Zunicode_valuesr(   r(   r)   _text_to_unicode_values%  s   z(UnicodeProcessor._text_to_unicode_valuestuple[bool, list[str]]c                 C  s^   t |}t  }|D ]}t | |}|| j }t|dkr"|| q	t|dktt|fS )aX  Validate if text can be processed by the model.

        Args:
            text: Text to validate

        Returns:
            Tuple of (is_valid, unsupported_chars):
                - is_valid: True if text can be processed
                - unsupported_chars: List of unsupported characters (empty if valid)

        Example:
            ```python
            processor = UnicodeProcessor("unicode_indexer.json")
            is_valid, unsupported = processor.validate_text("Hello world")
            if not is_valid:
                print(f"Cannot process: {unsupported}")
            ```
        r   )rN   r~   rU   rE   updatesortedr<   )r:   rV   Zinput_charsZunsupported_charsZ
input_charZp_charsZus_charsr(   r(   r)   validate_text)  s   

zUnicodeProcessor.validate_text	text_list	list[str]c                 C  s   d |}| |S )zValidate a list of texts.rX   )ry   r   )r:   r   Ztext_catr(   r(   r)   validate_text_listE  s   

z#UnicodeProcessor.validate_text_listtuple[np.ndarray, np.ndarray]c           
        s    fdd|D }t jdd |D t jd}t jt|| ft jd}t|D ] \}}|}t jfdd|D t jd||dt|f< q)|}	||	fS )a  Process a list of texts into model inputs.

        Args:
            text_list: List of text strings to process
            lang: Language code for multilingual support (en, ko, es, pt, fr).
                If None, no language tokens are added (v1 compatibility).

        Returns:
            Tuple of (text_ids, text_mask):
                - text_ids: Array of shape (batch_size, max_length) with unicode indices
                - text_mask: Array of shape (batch_size, 1, max_length) with attention mask
        c                   s   g | ]} | qS r(   )r~   )r   trv   r:   r(   r)   r   Y  s    z-UnicodeProcessor.__call__.<locals>.<listcomp>c                 S  r   r(   )rE   )r   rV   r(   r(   r)   r   Z  r   r   c                   s   g | ]} j | qS r(   )r7   )r   valrT   r(   r)   r   _  s    N)	r$   r   int64ZzerosrE   r#   rO   r   r   )
r:   r   rv   Zpreprocessed_textsr   text_idsirV   Zunicode_valsr   r(   r   r)   __call__J  s   

zUnicodeProcessor.__call__)r3   r4   )r3   r4   r!   r<   )r!   rM   )rV   r4   r!   r4   )rV   r4   rv   r4   r!   r4   r5   )rV   r4   rv   r{   r!   r4   )r   r   r!   r   )rV   r4   r!   r   )rV   r4   r!   r   )r   r   r!   r   )r   r   rv   r{   r!   r   )rD   
__module____qualname____doc__r;   r6   r8   propertyrU   r\   rb   re   ri   rm   ro   rr   ru   rz   r~   r   r   r   r   r   r(   r(   r(   r)   r2   p   s.    
	

 









,


r2   c                   @  s   e Zd ZdZdddZdS )	Stylea  Voice style representation for TTS synthesis.

    This class encapsulates the style vectors used to control the voice
    characteristics during speech synthesis.

    Args:
        style_ttl_onnx (numpy.ndarray): Style vector for the text-to-latent model
        style_dp_onnx (numpy.ndarray): Style vector for the duration predictor

    Attributes:
        ttl (numpy.ndarray): Text-to-latent style vector
        dp (numpy.ndarray): Duration predictor style vector
    style_ttl_onnxr   style_dp_onnxc                 C  sP   t |tjstdt|j t |tjs tdt|j || _|| _d S )Nz#style_ttl must be numpy array, got z"style_dp must be numpy array, got )rA   r$   Zndarray	TypeErrorrC   rD   ttldp)r:   r   r   r(   r(   r)   r;   t  s   
zStyle.__init__N)r   r   r   r   )rD   r   r   r   r;   r(   r(   r(   r)   r   e  s    r   c                   @  s6   e Zd ZdZd"ddZd#ddZ			d$d%d d!ZdS )&
Supertonica  Core TTS engine for Supertonic speech synthesis.

    This class orchestrates the entire text-to-speech pipeline, from text
    encoding through duration prediction and waveform generation.

    Args:
        cfgs: Model configuration dictionary
        text_processor: Unicode text processor instance
        dp_ort: Duration predictor ONNX session
        text_enc_ort: Text encoder ONNX session
        vector_est_ort: Vector estimator ONNX session
        vocoder_ort: Vocoder ONNX session

    Attributes:
        sample_rate (int): Audio sample rate in Hz
        base_chunk_size (int): Base chunk size for latent representation
        chunk_compress_factor (int): Compression factor for chunks
        ldim (int): Latent dimension size
    cfgsdicttext_processorr2   dp_ortort.InferenceSessiontext_enc_ortvector_est_ortvocoder_ortc           
   
   C  s$  t |tstdt|j d|fd|fd|fd|ffD ]\}}t |tjs3t| dt|j q|| _|| _|| _	|| _
|| _|| _z|d d | _|d d	 | _|d
 d | _|d
 d | _W n ty }	 ztd|	  td|	 d|	d }	~	ww td| j d| j d d S )Nz-text_processor must be UnicodeProcessor, got r   r   r   r   z must be InferenceSession, got Zaesample_rater,   r   r.   
latent_dimzMissing required config key: z0Model configuration is incomplete. Missing key: z<. Please ensure you have downloaded the correct model files.z+Initialized Supertonic engine (sample_rate=zHz, latent_dim=))rA   r2   r   rC   rD   ortZInferenceSessionr   r   r   r   r   r   r   r,   r.   ldimKeyErrorrF   rI   rB   rG   )
r:   r   r   r   r   r   r   namesessionrL   r(   r(   r)   r;     sJ   



zSupertonic.__init__durationr   r!   r   c           
      C  s   t |}| | j }|| j tj}| j| j }|| d | tj}| j	| j }tj
|||tj}t|| j| j}	||	 }||	fS )Nr   )rE   r#   r   r%   r$   r   r,   r.   Zint32r   randomZrandnr&   r1   )
r:   r   bszZwav_len_maxr+   
chunk_sizeZ
latent_lenr   noisy_latentr0   r(   r(   r)   sample_noisy_latent  s   zSupertonic.sample_noisy_latent   ?Nr   r   styler   
total_stepr-   speedfloatrv   r{   c                 C  sF  t ||jjd krtdt | d|jjd  d|tk s#|tkr2tdt dt d|dd	t |}| ||\}}| jd
||j	|d^}	}
|	| }	| j
d
||j|d^}}
| |	\}}tj|g| tjd}t|D ]}tj|g| tjd}| jd
|||j||||d^}}
qt| jd
d|i^}}
||	fS )aa  Synthesize speech from text using the specified style.

        Args:
            text_list: List of text strings to synthesize
            style: Voice style object containing style vectors
            total_step: Number of diffusion steps (higher = better quality, slower)
            speed: Speech speed multiplier (0.7 = slower, 2.0 = faster)
            lang: Language code for multilingual support (en, ko, es, pt, fr).
                Required for supertonic-2 model. Supported languages:
                - "en": English
                - "ko": Korean
                - "es": Spanish
                - "pt": Portuguese
                - "fr": French

        Returns:
            Tuple of (waveform, duration):
                - waveform: Audio array of shape (batch_size, num_samples)
                - duration: Duration in seconds for each sample
        r   zNumber of texts (z&) must match number of style vectors (z%). Please provide one style per text.zSpeed must be between z and z, got z.6fz4. Use values closer to 1.05 for more natural speech.N)r   Zstyle_dpr   )r   	style_ttlr   r   )r   Ztext_embr   r   r0   current_stepr   Zlatent)rE   r   shaperB   r   r   r   r   runr   r   r   r$   r   r&   ranger   r   )r:   r   r   r   r   rv   r   r   r   Zdur_onnxr   Ztext_emb_onnxZxtr0   Ztotal_step_npstepr   Zwavr(   r(   r)   r     sJ   

zSupertonic.__call__)r   r   r   r2   r   r   r   r   r   r   r   r   )r   r   r!   r   )r   r   N)r   r   r   r   r   r-   r   r   rv   r{   r!   r   )rD   r   r   r   r;   r   r   r(   r(   r(   r)   r     s    

0r   r5   )r   r   r   r    r!   r   )r+   r   r,   r-   r.   r-   r!   r   )$r   
__future__r   r?   loggingretypingr   unicodedatar   numpyr$   Zonnxruntimer   configr   r   r   	getLoggerrD   rF   compileUNICODErY   r]   rc   rd   rj   rn   rp   rs   r*   r1   r2   r   r   r(   r(   r(   r)   <module>   s    
	







 v