o
    iv                     @  s  U d dl mZ d dlZd dlmZ d dlmZ ejdkr#d dlm	Z	 nzd dl
m	Z	 W n ey8   dd Z	Y nw d	d
lmZmZmZmZmZmZmZmZmZmZmZmZmZ d	dlmZmZmZmZmZm Z m!Z! eeB eB eB eB Z"de#d< e	G dd dZ$G dd dZ%e	G dd de%Z&e	G dd de%Z'e	G dd de%Z(e	G dd de%Z)e	G dd de%Z*e	G dd de%Z+e	G dd de%Z,e	G d d! d!e%Z-e	G d"d# d#e%Z.ed$d%d7d+d,Z/ed-d%	/d8d9d5d6Z0dS ):    )annotationsN)	lru_cache)	getLogger)      )finalc                 C  s   | S )N )clsr   r   O/home/kim/smarthome/.venv/lib/python3.10/site-packages/charset_normalizer/md.pyr      s   r      )COMMON_CJK_CHARACTERSCOMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD_ACCENTUATED_ARABIC_ARABIC_ISOLATED_FORM_CJK_HANGUL	_HIRAGANA	_KATAKANA_LATIN_THAI)_character_flagsis_emoticonis_punctuationis_separator	is_symbolremove_accentunicode_rangeint_GLYPH_MASKc                   @  s(   e Zd ZdZdZdddZdd	d
ZdS )CharInfou{  Pre-computed character properties shared across all detectors.

    Instantiated once and reused via :meth:`update` on every character
    in the hot loop so that redundant calls to str methods
    (``isalpha``, ``isupper``, …) and cached utility functions
    (``_character_flags``, ``is_punctuation``, …) are avoided when
    several plugins need the same information.
    	character	printablealphaupperlowerspacedigitis_asciicase_variableflagsaccentuatedlatinis_cjk	is_arabicis_glyphpunctsymreturnNonec                 C  sj   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d| _	d| _
d| _d| _d| _d| _d| _d| _d S )N Fr   r#   selfr   r   r
   __init__N   s"   
zCharInfo.__init__r$   strc                 C  s  || _ t|}|dk rd| _d| _d| _d| _d| _d|  kr%dkrJn n#d| _d| _d| _	d| _
d| _d| _d| _t| _d| _d| _d| _dS d|  krTdkryn n#d| _d| _d| _	d| _
d| _d| _d| _t| _d| _d| _d| _dS d|  krd	krn n#d| _d| _d| _	d| _
d| _d| _d| _d
| _d| _d| _d| _dS |dksd|  krdkrn n%d| _d| _d| _	d| _
d| _|dk| _d| _d
| _d| _d| _d| _dS | | _d| _d| _d| _	d| _
d| _d| _d
| _d| _| jrt|nd| _| jrt|nd| _dS d| _| | _| | _| | _| | _	| | _
| | _| j	| jk| _| jrCt|}nd
}|| _t|t@ | _t|t@ | _t|t@ | _t|t@ | _t|t @ | _| jrst|nd| _| jr~t|nd| _dS )zBUpdate all properties for *character* (called once per character).   TFA   Z   a   z   0   9   r       	      N)!r$   ordr+   r.   r0   r1   r2   r&   r'   r(   r)   r*   r%   r,   r   r-   r/   r3   r4   isprintabler   r   isalphaisupperislowerisspaceisdigitr   boolr   r   r   r!   )r9   r$   or-   r   r   r
   updatea   s   



 









zCharInfo.updateNr5   r6   )r$   r;   r5   r6   )__name__
__module____qualname____doc__	__slots__r:   rO   r   r   r   r
   r"   /   s
    	
r"   c                   @  s6   e Zd ZdZdZdd	d
ZdddZedddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    r   r$   r;   infor"   r5   r6   c                 C     t )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        NotImplementedErrorr9   r$   rW   r   r   r
   	feed_info   s   zMessDetectorPlugin.feed_infoc                 C  rX   )zB
        Permit to reset the plugin to the initial state.
        rY   r8   r   r   r
   reset   s   zMessDetectorPlugin.resetfloatc                 C  rX   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        rY   r8   r   r   r
   ratio   s   zMessDetectorPlugin.ratioNr$   r;   rW   r"   r5   r6   rP   r5   r^   )	rQ   rR   rS   rT   rU   r\   r]   propertyr_   r   r   r   r
   rV      s    

rV   c                   @  <   e Zd ZdZdddZdd
dZdddZedddZdS ) TooManySymbolOrPunctuationPlugin_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr5   r6   c                 C  s"   d| _ d| _d| _d | _d| _d S Nr   Fre   r8   r   r   r
   r:      s
   
z)TooManySymbolOrPunctuationPlugin.__init__r$   r;   rW   r"   c                 C  sb   |  j d7  _ || jkr,|tvr,|jr|  jd7  _n|js,|jr,t|s,|  jd7  _|| _dS )1Optimized feed using pre-computed character info.r      N)	rh   ri   r   r3   rf   r*   r4   r   rg   r[   r   r   r
   r\      s   

z*TooManySymbolOrPunctuationPlugin.feed_infoc                 C  s   d| _ d| _d| _d S Nr   )rf   rh   rg   r8   r   r   r
   r]     s   
z&TooManySymbolOrPunctuationPlugin.resetr^   c                 C  s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           333333?)rh   rf   rg   )r9   Zratio_of_punctuationr   r   r
   r_     s   

z&TooManySymbolOrPunctuationPlugin.ratioNrP   r`   ra   	rQ   rR   rS   rU   r:   r\   r]   rb   r_   r   r   r   r
   rd      s    


rd   c                   @  rc   )TooManyAccentuatedPluginrh   _accentuated_countr5   r6   c                 C     d| _ d| _d S rm   rr   r8   r   r   r
   r:   "     
z!TooManyAccentuatedPlugin.__init__r$   r;   rW   r"   c                 C  s*   |  j d7  _ |jr|  jd7  _dS dS rk   r   N)rh   r.   rs   r[   r   r   r
   r\   &  s   z"TooManyAccentuatedPlugin.feed_infoc                 C  rt   rm   rr   r8   r   r   r
   r]   -  ru   zTooManyAccentuatedPlugin.resetr^   c                 C  s*   | j dk rdS | j| j  }|dkr|S dS )Nr   rn   gffffff?rr   )r9   Zratio_of_accentuationr   r   r
   r_   1  s   
zTooManyAccentuatedPlugin.ratioNrP   r`   ra   rp   r   r   r   r
   rq     s    


rq   c                   @  rc   )UnprintablePlugin_unprintable_countrh   r5   r6   c                 C  rt   rm   rx   r8   r   r   r
   r:   >  ru   zUnprintablePlugin.__init__r$   r;   rW   r"   c                 C  s<   |j s|js|dkr|dkr|  jd7  _|  jd7  _dS )rk   u   ﻿r   N)r)   r%   ry   rh   r[   r   r   r
   r\   B  s   zUnprintablePlugin.feed_infoc                 C  s
   d| _ d S rm   )ry   r8   r   r   r
   r]   M  s   
zUnprintablePlugin.resetr^   c                 C     | j dkrdS | jd | j  S )Nr   rn   r   )rh   ry   r8   r   r   r
   r_   P     
zUnprintablePlugin.ratioNrP   r`   ra   rp   r   r   r   r
   rw   :  s    


rw   c                   @  rc   )SuspiciousDuplicateAccentPlugin_successive_countrh   _last_latin_character_last_was_accentuatedr5   r6   c                 C     d| _ d| _d | _d| _d S rj   r~   r8   r   r   r
   r:   a  s   
z(SuspiciousDuplicateAccentPlugin.__init__r$   r;   rW   r"   c                 C  st   |  j d7  _ | jdur1|jr1| jr1|jr!| j r!|  jd7  _t|t| jkr1|  jd7  _|| _|j| _dS rv   )rh   r   r.   r   r'   rI   r   r   r[   r   r   r
   r\   h  s   
z)SuspiciousDuplicateAccentPlugin.feed_infoc                 C  r   rj   r~   r8   r   r   r
   r]   w     
z%SuspiciousDuplicateAccentPlugin.resetr^   c                 C  r{   )Nr   rn   rl   )rh   r   r8   r   r   r
   r_   }  r|   z%SuspiciousDuplicateAccentPlugin.ratioNrP   r`   ra   rp   r   r   r   r
   r}   X  s    


r}   c                   @  rc   )SuspiciousRange"_suspicious_successive_range_countrh   _last_printable_seen_last_printable_ranger5   r6   c                 C     d| _ d| _d | _d | _d S rm   r   r8   r   r   r
   r:     r   zSuspiciousRange.__init__r$   r;   rW   r"   c                 C  s   |  j d7  _ |js|js|tv rd| _d| _dS | jdu r(|| _t|| _dS | j}t|}t||r;|  jd7  _|| _|| _dS rv   )	rh   r)   r3   r   r   r   r    is_suspiciously_successive_ranger   )r9   r$   rW   unicode_range_aunicode_range_br   r   r
   r\     s   



zSuspiciousRange.feed_infoc                 C  r   rm   )rh   r   r   r   r8   r   r   r
   r]     r   zSuspiciousRange.resetr^   c                 C  s"   | j dkrdS | jd | j  }|S )NrE   rn   rl   )rh   r   )r9   Zratio_of_suspicious_range_usager   r   r
   r_     s   
zSuspiciousRange.ratioNrP   r`   ra   rp   r   r   r   r
   r     s    


r   c                   @  rc   )SuperWeirdWordPlugin_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrh   _bad_character_count_buffer_length_buffer_last_char_buffer_last_char_accentuated_buffer_accent_count_buffer_glyph_count_buffer_upper_countr5   r6   c                 C  sR   d| _ d| _d| _d| _d| _d| _d| _d| _d | _d| _	d| _
d| _d| _d S rj   r   r8   r   r   r
   r:     s   
zSuperWeirdWordPlugin.__init__r$   r;   rW   r"   c                 C  s  |j r@|  jd7  _|| _|jr|  jd7  _|j| _|jr%|  jd7  _| js4|j	r.|jr4|j
s4d| _|j
r>|  jd7  _dS | jsEdS |jsO|jsOt|r|  jd7  _| j}|  j|7  _|dkr| j| dkrod| _n'| jr| j r| j|kr|  jd7  _d| _n| jdkrd| _|  jd7  _|dkr| jr| jdko| j| dk}|s|  jd7  _d| _| jr|  jd7  _|  j|7  _d	| _d	| _d| _d| _d	| _d| _d| _d| _dS |d
vr|js|jrd| _|  jd7  _|| _d	| _dS dS dS dS )rk   r   TN         ?   r   ro   F>   ~-=_<>|)r&   r   r   r'   r   r.   r   r   r   r/   r2   r   r)   r3   r   r   rh   r   rI   r   r   r   r*   r4   )r9   r$   rW   Zbuffer_lengthZprobable_camel_casedr   r   r
   r\     s   




zSuperWeirdWordPlugin.feed_infoc                 C  sR   d| _ d | _d| _d| _d| _d| _d| _d| _d| _d| _	d| _
d| _d| _d S rj   )r   r   r   r   r   r   r   rh   r   r   r   r   r   r8   r   r   r
   r]   ,  s   
zSuperWeirdWordPlugin.resetr^   c                 C  s$   | j dkr| jdkrdS | j| j S )N
   r   rn   )r   r   r   rh   r8   r   r   r
   r_   ;  s   zSuperWeirdWordPlugin.ratioNrP   r`   ra   rp   r   r   r   r
   r     s    


Kr   c                   @  s@   e Zd ZdZdZdddZdddZdddZedddZ	dS )CjkUncommonPluginz<
    Detect messy CJK text that probably means nothing.
    rh   _uncommon_countr5   r6   c                 C  rt   rm   r   r8   r   r   r
   r:   K  ru   zCjkUncommonPlugin.__init__r$   r;   rW   r"   c                 C  s,   |  j d7  _ |tvr|  jd7  _dS dS rv   )rh   r   r   r[   r   r   r
   r\   O  s   zCjkUncommonPlugin.feed_infoc                 C  rt   rm   r   r8   r   r   r
   r]   V  ru   zCjkUncommonPlugin.resetr^   c                 C  s.   | j dk rdS | j| j  }|dkr|d S dS )Nr   rn   r   r   r   )r9   Zuncommon_form_usager   r   r
   r_   Z  s   
zCjkUncommonPlugin.ratioNrP   r`   ra   )
rQ   rR   rS   rT   rU   r:   r\   r]   rb   r_   r   r   r   r
   r   C  s    


r   c                   @  rc   )ArchaicUpperLowerPlugin	_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrh   _last_alpha_seen_last_alpha_seen_upper_last_alpha_seen_lower_current_ascii_onlyr5   r6   c                 C  s:   d| _ d| _d| _d| _d| _d | _d| _d| _d| _d S )NFr   Tr   r8   r   r   r
   r:   t  s   
z ArchaicUpperLowerPlugin.__init__r$   r;   rW   r"   c                 C  s
  |j o|j}| }|r;| jdkr;| jdkr#|js#| js#|  j| j7  _d| _d| _d| _d| _|  j	d7  _	d| _dS | jrD|j
sDd| _| jdurj|jrO| jsU|jrg| jrg| jrc|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _|j| _|j| _dS )rk   r   @   NFr   Trl   )r&   r,   r   r*   r   r   r   r   r   rh   r+   r'   r   r(   r   )r9   r$   rW   Zis_concernedZ	chunk_sepr   r   r
   r\     sH   

z!ArchaicUpperLowerPlugin.feed_infoc                 C  s:   d| _ d| _d| _d| _d | _d| _d| _d| _d| _d S )Nr   FT)	rh   r   r   r   r   r   r   r   r   r8   r   r   r
   r]     s   
zArchaicUpperLowerPlugin.resetr^   c                 C  s   | j dkrdS | j| j  S )Nr   rn   )rh   r   r8   r   r   r
   r_     s   
zArchaicUpperLowerPlugin.ratioNrP   r`   ra   rp   r   r   r   r
   r   f  s    


-r   c                   @  s<   e Zd ZdZdddZdddZdddZedddZdS )ArabicIsolatedFormPluginrh   _isolated_form_countr5   r6   c                 C  rt   rm   r   r8   r   r   r
   r:     ru   z!ArabicIsolatedFormPlugin.__init__c                 C  rt   rm   r   r8   r   r   r
   r]     ru   zArabicIsolatedFormPlugin.resetr$   r;   rW   r"   c                 C  s.   |  j d7  _ |jt@ r|  jd7  _dS dS rv   )rh   r-   r   r   r[   r   r   r
   r\     s   
z"ArabicIsolatedFormPlugin.feed_infor^   c                 C  s   | j dk rdS | j| j  }|S )Nr   rn   r   )r9   Zisolated_form_usager   r   r
   r_     s   
zArabicIsolatedFormPlugin.ratioNrP   r`   ra   )	rQ   rR   rS   rU   r:   r]   r\   rb   r_   r   r   r   r
   r     s    


r      )maxsizer   
str | Noner   r5   rM   c                 C  sv  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS | dks|dkrdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaZCJKZHangulzBasic Latin)r   r   PunctuationZForms)splitr   )r   r   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr   r   r
   r     sZ   r   i   皙?Fdecoded_sequencer;   maximum_thresholdr^   debugc              	   C  s  t | }|dk rd}n	|dk rd}nd}t }t }t }t }t }	t }
t }t }t	 }|j
}|j
}|j
}|j
}|	j
}|
j
}|j
}|j
}|j
}t }|j}td||D ]n}| |||  D ]B}|| ||| ||| ||| |jr||| ||| |jr||| |jr||| |jr||| |jr||| q`|j|j |j |j |	j |
j |j |j |j }||kr n/qV|d |d| |d| |d| |j|j |j |j |	j |
j |j |j |j }|rDtd}|td	| d
| d|  |dkr(|td| dd   |td| dd   |||||	|
|||f	D ]}|t|j d|j  q3t|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    i  rC   r   r   r<   r   
Zcharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=   zStarting with: NzEnding with: iz: r   )lenrd   rq   rw   r}   r   r   r   r   r   r\   r"   rO   ranger%   r&   r/   r0   r1   r_   r   logr   	__class__round)r   r   r   Zseq_lenstepZd_spZd_taZd_upZd_sdaZd_srZd_swZd_cuZd_auZd_aiZ	d_sp_feedZ	d_ta_feedZ	d_up_feedZ
d_sda_feedZ	d_sr_feedZ	d_sw_feedZ	d_cu_feedZ	d_au_feedZ	d_ai_feedrW   Zinfo_updateZblock_startr$   Zmean_mess_ratiologgerdtr   r   r
   
mess_ratio*  s   













r   )r   r   r   r   r5   rM   )r   F)r   r;   r   r^   r   rM   r5   r^   )1
__future__r   sys	functoolsr   loggingr   version_infotypingr   Ztyping_extensionsImportErrorZconstantr   r   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r   r   r   r!   __annotations__r"   rV   rd   rq   rw   r}   r   r   r   r   r   r   r   r   r   r   r
   <module>   sT    
<$ 1,7 "\I