o
    iH                  	   @  s  U d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZ edZe  Z!e!"e#d g Z$de%d< g Z&de%d< eD ]!Z'zee'r{e$(e' ne&(e' W qn e)y   e&(e' Y qnw e$e& Z*de%d< 									d6d7d(d)Z+									d6d8d,d-Z,									d6d9d0d1Z-									d:d;d4d5Z.dS )<    )annotationsN)PathLike)BinaryIO   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDIANA_SUPPORTED_SIMILARTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encodingcut_sequence_chunks	iana_nameidentify_sig_or_bomis_multi_byte_encodingshould_strip_sig_or_bomZcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)sz	list[str]_mb_supported_sb_supportedIANA_SUPPORTED_MB_FIRST      皙?TF皙?	sequencesbytes | bytearraystepsint
chunk_size	thresholdfloatcp_isolationlist[str] | Nonecp_exclusionpreemptive_behaviourboolexplainlanguage_thresholdenable_fallbackreturnr   c
           @      C  s  t | ttfstdt| |rtj}
tt	 t
t t| }|dkrDtd |r8tt	 t
|
 tt| dddg dgS |durZttd	d
| dd |D }ng }|durrttdd
| dd |D }ng }||| krttd||| d}|}|dkr|| |k rt|| }t| tk }t| tk}|rttd| n|rttd| g }|rt| nd}|dur|| ttd| t }g }g }t }t }i }d}t }d}d}d}d}d}d}t }t }t| \} }!| dur||  ttdt|!|  |d d|vr#|d |t D ]'}"|r4|"|vr4q'|r>|"|v r>q'|"|v rEq'||" d}#| |"k}$|$oVt|"}%|"dv rh|$shttd|" q'|"dv ry|$syttd|" q'|"|v rttd|" q'|"|v rttd|" q'zt|"}&W n ttfy   ttd|" Y q'w |r|&stt |"}'ntt!|"}'|'"|sttd|"|'| q'|r|&s||krttd|"|| q'|r|&sttd |" q'z9|r|&du rt#|%du r| dtd! n	| t|!td! |"d" nt#|%du r&| n| t|!d |"d"}#W n+ t$t%fy] }( zt |(t%sMttd#|"t#|( ||" W Y d}(~(q'd}(~(ww t&|$sddnt|!|t|| })|&o||#duo|t|#|k }*|*rttd$|" |#dur{|&s{t'|#}+|(|+},|,dur{|,\}-}.}/|/rCt| |"|-|$|.|du s|"|ddfv r|#nd|d%}0||0 ||" ttd&|"t)|-d' d(d) |"|ddfv r	|-d*k r	|-dkrtd+|0j* |rtt	 t
|
 t|0g  S ||0 t|rA|du s||v rAd|v rAd|v rA|+ }1td+|1j* |r:tt	 t
|
 t|1g  S q'||" ttd,|" |	ry|"dd|d-d.fv ryt| |"||$g |#|d%}2|"|kro|2}n
|"dkrw|2}n|2}q'tt|)d/ }3t,|3d0}3d}4d}5g }6g }7zLt-| |"|)||$|%|!|&|#	D ]=}8|6|8 |7t.|8||d1u odt|  kod0kn   |7d2 |kr|4d7 }4|4|3ks|$r|%du r nqW n! t$y }( zttd3|"t#|( |3}4d1}5W Y d}(~(nd}(~(ww |5s:|r:|&s:z| td4d j/|"d5d6 W n# t$y9 }( zttd7|"t#|( ||" W Y d}(~(q'd}(~(ww |7rEt0|7t|7 nd}9|9|ksQ|4|3kr||" |"t1v rb|2t1|"  |#duru|&su|3t'|#|9g df ttd8|"|4t)|9d' d(d) |	r|"dd|d-d.fv r|5st| |"||$g |#|d%}2|"|kr|2}n
|"dkr|2}n|2}q'ttd9|"t)|9d' d(d) |&st |"}:nt!|"}:|:rttd:|"t#|: g };|"dkr|6D ]}8t4|8||:rd;|:nd}<|;|< qt5|;}=nt5|;}=|=rttd<|=|" t| |"|9|$|=|du s%|"|ddfv r'|#nd|d%}>||> |#durD|&sD|3t'|#|9|=d1f |rS|&sS|9d=k rS|d7 }|"|ddfv r|9d*k r|9dkrtd+|>j* |rytt	 t
|
 t|>g  S ||> t|r|du s||v rd|v rd|v r|+ }1td+|1j* |rtt	 t
|
 t|1g  S |s|&s|=rt,d>d? |=D dd@nd}?|?dAkrd|v rd|v rd1}|2|: ttdB|"|9|? |s-|&r-|*r-|#dur-t|#|dC k r-|"dDvr-d|v r-d|v r-d1}ttdE|"|9t|#|t|#| d'  |"| krNtdF|" |rEtt	 t
|
 t||" g  S q't|dkr|s`|s`|rfttdG |rvtdH|j* || n2|r~|du s|r|r|j6|j6ks|durtdI || n|rtdJ || |rtdK|+ j*t|d  ntdL |rtt	 t
|
 |S )Maf  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z3Expected object of type bytes or bytearray, got: {}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 S     g | ]}t |d qS Fr   .0cp r7   P/home/kim/smarthome/.venv/lib/python3.10/site-packages/charset_normalizer/api.py
<listcomp>s       zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 S  r1   r2   r3   r4   r7   r7   r8   r9   ~   r:   z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.   zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z\Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.>   utf_7zREncoding %s won't be tested as-is because detection is unreliable without BOM/SIG.zY%s is deemed too similar to a code page that was already considered unsuited. Continuing!zESkipping %s: already fast-tracked from a similar successful encoding.z2Encoding %s does not provide an IncrementalDecoderzbSkipping %s: definitive match already found, this encoding targets different languages (%s vs %s).zXSkipping %s: already accumulated %d same-family results after definitive match (cap=%d).zCSkipping single-byte %s: multi-byte definitive match already found.g    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %szpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.)Zpreemptive_declarationzM%s fast-tracked (identical decoded payload to a prior encoding, chaos=%f %%).d      )ndigitsr   z.Encoding detection: %s is most likely the one.zZ%s fast-skipped (identical decoded payload to a prior encoding that failed chaos probing).r=   r>         TzaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sg     j@strict)errorsz^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.z=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {},z We detected language {} using {}g{Gz?c                 s  s    | ]\}}|V  qd S )Nr7   )r5   _vr7   r7   r8   	<genexpr>  s    zfrom_bytes.<locals>.<genexpr>)defaultg      ?zyDefinitive match found: %s (chaos=%.3f, coherence=%.2f). Encodings targeting different language families will be skipped.g\(\?>	   Z	utf_8_sig	utf_32_ler?   r=   r>   	utf_16_ler/   	utf_32_be	utf_16_bezjMulti-byte definitive match: %s (chaos=%.3f, decoded=%d/%d=%.1f%%). Single-byte encodings will be skipped.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)7
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerr   r   logjoinr"   r   r   r   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrorr   r   intersectionstrUnicodeDecodeErrorLookupErrorrangehashgetroundr@   bestmaxr   r   decodesumr   update
setdefaultr   r	   fingerprint)@r   r!   r#   r$   r&   r(   r)   r+   r,   r-   Zprevious_logger_levellengthZis_too_small_sequenceZis_too_large_sequenceZprioritized_encodingsZspecified_encodingZtestedZtested_but_hard_failureZtested_but_soft_failureZsoft_failure_skipZsuccess_fast_trackedZpayload_result_cacheZdefinitive_match_foundZdefinitive_target_languagesZ post_definitive_sb_success_countZPOST_DEFINITIVE_SB_CAPZmb_definitive_match_foundZfallback_asciiZfallback_u8Zfallback_specifiedresultsZearly_stop_resultsZsig_encodingZsig_payloadZencoding_ianaZdecoded_payloadZbom_or_sig_availableZstrip_sig_or_bomZis_multi_byte_decoderZenc_languageseZr_Zmulti_byte_bonusZpayload_hashcachedZcached_messZ	cached_cdZcached_passedZ
fast_matchZprobable_resultZfallback_entryZmax_chunk_gave_upZearly_stop_countZlazy_str_hard_failureZ	md_chunksZ	md_ratioschunkZmean_mess_ratioZtarget_languagesZ	cd_ratiosZchunk_languagesZcd_ratios_mergedZcurrent_matchZbest_coherencer7   r7   r8   
from_bytes9   s  





	











	




















	


&







	















	
















r{   fpr   c
           
      C  s   t |  |||||||||	
S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r{   read)
r|   r!   r#   r$   r&   r(   r)   r+   r,   r-   r7   r7   r8   from_fpR  s   r~   pathstr | bytes | PathLikec
                 C  sH   t | d}
t|
|||||||||	
W  d   S 1 sw   Y  dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr~   )r   r!   r#   r$   r&   r(   r)   r+   r,   r-   r|   r7   r7   r8   	from_pathp  s   $r   fp_or_path_or_payload!PathLike | str | BinaryIO | bytesc
                 C  s   t | ttfrt| |||||||||	d
}
|
 S t | ttfr0t| |||||||||	d
}
|
 S t| |||||||||	d
}
|
 S )a)  
    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
    )	r!   r#   r$   r&   r(   r)   r+   r,   r-   )rR   rh   r   r   rT   rS   r{   r~   )r   r!   r#   r$   r&   r(   r)   r+   r,   r-   Zguessesr7   r7   r8   	is_binary  s\   -r   )	r   r   r   NNTFr   T)r   r    r!   r"   r#   r"   r$   r%   r&   r'   r(   r'   r)   r*   r+   r*   r,   r%   r-   r*   r.   r   )r|   r   r!   r"   r#   r"   r$   r%   r&   r'   r(   r'   r)   r*   r+   r*   r,   r%   r-   r*   r.   r   )r   r   r!   r"   r#   r"   r$   r%   r&   r'   r(   r'   r)   r*   r+   r*   r,   r%   r-   r*   r.   r   )	r   r   r   NNTFr   F)r   r   r!   r"   r#   r"   r$   r%   r&   r'   r(   r'   r)   r*   r+   r*   r,   r%   r-   r*   r.   r*   )/
__future__r   loggingosr   typingr   cdr   r   r   r	   Zconstantr
   r   r   r   r   mdr   modelsr   r   utilsr   r   r   r   r   r   	getLoggerrX   StreamHandlerr[   setFormatter	Formatterr   __annotations__r   Z_supported_encrb   rf   r   r{   r~   r   r   r7   r7   r7   r8   <module>   s     
		
      ! !