o
    i                     @   s<  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 dZ1dZ2e1e2 Z3dd Z4dd Z5dd Z6dd Z7dd Z8dd Z9ej:;deefdd  Z<d!d" Z=d#d$ Z>d%d& Z?d'd( Z@d)d* ZAd+d, ZBd-d. ZCd/d0 ZDd1d2 ZEd3d4 ZFd5d6 ZGd7d8 ZHd9d: ZId;d< ZJd=d> ZKd?d@ ZLdAdB ZMej:jNe.dCdDdEdF ZOdGdH ZPdIdJ ZQdKdL ZRdMdN ZSdOdP ZTej:;deefdQdR ZUdSdT ZVdUdV ZWdWdX ZXdYdZ ZYd[d\ ZZej:;deefd]d^ Z[d_d` Z\dadb Z]dcdd Z^dedf Z_dgdh Z`didj Zaej:;dkejbejcejdgdldm Zedndo Zfdpdq Zgdrds Zhdtdu Zidvdw Zjdxdy Zkdzd{ Zld|d} Zmd~d Zndd Zodd Zpej:;deeefdd Zqej:;dejrejsgdd Ztej:;dee/e0dd Zuej:;dejvejsdfejwejsdfejrejrdfejsejsdfgdd Zxej:;deddeddeddgdd Zydd Zzdd Z{e,ej:;de0dd Z|ej:;deeegdd Z}ej:;deeegej:;dde~dfdedfgdd Zej:;deeeegej:;ddd dd gej:;dddgdd Zej:;deeegdd Zej:;deeegej:;dddgddddddddf	ddd dddddddf	ddd dddddddf	dddd dddd dddf	ddddddd dddf	dgddȄ Zej:;deddd̜dddΜgfee1ffddЄ Zdd҄ ZddԄ Zej:;deeeegddք Zej:;de0dd؄ Zej:;dejrejsgddۄ Zdd݄ Zdd߄ ZdS )    N)defaultdict)Mapping)partial)StringIO)product)assert_array_almost_equalassert_array_equal)sparse)clone)ENGLISH_STOP_WORDSCountVectorizerHashingVectorizerTfidfTransformerTfidfVectorizerstrip_accents_asciistrip_accents_unicode
strip_tags)GridSearchCVcross_val_scoretrain_test_split)Pipeline)	LinearSVC)assert_allclose_dense_sparseassert_almost_equalskip_if_32bit)_IS_WASMCSC_CONTAINERSCSR_CONTAINERS)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 C   s   t |  S N)r   uppers r"   d/home/kim/smarthome/.venv/lib/python3.10/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercase9      r$   c                 C   s   |  ddS )N   ée)replacer    r"   r"   r#   strip_eacute=   r%   r)   c                 C      |   S r   splitr    r"   r"   r#   split_tokenizeA      r-   c                 C   s   dgS )NZthe_ultimate_featurer"   r    r"   r"   r#   lazy_analyzeE   s   r/   c                  C   s   d} d}t | |ksJ d} d}t | |ksJ d} d}t | |ks$J d} d}t | |ks0J d	} d
}t | |ks<J d} d}t | |ksHJ d} d
}t | |ksTJ d S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫ)r   aexpectedr"   r"   r#   test_strip_accentsI   s*   r<   c                  C   sd   d} d}t | |ksJ d} d}t | |ksJ d} d}t | |ks$J d} d}t | |ks0J d S )	Nr0   r1   r2   r3   r4   r8   r5   r6   )r   r9   r"   r"   r#   test_to_asciim   s   r=   
Vectorizerc                 C   s   | dd  }d}g d}|||ksJ d}g d}|||ks#J | dd  }td	}g d
}|||ks:J | td  }d}g d}|||ksOJ | tdd  }d}g d}|||kseJ d S )Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.)
aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.)thisistestZreallyZmetZharryZ	yesterdayfile)input'This is a test with a file-like object!)rM   rN   rO   withrP   likeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.)
ZAIZMANGEZDUZ	KANGOUROUZCEZMIDIZETAITZPASZTRESZBON)	tokenizerrA   )
zj'airD   rE   rF   rG   zmidi,zc'etaitrJ   rK   zbon.)build_analyzerr   r$   r-   )r>   watextr;   r"   r"   r#   test_word_analyzer_unigrams   s&   r\   c                  C   s2   t dddd } d}g d}| ||ksJ d S )Nwordunicode      analyzerrA   ngram_rangerB   )rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rY   )rZ   r[   r;   r"   r"   r#   'test_word_analyzer_unigrams_and_bigrams   s   re   c                  C   s   d} |  d}tddd }tt || W d    n1 s#w   Y  tdddd }tt || W d    d S 1 sFw   Y  d S )	NrB   zutf-8r_   r?   )rd   encodingchar      )rc   rd   rf   )encoder   rY   pytestraisesUnicodeDecodeError)r[   Z
text_bytesrZ   car"   r"   r#   test_unicode_decode_error   s   


"rp   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J d
}g d}| |d d |ks=J g d}| |d	d  |ksMJ t dddd } td}g d}| |d d |ksjJ d S )Nrg   r^   rh   rb   u9   J'ai mangé du kangourou  ce midi, c'était pas très bon)zj'az'aizai zi mz ma   )zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterday)thihisis zs iz is)z yesteyesteresterdsterdaterdayrP   rQ   rc   rd   rR   r   rY   r   cngar[   r;   r"   r"   r#   test_char_ngram_analyzer   s.   r   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J t d
ddd } td}g d}| |d d |ksHJ d S )NZchar_wbr^   rh   rb   rs   )z thrt   ru   rv   z thirq   )rw   rx   ry   rz   zerday rr   rP   r{   zA test with a file-like object!)z a z teZtesestzst z tesrj   r|   r}   r"   r"   r#   test_char_wb_ngram_analyzer  s$   r   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J t d
ddd }t|}||| |ksBJ d S )Nr]   r^   rh   rb   rs   )zthis is testzis test reallyztest really metri   )ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayrP   r{   r|   )r~   r[   r;   Z	cnga_filerP   r"   r"   r#   test_word_ngram_analyzer  s"   r   c                  C   s   ddd} t |  }ttttttfD ]O}|| }t|d}|	t
 t|tr1|j| ks0J n	t |j|ks:J |t
}|jd t|ksJJ || }t|d}||}t||jd kscJ qd S )Nr   r`   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_	transformshapeleninverse_transform)vocabtermstypvvectXinvr"   r"   r#   &test_countvectorizer_custom_vocabulary6  s    






r   c                  C   sd   ddg} t dt| dfdt fg}|t}t|jd jt| ks%J |jd t	| ks0J d S )Nr   r   countr   tfidfr`   )
r   r   r   fit_transformALL_FOOD_DOCSr   named_stepsr   r   r   )Zwhat_we_likepiper   r"   r"   r#   /test_countvectorizer_custom_vocabulary_pipelineK  s   
r   c                  C   sX   ddd} d}t jt|d t| d}|dg W d    d S 1 s%w   Y  d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   Zpasta_sizilianarl   rm   
ValueErrorr   r   )r   msgr   r"   r"   r#   7test_countvectorizer_custom_vocabulary_repeated_indicesX  s   

"r   c                  C   sT   ddd} t jtdd t| d}|dg W d    d S 1 s#w   Y  d S )Nr`   ra   r   zdoesn't contain indexr   r   Zpasta_verdurar   r   r   r"   r"   r#   0test_countvectorizer_custom_vocabulary_gap_index`  s
   

"r   c                  C   s   t  } | jdd |  tksJ | jdd tt |   W d    n1 s+w   Y  | jdd tt |   W d    n1 sJw   Y  g d}| j|d |  t|kscJ d S )Nenglish
stop_wordsZ_bad_str_stop_Z_bad_unicode_stop_)Zsomeotherwords)r   
set_paramsget_stop_wordsr   rl   rm   r   r   )cvZstoplistr"   r"   r#   test_countvectorizer_stop_wordsg  s   

r   c                  C   s   t jtdd tg d} | dg W d    n1 sw   Y  t jtdd tddd}|g d W d    d S 1 sBw   Y  d S )	Nzempty vocabularyr   r   foo      ?r   )max_dfr   )zto be or not to bez
and me toozand so do your   )r   r   r"   r"   r#   %test_countvectorizer_empty_vocabularyv  s   
"r   c                  C   sF   t  } | td d }| tdd  }|jd |jd ks!J d S )Nrq   r`   )r   r   r   r   )r   ZX1X2r"   r"   r#   test_fit_countvectorizer_twice  s   r   c                  C   s>   g d} d}t |d}||  g d}| }t|| dS )zCheck `get_feature_names_out()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_pattern)documentonesampleN)r   r   get_feature_names_outr   )corpusr   
vectorizerr;   feature_names_outr"   r"   r#   )test_countvectorizer_custom_token_pattern  s   

r   c                  C   sX   g d} d}d}t |d}tjt|d ||  W d   dS 1 s%w   Y  dS )zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr   rl   rm   r   r   )r   r   err_msgr   r"   r"   r#   <test_countvectorizer_custom_token_pattern_with_several_group  s   
"r   c                  C   s   g d} d}t d| d}tjt|d ||  W d    n1 s#w   Y  t  tdt ||  W d    d S 1 sCw   Y  d S )N)ZSampleZUpperZCaseZ
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r   rl   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   r"   r"   r#   'test_countvectorizer_uppercase_in_vocab  s   
"r   c                  C   sH   g dg dg dg} t ddd| }g d}||}t|| dS )	z0Check get_feature_names_out for TfidfTransformerr`   r`   r`   r`   r`   r   r`   r   r   Tl2
smooth_idfnorm)r:   cbN)r   r   r   r   )r   trZfeature_names_inr   r"   r"   r#   %test_tf_transformer_feature_names_out  s
   
r   c                  C   s   g dg dg dg} t ddd}||  }|dk s J t|d jd	d
g d g dg dg dg} t ddd}||  }|dk sMJ d S )Nr   r   r   Tr   r   r   ra   r`   Zaxisr   r   r   )r   r   toarrayallr   sumr   r   r   r"   r"   r#   test_tf_idf_smoothing  s   r   zcno floating point exceptions, see https://github.com/numpy/numpy/pull/21895#issuecomment-1311525881)reasonc                  C   s   g dg dg dg} t ddd}||  }|dk s J t|d jd	d
g d g dg dg dg} t ddd}d}tjt|d ||   W d    d S 1 sZw   Y  d S )Nr   r   r   Fr   r   r   ra   r`   r   r   zdivide by zeror   )	r   r   r   r   r   r   rl   r   RuntimeWarning)r   r   r   Zin_warning_messager"   r"   r#   test_tfidf_no_smoothing  s   "r   c                  C   s   dgdgdgg} t ddd d}||  }|d dksJ |d |d ks(J |d |d ks2J |d dk s:J |d dk sBJ d S )Nr`   ra   ri   TF)sublinear_tfuse_idfr   r   )r   r   r   r   r"   r"   r#   test_sublinear_tf  s   r   c                  C   s  t td d } td g}ttd }tdd}|| }t|dr&| }|d|jd f dks3J t|jd	}||fD ]s}||}t|drM| }|j}|d|d
 f dks\J |d|d f dkshJ |d|d f dkstJ d|vszJ d|vsJ |d|d f dksJ |d|d f dksJ |d|d f dksJ |d|d f dksJ q=t	dd}	|	
|| }
t|	jt|jksJ |
j|t|jfksJ |	| }|jt|t|jfksJ t	ddd}|
|| }t|drJ t	dd}tt || W d    n	1 s w   Y  ttj|dddg|  t td d } tdd}|j|_||  }|jrPJ t|
| || }t|| td d	}tt ||  W d    n	1 s|w   Y  |jddd | }d}t|}||}||ksJ |jdd d tt |  W d    n	1 sw   Y  d |_tt |  W d    d S 1 sw   Y  d S )!Nr`         ?r   tocsrr   r   ra   r   saladtomatowaterthe	copyrightcokeburgerr   l1r   F)r   r   idf_Tr   r   r   r?   )rA   r   rB   Z_gabbledegook_)rA   rW   Z_invalid_analyzer_type_)r   r   r   r   r   hasattrr   r   r   r   r   r   r   r   rl   rm   r   r   npr   r   r   fixed_vocabulary_r   build_preprocessorr   rY   )
train_data	test_dataZn_trainZv1Zcounts_trainZv2r   Zcounts_testr   t1r   Z
tfidf_testt2tfZt3tvZtfidf2Ztfidf_test2Zv3	processorr[   r;   resultr"   r"   r#   test_vectorizer  s~   













$r  c                  C   s  d\} }}}t | |||d}|t |jj| ksJ |jj|ks#J |jj|ks+J |jj|ks3J d|_d|_d|_d|_|jj| ksGJ |jj|ksOJ |jj|ksWJ |jj|ks_J |t |jj|jksmJ |jj|jksvJ |jj|jksJ |jj|jksJ d S )N)r   FFF)r   r   r   r   r   T)r   r   r   _tfidfr   r   r   r   )r   r   r   r   r  r"   r"   r#   test_tfidf_vectorizer_settersi  s,   

r	  c                  C   sv  t  } | t}|j}|jtt| jfksJ |j| jksJ t	|j
dks)J t	|j
dk s3J t|j
dks=J t|j
dk sGJ t|jd D ]}ttj|d j
dd qNt ddd} | t}|jtt| jfksuJ |j| jks}J |j}||ksJ |d| k sJ t	|j
dksJ t|j
dk sJ t|jd D ]}ttj|d j
dd qd S )	Nr   r   r`   ra   r   r_   r   )rd   r   )r   r   r   nnzr   r   
n_featuresdtyper   mindatamaxranger   Zlinalgr   )r   r   Z	token_nnziZ
ngrams_nnzr"   r"   r#   test_hashing_vectorizer  s.   

r  c                  C   s2  t dd} tt |   W d    n1 sw   Y  | jr#J | t}|j\}}t	| j
|ks6J |  }t|tjsBJ |jtksIJ t	||ksQJ tg d| t|D ]\}}|| j
|ksjJ q\g d}t |d} |  }tg d| | jsJ t|D ]\}}|| j
|ksJ qd S )Nr   r   	r   r   celerir   r   r   Z	sparklingr   r   r   )r   rl   rm   r   r   r   r   r   r   r   r   r   r   Zndarrayr  rU   r   	enumerateget)r   r   Z	n_samplesr  Zfeature_namesidxnamer   r"   r"   r#   test_feature_names  s:   






r  c                 C   s4   h d}| ddd}| t t|j|ksJ d S )N>   r   r   r   r   g333333?   )r   max_features)r   r   r   r   )r>   Zexpected_vocabularyr   r"   r"   r#   test_vectorizer_max_features  s   
r  c            	      C   s   t dd} t dd}t d d}| tjdd}|tjdd}|tjdd}|  }| }| }d| ks>J d| ksFJ d| ksNJ d|t| ksYJ d|t| ksdJ d|t| ksoJ d S )Nr`   r  ri   r   r      r   )r   r   r   r   r   r  r   Zargmax)	Zcv_1Zcv_3Zcv_NoneZcounts_1Zcounts_3Zcounts_NoneZ
features_1Z
features_3Zfeatures_Noner"   r"   r#   "test_count_vectorizer_max_features  s   


r  c                  C   s   g d} t ddd}||  d|j v sJ t|j dks#J d|_||  d|j vs4J t|j dks?J d	|_||  d|j vsPJ t|j dks[J d S )
NabcZdeaZeatrg   r   rc   r   r:   rj   r   r  r`   )r   r   r   r   r   r   r   r   r"   r"   r#   test_vectorizer_max_df     


r$  c                  C   s   g d} t ddd}||  d|j v sJ t|j dks#J d|_||  d|j vs4J t|j dks?J d	|_||  d|j vsPJ t|j dks[J d S )
Nr   rg   r`   )rc   min_dfr:   rj   ra   r   g?)r   r   r   r   r   r&  r#  r"   r"   r#   test_vectorizer_min_df)  r%  r'  c                  C   s   ddg} t ddd}||  }tg d|  tg dg dg| t ddd	d
}||  }tg dg dg| t ddd	tjd}|| }|jtjksTJ d S )Naaabcabbderg   r   r"  )r:   r   r   dr'   )ri   r`   r`   r   r   )r`   ra   r   r`   r`   T)rc   r   binary)r`   r`   r`   r   r   )r`   r`   r   r`   r`   )rc   r   r+  r  )r   r   r   r   r   r   float32r  )r   r   r   ZX_sparser"   r"   r#   test_count_binary_occurrences;  s   
r-  c                  C   s   ddg} t ddd d}|| }t|dd jdksJ t|dd	 jd	ks,J |jtjks4J t ddd
d d}|| }t|jdksKJ |jtjksSJ t ddd
d tjd}|| }|jtjksjJ d S )Nr(  r)  Frg   )alternate_signrc   r   r   r`   ri   ra   T)rc   r.  r+  r   )rc   r.  r+  r   r  )r   r   r   r  r  r  float64)r   r   r   r"   r"   r#   test_hashed_binary_occurrencesO  s"   


r0  c                 C   s  t }|  }||}||}t|tsJ | }t||D ]\}}tt	||}tt	|}t
|| qt|sBJ |jdksIJ | }	||	}
t||
D ]\}}t
t|t| qW| }||}t||D ]\}}t
t|t| qud S )NZcsr)r   r   r   r   r   rY   zipr   sortuniquer   r	   issparseformatr   Ztocsc)r>   r  r   Ztransformed_dataZinversed_dataZanalyzedocZinversed_termsr   Ztransformed_data2Zinversed_data2Zterms2Ztransformed_data3Zinversed_data3Zterms3r"   r"   r#   !test_vectorizer_inverse_transformi  s*   



r7  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
d}t||ddd}|||	|}	t
|	| |jdksNJ |jjd }
|
jdks[J d S )Nr   r`   g?r   Z	test_sizerandom_stater   svcr`   r`   r_   ZhingeZsquared_hinge)vect__ngram_range	svc__lossri   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r   r   r   r   predictr   best_score_best_estimator_r   rd   r  targetr   r   Ztarget_trainZtarget_testpipeline
parametersZgrid_searchpredZbest_vectorizerr"   r"   r#   -test_count_vectorizer_pipeline_grid_selection  s   
rI  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
dd}t||dd}|||	|}	t
|	| |jdksNJ |jjd }
|
jdks[J |
jdksbJ |
jrgJ d S )Nr   r`   g?r   r8  r   r:  r;  r_   )r   r   r<  )r=  Z
vect__normr>  )r?  r   r   )r   r@  r   r   r   r   r   r   r   rA  r   rB  rC  r   rd   r   r   rD  r"   r"   r#   'test_vectorizer_pipeline_grid_selection  s$   
rJ  c                  C   s^   t t } dgtt  dgtt  }tdt fdt fg}t|| |dd}t|g d d S )Nr   r`   r   r:  ri   )r   r   )r   r@  r   r   r   r   r   r   )r  rE  rF  Z	cv_scoresr"   r"   r#   )test_vectorizer_pipeline_cross_validation  s
   rK  c                  C   sx   d} t  }|| g}|jdksJ td dd}|| g}|jdks%J |j|jks-J tt|j	t|j	 d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)r`      F)r   r.  )r`   i   )
r   r   r   r   r   r
  r   r   r2  r  )r   r   Z	X_countedZX_hashedr"   r"   r#   test_vectorizer_unicode  s   rM  c                  C   sF   ddg} t | d}|t}|t}t| |  |js!J d S )Nr   r  r   )r   r   r   r   r   r   r   )r   r   ZX_1ZX_2r"   r"   r#   +test_tfidf_vectorizer_with_fixed_vocabulary  s   


rN  c                  C   s   t  t ddt ddt ddt ttdttdttdtttd	tt ttdt tg} | D ]*}t	|}t
|}t||jksJJ | | ksTJ t|t|t q5d S )
Nr   r   T)r+  r_   rd   rV   )rc   r@   )r   r   r   r/   r   r   r)   r   pickledumpsloadstype	__class__
get_paramsr   r   )Z	instancesorigr!   copyr"   r"   r#   test_pickling_vectorizer  s.   


rX  factoryc                 C   sB   t  }| |}d}tt|}||}||}||ksJ dS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    rB   N)r   rP  rR  rQ  )rY  vecfunctionr[   Zroundtripped_functionr;   r  r"   r"   r#   test_pickling_built_processors  s   r\  c                  C   s   t jd} t g d}tddD ],}t| j|ddd}t|d}t	t
|}|t |t t| |  qd S Nr   r  d   rq   F)sizer(   r   )r   randomRandomStatearrayr  r   choicer   rP  rR  rQ  r   r   r   r   )rngvocab_wordsxZ	vocab_setr   unpickled_cvr"   r"   r#   -test_countvectorizer_vocab_sets_when_pickling3  s   


rh  c                  C   s   t jd} t g d}tddD ];}t }| j|ddd}tddD ]}|||| < q$t|d}t	t
|}|t |t t| |  qd S r]  )r   r`  ra  rb  r  r   rc  r   rP  rR  rQ  r   r   r   r   )rd  re  rf  Z
vocab_dictr   yr   rg  r"   r"   r#   .test_countvectorizer_vocab_dicts_when_picklingO  s"   


rj  c                  C   s`   t  t} t | }t|}t|}t||j	ksJ t
||  ||   d S r   )r   r   r   r   r   rP  rQ  rR  rS  rT  r   r   )r   rV  r!   rW  r"   r"   r#   test_pickling_transformerl  s   

"rk  c                  C   sH   t  t} t | }t }|j|_t||  ||   d S r   )	r   r   r   r   r   r   r   r   r   )r   rV  rW  r"   r"   r#   test_transformer_idf_setteru  s
   "rl  c                  C   s   t dd} | t t | jdd}| j|_t|t | t  t | jdd}d}tj	t
|d | j|_W d    d S 1 sDw   Y  d S )NTr   r   r   Fz+`idf_` cannot be set when `user_idf=False`.r   )r   r   r   r   r   r   r   r   rl   rm   r   )rV  rW  r   r"   r"   r#   test_tfidf_vectorizer_setter}  s   


"rn  c                  C   sv   t dd} | t t | jdd}t| j}dg|d  }tt t	|d| W d    d S 1 s4w   Y  d S )NTr   rm  r   r`   r   )
r   r   r   r   r   r   rl   rm   r   setattr)r   rW  Zexpected_idf_lenZinvalid_idfr"   r"   r#   %test_tfidfvectorizer_invalid_idf_attr  s   


"rp  c                  C   sL   g d} t | d}tt |g  W d    d S 1 sw   Y  d S )N)r:   r   r   r:   r:   r   r   r   r"   r"   r#   test_non_unique_vocab  s
   
"rq  c                  C   sJ   d} t }dd }tj|| d |  W d    d S 1 sw   Y  d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  S   s   t  } | dtjdg d S )Nhello worldhello hello)r   r   r   nan)Zhvr"   r"   r#   func  s   z0test_hashingvectorizer_nan_in_docs.<locals>.funcr   )r   rl   rm   )r   	exceptionru  r"   r"   r#   "test_hashingvectorizer_nan_in_docs  s   "rw  c                  C   sd   t ddd d} | jsJ | ddg }t| g d | ddg }t| g d d S )NTF)r+  r   r   rr  rs  )r`   r`   r`   r   )r   r+  r   r   r   Zravelr   )r   r   r   r"   r"   r#   test_tfidfvectorizer_binary  s   
rx  c                  C   s(   t dd} | t t| j| jj d S )NTr   )r   r   r   r   r   r  )r   r"   r"   r#   test_tfidfvectorizer_export_idf  s   

ry  c                  C   s<   t dgd} t| }| t |t |j| jksJ d S )Nr   r   )r   r
   r   r   r   )Z
vect_vocabZvect_vocab_cloner"   r"   r#   test_vectorizer_vocab_clone  s
   

rz  c                 C   s   d}|  }t jt|d |d W d    n1 sw   Y  t jt|d |d W d    n1 s8w   Y  |ddg t jt|d |d W d    d S 1 s\w   Y  d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)rl   rm   r   r   r   r   )r>   r   rZ  r"   r"   r#   &test_vectorizer_string_object_as_input  s   "r|  X_dtypec                 C   s2   t jdd| dd}t |}|j|jksJ d S N
    N  *   r  r9  )r	   randr   r   r  )r}  r   ZX_transr"   r"   r#   test_tfidf_transformer_type  s   r  zcsc_container, csr_containerc                 C   sZ   t jddtjdd}| |}||}t |}t |}t|| |j|jks+J d S r~  )r	   r  r   r/  r   r   r   r5  )Zcsc_containercsr_containerr   ZX_cscX_csrZX_trans_cscZX_trans_csrr"   r"   r#   test_tfidf_transformer_sparse  s   
r  z0vectorizer_dtype, output_dtype, warning_expectedTFc                 C   s   t g d}t| d}d}|r-tjt|d ||}W d    n1 s'w   Y  nt  t	dt ||}W d    n1 sGw   Y  |j
|ksSJ d S )N)numpyscipyZsklearnr  z'dtype' should be used.r   r   )r   rb  r   rl   r   r   r   r   r   r   r  )Zvectorizer_dtypeZoutput_dtypeZwarning_expectedr   r   Zwarning_msg_matchZX_idfr"   r"   r#   test_tfidf_vectorizer_type  s   


r  rZ  )ra   r`   rO  c                 C   s   | j }td| d}tjt|d | dg W d    n1 s$w   Y  tjt|d | dg W d    n1 sAw   Y  t| t	rktjt|d | 
dg W d    d S 1 sdw   Y  d S d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.r   zgood news everyone)rd   reescaperl   rm   r   r   r   r   r   r   )rZ  Zinvalid_ranger   r"   r"   r#   $test_vectorizers_invalid_ngram_range  s   

"r  c                 C   s&   |   }|  }|  }| |||S r   )r   build_tokenizerr   _check_stop_words_consistency)Z	estimatorr   tokenize
preprocessr"   r"   r#   r     s   r  c               	   C   s   d} d|  }t  t t fD ]1}|jg dd tjt|d |dg W d    n1 s0w   Y  |`t	|du s?J qt
  t
dt |dg W d    n1 s[w   Y  t	|d u shJ |jg d	d tjt|d |dg W d    d S 1 sw   Y  d S )
Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.)you'veyouyou'llANDr   r   rr  Fr   )r  r  r  Zblahr  )r   r   r   r   rl   r   r   r   Z_stop_words_idr  r   r   r   )Zlstrr   rZ  r"   r"   r#   'test_vectorizer_stop_words_inconsistent'  s*   
"r  r  c                 C   s^   | dt jd}t j}|j||_|j||_dddd}t ||}||jjks-J dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )rq   rq   r  r   r`   ra   )zscikit-learnrN   zgreat!N)r   int64indicesZastypeZindptrr   Z_sort_featuresr  )r  r   ZINDICES_DTYPEr   ZXsr"   r"   r#   7test_countvectorizer_sort_features_64bit_sparse_indicesB  s   r  	Estimatorc                 C   s   ddig}|  }t |du sJ | dd dgd}t |dks!J t |d u s)J || G d	d
 d
| }|dgd}t |dksDJ | dd dgd}t |du sUJ d S )Nr[   r{  Tc                 S      | d S Nr[   r"   rf  r"   r"   r#   <lambda>e      z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>and)rW   r   r   c                   @   s   e Zd Zdd ZdS )zFtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                 S   s   dd S )Nc                 S   r  r  r"   r  r"   r"   r#   r  m  r  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>r"   )selfr"   r"   r#   r   l  r.   zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorN)__name__
__module____qualname__r   r"   r"   r"   r#   CustomEstimatork  s    r  r   c                 S   s   t d| S )Nz\w{1,})r  compilefindallr6  r"   r"   r#   r  s  s    )rX   r   )r  r   )r  r  rZ  r  r"   r"   r#   -test_stop_word_validation_custom_preprocessor\  s   


r  zinput_type, err_type, err_msgfilenamer8   rP   z$'str' object has no attribute 'read'c                 C   sP   dg}t j||d | dd |d| W d    d S 1 s!w   Y  d S )N"this is text, not file or filenamer   c                 S   r*   r   r+   r  r"   r"   r#   r    r  z.test_callable_analyzer_error.<locals>.<lambda>rc   rQ   )rl   rm   r   )r  
input_typeZerr_typer   r  r"   r"   r#   test_callable_analyzer_errorx  s   "r  rc   c                 C   s
   t | dS )Nr)openr  r"   r"   r#   r    s   
 r  c                 C   r*   r   )readr  r"   r"   r#   r    r  r  c                 C   sL   dg}t ttf | ||d| W d    d S 1 sw   Y  d S )Nr  r  )rl   rm   FileNotFoundErrorAttributeErrorr   )r  rc   r  r  r"   r"   r#   &test_callable_analyzer_change_behavior  s   "r  c                 C   sd   dd }|  d}|d tjtdd ||dd|g W d    d S 1 s+w   Y  d S )	Nc                 S   s   t d)Ntesting)	Exceptionr  r"   r"   r#   rc     r.   z6test_callable_analyzer_reraise_error.<locals>.analyzerzfile.txtzsample content
r  r   rP   r  )joinwriterl   rm   r  r   )Ztmpdirr  rc   fr"   r"   r#   $test_callable_analyzer_reraise_error  s   

"r  zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgr  r  r;  rg   z'stop_words'
'analyzer'	!= 'word'c                 C   r*   r   r+   r    r"   r"   r#   r    r  z'tokenizer'c                 C   r*   r   r+   r    r"   r"   r#   r    r  \w+r]   'token_pattern'zis not Nonec                 C   r*   r   r   r    r"   r"   r#   r    r  c                 C   r*   r   r  r    r"   r"   r#   r    r  z'preprocessor'zis callabler_   c                 C   r*   r   r  r    r"   r"   r#   r    r  z'ngram_range')	NNNr;  r  rg   r  r  r  c
                 C   sl   t }
|  }|j||||||d d|||	f }tjt|d ||
 W d    d S 1 s/w   Y  d S )N)r   rX   rW   rd   r   rc   z-The parameter %s will not be used since %s %sr   )r   r   rl   r   r   r   )r>   r   rX   rW   rd   r   rc   Zunused_nameZ	ovrd_nameZovrd_msgr   r   r   r"   r"   r#   test_unused_parameters_warn  s$   Y"r  zVectorizer, Xr`   ra   )r   barri   )r   Zbazc                 C   s0   |  }t |dr
J || t |drJ d S )NZn_features_in_)r   r   )r>   r   r   r"   r"   r#   test_n_features_in  s   	
r  c                  C   s:   t dd} | ddgj}| ddgj}||ksJ d S )Nr`   r  ZhelloZworld)r   r   r   )rZ  Zvocab1Zvocab2r"   r"   r#   )test_tie_breaking_sample_order_invariance%  s   
r  c                  C   s.   t ddd} | dgj}|d dksJ d S )Ni@B )ra   ri   )r  rd   z22pcs efuturer   )r   r   r  )Zhashingr  r"   r"   r#   2test_nonnegative_hashing_vectorizer_result_indices.  s   r  c                 C   s   |  }t |dr
J dS )z0Check that vectorizers do not define set_output.Z
set_outputN)r   )r  r   r"   r"   r#   'test_vectorizers_do_not_have_set_output5  s   r  c                 C   s   t jddtjdd}| |}| }t |}|j|dd}t|| ||us*J |j|dd}||u s7J t	
t t|| W d   dS 1 sMw   Y  dS )	zJCheck the behaviour of TfidfTransformer.transform with the copy parameter.r  r  r  r  T)rW  FN)r	   r  r   r/  rW  r   r   r   r   rl   rm   AssertionError)r  r   r  ZX_csr_originalZtransformerZX_transformr"   r"   r#   test_tfidf_transformer_copy>  s   
"r  r  c                 C   s6   dd t dD }t| d|}|jj| ksJ dS )zCheck that `idf_` has the same dtype as the input data.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/30016
    c                 S   s   g | ]}t t qS r"   )struuiduuid4).0r  r"   r"   r#   
<listcomp>Z  s    z<test_tfidf_vectorizer_perserve_dtype_idf.<locals>.<listcomp>i r  N)r  r   r   r   r  )r  r   r   r"   r"   r#   (test_tfidf_vectorizer_perserve_dtype_idfS  s   r  c                  C   s   t  } |  }|jrJ dS )z7Test that HashingVectorizer has requires_fit=False tag.N)r   Z__sklearn_tags__Zrequires_fit)r   tagsr"   r"   r#   (test_hashing_vectorizer_requires_fit_tag_  s   r  c                  C   s.   t dd} ddg}| |}|jdksJ dS )z:Test that HashingVectorizer can transform without fitting.r  )r  zThis is testzAnother test)ra   r  N)r   r   r   )r   r   r  r"   r"   r#   -test_hashing_vectorizer_transform_without_fitf  s   

r  )rP  r  r  r   collectionsr   collections.abcr   	functoolsr   ior   	itertoolsr   r  r   rl   Znumpy.testingr   r   r  r	   Zsklearn.baser
   Zsklearn.feature_extraction.textr   r   r   r   r   r   r   r   Zsklearn.model_selectionr   r   r   Zsklearn.pipeliner   Zsklearn.svmr   Zsklearn.utils._testingr   r   r   Zsklearn.utils.fixesr   r   r   r   r@  r   r$   r)   r-   r/   r<   r=   markZparametrizer\   re   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Zxfailr   r   r  r	  r  r  r  r  r$  r'  r-  r0  r7  rI  rJ  rK  rM  rN  rX  rY   r   r  r\  rh  rj  rk  rl  rn  rp  rq  rw  rx  ry  rz  r|  r,  r/  r  r  Zint32r  r  r  r  r  r  r  r  r  r  paramr  r  r  r  r  r  r  r  r  r  r  r"   r"   r"   r#   <module>   s   (
	$
=

g&G
	
$'

	






	






J 
	


