o
    i>                     @   s   d dl m Z  d dlmZmZ d dlmZ d dlmZ d dlZ	d dl
mZ d dlmZ ddlmZmZmZ dd	lmZ dd
lmZ G dd deeZdS )    )array)IterableMapping)Number)
itemgetterN)metadata_routing   )BaseEstimatorTransformerMixin_fit_context)check_array)check_is_fittedc                       s   e Zd ZU dZdejiZdegdgdgdZe	e
d< ejddddd	d
ZdddddddZeddd!ddZdd Zeddd!ddZe	fddZdd Zd!ddZd"ddZ fdd Z  ZS )#DictVectorizera  Transforms lists of feature-value mappings to vectors.

    This transformer turns lists of mappings (dict-like objects) of feature
    names to feature values into Numpy arrays or scipy.sparse matrices for use
    with scikit-learn estimators.

    When feature values are strings, this transformer will do a binary one-hot
    (aka one-of-K) coding: one boolean-valued feature is constructed for each
    of the possible string values that the feature can take on. For instance,
    a feature "f" that can take on the values "ham" and "spam" will become two
    features in the output, one signifying "f=ham", the other "f=spam".

    If a feature value is a sequence or set of strings, this transformer
    will iterate over the values and will count the occurrences of each string
    value.

    However, note that this transformer will only do a binary one-hot encoding
    when feature values are of type string. If categorical features are
    represented as numeric values such as int or iterables of strings, the
    DictVectorizer can be followed by
    :class:`~sklearn.preprocessing.OneHotEncoder` to complete
    binary one-hot encoding.

    Features that do not occur in a sample (mapping) will have a zero value
    in the resulting array/matrix.

    For an efficiency comparison of the different feature extractors, see
    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.

    Read more in the :ref:`User Guide <dict_feature_extraction>`.

    Parameters
    ----------
    dtype : dtype, default=np.float64
        The type of feature values. Passed to Numpy array/scipy.sparse matrix
        constructors as the dtype argument.
    separator : str, default="="
        Separator string used when constructing new features for one-hot
        coding.
    sparse : bool, default=True
        Whether transform should produce scipy.sparse matrices.
    sort : bool, default=True
        Whether ``feature_names_`` and ``vocabulary_`` should be
        sorted when fitting.

    Attributes
    ----------
    vocabulary_ : dict
        A dictionary mapping feature names to feature indices.

    feature_names_ : list
        A list of length n_features containing the feature names (e.g., "f=ham"
        and "f=spam").

    See Also
    --------
    FeatureHasher : Performs vectorization using only a hash function.
    sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
        features encoded as columns of arbitrary data types.

    Examples
    --------
    >>> from sklearn.feature_extraction import DictVectorizer
    >>> v = DictVectorizer(sparse=False)
    >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
    >>> X = v.fit_transform(D)
    >>> X
    array([[2., 0., 1.],
           [0., 1., 3.]])
    >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
    ...                            {'baz': 1.0, 'foo': 3.0}]
    True
    >>> v.transform({'foo': 4, 'unseen_feature': 3})
    array([[0., 0., 4.]])
    	dict_typeZno_validationbooleandtype	separatorsparsesort_parameter_constraints=Tc                C   s   || _ || _|| _|| _d S Nr   )selfr   r   r   r    r   e/home/kim/smarthome/.venv/lib/python3.10/site-packages/sklearn/feature_extraction/_dict_vectorizer.py__init__j   s   
zDictVectorizer.__init__FNfittingtransformingindicesvaluesc                C   s   |D ]B}	t |	trd|| j|	f }
d}	n
tdt|	 d|r/|
|vr/t|||
< ||
 |rD|
|v rD|||
  || |	 qdS )z)Add feature names for iterable of strings%s%s%s   zUnsupported type z; in iterable value. Only iterables of string are supported.N)
isinstancestrr   	TypeErrortypelenappendr   )r   fvfeature_namesvocabr   r   r    r!   Zvvfeature_namer   r   r   _add_iterable_elementp   s   

z$DictVectorizer._add_iterable_element)Zprefer_skip_nested_validationc           	   
   C   s   g }i }|D ]\}|  D ]U\}}t|trd|| j|f }n0t|ts'|du r*|}n$t|tr?tdt| d| d| dt|trNd}| 	|||| |dura||vrat
|||< || qq| jrs|  dd t|D }|| _|| _| S )	a)  Learn a list of feature name -> indices mappings.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

            .. versionchanged:: 0.24
               Accepts multiple string values for one categorical feature.

        y : (ignored)
            Ignored parameter.

        Returns
        -------
        self : object
            DictVectorizer class instance.
        r"   NzUnsupported value type  for : z$.
Mapping objects are not supported.c                 S   s   i | ]\}}||qS r   r   ).0ir*   r   r   r   
<dictcomp>   s    z&DictVectorizer.fit.<locals>.<dictcomp>)itemsr$   r%   r   r   r   r&   r'   r   r/   r(   r)   r   	enumeratefeature_names_vocabulary_)	r   Xyr,   r-   xr*   r+   r.   r   r   r   fit   s>   



zDictVectorizer.fitc                 C   sN  t djdksJ d| j}|rg }i }n| j}| j}d}t|tr%|gn|}t d}dg}g }	|D ]}
|
 D ]w\}}t|trLd|| j	|f }d}n;t|t
sU|d u rX|}n/t|tsrt|trrd }| j||||||||	d ntd	t| d
| d| dt| d	|d ur|r||vrt|||< || ||v r|||  |	| | q8|t| q2t|dkrtdtj|tjd}t|d t|f}tj|	||f||d}|r| jr|  tjt|tjd}t|D ]\}}|| ||< |||< q|d d |f }| jr|  n| }|r%|| _|| _|S )Nr3      zsizeof(int) != 4 on your platform; please report this at https://github.com/scikit-learn/scikit-learn/issues and include the output from platform.platform() in your bug reportTr   r"   r#   r   zUnsupported value Type r0   r1   z.
z objects are not supported.zSample sequence X is empty.r   )shaper   )r   itemsizer   r7   r8   r$   r   r5   r%   r   r   r   r/   r&   r'   r(   r)   
ValueErrornpZ
frombufferZintcspZ
csr_matrixr   emptyZint32r6   r   Zsort_indicesZtoarray)r   r9   r   r   r,   r-   r   r    Zindptrr!   r;   r*   r+   r.   r?   Zresult_matrix	map_indexnew_valr   r   r   
_transform   s   



zDictVectorizer._transformc                 C   s   | j |ddS )a  Learn a list of feature name -> indices mappings and transform X.

        Like fit(X) followed by transform(X), but does not require
        materializing X in memory.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

            .. versionchanged:: 0.24
               Accepts multiple string values for one categorical feature.

        y : (ignored)
            Ignored parameter.

        Returns
        -------
        Xa : {array, sparse matrix}
            Feature vectors; always 2-d.
        Tr   )rG   )r   r9   r:   r   r   r   fit_transform(  s   zDictVectorizer.fit_transformc           
         s   t | d t|ddgd}|jd }| j} fddt|D }t|r>t|  D ]\}}|||f || || < q+|S t	|D ]!\}}t	||ddf D ]\}}	|	dkrb|||f ||| < qPqB|S )	aW  Transform array or sparse matrix X back to feature mappings.

        X must have been produced by this DictVectorizer's transform or
        fit_transform method; it may only have passed through transformers
        that preserve the number of features and their order.

        In the case of one-hot/one-of-K coding, the constructed feature
        names and values are returned rather than the original ones.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Sample matrix.
        dict_type : type, default=dict
            Constructor for feature mappings. Must conform to the
            collections.Mapping API.

        Returns
        -------
        X_original : list of dict_type objects of shape (n_samples,)
            Feature mappings for the samples in X.
        r7   ZcsrZcsc)Zaccept_sparser   c                    s   g | ]}  qS r   r   )r2   _r   r   r   
<listcomp>`  s    z4DictVectorizer.inverse_transform.<locals>.<listcomp>N)
r   r   r?   r7   rangerC   issparsezipZnonzeror6   )
r   r9   r   Z	n_samplesnamesZdictsr3   jdr+   r   rK   r   inverse_transformB  s    


z DictVectorizer.inverse_transformc                 C   s   t | ddg | j|ddS )a  Transform feature->value dicts to array or sparse matrix.

        Named features not encountered during fit or fit_transform will be
        silently ignored.

        Parameters
        ----------
        X : Mapping or iterable over Mappings of shape (n_samples,)
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

        Returns
        -------
        Xa : {array, sparse matrix}
            Feature vectors; always 2-d.
        r7   r8   FrH   )r   rG   )r   r9   r   r   r   	transformm  s   zDictVectorizer.transformc                 C   sD   t | d tdd | jD rdd | jD }n| j}tj|tdS )a^  Get output feature names for transformation.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Not used, present here for API consistency by convention.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        r7   c                 s   s    | ]	}t |t V  qd S r   )r$   r%   r2   namer   r   r   	<genexpr>  s    z7DictVectorizer.get_feature_names_out.<locals>.<genexpr>c                 S   s   g | ]}t |qS r   )r%   rU   r   r   r   rL     s    z8DictVectorizer.get_feature_names_out.<locals>.<listcomp>r>   )r   anyr7   rB   Zasarrayobject)r   Zinput_featuresr,   r   r   r   get_feature_names_out  s
   
z$DictVectorizer.get_feature_names_outc                 C   sj   t | d |st|d }| j}i }|D ]
}t|||| < q|| _dd t| tddD | _| S )a=  Restrict the features to those in support using feature selection.

        This function modifies the estimator in-place.

        Parameters
        ----------
        support : array-like
            Boolean mask or list of indices (as returned by the get_support
            member of feature selectors).
        indices : bool, default=False
            Whether support is a list of indices.

        Returns
        -------
        self : object
            DictVectorizer class instance.

        Examples
        --------
        >>> from sklearn.feature_extraction import DictVectorizer
        >>> from sklearn.feature_selection import SelectKBest, chi2
        >>> v = DictVectorizer()
        >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
        >>> X = v.fit_transform(D)
        >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
        >>> v.get_feature_names_out()
        array(['bar', 'baz', 'foo'], ...)
        >>> v.restrict(support.get_support())
        DictVectorizer()
        >>> v.get_feature_names_out()
        array(['bar', 'foo'], ...)
        r7   r   c                 S   s   g | ]\}}|qS r   r   )r2   r*   r3   r   r   r   rL     s    z+DictVectorizer.restrict.<locals>.<listcomp>r#   )key)	r   rB   wherer7   r(   r8   sortedr5   r   )r   Zsupportr    rP   Z	new_vocabr3   r   r   r   restrict  s   
!zDictVectorizer.restrictc                    s   t   }d|j_d|j_|S )NTF)super__sklearn_tags__Z
input_tagsdictZtwo_d_array)r   tags	__class__r   r   r`     s   
zDictVectorizer.__sklearn_tags__r   )F)__name__
__module____qualname____doc__r   ZUNUSEDZ4_DictVectorizer__metadata_request__inverse_transformr%   r   ra   __annotations__rB   Zfloat64r   r/   r   r<   rG   rI   rS   rT   rZ   r^   r`   __classcell__r   r   rc   r   r      s0   
 
M5c+

2r   )r   collections.abcr   r   numbersr   operatorr   numpyrB   Zscipy.sparser   rC   Zsklearn.utilsr   baser	   r
   r   utilsr   Zutils.validationr   r   r   r   r   r   <module>   s   