o
    Yi7                     @   s   d Z ddlZddlmZmZ ddlmZmZ ddlmZ ddl	m
Z
 ddl	mZ dd	l	mZ dd
l	mZ edZg dZG dd dZdedejjjdefddZdedefddZG dd dZdS )z)[Experimental] Text Only Local Tokenizer.    N)AnyIterable)OptionalUnion)sentencepiece_model_pb2   )_common)_local_tokenizer_loader)_transformers)typeszgoogle_genai.local_tokenizer)_parse_hex_byte_token_str_to_bytesLocalTokenizer_TextsAccumulatorc                   @   s&  e Zd ZdZd)ddZdee fddZdeej	 ddfd	d
Z
dej	ddfddZdejddfddZdejdejfddZdeej ddfddZdeej ddfddZdejddfddZdejdejfddZd ejdejfd!d"Zd#eeef deeef fd$d%Zd&edefd'd(ZdS )*r   a  Accumulates countable texts from `Content` and `Tool` objects.

  This class is responsible for traversing complex `Content` and `Tool`
  objects and extracting all the text content that should be included when
  calculating token counts.

  A key feature of this class is its ability to detect unsupported fields in
  `Content` objects. If a user provides a `Content` object with fields that
  this local tokenizer doesn't recognize (e.g., new fields added in a future
  API update), this class will log a warning.

  The detection mechanism for `Content` objects works by recursively building
  a "counted" version of the input object. This "counted" object only
  contains the data that was successfully processed and added to the text
  list for tokenization. After traversing the input, the original `Content`
  object is compared to the "counted" object. If they don't match, it
  signifies the presence of unsupported fields, and a warning is logged.
  returnNc                 C   s
   g | _ d S N_textsself r   V/home/kim/smarthome/.venv/lib/python3.10/site-packages/google/genai/local_tokenizer.py__init__;   s   
z_TextsAccumulator.__init__c                 C   s   | j S r   r   r   r   r   r   	get_texts>   s   z_TextsAccumulator.get_textscontentsc                 C      |D ]}|  | qd S r   )add_content)r   r   contentr   r   r   add_contentsA      z_TextsAccumulator.add_contentsr   c                 C   s  t jg |jd}|jrg|jD ]X}|jd usJ t  }|jd us%|jd ur)td|jd ur2|j|_|j	d urA| 
|j	 |j	|_	|jd urP| |j |j|_|jd ur`|j|_| j|j |j| q|jdd|jddkrtd| d| d d S d S )N)partsrolez6LocalTokenizers do not support non-text content types.T)Zexclude_nonezHContent contains unsupported types for token counting. Supported fields z. Got .)r   Contentr!   r    ZPart	file_dataZinline_data
ValueErrorZvideo_metadatafunction_calladd_function_callfunction_responseadd_function_responsetextr   appendZ
model_dumploggerwarning)r   r   Zcounted_contentpartZcounted_partr   r   r   r   E   s@   




z_TextsAccumulator.add_contentr&   c                 C   sB   |j r
| j|j  tj|j d}|jr| |j}||_dS dS )zProcesses a function call and adds relevant text to the accumulator.

    Args:
        function_call: The function call to process.
    )nameN)r/   r   r+   r   FunctionCallargs_dict_traverse)r   r&   Zcounted_function_callZcounted_argsr   r   r   r'   d   s   
z#_TextsAccumulator.add_function_calltoolc                 C   sH   t jg d}|jr"|jD ]}| |}|jd u rg |_|j| q|S )N)function_declarations)r   Toolr4   _function_declaration_traverser+   )r   r3   Zcounted_toolfunction_declarationcounted_function_declarationr   r   r   add_toolq   s   

z_TextsAccumulator.add_tooltoolsc                 C   r   r   )r9   )r   r:   r3   r   r   r   	add_tools~   r   z_TextsAccumulator.add_toolsfunction_responsesc                 C   r   r   )r)   )r   r<   r(   r   r   r   add_function_responses   s   z(_TextsAccumulator.add_function_responsesr(   c                 C   sD   t  }|jr| j|j |j|_|jr | |j}||_d S d S r   )r   FunctionResponser/   r   r+   responser2   )r   r(   Zcounted_function_responsecounted_responser   r   r   r)      s   
z'_TextsAccumulator.add_function_responser7   c                 C   st   t  }|jr| j|j |j|_|jr | j|j |j|_|jr,| |j}||_|jr8| |j}||_|S r   )	r   FunctionDeclarationr/   r   r+   description
parameters
add_schemar?   )r   r7   r8   Zcounted_parametersr@   r   r   r   r6      s   z0_TextsAccumulator._function_declaration_traverseschemac           	      C   s*  t  }|jr|j|_|jr|j|_|jdur|j|_|jr)| j|j |j|_|jr7| j|j |j|_|j	rE| j
|j	 |j	|_	|jrS| j
|j |j|_|jrZ|j|_|jrf| |j}||_|jri }|j D ]\}}| j| | |}|||< qp||_|jr| |j}||_|S )zProcesses a schema and adds relevant text to the accumulator.

    Args:
        schema: The schema to process.

    Returns:
        The new schema object with only countable fields.
    N)r   Schematypetitledefaultformatr   r+   rB   enumextendrequiredZproperty_orderingitemsrD   
propertiesZexample_any_traverse)	r   rE   Zcounted_schemaZcounted_schema_itemsdkeyvalueZcounted_valueZcounted_schema_exampler   r   r   rD      sF   	


z_TextsAccumulator.add_schemarQ   c                 C   s<   i }| j t|  | D ]\}}| |||< q|S )zProcesses a dict and adds relevant text to the accumulator.

    Args:
        d: The dict to process.

    Returns:
        The new dict object with only countable fields.
    )r   rL   listkeysrN   rP   )r   rQ   Zcounted_dictrR   valr   r   r   r2      s
   	z _TextsAccumulator._dict_traverserS   c                    sN   t |tr j| |S t |tr |S t |tr% fdd|D S |S )zProcesses a value and adds relevant text to the accumulator.

    Args:
        value: The value to process.

    Returns:
        The new value with only countable fields.
    c                    s   g | ]}  |qS r   )rP   ).0itemr   r   r   
<listcomp>   s    z3_TextsAccumulator._any_traverse.<locals>.<listcomp>)
isinstancestrr   r+   dictr2   rT   )r   rS   r   r   r   rP      s   
	


z_TextsAccumulator._any_traverse)r   N)__name__
__module____qualname____doc__r   r   r[   r   r   r#   r   r   r0   r'   r5   r9   r;   r>   r=   r)   rA   r6   rF   rD   r\   r   r2   rP   r   r   r   r   r   '   s6    



"-r   tokenrG   r   c                 C   s4   |t jjjjkrt| jdddS | dddS )Nr   big)length	byteorderu   ▁ zutf-8)	r   
ModelProtoSentencePieceTypeZBYTEr   to_bytesreplaceencode)ra   rG   r   r   r   r      s   r   c                 C   s   t | dkrtd|  | dr| dstd|  zt| dd d}W n ty7   td	|  w |d
krCtd|  |S )zParses a hex byte string of the form '<0xXX>' and returns the integer value.

  Raises ValueError if the input is malformed or the byte value is invalid.
     zInvalid byte length: z<0x>zInvalid byte format:          zInvalid hex value:    zByte value out of range: )lenr%   
startswithendswithint)ra   rV   r   r   r   r      s   r   c                   @   s   e Zd ZdZdefddZeddddee	j
e	jf d	ee	j d
e	jfddZeddee	j
e	jf d
e	jfddZdS )r   a  [Experimental] Text Only Local Tokenizer.

  This class provides a local tokenizer for text only token counting.

  LIMITATIONS:
  - Only supports text based tokenization and no multimodal tokenization.
  - Forward compatibility depends on the open-source tokenizer models for future
  Gemini versions.
  - For token counting of tools and response schemas, the `LocalTokenizer` only
  supports `types.Tool` and `types.Schema` objects. Python functions or Pydantic
  models cannot be passed directly.
  
model_namec                 C   s,   t || _t | j| _t | j| _d S r   )loaderZget_tokenizer_nameZ_tokenizer_nameZload_model_proto_model_protoZget_sentencepiece
_tokenizer)r   rv   r   r   r   r   $  s   zLocalTokenizer.__init__zThe SDK's local tokenizer implementation is experimental and may change in the future. It only supports text based tokenization.N)configr   rz   r   c                C   s   t |}t }tj|pi }|| |jr||j |j	r,|j	j
r,||j	j
 |jr9|t |jg | jt| }tjtdd |D dS )a  Counts the number of tokens in a given text.

    Args:
      contents: The contents to tokenize.
      config: The configuration for counting tokens.

    Returns:
      A `CountTokensResult` containing the total number of tokens.

    Usage:

    .. code-block:: python

      from google import genai
      tokenizer = genai.LocalTokenizer(model_name='gemini-2.0-flash-001')
      result = tokenizer.count_tokens("What is your name?")
      print(result)
      # total_tokens=5
    c                 s   s    | ]}t |V  qd S r   )rr   )rW   tokensr   r   r   	<genexpr>R  s    z.LocalTokenizer.count_tokens.<locals>.<genexpr>)Ztotal_tokens)t
t_contentsr   r   ZCountTokensConfigZmodel_validater   r:   r;   Zgeneration_configZresponse_schemarD   Zsystem_instructionry   rk   rT   r   CountTokensResultsum)r   r   rz   processed_contentstext_accumulatorZtokens_listr   r   r   count_tokens)  s   

zLocalTokenizer.count_tokensc                    s   t |}t }|D ]}|| q
 j| }g }|D ]}|jr/|jD ]}||j	 q&qg }t
||D ]\}	}
|tjdd |	jD  fdd|	jD |
d q7tj|dS )a,  Computes the tokens ids and string pieces in the input.

    Args:
      contents: The contents to tokenize.

    Returns:
      A `ComputeTokensResult` containing the token information.

    Usage:

    .. code-block:: python

      from google import genai
      tokenizer = genai.LocalTokenizer(model_name='gemini-2.0-flash-001')
      result = tokenizer.compute_tokens("What is your name?")
      print(result)
      # tokens_info=[TokensInfo(token_ids=[279, 329, 1313, 2508, 13], tokens=[b' What', b' is', b' your', b' name', b'?'], role='user')]
    c                 S   s   g | ]}|j qS r   )idrW   piecer   r   r   rY     s    z1LocalTokenizer.compute_tokens.<locals>.<listcomp>c                    s$   g | ]}t |j jj|j jqS r   )r   r   rx   piecesr   rG   r   r   r   r   rY     s    )Z	token_idsr{   r!   )Ztokens_info)r}   r~   r   r   ry   ZEncodeAsImmutableProtor   r    r+   r!   zipr   Z
TokensInfor   ComputeTokensResult)r   r   r   r   r   Ztokens_protosZroles_Ztoken_infosZtokens_protor!   r   r   r   compute_tokensU  s2   


zLocalTokenizer.compute_tokens)r]   r^   r_   r`   r[   r   r   Zexperimental_warningr   r   ZContentListUnionZContentListUnionDictr   ZCountTokensConfigOrDictr   r   r   r   r   r   r   r   r     s,    (r   )r`   loggingtypingr   r   r   r   Zsentencepiecer    r   r	   rw   r
   r}   r   	getLoggerr,   __all__r   r[   rf   rg   rh   bytesr   ru   r   r   r   r   r   r   <module>   s,   
 Q
	