
    Z j,                        % S r SSKJrJr  SSKJr  \R                  " \5      rSr	Sr
SrSrSrS	rS
r\S\S\S\S\
S\S0r\\\4   \S'   \R+                  5        V Vs0 s H  u  pX_M	     snn r\\\4   \S'    " S S\5      rS/rgs  snn f )z Tokenization classes for CANINE.   )
AddedTokenPreTrainedTokenizer)loggingi       i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSSPECIAL_CODEPOINTS_BY_NAMEc                      ^  \ rS rSrSr/ SQr\" \5      \" \5      \" \5      \" \5      \" \	5      \" \
5      SS4U 4S jjr\S\4S j5       rS	 rS
\S\\   4S jrS\S\4S jrS\S\4S jrS rSrU =r$ )CanineTokenizer7   a  
Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
converts each character into its Unicode code point.

[`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

Args:
    model_max_length (`int`, *optional*, defaults to 2048):
            The maximum sentence length the model accepts.
)	input_idsattention_masktoken_type_idsFi   c	                   > [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn0 U l        [        R                  5        H  u  pXR                  U'   M     U R                  R                  5        VV
s0 s H  u  pX_M	     sn
nU l        [        U l        [        U R                  5      U l
        [        TU ]0  " SUUUUUUUUSSSS.U	D6  g s  sn
nf )NF)lstriprstripT	all_zeroscls_sep)	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_lengthtoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )
isinstancestrr   _special_codepointsr   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargs	codepointname	__class__s               /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/canine/tokenization_canine.pyr*   CanineTokenizer.__init__G   s    JTT]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	 KUU_adJeJeZ
4Fku
 46 1779OI-6$$T*  :
 483K3K3Q3Q3S;
3SIO3S;
' $6 #&t'?'?#@  	
!--#.26#,	
 	
;
s   E6returnc                     U R                   $ N)r&   )r+   s    r0   
vocab_sizeCanineTokenizer.vocab_sizex   s    '''    c                     [        U R                  5       Vs0 s H  n[        U5      U_M     nnUR                  U R                  5        U$ s  snf r4   )ranger5   chrupdateadded_tokens_encoder)r+   ivocabs      r0   	get_vocabCanineTokenizer.get_vocab|   sE    $)$//$:;$:qQ$:;T../ <s   Atextc                     [        U5      $ )z5Tokenize a string (i.e. perform character splitting).)list)r+   rA   s     r0   	_tokenizeCanineTokenizer._tokenize   s    Dzr7   tokenc                 T     [        U5      $ ! [         a    [        SU S35      ef = f)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r+   rF   s     r0   _convert_token_to_id$CanineTokenizer._convert_token_to_id   s5    	:u: 	:/wa899	:s   
 'indexc                 x     U[         ;   a	  [         U   $ [        U5      $ ! [         a    [        SU 35      ef = f)z
Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
human-readable format.
zinvalid id: )r   r:   rJ   rK   )r+   rN   s     r0   _convert_id_to_token$CanineTokenizer._convert_id_to_token   sF    
	5**)%00u: 	5|E7344	5s     
  9c                 $    SR                  U5      $ )N )join)r+   tokenss     r0   convert_tokens_to_string(CanineTokenizer.convert_tokens_to_string   s    wwvr7   )r(   r$   r"   r&   )__name__
__module____qualname____firstlineno____doc__model_input_namesr:   CLSSEPPADMASKr*   propertyintr5   r?   r!   rC   rD   rL   rP   rV   __static_attributes____classcell__)r/   s   @r0   r
   r
   7   s     J c(c(c(c(c(t9/
b (C ( (
c d3i :# :# :
5# 
5# 
5 r7   r
   N)r\   tokenization_pythonr   r   utilsr   
get_loggerrX   loggerr%   r`   r^   r_   BOSra   RESERVEDr   dictrc   r!   __annotations__r#   r   r
   __all__)r-   r.   s   00r0   <module>ro      s    ' B  
		H	%    (l& DcN   VhUmUmUo-pUo/)doUo-p DcN pb) bJ 
Q .qs   B