
    Z j7                         S r SSKrSSKrSSKrSSKrSSKJr  SSKJrJ	r	  SSK
rSSKJr  SSKJr  \(       a  SSKJr  SS	KJrJr  SS
KJr  \R,                  " \5      rSS0rSr\" SS9 " S S\5      5       rS/rg)z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAny   )
AddedToken)SentencePieceBackend)	TextInput)loggingrequires_backends)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc            
         ^  \ rS rSrSr\rSS/r       S$S\\	\
4   S-  SS4U 4S jjjr\S	 5       rS
 r S%S\\   S\\   S-  S\S\\   4U 4S jjjrS\\   S\\   4S jr S&S\\   S\\   S-  S\\   4S jjr S&S\\   S\\   S-  S\\   4S jjrS rS rS\	S\	4S jrSS.S jrS'SSS\\	   4U 4S jjjr\S 5       rS rS rS rS rS&S \	S!\	S-  S\ \	   4S" jjr!S#r"U =r#$ )(SiglipTokenizer+   ad  
Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"</s>"`):
        The token used for padding, for example when batching sequences of different lengths.
    additional_special_tokens (`list[str]`, *optional*):
        Additional special tokens used by the tokenizer.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.
    model_max_length (`int`, *optional*, defaults to 64):
        The maximum length (in number of tokens) for model inputs.
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
	input_idsattention_maskNsp_model_kwargsreturnc	                 `  > [        U S5        [        U[        5      (       a  [        USSSSS9OUn[        U[        5      (       a  [        USSSSS9OUn[        U[        5      (       a  [        USSSSS9OUnUc  0 OUU l        Xl        [        T
U ]  " SUUUUUU R                  UUS.U	D6  g )NprotobufTF)rstriplstrip
normalizedspecial)r   	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case )r   
isinstancestrr   r   r"   super__init__)selfr   r   r   r   r    r   r!   r"   kwargs	__class__s             /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/siglip/tokenization_siglip.pyr'   SiglipTokenizer.__init__X   s     	$
+ )S)) yduVZ[ 	 )S)) yduVZ[ 	 )S)) yduVZ[ 	 &5%<r/* 
	
!&? 00-'
	
 
	
    c                 6    U R                   R                  5       $ N)sp_modelget_piece_sizer(   s    r+   
vocab_sizeSiglipTokenizer.vocab_size   s    }}++--r-   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r/   )ranger3   convert_ids_to_tokensupdateadded_tokens_encoder)r(   ivocabs      r+   	get_vocabSiglipTokenizer.get_vocab   sL    ;@;QR;Qa++A.1;QRT../ Ss   Atoken_ids_0token_ids_1already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/[        U5      -  S/-   $ S/[        U5      -  S/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r>   r?   r@   r      )r&   get_special_tokens_masklen)r(   r>   r?   r@   r*   s       r+   rC   'SiglipTokenizer.get_special_tokens_mask   sw    $ &72']a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr-   	token_idsc                     [        U5      S:  a9  US   U R                  :X  a&  [        R                  " SU R                   S35        U$ XR                  /-   $ )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)rD   eos_token_idwarningswarnr   )r(   rF   s     r+   _add_eos_if_not_present'SiglipTokenizer._add_eos_if_not_present   s[    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r-   c                 r    U R                   /nUc  [        X-   5      S/-  $ [        X-   U-   U-   5      S/-  $ )aw  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of zeros.
r   )rI   rD   )r(   r>   r?   eoss       r+   $create_token_type_ids_from_sequences4SiglipTokenizer.create_token_type_ids_from_sequences   sL        !{()QC//;${2S89QC??r-   c                 X    U R                  U5      nUc  U$ U R                  U5      nX-   $ )a"  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:

- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`

Args:
    token_ids_0 (`list[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)rL   )r(   r>   r?   s      r+    build_inputs_with_special_tokens0SiglipTokenizer.build_inputs_with_special_tokens   s9    & 22;?66{CK,,r-   c                 D    U R                   R                  5       nS US'   U$ )Nr0   )__dict__copy)r(   states     r+   __getstate__SiglipTokenizer.__getstate__   s#    ""$ jr-   c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr   r#   )rV   hasattrr   spmSentencePieceProcessorr0   Loadr   )r(   ds     r+   __setstate__SiglipTokenizer.__setstate__   sP     t.//#%D 22JT5I5IJ4??+r-   textc                 j    UR                  [        R                  SS[        R                  5      5      $ )N )	translater%   	maketransstringpunctuation)r(   rc   s     r+   remove_punctuation"SiglipTokenizer.remove_punctuation   s$    ~~cmmBF4F4FGHHr-   keep_punctuation_exact_stringc                   ^  T R                   (       a  UR                  5       nU(       a+  UR                  U 4S jUR                  U5       5       5      nOT R	                  U5      n[
        R                  " SSU5      nUR                  5       nU$ )a]  Returns canonicalized `text` (puncuation removed).

Args:
    text (`str`):
        String to be canonicalized.
    keep_punctuation_exact_string (`str`, *optional*):
        If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
        (but will still remove '{' and '}' that appear separately).
c              3   F   >#    U  H  nTR                  U5      v   M     g 7fr/   )rj   ).0partr(   s     r+   	<genexpr>4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>  s#      6:c$''--:cs   !z\s+ )r"   lowerjoinsplitrj   resubstrip)r(   rc   rm   s   `  r+   canonicalize_text!SiglipTokenizer.canonicalize_text   sw     ::<D(055 6:>**Eb:c6 D **40Dvvfc4(zz|r-   r	   c                    > [         TU ]  " [        UR                  [        S5      -   40 UD6n[	        U5      S:  a%  US   [        :X  a  US   U R
                  ;   a  USS nU$ )z(
Converts a string to a list of tokens.
rt   rB   r   N)r&   tokenizeSPIECE_UNDERLINEreplacerD   all_special_tokens)r(   rc   add_special_tokensr)   tokensr*   s        r+   r~   SiglipTokenizer.tokenize  se     !"2T\\BRTW5X"Xc\bcv;?vay,<<dNeNeAeABZFr-   c                 p    [        U R                  R                  [        U R                  5      5      5      $ r/   )rD   r0   encoder%   r   r2   s    r+   unk_token_length SiglipTokenizer.unk_token_length  s%    4==''DNN(;<==r-   c                    U R                  USS9nU R                  R                  U[        S9nU R                  R                  U R                  U-   [        S9n[        U5      U R                  :  a  X0R                  S $ U$ )u  
Returns a tokenized string.

We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE.

For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
Nrl   )out_type)r{   r0   r   r%   r   rD   r   )r(   rc   r)   r   s       r+   	_tokenizeSiglipTokenizer._tokenize  s     %%d$%O%%dS%9 %%dnnt&;c%J25f+AVAV2Vv++-.b\bbr-   c                 8    U R                   R                  U5      $ )z0Converts a token (str) in an id using the vocab.)r0   piece_to_id)r(   tokens     r+   _convert_token_to_id$SiglipTokenizer._convert_token_to_id2  s    }}((//r-   c                 <    U R                   R                  U5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r0   	IdToPiece)r(   indexr   s      r+   _convert_id_to_token$SiglipTokenizer._convert_id_to_token6  s    ''.r-   c                 "   / nSnSnU HW  nXPR                   ;   a2  U(       d  US-  nX0R                  R                  U5      U-   -  nSn/ nMD  UR                  U5        SnMY     X0R                  R                  U5      -  nUR	                  5       $ )z:Converts a sequence of tokens (string) in a single string.re   Frt   T)r   r0   decodeappendrz   )r(   r   current_sub_tokens
out_stringprev_is_specialr   s         r+   convert_tokens_to_string(SiglipTokenizer.convert_tokens_to_string;  s    
E///&#%Jmm223EFNN
"&%'""))%0"'  	mm**+=>>
!!r-   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-re   r   wb)ospathisdirloggererrorrv   VOCAB_FILES_NAMESabspathr   isfiler   openr0   serialized_model_protowrite)r(   r   r   out_vocab_fileficontent_spiece_models         r+   save_vocabularySiglipTokenizer.save_vocabularyN  s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F	)rV   r"   r0   r   )</s>z<unk>r   NN@   T)NFr/   )F)$__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesdictr%   r   r'   propertyr3   r<   listintboolrC   rL   rP   rS   rY   ra   rj   r{   r~   r   r   r   r   r   tupler   __static_attributes____classcell__)r*   s   @r+   r   r   +   s   &P *$&67
 "&15+
 c3h$.+
 
+
 +
Z . . puO9O379t3COhlO	cO O8	3c 	3tCy 	3 GK@9@379t3C@	c@. GK-9-379t3C-	c-4
,Is Is I HL 0[ QUVYQZ   > >c(0
"&!c !C$J !Z_`cZd ! !r-   r   )r   r   rx   rh   rJ   shutilr   typingr   r   r   r]   tokenization_utils_baser    tokenization_utils_sentencepiecer   r	   utilsr
   r   utils.import_utilsr   
get_loggerr   r   r   r   r   __all__r#   r-   r+   <module>r      s    + 	 	    %  1 D 4 / * 
		H	%!>2    
%&q!* q! 'q!h	 
r-   