
    Z jR                         S r SSKJrJrJrJrJrJr  SSKJ	r	  SSK
Jr  SSKJr  \R                  " \5      rSSS	S
.r " S S\5      rS/rg)zTokenization classes for CLIP.    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                      ^  \ rS rSrSr\rSS/r\r	      SS\
\\
\4   -  S-  S\
\\
   -  S-  S\
S	\
S
\
S\
4U 4S jjjrS rSrU =r$ )CLIPTokenizer   a*  
Construct a CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding.

This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab (`str`, `dict` or `list`, *optional*):
        Vocabulary dict to use for the tokenizer.
    merges (`str` or `list`, *optional*):
        Merges list to use for the BPE tokenizer.
    unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
        The beginning of sequence token.
    eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The token used for padding, for example when batching sequences of different lengths.
	input_idsattention_maskNvocabmerges	unk_token	bos_token	eos_token	pad_tokenc                   > Ub  UO"[        U5      S[        U5      S[        U5      S0nU=(       d    / U l        [        [        UU R                  S SSS[        U5      S95      U l        [
        R                  " [
        R                  " 5       [
        R                  " [        S5      S	5      [
        R                  " 5       /5      U R                  l        [        R                  " [        R                  " [        S
5      SSS9[        R                  " SS9/5      U R                  l        [         R                  " 5       U R                  l        [$        T	U ]L  " SUUUUS.UD6  [(        R*                  " [        U5      U R,                  4[        U5      U R.                  4SSS9U R                  l        U R3                  5         g )Nr          z</w>F)r   r   dropoutcontinuing_subword_prefixend_of_word_suffixfuse_unkr   z\s+ z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedT)behaviorinvert)add_prefix_space)r   r   r   r   )sepclsr'   trim_offsets )str_mergesr   r	   
_tokenizerr   SequenceNFCReplacer   	Lowercase
normalizerr   Split	ByteLevelpre_tokenizerr   decodersuper__init__r   RobertaProcessingeos_token_idbos_token_idpost_processor%_wrap_decode_method_backend_tokenizer)
selfr   r   r   r   r   r   kwargs_vocab	__class__s
            {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/clip/tokenization_clip.pyr9   CLIPTokenizer.__init__8   s       III 	 |#||*,#)i.

 &1%9%9__ 3 3E&M3 GI^I^I`a&
" )7(?(?$$z ' ((%@	)
% #+"4"4"6 	
		

 	
 *4)E)EY!2!23Y!2!23"	*
& 	224    c                    ^^ U R                   R                  mU R                   R                  R                  mUU4S jnXR                   l        g )Nc                  X   > T" U 0 UD6nUR                  TS5      R                  5       nU$ )Nr#   )replacestrip)argsr@   textr!   orig_decode_methods      rC   new_decode_methodNCLIPTokenizer._wrap_decode_method_backend_tokenizer.<locals>.new_decode_method   s1    %t6v6D<< 2C8>>@DKrE   )backend_tokenizerdecodemodelr!   )r?   rM   r!   rL   s     @@rC   r>   3CLIPTokenizer._wrap_decode_method_backend_tokenizer   sB    !33:: "3399LL	
 ):%rE   )r-   r.   )NN<|endoftext|>z<|startoftext|>rS   rS   )__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   rQ   r,   dictintlistr9   r>   __static_attributes____classcell__)rB   s   @rC   r   r      s    . *$&67E .2)-(*((E5T#s(^#d*E5 d3i$&E5 	E5
 E5 E5 E5 E5N: :rE   r   N)rX   
tokenizersr   r   r   r   r   r   tokenizers.modelsr	   tokenization_utils_tokenizersr   utilsr   
get_loggerrT   loggerrY   r   __all__r+   rE   rC   <module>rh      sS    % Z Z ! >  
		H	%#/`pq o:% o:d 
rE   