
    Z j                         S SK JrJrJrJr  S SKJr  SSKJr  SSK	J
r
  \
R                  " \5      rSS0r " S S	\5      rS	/rg
)    )	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackend)loggingtokenizer_fileztokenizer.jsonc                      ^  \ rS rSrSr\rSrSS/r\	r
       SS\\\\4   -  S-  S\\\   -  S-  S	\S
\S\S\S\4U 4S jjjrSrU =r$ )GemmaTokenizer   u  
Construct a fast Gemma tokenizer (backed by HuggingFace's tokenizers library).

This tokenizer uses a BPE model with byte fallback, no prefix space, and a normalizer that replaces
spaces with "▁".

Args:
    tokenizer_file (`str`, optional):
        A tokenizers JSON file containing the serialization of a tokenizer.
    unk_token (`str`, optional, defaults to "<unk>"):
        The unknown token.
    bos_token (`str`, optional, defaults to "<bos>"):
        The beginning of sequence token.
    eos_token (`str`, optional, defaults to "<eos>"):
        The end of sequence token.
    pad_token (`str`, optional, defaults to "<pad>"):
        The padding token.
    mask_token (`str`, optional, defaults to "<mask>"):
        The mask token.
    add_bos_token (`bool`, optional, defaults to True):
        Whether or not to add a `bos_token` at the start of sequences.
    add_eos_token (`bool`, optional, defaults to False):
        Whether or not to add an `eos_token` at the end of sequences.
    vocab (`str` or `dict[str, int]`, optional):
        Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
left	input_idsattention_maskNvocabmerges	unk_token	bos_token	eos_token	pad_token
mask_tokenc                   > Uc9  [        U5      S[        U5      S[        U5      S[        U5      S[        U5      S0nXl        U=(       d    / U l        [        [	        U R                  U R                  S[        U5      S SS95      U l        [        R                  " SS	S
S9U R
                  l        [        R                  " [        R                  " SS5      [        R                  " 5       [        R                  " 5       /5      U R
                  l        [        R                  " SS5      U R
                  l        ["        T	U ]H  " SUUUUUS.UD6  g )Nr         r      T)r   r   fuse_unkr   dropoutbyte_fallback merged_with_previousF)patternbehaviorinvertu   ▁)r   r   r   r   r    )str_vocab_mergesr   r   
_tokenizerr   Splitpre_tokenizerr   SequenceReplaceByteFallbackFusedecoderr   
normalizersuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma/tokenization_gemma.pyr3   GemmaTokenizer.__init__;   s(    =IIIIJE |#kk||i."	
 )7(<(<"8)
% #+"3"3eS)8+@+@+BHMMOT#
 &1%8%8e%D" 	
!	
 	
    )r(   r)   r'   )NNz<unk>z<bos>z<eos>z<pad>z<mask>)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namespadding_sidemodel_input_namesr   modelr&   dictintlistr3   __static_attributes____classcell__)r6   s   @r7   r   r      s    6 *L$&67E .2)-    "/
T#s(^#d*/
 d3i$&/
 	/

 /
 /
 /
 /
 /
r9   r   N)
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr	   utilsr
   
get_loggerr:   loggerr?   r   __all__r%   r9   r7   <module>rP      sO    H G ! >  
		H	%%'78 P
& P
f 
r9   