
    Z jM                         S SK JrJrJrJrJr  S SKJr  SSKJ	r	  SSK
Jr  \R                  " \5      rSSS.r " S	 S
\	5      rS
/rg)    )	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                      ^  \ rS rSrSr\rSS/r\r	         SS\
\\
\4   -  S-  S\
\\
   -  S-  S\
S	\
S
\
S\
S\
S\
S-  S\
S-  4U 4S jjjrSrU =r$ )HerbertTokenizer   a~  
Construct a BPE tokenizer for HerBERT (backed by HuggingFace's tokenizers library).

Peculiarities:

- uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
  a punctuation character will be treated separately.

This tokenizer inherits from [`TokenizersBackend`] which contains most of the methods. Users should refer to the
superclass for more information regarding methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The classifier token.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The padding token.
    mask_token (`str`, *optional*, defaults to `"<mask>"`):
        The mask token.
    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token.
    vocab (`str`, `dict` or `list`, *optional*):
        Custom vocabulary dictionary.
    merges (`str` or `list[str]`, *optional*):
        Custom merges list.
	input_idsattention_maskNvocabmerges	cls_token	unk_token	pad_token
mask_token	sep_tokenr   r   c
           
      B  > Ub  UO[        U5      S0U l        U=(       d    / U l        [        [	        U R                  U R                  S [        U5      SS95      U l        [        R                  " SSSSS9U R
                  l        [        R                  " 5       U R
                  l        [        R                  " SS9U R
                  l        [        TU ]@  " SUUUUUS.U
D6  ["        R$                  " U R&                  S	4U R(                  S4S
9U R
                  l        g )Nr   z</w>)r   r   dropoutr   end_of_word_suffixFT)	lowercasestrip_accents
clean_texthandle_chinese_chars)suffix)r   r   r   r   r      )sepcls )str_vocab_mergesr   r   
_tokenizerr   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizerr   
BPEDecoderdecodersuper__init__r   BertProcessingr   r   post_processor)selfr   r   r   r   r   r   r   r   r   kwargs	__class__s              ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/herbert/tokenization_herbert.pyr1   HerbertTokenizer.__init__A   s     %0es9~q6I|#kk||i.#)
 &1%?%?5TX\&
" )7(G(G(I%"*"5"5V"D 	
!	
 	
 *4)B)B##*
&    )r(   r)   r'   )	NNz<s>z<unk>z<pad>z<mask>z</s>NN)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr&   dictintlistr1   __static_attributes____classcell__)r6   s   @r7   r   r      s    @ *$&67E .2)-  "!%"&+
T#s(^#d*+
 d3i$&+
 	+

 +
 +
 +
 +
 $J+
 4Z+
 +
r9   r   N)
tokenizersr   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   utilsr   
get_loggerr:   loggerr?   r   __all__r%   r9   r7   <module>rO      sN     T S ! >  
		H	%#/M P
( P
f 
r9   