
    Z j$                         S r SSKrSSKJrJrJrJrJr  SSKJ	r	  SSK
Jr  SSKJr  \R                  " \5      rSS	0rS
 r " S S\5      rS/rg)z"Tokenization classes for Splinter.    N)	Tokenizerdecodersnormalizerspre_tokenizers
processors)	WordPiece   )TokenizersBackend)logging
vocab_filez	vocab.txtc                     [         R                  " 5       n[        U SSS9 nUR                  5       nS S S 5        [	        W5       H  u  pEUR                  S5      nXAU'   M     U$ ! , (       d  f       N9= f)Nrzutf-8)encoding
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextokens         ڃ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/splinter/tokenization_splinter.py
load_vocabr      sg    ##%E	j#	0F!!# 
1!&)T"e * L 
1	0s   A%%
A3c                      ^  \ rS rSrSr\rSS/r\r	          SS\
\\
\4   -  S-  S\S\
S	\
S
\
S\
S\
S\
S\S\S-  4U 4S jjjr\S 5       rS rSrU =r$ )SplinterTokenizer)   a~  
Construct a Splinter tokenizer (backed by HuggingFace's tokenizers library). Based on WordPiece.

This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab_file (`str`, *optional*):
        Path to a vocabulary file.
    tokenizer_file (`str`, *optional*):
        Path to a tokenizers JSON file containing the serialization of a tokenizer.
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
    unk_token (`str`, *optional*, defaults to `"[UNK]"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    sep_token (`str`, *optional*, defaults to `"[SEP]"`):
        The separator token, which is used when building a sequence from multiple sequences.
    pad_token (`str`, *optional*, defaults to `"[PAD]"`):
        The token used for padding, for example when batching sequences of different lengths.
    cls_token (`str`, *optional*, defaults to `"[CLS]"`):
        The classifier token which is used when doing sequence classification.
    mask_token (`str`, *optional*, defaults to `"[MASK]"`):
        The token used for masking values.
    question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
        The token used for constructing question representations.
    tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
        Whether or not to tokenize Chinese characters.
    strip_accents (`bool`, *optional*):
        Whether or not to strip all accents. If this option is not specified, then it will be determined by the
        value for `lowercase`.
    vocab (`str`, `dict` or `list`, *optional*):
        Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
	input_idsattention_maskNr   do_lower_case	unk_token	sep_token	pad_token	cls_token
mask_tokenquestion_tokentokenize_chinese_charsstrip_accentsc                   > Ub  UOE[        U5      S[        U5      S[        U5      S[        U5      S[        U5      S[        U5      SSS0U l        [        [        U R                  [        U5      S	95      U l        [
        R                  " S
U	U
US9U R                  l        [        R                  " 5       U R                  l
        [        R                  " SS9U R                  l        [        TU ]8  " SUUUUUUUU	U
S.	UD6  X l        Xl        Xl        Xl        U R$                  U R&                  ;  a  U R)                  U R$                  /S
S9  U R+                  5         g )Nr         r	         .   )r$   T)
clean_texthandle_chinese_charsr+   	lowercasez##)prefix)	r$   r%   r&   r'   r(   r)   r#   r*   r+   )special_tokens )str_vocabr   r   
_tokenizerr   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizerr   decodersuper__init__r#   r*   r+   r)   all_special_tokens
add_tokensupdate_post_processor)selfr   r#   r$   r%   r&   r'   r(   r)   r*   r+   kwargs	__class__s               r   rB   SplinterTokenizer.__init__Q   sR        IIIIJN#QQ 	 $IdkkS^$TU%0%?%?!7'#	&
" )7(G(G(I%"*"4"4D"A 	
!)'#9'	
 	
 +&<#*,d&=&==OOT001$OG""$    c                 8    U R                  U R                  5      $ )N)convert_tokens_to_idsr)   )rF   s    r   question_token_id#SplinterTokenizer.question_token_id   s    ))$*=*=>>rJ   c           
         U R                   nU R                  nU R                  nSnU R                  nU R                  nU R
                  nU R                  S5      nUb  Uc  g U R                  S:X  a  U SU SU SU SU S3
n	OU SU SU SU SU S3
n	[        R                  " U SU S3U	X4X&4X74XH4/S9U R                  l        g )	Nr1   rightz:0 $A:0  z:0 $B:1 z:1z:0)singlepairr7   )r'   r%   r)   cls_token_idsep_token_idrM   rL   padding_sider   TemplateProcessingr;   post_processor)
rF   clssepquestiondotrT   rU   rM   dot_token_idrS   s
             r   rE   'SplinterTokenizer.update_post_processor   s    nnnn&&(((( 2211#6;#+'U(8*AcU!C5RHDU(3%xz3%qRHD)3)F)FU(3%r*##-#		*
&rJ   )r;   r:   r#   r)   r+   r*   )
NTz[UNK]z[SEP]z[PAD]z[CLS]z[MASK]z
[QUESTION]TN)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr9   dictintboolrB   propertyrM   rE   __static_attributes____classcell__)rH   s   @r   r   r   )   s    !F *$&67E .2"    "*'+%):%T#s(^#d*:% :% 	:%
 :% :% :% :% :% !%:% d{:% :%x ? ?
 
rJ   r   )rc   r   
tokenizersr   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   utilsr   
get_loggerr_   loggerrd   r   r   __all__r8   rJ   r   <module>ru      sX    )  S S ' >  
		H	%!;/ C
) C
L 
rJ   