
    Z jA                         S r SSKJrJrJrJrJrJr  SSKJ	r	  SSK
Jr  SSKJr  \R                  " \5      rSS0r " S	 S
\5      rS
/rg)zTokenization classes for XGLM.    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)Unigram   )TokenizersBackend)loggingtokenizer_fileztokenizer.jsonc                      ^  \ rS rSrSr\rSS/r\r	        SS\
\\\
\4      -  S-  S\
S\
S	\
S
\
S\
S\
S\4U 4S jjjrSrU =r$ )XGLMTokenizer   a  
Construct a XGLM tokenizer (backed by HuggingFace's tokenizers library). Based on BPE.

This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    tokenizer_file (`str`, *optional*):
        Path to a tokenizers JSON file containing the serialization of a tokenizer.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The classifier token which is used when doing sequence classification.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding.
    vocab (`str`, `dict` or `list`, *optional*):
        Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
    merges (`list[tuple[str, str]]`, *optional*):
        Custom merge rules for BPE. If not provided, merges are generated from the vocabulary.
    add_prefix_space (`bool`, *optional*, defaults to `True`):
        Whether to add a prefix space before encoding.
	input_idsattention_maskNvocab	bos_token	eos_token	sep_token	cls_token	unk_token	pad_tokenadd_prefix_spacec	                   > SU l         [        U R                   5       V
s/ s H	  n
SU
 S3PM     nn
U	R                  S/ 5      =(       d    / U	S'   U	S==   U Vs/ s H  oU	S   ;  d  M  UPM     sn-  ss'   Xl        Ub  Xl        O7[        U5      S4[        U5      S4[        U5      S4[        U5      S4/U l        [        [        U R                  SSS95      U l        [        R                  " [        R                  " [        S	5      S
5      [        R                  " 5       [        R                  " [        S5      S
5      /5      U R                  l        U(       a  SOSn[        R                   " SUS9U R                  l        [$        R                   " SUS9U R                  l        [(        TU ]T  " SUUUUUUUS.U	D6  [,        R.                  " U R0                   SU R0                   3U R0                   SU R0                   S
U R0                   SU R0                   3U R2                  U R4                  4U R0                  U R6                  4/S9U R                  l        g s  sn
f s  snf )N   z<madeupword>additional_special_tokensg        r
   F)r   unk_idbyte_fallbackz[\n\r\t] z {2,}alwaysneveru   ▁)replacementprepend_scheme)r   r   r   r   r   r   r   z $A z $B )singlepairspecial_tokens )num_madeup_wordsrangegetr   _vocabstrr   r	   
_tokenizerr   SequenceReplacer   NFKC
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__r   TemplateProcessingr   r   bos_token_ideos_token_idpost_processor)selfr   r   r   r   r   r   r   r   kwargsimadeup_wordswordr%   	__class__s                 {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/xglm/tokenization_xglm.pyr8   XGLMTokenizer.__init__>   sG    !"49$:O:O4PQ4Pq+aS*4PQ.4jj9TVX.Y._]_*+*+)0
)T@[9\-\D\0
 	
+ !1K Y%Y%Y%Y%	DK $G$++aW\$]^%0%9%9##E+$6<  "##E(OS9&
" &67(6(@(@Ucq(r%"*"4"4We"f 		
-		
 		
 *4)F)Fnn%T$..)9:NN#4'7q8HT^^L\]!2!23!2!23*
&S R0
s   I$I)
*I)
)r/   r-   r   r*   )N<s></s>rF   rE   z<unk>z<pad>T)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   modelr.   listtuplefloatboolr8   __static_attributes____classcell__)rB   s   @rC   r   r      s    : *$&67E 7;  !%=
T%U
+,,t3=
 =
 	=

 =
 =
 =
 =
 =
 =
    r   N)rK   
tokenizersr   r   r   r   r   r   tokenizers.modelsr	   tokenization_utils_tokenizersr   utilsr   
get_loggerrG   loggerrL   r   __all__r)   rV   rC   <module>r^      sQ    % Z Z % >  
		H	%%'78 _
% _
D 
rV   