
    Z j,                       S r SSKrSSKrSSKrSSKJr  SSKJr  SSKJ	r	  SSK
Jr  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.J/r/J0r0J1r1J2r2  SSK3J4r4J5r5J6r6  \6Rn                  " \85      r9Sr:Sr;Sr<Sr=Sr>\,S-  r,\\ \!\"S.r?\:\=S.r@\5" \,5       " S S\/5      5       rA\ArBg)z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)copyfile)Any)is_offline_mode)
AddedToken
processors)Encoding)	Tokenizer)Decoder)BPEUnigram)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainercached_file   )SpmConverter)convert_gguf_tokenizer)load_gguf_checkpoint)INIT_TOKENIZER_DOCSTRINGBatchEncodingPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategygenerate_merges)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)r   r   	WordLevel	WordPiece)tokenizer_file
vocab_filec            )       ^  ^  \ rS rSrSr\rSrSr\	SNS j5       r
U 4S jr\S\4S j5       r\S\4S	 j5       rSOS
\S\S-  S\\   4S jjrS r\S 5       r\S 5       r\R,                  S 5       r\R,                  S 5       rS r\S\4S j5       rS\\\4   4S jr\S\\\4   4S j5       r\S\\\4   4S j5       r\S\\\4   4S j5       r\r \r!S\\\4   4S jr"S\4S jr#S\4S jr$\S\%4S j5       r&\S\'4S j5       r(       SPS\)S\S-  S \S-  S!\S"\S#\S$\S%\S\\\\*4   \+\)   4   4S& jjr,S'\S\4S( jr-S)\S\S-  4S* jr.SNS+\+\\-     S\4S, jjr/SNS-\S\4S. jjr0SNS/\\+\   -  S0\S\\+\   -  4S1 jjr1SQS2\S-\S-  S3\S\+\   4S4 jjr2S5\3S6\4S7\S8\S9\S-  S:\S-  4S; jr5SS\3Rl                  \4Rn                  SS<SSSSSSSSSSSS4S2\8\9-  \+\8   -  \+\9   -  S=\8\9-  \+\8   -  \+\9   -  S-  S3\S5\3S6\4S7\S-  S8\S>\S9\S-  S:\S-  S?\S-  S\S-  S \S-  S!\S"\S#\S$\S%\S@\S-  S\:4(SA jjr;SB\+\   S\4SC jr<  SRSD\\+\   -  S0\SE\S-  S\4SF jjr=  SSS
\\>R~                  -  SG\\SH4   SI\S-  S\S-  S\\SH4   4
SJ jjr@   STSK jrA\	       SUSL j5       rBSMrCU =rD$ )VTokenizersBackendS   a5  
Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

Handles all the shared methods for tokenization and special tokens, as well as methods for
downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
NFc                 "  ^( [        U5      nUR                  SS5      nUb_  [        R                  R	                  U5      (       a;  U [
        L d  SU R                  ;  d  U(       a  [        R                  " U5      US'   U$ UGb  [        R                  R	                  U5      (       Ga  [        USS9 n[        R                  " U5      nSSS5        WR                  S0 5      R                  S5      nUS	;  a]  [        U5      n[        US   5      n	0 U	S
'   US:X  a  / U	S'   XS'   / US'   [        R                  " [        R                  " U5      5      n
O[        R                  " U5      n
U
R                  US'   U
R                   US'   U
R"                  US'   U
R"                  b  U
R"                  US'   U
R                   b  U
R                   US'   UR                  S5      nU(       aw  UR                  SS5      S:X  a  US   nO[%        U[&        5      (       d  U/nU H=  nUR                  S5      S:X  d  M  SU;   d  M"  SSKnUR+                  US   5      US'     O   UR                  S0 5      R                  S
S5      nU R,                  c0  [%        U[&        5      (       a  ['        [/        [0        U5      5      nGO0U R,                  R2                  S:X  aV  [%        U[&        5      (       a@  U(       a9  [%        US   [&        [0        45      (       a  U Vs/ s H  n[1        U5      PM     nnOU R,                  R2                  S:X  a"  [5        U5       VVs0 s H	  u  nnUU_M     nnnOU R,                  R2                  S:X  d  U R,                  R2                  S:X  aP  [%        U[&        5      (       a;  [5        U5       VVs0 s H#  u  nn[%        U[&        5      (       a  US   OUU_M%     nnnXS
'   [7        U SS5      nSUR                  S0 5      ;   an  U(       ag  UR2                  S:X  aW  US   S   nU Vs/ s H=  n[%        U[8        5      (       a  [1        UR;                  S5      5      O
[1        U5      PM?     nnUUS'   U$ UR                  S5      nUR                  S5      nUR                  S
5      nUR                  S5      n[%        U[8        5      (       a`  UR=                  S 5      (       aJ  [        R                  R	                  U5      (       a&  S!S"KJ n  U" US#9RC                  U5      u  US
'   US'   U$ [%        U[8        5      (       Ga  [        R                  R	                  U5      (       Ga  UR=                  S$5      (       Gax   S!S%KJ"n  U" U5      nURF                  " U R,                  40 UD6n S!S&KJ$n  UR                  U R2                  5      nUb#  [K        US'5      (       a  URL                  " S?0 UD6n[K        U S+5      (       a  U RT                  " S?0 UD6nSU;  Ga  U [
        L d  SU R                  ;  Ga  UR                  S
S5      nUR                  SS5      nUR                  S,5      =(       d    0 nUb  U(       a  URW                  5        VVs0 s H	  u  nnUU_M     nnnURW                  5        H_  u  nn[Y        U5      n[9        U5      nUR                  U5      n U (       d  M6  U U:w  d  M>  UU;  d  MF  UR                  U 5      UU'   UUU'   Ma     [Z        R\                  " UR^                  UUS-9n!U!b  U!US'   UR^                  R`                  n"U"Rb                  S:  a%  URe                  S.U"Rf                  =(       d    S/5        U"Rh                  S:  a%  URe                  S0U"Rj                  =(       d    S15        U"Rl                  S:  a%  URe                  S2U"Rn                  =(       d    S35        U$ UcC  [%        U[8        5      (       a.  [        R                  R	                  U5      (       a
  UUS
'   US
   nUcC  [%        U[8        5      (       a.  [        R                  R	                  U5      (       a
  UUS'   US   nUc  U R,                  b  U R,                  R2                  S:X  a  [%        U[         5      (       ak  S:[t        [v           S;[&        [8           4U(4S< jjm(/ S=Qn%[y        5       n&U% H&  n'U'U;   d  M  U&R{                  T(" UU'   /5      5        M(     [}        UU&S>9nUUS'   U$ ! , (       d  f       GN~= fs  snf s  snnf s  snnf s  snf ! [N         a1  n[P        RS                  S(U R2                   S)U S*35         SnAGNSnAff = fs  snnf ! [N         aV  n[P        RS                  S4U S5U S635        S!S7KJ8n#  U#" UUR                  S85      S99n$U$Rs                  5       US'    SnAU$ SnAff = f)@z
Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
models, tekken.json, vocab/merges).
r%   N__init__tokenizer_objectutf-8encodingmodeltype)Nr   vocabr   mergesadded_tokenspost_processortokenizer_paddingtokenizer_truncation_json_truncation_json_padding
normalizerSequencenormalizersPrecompiledprecompiled_charsmapr   _spm_precompiled_charsmapr   r#   r$    r&   merges_fileztekken.jsonr   )MistralConverter)r&   .model)SentencePieceExtractor)SLOW_TO_FAST_CONVERTERSconvert_from_spmz,Could not reorder vocab using converter for z due to z/. Falling back to raw SentencePiece extraction.convert_from_spm_modeladded_tokens_decoder)protor2   r3   	bos_token<s>	eos_token</s>	unk_tokenz<unk>z+Could not extract SentencePiece model from z$ using sentencepiece library due to z%. Falling back to TikToken extractor.)TikTokenConverterextra_special_tokens)r&   rP   valuesreturnc                    > / nU  HV  nUc  M  [        U[        [        45      (       a  UR                  T" U5      5        M<  UR	                  [        U5      5        MX     U$ N)
isinstancelisttupleextendappendstr)rQ   	collectedval_iter_special_tokenss      {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/tokenization_utils_tokenizers.pyr]   HTokenizersBackend.convert_to_native_format.<locals>._iter_special_tokens)  sY    ')	!C{ !#e}55!(()=c)BC!((S2 " !     )		pad_tokenrN   rJ   rL   	sep_token	cls_token
mask_tokenadditional_special_tokensrP   )skip_tokens )?dictpopospathisfiler(   __dict__TokenizerFast	from_fileopenjsonloadgetfrom_strdumpsr5   padding
truncationrU   rV   base64	b64decoder0   maprW   __name__	enumerategetattrrZ   splitendswithconvert_slow_tokenizerrB   extract_vocab_merges_from_modelrD   extractrE   hasattrrF   	ExceptionloggerwarningrG   itemsintr   build_tokenizer_from_spm_protorI   trainer_specbos_id
setdefault	bos_pieceeos_id	eos_pieceunk_id	unk_piecerO   	convertedr   r   setupdater   ))clstrust_remote_codekwargslocal_kwargsfast_tokenizer_filetokenizer_handletokenizer_json
model_typeminimal_tokenizer_jsonminimal_modeltok_from_filenormalizer_configr:   rx   r2   itemitokenr3   merger&   rA   rB   rD   	extractorrE   converter_classerH   token_idid_to_token	new_tokencurrent_tokenr,   
proto_specrO   	converterspecial_tokens_keysrf   keyr]   s)                                           @r^   convert_to_native_format*TokenizersBackend.convert_to_native_formate   sH	    F|*../?F  +233))Zs||-KO`/</F/FGZ/[L+, ,@S1T1T )G<@P!%+;!< = (++GR8<<VDJ!22)-n)=& $^G%< =)+g&&.0M(+2?w/9;&~6 - 6 6tzzBX7Y Z - 7 78K L-:-I-IL)*0=0E0EL,-3@3K3KL/0 ''33@3K3K/0$$00=0E0E_- !/ 2 2< @ $((6*D(9-(H%#$5t<<):(;%"3J!~~f->CY]gCg%DJDTDT&'=>E%@A  #4 #&&w377FEyy eT** UE!23E##y0eT**uE!HtUZm9\9\5:;UTU4[UE;##{22;E2BC2Bha2BC##u,		0B0Bk0QeT**_hin_op_oS[STV[E4)@)@U1XeQN_oEp$)! gt4J>--gr::
zObObfkOk'0:kqrkqbgZs5K5K%C 01QVW\Q]]kqr)/X&!%%l3
"&&}5  )!!(+ j#&&:+>+>}+M+MRTRYRYR`R`akRlRl@<L%=--j9 :L!<#9   j#&&277>>*+E+E*J]J]^fJgJgFIJ 3:>	(00KlK	O&=&A&A#,,&OO&2wPb7c7c'6'G'G'W,'W
 3 899#&#=#=#M#ML
 &\9,,
#,,0N(,,Wd;E)--h=F ,8+;+;<R+S+YWY((-ANSkkm&\m?5(xm&\3G3M3M3O/Hi'*8}H(+II,7OOH,EM,})1KPYafPf3899]3Ki 08AH 5 4P (4'R'R'oo#%($
 (3;K%78 &/__%A%A
%,,1(33KAUAUA^Y^_%,,1(33KAUAUA_Y_`%,,1(33KAUAUA`Y`a   =Z
C88RWW^^J=W=W$.L! )E>jc::rww~~k?Z?Z%0L"!(+F >cii3		8J8Je8SXbchjnXoXo	!Xc] 	!tCy 	!
# %(EK*,&&&';\#=N<O'PQ + %UDF%+L"[ =<x <C q sF ! NNFs||nT\]^\_  `O  P & ']6  
IA*Mqrsqt u: : F-)@P@PQg@h	 4=3F3F3H/0
Is   4eee*e:Ae%+f. ?Ae* Bf. %f(5Af. >f. f. C:f. 
e*
f%4&f f.  f%%	f. .
h8Ah		hc           	      <  > UR                  SS 5      nUR                  SS 5      nUR                  SS 5        UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  S0 5      nUR                  SS	5      n	UR                  S
5      n
UR                  S5      nUR                  S5      nS nUb  [        R                  " U5      nGOUb<  [        R
                  R                  U5      (       a  [        R                  " U5      nGOUb|  [        UR                  SS5      U40 UD6n[        U5      nUS   S   nUS   nUS   n[        UU5      u  nnUR                  U5        [        U5      S:  a  UR                  U5        GOU R                  c  Ub  UbT  [        U[         5      (       a  UO&[#        U5       VVVs0 s H  u  nu  nnUU_M     snnnn[        [%        UUSS S95      nO[        U[         5      (       a  [        [%        U/ SS S95      nOu[        U[&        5      (       aG  U(       a@  [        US   [(        [&        45      (       a"  [        [+        XR                  SS5      S95      nOU R                  c  [-        S5      eUc4  Uc1  U R                  c$  UR/                  SS5        UR/                  SS5        Ub  Xl        U R                  c  [-        S5      eUR                  SS 5      =(       d    U R                  R0                  =(       d    UnUbq  U R                  R2                  " S80 UD6  UR/                  SUS   5        UR/                  S US!   5        UR/                  S"US"   5        UR/                  S#US$   5        OU R                  R5                  5         UR                  S%S 5      =(       d    U R                  R6                  =(       d    UnUb  U R                  R8                  " S80 UD6  UR/                  S&US&   5        UR/                  S'US(   5        UR/                  S)US!   5        UR/                  SUS*   5        UR/                  S+US+   5        S,U;  a  S-US,'   S.U;   =(       d    S/U;   nUR                  S.S	5      U l        UR                  S/S	5      U l        UR                  S0S 5      =n(       a  UU R                  l        U=(       d    U R                  R>                  S L U l         [B        T&U ]  " S80 UD6  U
b  Xl#        Xl$        U RJ                  U R                  l&        U RN                   Vs1 s H  n[Q        [S        U5      5      iM     nn[U        URW                  5       S1 S29 VVs/ s H"  u  nn[Q        [S        U5      5      U;  d  M   UPM$     nnn['        U RX                  R[                  5       5      U Vs/ s H  n[]        U5      PM     sn-   n U R^                  Ra                  5        H2  n!U!c  M  []        U!5      U ;  d  M  U!U;  d  M!  URc                  U!5        M4     U Rd                   H-  n[]        U5      U ;  d  M  UU;  d  M  URc                  U5        M/     [        U5      S:  a  / n"U R^                  Ra                  5        V#s/ s H  n#U#(       d  M  []        U#5      PM     n$n#U Hp  n[        U[\        5      (       a  [g        USS39nO<[        U[f        5      (       a'  URh                  (       d  []        U5      U$;   a  SUl4        U"Rc                  U5        Mr     U"(       a  U Rk                  U"5         U R                  Rm                  5       n%U%S4:  a  [q        U R                  S5S 5      bl  UR                  SS 5        U Rr                  " U R                  U Rt                  R                  SS 5      4U Rt                  UR                  S6S 5      S7.UD6U l        U R@                  =(       d    U R                  R>                  S L U l         U R@                  (       a  U Rw                  5         g g s  snnnf s  snf s  snnf s  snf s  sn#f ! [n         a    Sn% GNf = f)9Nr8   r9   r?   r,   	gguf_filer%   rH   add_prefix_spaceFr&   r2   r3   name_or_path configr   	tokenizertokenizer_configr   T)r2   r3   fuse_unkdropoutr   )r2   r   a9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.rJ   rK   rL   rM   z3The backend tokenizer is not correctly initialized.r7   
max_lengthtruncation_side	directionstridetruncation_strategystrategyr6   ra   pad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofbackend
tokenizersadd_bos_tokenadd_eos_tokenr5   c                     U S   $ Nr   rg   )xs    r^   <lambda>,TokenizersBackend.__init__.<locals>.<lambda>  s    STUVSWr`   r   )speciali pre_tokenizerfix_mistral_regex)init_kwargsr   rg   )<ri   rs   copydeepcopyrj   rk   rl   rn   ro   r   r   r   r   len
_tokenizerrU   rh   r|   r   rV   rW   r   
ValueErrorr   rw   enable_truncationno_truncationrv   enable_padding_add_bos_token_add_eos_tokenr5   _should_update_post_processorsuperr+   r&   r   split_special_tokensencode_special_tokensrH   hashreprsortedr   added_tokens_encoderkeysrZ   _special_tokens_maprQ   rY   _extra_special_tokensr   r   
add_tokensget_vocab_sizeNotImplementedErrorr}   _patch_mistral_regexr   update_post_processor)'selfargsr   r8   r9   r,   r   r   rH   r   r&   r2   r3   fast_tokenizer	gguf_path
gguf_paramarchitecturetokenizer_dictr   additional_kwargsr   w_
vocab_dict_truncation_paddingexplicit_bos_eos_in_kwargsr5   r   added_tokens_decoder_hashindextokens_to_addencoderspecial_token_valuetokenstall_named_tokens
vocab_size	__class__s'                                         r^   r+   TokenizersBackend.__init__H  s    "::&8$?

?D9 	

.5!::&8$?JJ{D1	$jj)94@%zz*@"E!::&8%@ZZ-


7#H%'!]]+;<N ,@S1T1T*445HIN"#FJJ~r$BIXQWXI-i8J%h/=L'4N)*<=0F|Uc0d-N-MM*+$%)/0__$):!&0&=&=UZcdiZjCkZjYQPVQRTUAqDZjCk
!.sF]ako/p!qE4((!.srTXbf/g!hE4((Uz%(UTXM7Z7Z!.wU::V^`aKb/c!d__$r  &+;+CH_k51k62%,O??"RSSjj!7>p$//B\B\p`p"OO--<<lK,EF/[1IJhH(=>3[5LMOO))+::148dDOO<S<SdWdOO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS F" ,F9%4%>%[/U[B["$jj%@$jj%@#ZZ(8$??>?-;DOO*-G-q4??KiKimqKq*"6"!(O 0040I0I-DHD]D]$^D]5T$u+%6D]!$^ !'';'A'A'C X
 XuDK (AA  X 	 

 t005578Ta;bTa5CJTa;bb $(#;#;#B#B#D"*&'w6;NVc;c$$%89	 $E //E5z(U--G$$U+ 0 }!F040H0H0O0O0QW0Q1UVA0QW&eS))&ud;Ez22 ==SZ;K-K(,e$ ' '	779J
 74??OT#R#^JJ{D)"77  $$^T: !,,"(**-@$"G	
 DO ..X$//2P2PTX2X 	* --&&( .o Dl~ %_

 <c"  X  # 	J	s<   a/'a6#a;a;5b	bb8b bbrR   c                     g)NTrg   r   s    r^   is_fastTokenizersBackend.is_fast  s    r`   c                    SU R                   ;   ao  U R                   S   R                  S5      (       aL  [        U S5      (       a:  U R                  (       a)  [        R
                  R                  U R                  5      $ gg)z
`bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
can only be `True` if the original `"sentencepiece.model"` was not deleted.
r&   rC   FT)vocab_files_namesr   r   r&   rj   rk   rl   r	  s    r^   can_save_slow_tokenizer)TokenizersBackend.can_save_slow_tokenizer  s^     4111d6L6L\6Z6c6cdl6m6mt\**tww~~doo66r`   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  a  [        U R                  U5        U4$ )NzVocabulary path (z) should be a directory-r   r&   )
rj   rk   isdirr   errorjoinVOCAB_FILES_NAMESabspathr&   r   )r   r  r  out_vocab_files       r^   save_vocabulary!TokenizersBackend.save_vocabulary  s    ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r`   c                    U R                   nU R                  nUc  U R                  (       a  SU l        U R                  nU R                  nUc  U R
                  (       a  SU l        U R                  (       a  US-   OS SU R
                  (       a  SU-   S-   OS 3nU U R                  (       a  SU-   S-   OS S	U R
                  (       a  SU-   S-   OS 3n/ nU R                  (       a  UR                  X45        U R
                  (       a  UR                  X445        [        R                  " XVUS
9U R                  l
        g)zU
Updates the underlying post processor with the current `bos_token` and `eos_token`.
NFz:0 r   z$A:0r@   z:0z:1z $B:1)singlepairspecial_tokens)rJ   bos_token_idr   rL   eos_token_idr   rY   r	   TemplateProcessingr   r5   )r   bosr   eosr!  r  r  r  s           r^   r   'TokenizersBackend.update_post_processor
  s.    nn((;4--!&Dnn((;4--!&D%)%7%7S5[R@[_[m[mcCiRVFVsuDvw0B0B39t+K5gkgygyRUX[R[^bRb  @B  QC  D!!3"56!!3"56)3)F)F^*
&r`   c                     [        U SS5      $ )Nr   Fr}   r	  s    r^   r   TokenizersBackend.add_eos_token$      t-u55r`   c                     [        U SS5      $ )Nr   Fr'  r	  s    r^   r   TokenizersBackend.add_bos_token(  r)  r`   c                 R    [         R                  U SU5        U R                  5         g )Nr   object__setattr__r   r   values     r^   r   r(  ,  !    4!159""$r`   c                 R    [         R                  U SU5        U R                  5         g )Nr   r-  r0  s     r^   r   r+  1  r2  r`   c           	         / nU R                   R                  5        Ha  nUc  M  [        U[        5      (       a  UR	                  U5        M0  [        U[
        5      (       d  MG  UR	                  [        USSS95        Mc     U R                   H\  n[        U[        5      (       a  UR	                  U5        M+  [        U[
        5      (       d  MB  UR	                  [        USSS95        M^     U(       a  U R                  USS9  [        U SS5      (       d  U R                  R                  c  U R                  5         gg)a3  
Post-initialization hook that runs after the tokenizer is fully set up.
This is called by from_pretrained() after loading the tokenizer, which allows
us to add any special tokens that may have been passed as AddedToken objects.

Child classes should call super()._post_init() if they override this method.
NTF)r   
normalized)r  r   )r   rQ   rU   r   rY   rZ   r   r   r}   r   r5   r   )r   r   token_valuer   s       r^   
_post_initTokenizersBackend._post_init6  s    33::<K"+z22$$[1K--$$ZTV[%\] = //E%,,$$U+E3''$$ZtPU%VW	 0 OOM$O?48$??4??CaCaCi&&( Djr`   c                 4    U R                   R                  SS9$ )z@
`int`: Size of the base vocabulary (without the added tokens).
Fwith_added_tokensr   r   r	  s    r^   r  TokenizersBackend.vocab_sizeV  s    
 ---FFr`   c                 4    U R                   R                  SS9$ )NTr:  )r   	get_vocabr	  s    r^   r?  TokenizersBackend.get_vocab]  s    ((4(@@r`   c                 "    U R                  5       $ rT   )r?  r	  s    r^   r2   TokenizersBackend.vocab`  s    ~~r`   c                     [        U R                  R                  5       S S9 VVs0 s H  u  pUR                  U_M     snn$ s  snnf )z
Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
optimisation in `self._added_tokens_encoder` for the slow tokenizers.
c                     U S   $ r   rg   r   s    r^   r   8TokenizersBackend.added_tokens_encoder.<locals>.<lambda>j      dhijdkr`   r   r   rH   r   contentr   vks      r^   r   &TokenizersBackend.added_tokens_encoderd  s?     *00I0I0O0O0QWk)lm)l		1)lmmm   Ac                 6    U R                   R                  5       $ )z
Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

Returns:
    `dict[str, int]`: The added tokens.
)r   get_added_tokens_decoderr	  s    r^   rH   &TokenizersBackend.added_tokens_decoderl  s     7799r`   c                     [        U R                  R                  5       S S9 VVs0 s H  u  pUR                  U_M     snn$ s  snnf )z
Returns the added tokens in the vocabulary as a dictionary of token to index.

Returns:
    `dict[str, int]`: The added tokens.
c                     U S   $ r   rg   rE  s    r^   r   3TokenizersBackend.get_added_vocab.<locals>.<lambda>  rG  r`   r   rH  rJ  s      r^   get_added_vocab!TokenizersBackend.get_added_vocab{  s?     *00I0I0O0O0QWk)lm)l		1)lmmmrN  c                     g)z>
Returns True, to avoid expensive `assert tokenizer` gotchas.
Trg   r	  s    r^   __bool__TokenizersBackend.__bool__  s     r`   c                 4    U R                   R                  SS9$ )z4
Size of the full vocabulary with the added tokens.
Tr:  r<  r	  s    r^   __len__TokenizersBackend.__len__  s     ---EEr`   c                     U R                   $ )zS
`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
)r   r	  s    r^   backend_tokenizer#TokenizersBackend.backend_tokenizer  s    
 r`   c                 .    U R                   R                  $ )zE
`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
)r   decoderr	  s    r^   ra  TokenizersBackend.decoder  s    
 &&&r`   Tr/   return_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                    Uc  SU R                   ;   nUc  SU R                   ;   nU(       a  UR                  b  U/UR                  -   n	OU/n	[        [        5      n
U	 H  nU
S   R	                  UR
                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       d  M  U
S   R	                  [        UR
                  5      5        M     X4$ )ar  
Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
of encodings, take care of building a batch from overflowing tokens.

Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
lists (overflows) of lists (tokens).

Output shape: (overflows, sequence length)
token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr   )model_input_namesoverflowingr   rV   rY   idstype_idsrl  rn  offsetsr   )r   r/   rc  rd  re  rf  rg  rh  ri  	encodingsencoding_dictr   s               r^   _convert_encoding#TokenizersBackend._convert_encoding  s   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D)A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyA}h'..s155z:  ''r`   r   c                 Z    U R                   R                  U5      nUc  U R                  $ U$ rT   )r   token_to_idunk_token_id)r   r   r   s      r^   #_convert_token_to_id_with_added_voc5TokenizersBackend._convert_token_to_id_with_added_voc  s,    ++E2=$$$r`   r   c                 J    U R                   R                  [        U5      5      $ rT   )r   r   r   )r   r   s     r^   _convert_id_to_token&TokenizersBackend._convert_id_to_token  s    **3u:66r`   
new_tokensc                 |    U(       a  U R                   R                  U5      $ U R                   R                  U5      $ rT   )r   add_special_tokensr   )r   r  r  s      r^   _add_tokensTokenizersBackend._add_tokens  s/    ??55jAA))*55r`   r  c                 8    U R                   R                  U5      $ )a  
Returns the number of added tokens when encoding a sequence with special tokens.

<Tip>

This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
this inside your training loop.

</Tip>

Args:
    pair (`bool`, *optional*, defaults to `False`):
        Whether the number of added tokens should be computed in the case of a sequence pair or a single
        sequence.

Returns:
    `int`: Number of special tokens added to sequences.
)r   num_special_tokens_to_add)r   r  s     r^   r  +TokenizersBackend.num_special_tokens_to_add  s    & 88>>r`   rr  skip_special_tokensc                 @   [        U[        5      (       a  U R                  R                  U5      $ / nU(       a  [	        U R
                  5      O	[	        5       nU H?  n[        U5      nXT;   a  M  UR                  U R                  R                  U5      5        MA     U$ )a  
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
added tokens.

Args:
    ids (`int` or `list[int]`):
        The token id (or token ids) to convert to tokens.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.

Returns:
    `str` or `list[str]`: The decoded token(s).
)rU   r   r   r   r   all_special_idsrY   )r   rr  r  r  ids_to_skipr   s         r^   convert_ids_to_tokens'TokenizersBackend.convert_ids_to_tokens  s     c3??..s333Fc$../CEEJE#MM$//55e<=	 
 r`   textr  c                 H    U R                   " SXUS.UD6R                  5       $ )N)r  	text_pairr  rg   )_encode_plusr  )r   r  r  r  r   s        r^   tokenizeTokenizersBackend.tokenize  s(      ldOaleklssuur`   padding_strategyr   r   r   r   r   c                    U R                   R                  nU R                   R                  nU[        R                  :X  a  Ub  U R                   R                  5         OdUUUR                  U R                  S.n	Uc  Sn
O"U	 Vs0 s H  oUR                  US5      _M     n
nX:w  a  U R                   R                  " S0 U	D6  U[        R                  :X  a  Ub  U R                   R                  5         ggU[        R                  :X  a  UOSnUUb  UOU R                  U R                  U R                   U R"                  US.n	X:w  a  U R                   R$                  " S0 U	D6  ggs  snf )a  
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
library) and restore the tokenizer settings afterwards.

The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
section.

Args:
    padding_strategy ([`~utils.PaddingStrategy`]):
        The kind of padding that will be applied to the input
    truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
        The kind of truncation that will be applied to the input
    max_length (`int`):
        The maximum size of a sequence.
    stride (`int`):
        The stride to use when handling overflow.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
        the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
    padding_side (`str`, *optional*):
        The side on which the model should have padding applied. Should be selected between ['right', 'left'].
        Default value is picked from the class attribute of the same name.
N)r   r   r   r   )r   r   pad_idra   r   r   rg   )r   rw   rv   r   DO_NOT_TRUNCATEr   r1  r   rs   r   r    
DO_NOT_PAD
no_padding
MAX_LENGTHr   pad_token_idra   r   r   )r   r  r   r   r   r   r   r   r   targetcurrentrL  r   s                r^   set_truncation_and_padding,TokenizersBackend.set_truncation_and_padding  sR   B oo00??**"4"D"DD&--/ ) /55!11	F "@FG1kooa66G 11;F;999#**, $ $47Q7Q#QZW[F -9-E\4K\K\++!^^#55&8F !..88 "% Hs   E'r   r  is_split_into_wordsreturn_tensorsr   c                 v   S nU" U5      (       d  [        S5      eUb  U" U5      (       d  [        S5      eU(       a@  [        U[        [        45      =(       a"    U=(       a    [        US   [        [        45      nO[        U[        [        45      nU(       ay  [        U[        5      (       a  [        S5      eUb<  [        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eUb  [        [        X5      5      OUnOU(       a  X4/OU/n[        U[        [        45      (       d  [        S[        U5       S	35      eU R                  UUUUU	U
S
9  Uc  U R                  nU R                  R                  U:w  a  UU R                  l        U R                  R                  UUUS9nU Vs/ s H  nU R                  UUUUUUUUS9PM     nn0 nUS   S    H.  nU VVVs/ s H  u  nnUU     H  nUPM     M     n nnnU UU'   M0     U VVVs/ s H  u  nnU  H  nUPM     M     n!nnnU(       a4  / n"[        U5       H  u  n#u  n$nU"U#/[        U$S   5      -  -  n"M      U"US'   US    H  n%U R!                  U%UU5        M     [#        UU!US9n&U(       dq  Ucn  U(       dg  [#        U&R%                  5        VV's0 s H5  u  nn'U[        U'5      S:  a  [        U'S   [        5      (       a  U'S   OU'_M7     sn'nU&R&                  5      n&U&$ s  snf s  snnnf s  snnnf s  sn'nf )Nc                    [        U [        5      (       a  g[        U [        [        45      (       a  [	        U 5      S:X  a  g[        U S   [        5      (       a  g[        U S   [        [        45      (       a  [	        U S   5      S:X  d  [        U S   S   [        5      (       a  g[        U S   S   [        [        45      (       a4  [	        U S   S   5      S:H  =(       d    [        U S   S   S   [        5      $ ggg)NTr   F)rU   rZ   rV   rW   r   )r  s    r^   _is_valid_text_input<TokenizersBackend._encode_plus.<locals>._is_valid_text_inputq  s    !S!!Ae}--q6Q;!c**!tUm441Q4yA~AaDGS)A)A##AaDGdE];;"1Q47|q0OJqtAwqz34OO$ r`   ztext input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs).r   zdwhen tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.zbatch length of `text`: z- does not match batch length of `text_pair`: .z:batch_text_or_text_pairs has to be a list or a tuple (got ))r  r   r   r   r   r   )r  is_pretokenized)r/   rc  rd  re  rf  rg  rh  ri  rm  overflow_to_sample_mapping)tensor_type)r   rU   rV   rW   rZ   	TypeErrorr   zipr1   r  r   r   r   encode_batchrw  r|   &_eventual_warn_about_too_long_sequencer   r   ru  )(r   r  r  r  r  r   r   r   r  r   r   r  rc  rd  re  rf  rg  rh  ri  r   r   r  
is_batchedbatch_text_or_text_pairsru  r/   tokens_and_encodingssanitized_tokensr   r   r   r   stacksanitized_encodingsr  r   toksrm  batched_outputr1  s(                                           r^   r  TokenizersBackend._encode_plusY  s   0	( $D))W 
  )=i)H)HW  #D4-8hThjQUVWQX[_afZgFhJ#D4-8J)S))  $Tc)n)D .s4yk :I'q*  FOEZtC,@'A`d$ ?H(9':dV$ 2UDMBBLTRjMkLllmn  	''- 3!1% 	( 	
  '#'#<#< ??004HH4HDOO1 OO00$1/ 1 
	$ & 
 & ""!&;&;*C+E'=+ # 	 & 	  
 '*1-C&:N&:74DIqQIQ&:EN$)S! . 1ES0DWQdqdq0DS %)+& )*> ?9D!*qcC[8I4J.JJ* !@=W9:)+6I77	:wW 7 ''79LZhi n4=V* '5&:&:&<&<
U c%j1nE!Hd9S9S%(Y^^&< ((N W 
" OS"s    L"7L'
&L.<L5
r  c                     U R                   R                  b%  U R                   R                  R                  U5      $ SR                  U5      $ )Nr@   )r^  ra  decoder  )r   r  s     r^   convert_tokens_to_string*TokenizersBackend.convert_tokens_to_string  sJ     %%--9 ""**11&9	
 &!	
r`   	token_idsclean_up_tokenization_spacesc                    UR                  SS 5        [        U[        5      (       a  U/n[        U[        5      (       a  US   nU R                  R                  XS9nUb  UOU R                  nU(       a~  [        U R                  R                  5      R                  S:X  a@  U R                  (       d/  [        R                  SU R                  R                   S35        U$ U R                  U5      nU$ )Nuse_source_tokenizerrm  )r  r   z=Ignoring clean_up_tokenization_spaces=True for BPE tokenizer aE  . The clean_up_tokenization post-processing step is designed for WordPiece tokenizers and is destructive for BPE (it strips spaces before punctuation). Set clean_up_tokenization_spaces=False to suppress this warning, or set clean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_output=True to force cleanup anyway.)ri   rU   r   rh   r   r  r  r1   r^  r0   r{   Gclean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_outputr   warning_oncer  clean_up_tokenization)r   r  r  r  r   r  s         r^   _decodeTokenizersBackend._decode  s     	

)40i%%"Ii&&!+.I%%i%Y ,7 )22 	%
 (
 T++112;;uDdd##//0 1--  11$7r`   
file_names.legacy_formatc                     [        U5      n[        R                  R                  X(       a  US-   OS[        -   5      nU R
                  R                  U5        X%4-   nU$ )Nr  r   )rZ   rj   rk   r  TOKENIZER_FILEr^  save)r   r  r  r  r  r%   s         r^   _save_pretrained"TokenizersBackend._save_pretrained%  s[     ^,o_s22Q__
 	##N3"33
r`   c           
      
   [         R                  " U R                  R                  5       5      nUR	                  S5      nUR	                  S5      n	Sn
US   S   S:X  a  0 US   S'   / US   S'   OuUS   S   S	:X  a?  US   S
   b5  US   S
   nUS   S   U   S   n
Ub	  X;   a  XZ   n
SUS   S
'   U
S//US   S'   O*US   S   S;   a	  0 US   S'   O[        SUS   S    S35      eUb%  SUS   ;   a  US   S   U;   a  XWS   S      US   S'   [        R                  " [         R                  " U5      5      n/ nU Hl  nUR	                  SS5      nUR	                  SS5      nUS   S   S	:w  a	  U(       d  M<  Ub  US   U;   a
  X^S      US'   UR                  [        S(0 UD65        Mn     Ub  UR                  U5        US   S   S:X  a  SU;  a  US   S   b  US   S   US'   US   S   S:X  a  SU;  a  US   S   b  US   S   US'   US   S   S	:X  a  U
b  XS'   US   b_  US   S   S:X  d2  US   S   S:X  aG  SUS   ;   a>  [        S US   S    5       5      (       a!  [        R                  R                  5       US'   [         US   S      nU" S(X-S.UD6nUR#                  XUS9  U	Gb@  [         R                  " UR                  5       5      nSU	;   a  U	S    H  nU	S   U   S   nUb"  U Vs/ s H  nUR%                  UU5      PM     nnUU	S   U   S'   U H"  nUR'                  U5      nUb  M  [        S 5      e   U Vs/ s H  nUR'                  U5      PM     snU	S   U   S!'   M     S" HG  nUU	;   d  M  U	U   u  nnUb  UU;   a  UU   nUR'                  U5      nUc  [        S 5      eUU/U	U'   MI     U	US'   [        R                  " [         R                  " U5      5      nU R(                  R+                  5       n[,        R.                   H  n[1        U U5      c  M  [1        U U5      nUb  UU;   a  UU   nU R2                  R%                  US5      n[5        U[        5      (       a;  [        UUR6                  UR8                  UR:                  UR<                  S#S$9UU'   M  UUU'   M     U R>                  (       a  U R>                  R+                  5       O/ nUb  UR                  U5        [A        U5      S:  a  UUS%'   XS&'    U RB                  " S(0 UD6$ s  snf s  snf ! [D         aG  nS'[G        U5      ;   a2  UR	                  S&S5        U RB                  " S(0 UD6nUUl        Us SnA$ e SnAff = f))u  
Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
as the current one.

Args:
    text_iterator (generator of `list[str]`):
        The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
        if you have everything in memory.
    vocab_size (`int`):
        The size of the vocabulary you want for your tokenizer.
    length (`int`, *optional*):
        The total number of sequences in the iterator. This is used to provide meaningful progress tracking
    new_special_tokens (list of `str` or `AddedToken`, *optional*):
        A list of new special tokens to add to the tokenizer you are training.
    special_tokens_map (`dict[str, str]`, *optional*):
        If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
        token name to new special token name in this argument.
    kwargs (`dict[str, Any]`, *optional*):
        Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

Returns:
    [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
    `text_iterator`.

r4   r5   Nr0   r1   r   r2   r3   r   r   r   g        )r#   r$   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.rN   r   idrI  continuing_subword_prefixend_of_word_suffixr   	ByteLevelr;   pretokenizersc              3   2   #    U  H  nUS    S:H  v   M     g7f)r1   r  Nrg   ).0pretokenizers     r^   	<genexpr><TokenizersBackend.train_new_from_iterator.<locals>.<genexpr>  s!      (X !(K7(Xs   initial_alphabet)r  r  )r   trainerr  r  zQAttempted to set a token in the post processor that does not exist in the mappingrr  )r   sepT)single_wordlstriprstripr5  r   rP   r,   z7multiple values for keyword argument 'tokenizer_object'rg   )$rq   loadsr   to_strri   r   rn   rt   ru   rY   r   rX   anypre_tokenizers_fastr  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrs   rz  r   r   r   SPECIAL_TOKENS_ATTRIBUTESr}   r   rU   r  r  r  r5  rP   r   r  r  rZ   )r   text_iteratorr  r   new_special_tokensspecial_tokens_mapr   r   r4   r5   rN   r   r   r  added_tokenr   r   trainer_classr  trained_tokenizer_jsonr   r  r   r   special_tokenspecial_token_fullrP   r   new_tokenizers                                r^   train_new_from_iterator)TokenizersBackend.train_new_from_iterator6  s   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1i6U 2 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EU\F]^iFj3kN7#K0!**4::n+EF	 'K!ooi6Gd+Ag&v.);G!-+i2HL^2^);	<R)SI&!!*";{";< ( )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+;/*6/7;F!/26:jH#~o'FF (6(G(X  
 .A-J-J-S-S-U)*01H1PQ_:_X^_%%mG%T%%)ZZ	0@0@0B%C">1)*:;C+,<=cB8LF)5TZ![TZ5"4"8"8"FTZ![FLN#34S9(C!'#,#8#8#?#+", s#  "( ouCuntejIDYDYZ_D`ntCuN#34S9%@ < "0 N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1 "0 8F"#34%..tzz:P/QRI!!&&(,FFEtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*==$.%$6$B$B188188#5#@#@ $%F5M %2F5M% G* DHC\C\t88==?bd) ''(:;#$q(-AF)* &/!"	>>+F++{ "\ Dvj  	HCPQFR 

-t4 $ 8 8+4($$ 	s0   4T'	T,T1 1
V;;U=6V<U==Vc
                 ~  ^^ SSK mSSKJn  SSKJm  SSKJn  SSKJn  U" SS9S	[        S
[        4UU4S jj5       nU(       d  [        5       (       a  SnUGbR  U(       d  U(       GdC  U" U5      (       Ga5  U" USUUUSSUS9nSnUb  [        USS9 n[        R                  " U5      nSSS5        WR                  S5      nUR                  S5      nU(       a7  UR!                  U5      UR!                  S5      :  a  U(       a  Ub  US;  a  U$ O-U(       a&  UR!                  U5      UR!                  S5      :  a  U$ SnU(       d  U(       Gd\  U" U5      (       GaN  U(       a  SU;   a  [#        USUS   5        U	c:  [%        USS5      (       d(  [#        USS5        [&        R)                  SU S35        U$ U	SL d  [%        USS5      (       a  [#        USS5        SSKnUR,                  R/                  UR1                  S5      SS9nUR2                  n[5        UUR,                  R6                  5      (       a  UUR2                  S'   U$ [5        UUR,                  R8                  5      (       a  UR,                  R;                  SSS9nUR,                  R7                  UU/5      Ul        U$ ! , (       d  f       GN= f)a6  
Patches mistral related tokenizers with incorrect regex if detected
    1) Local file with an associated config saved next to it
        >> Model type one of the mistral models (on older versions)
    2) Remote models on the hub from official mistral models
        >> Tags including `base_model:.*mistralai`
r   N)	lru_cache)
model_info)versionr      )maxsizemodel_idrR   c                    >  T" U 5      nUR                  b1  TR                  SSR                  UR                  5      5      (       a  gg! [          a     gf = f)NFzbase_model:.*mistralair   T)r   tagssearchr  )r  r0   r  res     r^   is_base_mistral?TokenizersBackend._patch_mistral_regex.<locals>.is_base_mistral  sW    "8, zz%995rwwuzz7JKK  s   A
 

AATzconfig.jsonF)	cache_dirr   local_files_only%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_commit_hashr-   r.   transformers_versionr   z5.0.0)mistralmistral3voxtral	ministralpixtralr   z$The tokenizer you are loading from 'a  ' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+isolated)patternbehavior)r   	use_regex)r  	functoolsr  huggingface_hubr  	packagingr  transformers.utils.hubr   rZ   boolr   rp   rq   rr   rs   parsesetattrr}   r   r   r   pre_tokenizersSplitRegexr   rU   r;   	Metaspacer  )r   r   pretrained_model_name_or_pathr   r  r  r  is_localr   r   r   r  r  r   r  _config_filemistral_config_detectedf_configr  transformers_model_typer   split_pretokenizercurrent_pretokenizerr  r  s                           @@r^   r   &TokenizersBackend._patch_mistral_regex  s   * 	'.%6	3			c 		d 		 
 		 00H(4X/:W*X*X&-#!16;8=)	L ',#',9Q"iilG :'.{{3I'J$*1++l*C'
 (GMM:N,ORYR_R_`gRh,h 3?3   )()gmm<P.QU\UbUbcjUk.k$$*.'&xOLi<j<j#6+#EI':KH[<\] %,WYH[]b5c5cI':EBNN>?\>] ^e eH ? '$.')EXZ_2`2`I':DA%)3)B)B)H)H * 0 0 s! ",	 *I *& ,5+B+B(!"6
8Q8Q8Z8Z[[5G	//2"  &&:J<U<U<_<_``3=3L3L3V3V16% 4W 40
 3=2K2K2T2T 2 43	/ O :9s   J--
J<)r   r   r   r   r   r   r   r&   )FrT   )NNFFFFT)NF)FN)NN)NNN)NNFNFNN)Er{   
__module____qualname____firstlineno____doc__r  r  r0   r   classmethodr   r+   propertyr  r
  r  rZ   rW   r  r   r   r   setterr7  r   r  rh   r?  r2   r   r   rH   _added_tokens_encoder_added_tokens_decoderrU  rX  r[  rn   r^  DecoderFastra  EncodingFastr   rV   rw  r|  r  r  r  r  r  r    r   r  r  r  r   r   r   r  r  r  rj   PathLiker  r  r   __static_attributes____classcell__)r  s   @r^   r(   r(   S   s   
 *EJ` `Da)F      !c !C$J !Z_`cZd !
4 6 6 6 6 % % % %)@ GC G GA4S> A  tCH~     nd38n n n :d3
?&; : : 10nc3h n$ F F =   ' ' ' .2-1*/+0',#-(-(  $d{-(  $d{	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(^  7# 7#* 76d3+;&< 6WZ 6?d ?s ?*tCy t `cfjknfo`o 4vS vd
 vt vjnorjs vI9)I9 0I9 	I9
 I9  $JI9 DjI9\ gk#',;,F,F2D2T2T!%$))-#'&*-1-1*/+0',#,0)X++d9o=EV@WWX 004	?BTJ[E\\_ccX !	X
 *X 0X $JX X "X  $JX DjX tX  $d{X  $d{X $(X  %)!X" !%#X$ %X& 'X( #Tk)X, 
-Xt
tCy 
S 
 %*48	)c?) ") '+Tk	) 
)^ &*&*bkk) #s(O d{	
 t 
sCx* CJ 
 C Cr`   r(   )Cr%  r   rq   rj   collectionsr   collections.abcr   shutilr   typingr   tokenizers.pre_tokenizersr  r  r  r   r   r   r	   r
   r,  r   rn   tokenizers.decodersr   r+  tokenizers.modelsr   r   tokenizers.trainersr   r   r   r   r  r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utils_baser   r   r   r   r   r   r   utilsr    r!   r"   
get_loggerr{   r   r  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEADDED_TOKENS_FILEr  r  r(   PreTrainedTokenizerFastrg   r`   r^   <module>rB     s   
   	 # $   7 + - / 1 6 * ^ ^ . 0 5 =   @ ? 
		H	% "3 / '  (      !!	  (6EXY  ,-k/ k .k^) , r`   