
    Z jb1                    \   S r SSKrSSKJr  SSKJr  SSKJr  SSKJ	r	J
r
JrJrJrJrJr  SSKJrJrJr  SSKJr  S	S
KJrJrJrJr  S	SKJr  \R8                  " \5      r/ SQr\/ SQ-   r SS jr!S\"S\#4S jr$SS\\#   S-  4S jjr% " S S5      r& " S S\&5      r'S\#S\"4S jr( " S S5      r) " S S\)5      r* " S S\)5      r+ " S  S!\)5      r, " S" S#\)5      r- " S$ S%\)5      r. " S& S'\)5      r/ " S( S)\)5      r0 " S* S+\)5      r1 " S, S-\)5      r2 " S. S/\)5      r3 " S0 S1\)5      r4 " S2 S3\)5      r5 " S4 S5\55      r6 " S6 S7\55      r7 " S8 S9\55      r8 " S: S;\55      r9 " S< S=\55      r: " S> S?\55      r; " S@ SA\55      r< " SB SC\55      r= " SD SE\55      r> " SF SG\55      r? " SH SI\55      r@ " SJ SK\55      rA " SL SM\55      rB " SN SO\55      rC " SP SQ\55      rD " SR SS\55      rE " ST SU\)5      rF " SV SW\55      rG " SX SY\)5      rH " SZ S[\)5      rI " S\ S]\)5      rJ " S^ S_\55      rK " S` Sa\55      rL " Sb Sc\55      rM " Sd Se\)5      rN " Sf Sg\55      rO " Sh Si\55      rP " Sj Sk\55      rQSl rR " Sm Sn5      rS " So Sp5      rT0 Sq\6_Sr\2_Ss\7_St\*_Su\G_Sv\J_Sw\8_Sx\H_Sy\/_Sz\*_S{\4_S|\9_S}\*_S~\*_S\*_S\*_S\*_0 S\6_S\,_S\/_S\0_S\*_S\*_S\2_S\>_S\2_S\2_S\*_S\N_S\:_S\;_S\-_S\*_S\2_E0 S\<_S\._S\C_S\1_S\@_S\A_S\2_S\3_S\=_S\*_S\D_S\E_S\F_S\>_S\?_S\+_S\K_E\M\M\L\MS.ErUSS\4S jjrVg)z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)
Collection)	lru_cache)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece)tqdm   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CN)af_ZAaz_AZbn_INfa_IRhe_ILhr_HRid_IDka_GEkm_KHmk_MKml_INmn_MNmr_INpl_PLps_AFpt_XXsv_SEsw_KEta_INte_INth_THtl_XXuk_UAur_PKxh_ZAgl_ESsl_SIc                 8   [        5       (       a  SSKJn  U$ [        5       (       aV  SS Kn[
        R                  " UR                  R                  5      [
        R                  " S5      :  a  SSK	Jn  U$ SSK	J
n  U$ [        [        R                  " U 5      5      e)Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecerL   r   google.protobufr   parseprotobuf__version__transformers.utilsrM   ImportErrorr   format)error_messagerL   googles      t/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufrY   _   sr    !##9&&==445g8NNB '& b&&/66}EFF    add_prefix_spacereturnc                 H    U (       a  Sn[        USS5      (       d  SnU$ SnU$ )NalwayslegacyTfirstnever)getattr)r[   original_tokenizerprepend_schemes      rX   _get_prepend_schemere   p   s4    !)8T::$N  !rZ   skip_tokensc                   ^  Ub  [        U5      O	[        5       nUS LnU(       a  [        U5      OT n/ nUR                  5        H  u  pVXR;   a  M  / n[        S[	        U5      5       H:  nUS U XXS  pX;   d  X;   a  M  U	T ;   d  M  U
T ;   d  M'  UR                  XU45        M<     [        UU 4S jS9nUR                  U5        M     [        US US9nU Vs/ s H  oS   US   4PM     nnU$ s  snf )Nr   c                 $   > TU S      TU S      4$ Nr   r    )xvocabs    rX   <lambda>!generate_merges.<locals>.<lambda>   s    U1Q4[%!+,FrZ   keyc                 B    U S   [        U S   5      [        U S   5      4$ )N   r   r   )lenvals    rX   rm   rn      s    SVSQ[#c!f+,NrZ   rp   reverser   )setdictitemsrangers   appendsortedextend)rl   vocab_scoresrf   rw   mergesmergepiece_scorelocalindexpiece_lpiece_rru   s   `           rX   generate_mergesr   z   s   &1&=#k"35K$&G)04%eLF*0021c%j)E$Ve}eFmW%)?%Gu$4g<= * u"FGe 3 F NX_`F*01&31vs1v&F1M 2s   C/c                   N    \ rS rSrSrS\4S jrS\\\\	4   \
\   4   4S jrSrg)	SentencePieceExtractor   zd
Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
modelc                     [        U S5        [        U S5        [        5       nUR                  5       n[        US5       nUR	                  UR                  5       5        S S S 5        X0l        g ! , (       d  f       N= f)NrN   rQ   rb)r   rY   
ModelProtoopenParseFromStringreadproto)selfr   	model_pb2mfs        rX   __init__SentencePieceExtractor.__init__   sa    $0$
+ $%	  "%!affh' 
 s    A..
A<r\   c                    U R                   R                  R                    Uc0  SSKJnJn  U R                   R                  R                  S:X  a  UOUnU R                   R                   Vs/ s H  oUR                  UR                  4PM     nnUR                  S:w  a(  U R                   R                  R                  US'   XbS'   O:SSKJn  [        U5       VV	V
s0 s H
  u  nu  pX_M     nn	nn
U" U5      nXbS'   XS	'   [        U R                   R                  5       VVs/ s H2  u  pUR                  S
;   d  M  XR                  UR                  S:H  4PM4     nnn[        US S9 VVVs/ s H  u  pn[!        USUS9PM     snnnUS'   [#        U R                   R$                  SS5      US'   U$ s  snf s  sn
n	nf s  snnf s  snnnf )
By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
order the merges with respect to the piece scores instead.
Nr   )r   r   r   r   unk_idrl   )r   r         r   c                     U S   $ Nr   rj   rk   s    rX   rm   0SentencePieceExtractor.extract.<locals>.<lambda>   s    QqTrZ   ro   F
normalizedspecialadditional_special_tokensprecompiled_charsmap_spm_precompiled_charsmap)r   trainer_specr   tokenizers.modelsr   r   
model_typepiecespiecescore__name__tokenization_utils_baser   	enumeratetyper}   r   rb   normalizer_spec)r   r   kwargsr   r   r   rl   r   iwordr   r   idpspm_added_tokenstokenr   s                    rX   extractSentencePieceExtractor.extract   s   
 	

&&6$(JJ$;$;$F$F!$KQTJ9=9J9JK9J++u{{+9JK%'#zz66==F8#7O@5>u5EF5E!1MTTW5EEF$U+F#7O%8 ENdjjN_N_D`uD`52dedjdjntdt6R!&&A+6D`u '--=>&R/
&R"7 u@&R/
*+ /6djj6P6PRhjn.o*+/ L G v/
s   ,!F4F9#G =G .Gr   N)r   
__module____qualname____firstlineno____doc__strr   tuplery   intlistr   __static_attributes__rj   rZ   rX   r   r      s5    
c 
!uT#s(^T%[5P/Q !rZ   r   c                   @    \ rS rSrSS\\\\4   \\   4   4S jjr	Sr
g)GemmaSentencePieceExtractor   Nr\   c                     U R                   n[        UR                  5       5       Vs0 s H  o2R                  U5      U_M     nnSU;  a  UR	                  S5      US'   [        XA5      nXE4$ s  snf )r   	<0x09>)spr{   GetPieceSizeid_to_piecegetr   )r   r   r   r   rl   r   s         rX   r   #GemmaSentencePieceExtractor.extract   so    
 WW;@AR;ST;S%&-;ST u))H-E$K 5} Us   A-rj   N)r   r   r   r   r   ry   r   r   r   r   r   rj   rZ   rX   r   r      s)    E$sCx.$u+2M,N  rZ   r   r   c                 z    [        U 5      S:  =(       d'    U S   S:g  =(       d    U S   R                  5       (       + $ )Nrr   ,)rs   isdigit)r   s    rX   check_number_commar      s3    u:>HU2Y#-HU2Y5F5F5H1HHrZ   c                   (    \ rS rSrS rS\4S jrSrg)	Converter   c                     Xl         g r   rc   )r   rc   s     rX   r   Converter.__init__   s    "4rZ   r\   c                     [        5       er   )NotImplementedErrorr   s    rX   	convertedConverter.converted   s    !##rZ   r   N)r   r   r   r   r   r   r   r   rj   rZ   rX   r   r      s    5$9 $rZ   r   c                   "    \ rS rSrS\4S jrSrg)BertConverter   r\   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixrc   rl   r   r   r   r   hasattrr   tokenize_chinese_charsr   do_lower_caser
   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr	   decoder
r   rl   	tokenizerr   r   r   clssepr  r  s
             rX   r   BertConverter.converted      ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	rZ   rj   Nr   r   r   r   r   r   r   rj   rZ   rX   r   r          #9 #rZ   r   c                   "    \ rS rSrS\4S jrSrg)SplinterConverteri  r\   c           
      v   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      n[	        U R                   R&                  5      nSn	U R                   R(                  n
U R                   R*                  nU R                   R,                  nU R                   R/                  S5      nU R                   R0                  S:X  a  U SU S	U	 S	U S
U S3
nOU SU S
U S	U	 S	U S3
n[2        R4                  " U SU S3UXj4X{4X4X4/S9Ul        [8        R                  " SS9Ul        U$ )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )rc   rl   r   r   r   r   r   r   r   r   r   r
   r   r   r   r  r  r  r  question_tokenr  r  question_token_idconvert_tokens_to_idspadding_sider   r  r  r	   r	  )r   rl   r  r   r   r   r  r  questiondotr  r  r  dot_token_idr   s                  rX   r   SplinterConverter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334t..==>..;;..;; 33EE..DDSI""//7:U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*##-#		$
	  %..d;	rZ   rj   Nr  rj   rZ   rX   r  r    s    .9 .rZ   r  c                   "    \ rS rSrS\4S jrSrg)FunnelConverteri=  r\   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r
  s
             rX   r   FunnelConverter.converted>  r  rZ   rj   Nr  rj   rZ   rX   r!  r!  =  r  rZ   r!  c                   "    \ rS rSrS\4S jrSrg)MPNetConverterid  r\   c                 h   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	U S
3Xh4Xy4/S9Ul        [0        R                  " SS9Ul        U$ )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r
  s
             rX   r   MPNetConverter.convertede  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5SXcU"=##$
	  %..d;	rZ   rj   Nr  rj   rZ   rX   r%  r%  d  r  rZ   r%  c                   "    \ rS rSrS\4S jrSrg)OpenAIGPTConverteri  r\   c                    U R                   R                  n[        U R                   R                  R	                  5       5      nU R                   R
                  n[        [        UUS [        U5      SSS95      nUR                  [        U5      5      b  UR                  [        U5      /5        [        R                  " SS9Ul        [        R                  " 5       Ul        ["        R$                  " SS9Ul        U$ )N</w>F)rl   r   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)rc   encoderr   	bpe_rankskeysr   r   r   r   token_to_idadd_special_tokensr
   r   r   r   r  r  r	   
BPEDecoderr	  r   rl   r   r   r  s        rX   r   OpenAIGPTConverter.converted  s    ''//d--77<<>?++55	i.#)	
	   Y0<((#i.)9:*99DI	"0"A"A"C	$//v>	rZ   rj   Nr  rj   rZ   rX   r)  r)    s    9 rZ   r)  c                   T    \ rS rSrSS\\\4   S-  S\\\\4      S-  S\	4S jjr
Srg)	GPT2Converteri  Nrl   r   r\   c                 Z   U(       d  U R                   R                  nU(       d  [        U R                   R                  5      n[	        [        UUS SSSS95      n[        U R                   SS5      n[        R                  " US9Ul	        [        R                  " 5       Ul        [        U R                   SS5      (       aQ  U R                   R                  nU R                   R                  n[        R                  " U S3U S3XV4/S	9Ul        U$ [        R                  " SS
9Ul        U$ )N Frl   r   r,  continuing_subword_prefixr-  r.  r[   r[   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)rc   r1  r   r2  r   r   rb   r   	ByteLevelr  r	   r	  	bos_tokenbos_token_idr   r  r  )r   rl   r   r  r[   bosrE  s          rX   r   GPT2Converter.converted  s   ++33E$11;;<F*,#%	
	 #4#:#:<NPUV"0":":L\"]	$..0	4**OUCC))33C22??L'1'D'DguL)' (I$  (2';';'OI$rZ   rj   NNr   r   r   r   ry   r   r   r   r   r   r   r   rj   rZ   rX   r:  r:    sE    "tCH~4 "T%PSUXPX/EZ]aEa "mv " "rZ   r:  c                   "    \ rS rSrS\4S jrSrg)HerbertConverteri  r\   c           
      ~   SnSnU R                   R                  n[        U R                   R                  R	                  5       5      nXS   S   ;   a  USS  n[        [        UUS U R                   R                  US95      n[        R                  " SSS9Ul
        [        R                  " 5       Ul        [        R                  " US9Ul        ["        R$                  " U R                   R&                  U R                   R(                  4U R                   R*                  U R                   R,                  4S	9Ul        U$ )
Nz	#version:r+  r   r   )r,  r   r-  F)r   r   r/  )r  r  )rc   r1  r   r2  r3  r   r   r   r
   r   r   r   r  r  r	   r6  r	  r   BertProcessingr  r  r  r  r  )r   tokenizer_info_strtoken_suffixrl   r   r  s         rX   r   HerbertConverter.converted  s   (''//d--77<<>?1-ABZF11;;#/
	  +99EY^_	"0"A"A"C	$//|D	#-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$
	 
 rZ   rj   Nr  rj   rZ   rX   rK  rK        9 rZ   rK  c                   T    \ rS rSrSS\\\4   S-  S\\\\4      S-  S\	4S jjr
Srg)	Qwen2Converteri  Nrl   r   r\   c                 8   U(       d  U R                   R                  nU(       d-  [        U R                   R                  R	                  5       5      n[        [        UUS S SSSSS95      n[        R                  " 5       Ul	        [        R                  " [        R                  " [        S5      SSS9[        R                  " [        U R                   SS5      SS9/5      Ul        ["        R                  " 5       Ul        [&        R                  " SS	9Ul        U$ )
Nr<  F)rl   r   r,  r   r>  r-  r.  byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr[   r[   	use_regexrA  )rc   r1  r   r2  r3  r   r   r
   NFCr   r   SequenceSplitr   rC  rb   r  r	   r	  r   r  )r   rl   r   r  s       rX   r   Qwen2Converter.converted  s   ++33E$11;;@@BCF*,#%#	
	  +0	"0"9"9$$ N (  ((%,T-D-DFXZ_%`##
	  %..0	#-#7#7U#K	 rZ   rj   rH  rI  rj   rZ   rX   rS  rS    sE    (tCH~4 (T%PSUXPX/EZ]aEa (mv ( (rZ   rS  c                   "    \ rS rSrS\4S jrSrg)RobertaConverteri  r\   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " UR                  UR                   4UR"                  UR$                  4UR                  SS9Ul        U$ )Nr<  Fr=  r?  Tr  r  r[   rB  )rc   r1  r   r2  r3  r   r   r   rC  r[   r  r	   r	  r   RobertaProcessingr  r  r  r  r  r   otrl   r   r  s        rX   r   RobertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#?#?r/r/00	$
	  rZ   rj   Nr  rj   rZ   rX   ra  ra        9 rZ   ra  c                   "    \ rS rSrS\4S jrSrg)RoFormerConverteri4  r\   c           	      J   SSK Jn  U R                  R                  n[	        [        U[        U R                  R                  5      S95      nSnSn[        U R                  S5      (       a@  U R                  R                  R                  nU R                  R                  R                  n[        R                  " SSUUS9Ul        [        R                   R#                  U" U5      5      Ul        [        U R                  R&                  5      n[        U R                  R(                  5      nU R                  R*                  nU R                  R,                  n	[.        R0                  " U SU S	3U SU S
U S3Xh4Xy4/S9Ul        [4        R
                  " SS9Ul        U$ )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsrl  rc   rl   r   r   r   r   r   r   r   r   r
   r   r   r   PreTokenizercustomr  r  r  r  r  r   r  r  r	   r	  )
r   rl  rl   r  r   r   r  r  r  r  s
             rX   r   RoFormerConverter.converted5  sx   I''--iT=T=T=^=^9_`a	4**,=>> 33CCQQM 33CCQQM*99!&'#	 
	 #1"="="D"DEVW\E]"^	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	rZ   rj   Nr  rj   rZ   rX   rj  rj  4  r  rZ   rj  c                   "    \ rS rSrS\4S jrSrg)DebertaConverteri[  r\   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " SSSU R                   R                  S5      4SU R                   R                  S5      4/S	9Ul        U$ )
Nr<  Fr=  r?  [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )rc   r1  r   r2  r3  r   r   r   rC  r[   r  r	   r	  r   r  r  r  re  s        rX   r   DebertaConverter.converted\  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@)4$11GGPQ$11GGPQ$
	  rZ   rj   Nr  rj   rZ   rX   rr  rr  [  rQ  rZ   rr  c                      ^  \ rS rSrSr\r0 r\SS j5       r	\
SS j5       rU 4S jrS rS rS rS	 rS
 rS rS rS\4S jrSrU =r$ )SpmConverteriz  Fc                    U R                   R                  nU R                   R                  nU R                  R                  n[        U[        5      (       a!  [        [        UU=(       d    / USUSS95      nOe[        U[        5      (       aO  U(       aH  [        US   [        [        -  5      (       a)  [        [        UU R                   R                  US95      nOg[        R                  " SS5      /nU(       a&  UR                  S[        R                   " U5      5        [        R"                  " U5      Ul        U(       a[  [&        R"                  " [&        R                  " SS5      [&        R(                  " 5       [&        R*                  " 5       /5      Ul        U$ [&        R"                  " [&        R                  " SS5      /5      Ul        U$ )z
Similar to convert_from_spm method, but used only when there is no `model_type` class, i.e. there is no matching class in `TOKENIZERS_MAPPING` and we just create a tokenizer instead of extracting stuff from the sentencepiece file
TN)rl   r   r   r.  rU  r,  r   )rl   r   rU  r     ▁)r   rU  	unk_piecer   r   
isinstancery   r   r   r   r   r   r   r
   ReplaceinsertPrecompiledr]  r   r	   ByteFallbackFuser	  )r   rl   r   rU  r}  r   r  _normalizerss           rX   build_tokenizer_from_spm_proto+SpmConverter.build_tokenizer_from_spm_proto  s{   
 **88&&00	$44II eT""!!<R'!"/ 	I t$$:eAhPT3U3U! --44"/I  $++C78;#:#:;O#PQ*33LA	  ( 1 1!!%-x/D/D/FX!I  !) 1 183C3CE33O2P QIrZ   c                     Ub  XS'   U$ )z
Hook used when converting directly from a SentencePiece model without a slow tokenizer instance.
By default, return kwargs unchanged.
rl   rj   )r  rl   r   s      rX   convert_from_spmSpmConverter.convert_from_spm  s     #7OrZ   c                   > [        U S5        [        TU ]  " U6   [        5       nUR	                  5       n[        U R                  R                  S5       nUR                  UR                  5       5        S S S 5        X0l
        U R                  R                  R                  (       a)  U R                  (       d  [        R                  " S5        g g g ! , (       d  f       Nc= f)NrQ   r   a  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr   rY   r   r   rc   
vocab_filer   r   r   r   rU  handle_byte_fallbackwarningswarn)r   argsr   r   r   	__class__s        rX   r   SpmConverter.__init__  s    $
+$ $%	  "$))44d;qaffh' <
::""009R9RMMe :S0	 <;s    C
C c                 p    UR                    Vs/ s H  o"R                  UR                  4PM     sn$ s  snf r   r   r   r   r   r   r   s      rX   rl   SpmConverter.vocab  s)    8=Euekk*EEEs   !3c                 .    UR                   R                  $ r   )r   r   r   r   s     rX   r   SpmConverter.unk_id  s    !!(((rZ   c                    UR                   R                  nU R                  U5      nUS:X  a.  [        [	        UU R                  U5      U R                  S95      nOUS:X  a  U R                  U R                  R                  5      R                  U5      u  pV[        U5       VVV	s0 s H
  u  nu  pX_M     n
nnn	[        [        U
UUR                   R                  SU R                  S S95      nO[        S5      e[        UR                  5       VVs/ s HR  u  pUR                   S;   d  M  XR"                  UR                   S:H  =(       d    UR"                  U R$                  ;   4PMT     nnnUR'                  [)        US	 S
9 VVVs/ s H  u  pn[+        USUS9PM     snnn5        U$ s  sn	nnf s  snnf s  snnnf )Nr   r   rU  rr   Tr   r.  rU  r,  z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmr   r   c                     U S   $ r   rj   r   s    rX   rm   (SpmConverter.tokenizer.<locals>.<lambda>       QRSTQUrZ   ro   Fr   )r   r   rl   r   r   r   r  SpmExtractorrc   r  r   r   r   r}  	Exceptionr   r   r   r   
add_tokensr}   r   )r   r   r   r   r  _r   r   r   r   	bpe_vocabr   r   r   r   r   s                   rX   r  SpmConverter.tokenizer  s   ''22
zz%(?! ;;u-"&";";I 1_))$*A*A*L*LMUUVbcIA9B<9PQ9P%5Q9PIQ!#00::!"&";"; 	I o  #5<<0
0vv IR!&&A+GD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGD*V	
 C R*
s   'F.F5/?F5F;c                 .   UR                   R                  n[        R                  " SSS9[        R                  " [        S5      S5      /nU(       d  [        R                  " U5      $ [        R                  " [        R                  " U5      /U-   5      $ )NFT)leftr   {2,}r|  )r   r   r
   Stripr  r   r]  r  r   r   r   r  s       rX   r   SpmConverter.normalizer  s{    $44II55g6
 $''55'')@)@AU)V(WZf(fggrZ   c                 T    [        X R                  5      n[        R                  " XS9$ Nreplacementrd   )re   rc   r   	Metaspacer   r  r[   rd   s       rX   r  SpmConverter.pre_tokenizer  s$    ,-=?V?VW''K__rZ   c                     g r   rj   r   s    rX   r  SpmConverter.post_processor  s    rZ   c                 T    [        X R                  5      n[        R                  " XS9$ r  )re   rc   r	   r  r  s       rX   r	  SpmConverter.decoder  s$    ,-=?V?VW!!kYYrZ   r\   c                    U R                  U R                  5      nU R                  U R                  5      nUb  X!l        SnSn[        U R                  S5      (       a  U R                  R
                  nU R                  X45      nUb  XQl        U R                  X45      Ul        U R                  5       nU(       a  Xal        U$ )Nr|  Tr[   )	r  r   r   r   rc   r[   r  r	  r  )r   r  r   r  r[   r  r  s          rX   r   SpmConverter.converted  s    NN4::.	 __TZZ0
!#- 4**,>??#66GG**;I$&3# LLG	,,.'5$rZ   r   r   )r   r   r   r   r  r   r  r   staticmethodr  classmethodr  r   rl   r   r  r   r  r  r	  r   r   r   __classcell__)r  s   @rX   rz  rz  z  sx     )LN- -^  *F)0d	h`Z9  rZ   rz  c                   &    \ rS rSrS rS rS rSrg)AlbertConverteri5  c                     UR                    Vs/ s HP  n[        UR                  5      (       a  UR                  UR                  4OUR                  UR                  S-
  4PMR     sn$ s  snf Nd   r   r   r   r   r  s      rX   rl   AlbertConverter.vocab6  f     
% +=U[[*I*IU[[%++&PUP[P[]b]h]hkn]nOoo%
 	
 
   AA)c                    [         R                  " SS5      [         R                  " SS5      /nU R                  R                  (       dH  UR	                  [         R
                  " 5       5        UR	                  [         R                  " 5       5        U R                  R                  (       a$  UR	                  [         R                  " 5       5        UR                  R                  nU(       a%  UR	                  [         R                  " U5      5        UR	                  [         R                  " [        S5      S5      5        [         R                  " U5      $ Nz``"z''r  r  r
   r  rc   keep_accentsr|   NFKDStripAccentsr   	Lowercaser   r   r  r   r]  r   r   list_normalizersr   s       rX   r   AlbertConverter.normalizer<      c*c*
 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR 3 3E'NC HI##$455rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ Nrt  ru  rv  rw  r   r   r  rc   r  r   s    rX   r  AlbertConverter.post_processorO  R    ,,)4$11GGPQ$11GGPQ
 	
rZ   rj   Nr   r   r   r   rl   r   r  r   rj   rZ   rX   r  r  5      
6&
rZ   r  c                        \ rS rSrS rS rSrg)BarthezConverteriZ  c                 
    SnU$ Nr   rj   r   r   r   s      rX   r   BarthezConverter.unk_id[      rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r  r   s    rX   r  BarthezConverter.post_processor_  R    ,, +//EEeLM00FFvNO
 	
rZ   rj   N)r   r   r   r   r   r  r   rj   rZ   rX   r  r  Z  s    
rZ   r  c                   :    \ rS rSrS rS rS r\SS j5       rSr	g)	CamembertConverterij  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nUS/-  nU$ s  snf )N)z
<s>NOTUSED        <pad>r  z</s>NOTUSEDr  <unk>r  )<unk>NOTUSEDir   <mask>r  r  r   r   rl   r   s       rX   rl   CamembertConverter.vocabk  sR    
 	,,qr:JK:J;;,:JKK/"" L   !Ac                     gr  rj   r  s     rX   r   CamembertConverter.unk_idx  s    rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    rX   r  !CamembertConverter.post_processor|  r  rZ   Nc                 ,   [        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      nSUS4S	US4S
/nUb  UR                  [        U5      SS  5        UR	                  US45        XbS'   U$ )N	pad_tokenr  r   r  
mask_tokenr  r  r  r  )r        Yr   rl   r   r   r~   r   r|   )r  rl   r   r  r   r  
vocab_lists          rX   r  #CamembertConverter.convert_from_spm  s    

;89	

;89	L(;<
   $

 d5k!"o.:s+,$wrZ   rj   r   
r   r   r   r   rl   r   r  r  r  r   rj   rZ   rX   r  r  j  s%    
  rZ   r  c                   &    \ rS rSrS rS rS rSrg)DebertaV2Converteri  c                    / nU R                   R                  (       a#  UR                  [        R                  " SS95        [        X R                   5      nUR                  [        R                  " XS95        [        R                  " U5      $ )NrV  )rX  r  )rc   split_by_punctr|   r   Punctuationre   r  r]  )r   r  r[   list_pretokenizersrd   s        rX   r   DebertaV2Converter.pre_tokenizer  sl    ""11%%n&@&@*&UV,-=?V?VW!!.":":{"rs&&'9::rZ   c                    / nU R                   R                  (       a$  UR                  [        R                  " 5       5        UR                  [        R
                  " 5       5        UR                  R                  nU(       a%  UR                  [        R                  " U5      5        UR                  [        R                  " [        S5      S5      5        [        R                  " U5      $ )Nr  r  )rc   r   r|   r
   r  r  r   r   r  r  r   r]  r  s       rX   r   DebertaV2Converter.normalizer  s    ""00##K$9$9$;< 1 1 34$44II##K$;$;<P$QR 3 3E'NC HI##$455rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    rX   r  !DebertaV2Converter.post_processor  r  rZ   rj   N)r   r   r   r   r  r   r  r   rj   rZ   rX   r  r    s    ;6
rZ   r  c                   :    \ rS rSrS rS rS r\SS j5       rSr	g)	MBartConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nUS/-  nU$ s  snf )Nr  r  r  r  r  r  r   )r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r    r  r!   r  r"   r  r#   r  r$   r  r%   r  r&   r  r'   r  r(   r  r)   r  r*   r  r+   r  r,   r  r-   r  r.   r  r/   r  r  r  r  s       rX   rl   MBartConverter.vocab  sc    
 	,,qr:JK:J;;,:JKK 
 	
6 	/""; L   !Ac                     gr  rj   r  s     rX   r   MBartConverter.unk_id      rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz$A </s> en_XXz$A $B </s> en_XXr   r  r   r  r   s    rX   r  MBartConverter.post_processor  R    ,,"#$11GGPQ00FFvNO
 	
rZ   Nc                    [        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  S	S
5      5      nUS4US4US4US4/nUb  UR                  [        U5      SS  5        UR                  S [         5       5        UR                  US45        XS'   U$ )NrD  r  r  r  	eos_tokenr  r   r  r  r  r  r   c              3   (   #    U  H  oS 4v   M
     g7fr  Nrj   .0	lang_codes     rX   	<genexpr>2MBartConverter.convert_from_spm.<locals>.<genexpr>  s     LOyc*O   rl   )r   r   r~   r   MBART_LANGUAGESr|   	r  rl   r   rD  r  r4  r   r  r  s	            rX   r  MBartConverter.convert_from_spm  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k!"o.LOLL:s+,$wrZ   rj   r   r   rj   rZ   rX   r  r    s&    $L
  rZ   r  c                   :    \ rS rSrS rS rS r\SS j5       rSr	g)	MBart50Converteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nUS/-  nU$ s  snf )Nr  r   )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  )r0   r  )r1   r  )r2   r  )r3   r  )r4   r  )r5   r  )r6   r  )r7   r  )r8   r  )r9   r  )r:   r  )r;   r  )r<   r  )r=   r  )r>   r  )r?   r  )r@   r  )rA   r  )rB   r  )rC   r  )rD   r  )rE   r  )rF   r  )rG   r  )rH   r  )rI   r  )rJ   r  r  r  r  s       rX   rl   MBart50Converter.vocab  sc    
 	,,qr:JK:J;;,:JKK  R  	R/"" Lr,  c                     gr  rj   r  s     rX   r   MBart50Converter.unk_id  r/  rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nzen_XX $A </s>zen_XX $A $B </s>r   r  r   r  r   s    rX   r  MBart50Converter.post_processor  r2  rZ   Nc                    [        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  S	S
5      5      nUS4US4US4US4/nUb  UR                  [        U5      SS  5        UR                  S [         5       5        UR                  US45        XS'   U$ )Nr  r  r  r  r4  r  r   r  r  r  r  r   c              3   (   #    U  H  oS 4v   M
     g7fr6  rj   r7  s     rX   r:  4MBart50Converter.convert_from_spm.<locals>.<genexpr>/  s     N<Myc*<Mr<  rl   )r   r   r~   r   MBART50_LANGUAGESr|   )	r  rl   r   r  r  r4  r   r  r  s	            rX   r  !MBart50Converter.convert_from_spm  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k!"o.N<MNN:s+,$wrZ   rj   r   r   rj   rZ   rX   rA  rA    s%    

  rZ   rA  c                   :    \ rS rSrS rS rS r\SS j5       rSr	g)	NllbConverteri5  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )Nr  r   r  r  s       rX   rl   NllbConverter.vocab6  E    
 	,,qr:JK:J;;,:JKK L   !>c                     gr  rj   r  s     rX   r   NllbConverter.unk_id@  r/  rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   r  r   s    rX   r  NllbConverter.post_processorC  sR    ,,%&T44JJ:VW00FFvNO
 	
rZ   Nc                    [        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      nUS	US
USUS0nUbY  [        U[        5      (       a  UR	                  5       OU VV	s/ s H  u  pUPM	     sn	nn
U
 H  nX;   a  M
  [        U5      X{'   M     XrS'   U$ s  sn	nf )NrD  r  r  r  r4  r  r   r  r   r   rr   r   rl   )r   r   r~  ry   r3  rs   )r  rl   r   rD  r  r4  r   reordered_vocabtokr  tokensr   s               rX   r  NllbConverter.convert_from_spmM  s    

;67	

;89	

;78	

;89	 qqqq	
 %/t%<%<UZZ\UZB[UZ633UZB[F+),_)=&   *w C\s   $Crj   r   r   rj   rZ   rX   rN  rN  5  s%    
  rZ   rN  c                   &    \ rS rSrS rS rS rSrg)SeamlessM4TConverterid  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )N)r  r  r  r  r   r  r  s       rX   rl   SeamlessM4TConverter.vocabe  rQ  rR  c                 .    U R                   R                  $ r   )rc   unk_token_idr  s     rX   r   SeamlessM4TConverter.unk_ido  s    &&333rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   r  r   s    rX   r  #SeamlessM4TConverter.post_processorr  sR    ,,$%D33II)TU00FFvNO
 	
rZ   rj   Nr   r   r   r   rl   r   r  r   rj   rZ   rX   r^  r^  d  s    4
rZ   r^  c                   :    \ rS rSrS rS rS r\SS j5       rSr	g)	XLMRobertaConverteri}  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nUS/-  nU$ s  snf )Nr  r   r  r  r  s       rX   rl   XLMRobertaConverter.vocab~  sR    
 	,,qr:JK:J;;,:JKK/"" Lr  c                 
    SnU$ r  rj   r  s      rX   r   XLMRobertaConverter.unk_id  r  rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    rX   r  "XLMRobertaConverter.post_processor  r  rZ   Nc                    [        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  S	S
5      5      nUS4US4US4US4/nUb  UR                  [        U5      SS  5        UR	                  US45        XS'   U$ )NrD  r  r  r  r4  r  r   r  r  r  r  r   rl   r  r>  s	            rX   r  $XLMRobertaConverter.convert_from_spm  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k!"o.:s+,$wrZ   rj   r   r   rj   rZ   rX   ri  ri  }  s%    	
  rZ   ri  c                   &    \ rS rSrS rS rS rSrg)XLNetConverteri  c                     UR                    Vs/ s HP  n[        UR                  5      (       a  UR                  UR                  4OUR                  UR                  S-
  4PMR     sn$ s  snf r  r  r  s      rX   rl   XLNetConverter.vocab  r  r  c                    [         R                  " SS5      [         R                  " SS5      /nU R                  R                  (       dH  UR	                  [         R
                  " 5       5        UR	                  [         R                  " 5       5        U R                  R                  (       a$  UR	                  [         R                  " 5       5        UR                  R                  nU(       a%  UR	                  [         R                  " U5      5        UR	                  [         R                  " [        S5      S5      5        [         R                  " U5      $ r  r  r  s       rX   r   XLNetConverter.normalizer  r  rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r  r   s    rX   r  XLNetConverter.post_processor  r  rZ   rj   Nr  rj   rZ   rX   rs  rs    r  rZ   rs  c                       \ rS rSrSrg)ReformerConverteri  rj   Nr   r   r   r   r   rj   rZ   rX   r{  r{        rZ   r{  c                        \ rS rSrS rS rSrg)RemBertConverteri  c                    [         R                  " SS5      [         R                  " SS5      [         R                  " [        S5      S5      /nU R                  R                  (       dH  UR                  [         R                  " 5       5        UR                  [         R                  " 5       5        U R                  R                  (       a$  UR                  [         R                  " 5       5        UR                  R                  nU(       a%  UR                  [         R                  " U5      5        [         R                  " U5      $ r  )r
   r  r   rc   r  r|   r  r  r   r  r   r   r  r]  r  s       rX   r   RemBertConverter.normalizer  s    c*c*g4

 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR##$455rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    rX   r  RemBertConverter.post_processor  r  rZ   rj   N)r   r   r   r   r   r  r   rj   rZ   rX   r  r    s    6&
rZ   r  c                       \ rS rSrSrg)BertGenerationConverteri  rj   Nr|  rj   rZ   rX   r  r    r}  rZ   r  c                   @    \ rS rSrS r\S	S j5       rS rS rS r	Sr
g)
PegasusConverteri  c                    U R                   R                  S4U R                   R                  S4/nU R                   R                  b  X R                   R                  S4/-  nU R                   R                  bI  U R                   R
                  U R                   R                  :  a  X R                   R                  S4/-  nU[        SU R                   R                  5       Vs/ s H  nSU S3S4PM     sn-  nX!R                  SS   Vs/ s H  oDR                  UR                  4PM     sn-  nU$ s  snf s  snf )Nr  rr   <unk_>r  )rc   r  r4  mask_token_sentr  mask_token_idoffsetr{   r   r   r   )r   r   rl   r   r   s        rX   rl   PegasusConverter.vocab  s)   $$..4$$..4

 ""22>..>>DEEE ##..:''558O8O8V8VV..993?@@E%4;R;R;Y;Y2Z[2ZQU1#Q<(2Z[[,,qr:JK:J;;,:JKK \Ks   &D6!D;Nc           	      2   [        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      n[        UR                  SS5      5      nUS	4US	4/nUS
:w  a  UR                  US	45        US
:w  a  Xe:w  a  UR                  US	45        UR                  [	        SUR                  SS5      5       Vs/ s H  nSU S3S4PM     sn5        Ub  UR                  [        U5      SS  5        XrS'   U$ s  snf )Nr  r  r4  r  r  z<mask_1>r  z<mask_2>r  Nonerr   r  g   r  r  r  rl   )r   r   r|   r~   r{   r   )	r  rl   r   r  r4  r  r  r  r   s	            rX   r  !PegasusConverter.convert_from_spm  s   

;89	

;78	L*=>
fjj):JGH 

 z3/0f$)F455FJJxY\D];^_;^aeA3aL&1;^_`d5k!"o.$w	 `s   Dc                 \    UR                   R                  U R                  R                  -   $ r   )r   r   rc   r  r  s     rX   r   PegasusConverter.unk_id#  s%    !!((4+B+B+I+IIIrZ   c                     [        X R                  5      n[        R                  " [        R                  " 5       [        R
                  " XS9/5      $ r  )re   rc   r   r]  WhitespaceSplitr  r  s       rX   r  PegasusConverter.pre_tokenizer&  sE    ,-=?V?VW&&..0(([`
 	
rZ   c                     U R                   R                  nXR                   R                  4/n[        R                  " SU/SSU/US9$ )N$A$Br   )rc   r4  eos_token_idr   r  )r   eosr   s      rX   r  PegasusConverter.post_processor/  sP    %%//))667
 ,,T3KtTSVFWhvwwrZ   rj   r   )r   r   r   r   rl   r  r  r   r  r  r   rj   rZ   rX   r  r    s,    &  *J
xrZ   r  c                   4    \ rS rSrS rS r\SS j5       rSrg)T5Converteri7  c                     U R                   R                  nUR                   Vs/ s H  o3R                  UR                  4PM     nnU[        US-
  SS5       Vs/ s H  nSU S3S4PM     sn-  nU$ s  snf s  snf )Nr   r   
<extra_id_r  r  )rc   
_extra_idsr   r   r   r{   )r   r   num_extra_idsr   rl   r   s         rX   rl   T5Converter.vocab8  s|    //::9>F++u{{+FE-!:KRQS4TU4TqZs!$c*4TUU GUs   !A4A9c                 n    [         R                  " SS// SQSU R                  R                  S5      4/S9$ Nr  r  )r  r  r  r  r   r  r   s    rX   r  T5Converter.post_processor>  =    ,,&>-00FFvNO
 	
rZ   Nc                     UR                  SS5      n[        US-
  SS5       Vs/ s H	  nSU S3PM     nnUb  [        U5      O/ nUR                  S U 5       5        UR	                  SU5        XbS	'   U$ s  snf )
N	extra_idsr  r   r   r  r  c              3   (   #    U  H  oS 4v   M
     g7fr6  rj   )r8  r   s     rX   r:  /T5Converter.convert_from_spm.<locals>.<genexpr>L  s     AL5#,Lr<  r   rl   )r   r{   r   r~   
setdefault)r  rl   r   r  r   extra_tokensr  s          rX   r  T5Converter.convert_from_spmG  s    JJ{C0	38QB3OP3Oa*QCq)3OP$)$5T%[2
ALAA5|D$w Qs   A7rj   r   )	r   r   r   r   rl   r  r  r  r   rj   rZ   rX   r  r  7  s     
  rZ   r  c                       \ rS rSrS rSrg)UdopConverteriS  c                 n    [         R                  " SS// SQSU R                  R                  S5      4/S9$ r  r  r   s    rX   r  UdopConverter.post_processorT  r  rZ   rj   Nr   r   r   r   r  r   rj   rZ   rX   r  r  S  s    
rZ   r  c                   "    \ rS rSrS\4S jrSrg)WhisperConverteri^  r\   c                    U R                   R                  n[        U R                   R                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " U R                   R                  S9Ul
        [        R                  " 5       Ul        U R                   R                  nU R                   R                  U5      nU R                   R                  nU R                   R                   nSR#                  U Vs/ s H  o S3PM	     sn5      n	[$        R&                  " U	 SU S3U	 SU S	3Xg4/[)        XT5      QS
9Ul        U$ s  snf )Nr<  Fr=  r?  r  r   z $A:0 z $A:0 $B:1 r   r   )rc   r1  r   r2  r3  r   r   r   rC  r[   r  r	   r	  prefix_tokensconvert_ids_to_tokensr4  r  joinr   r  zipr  )
r   rl   r   r  prefix_token_idsprefixesr  r  r   prefix_templates
             rX   r   WhisperConverter.converted_  sO   ''//d--77<<>?*,#%	
	 #1":":DLcLcLtLt"u	$..0	22@@**@@AQR%%//..;;((h#GhUgRLh#GH#-#@#@%&fSE4#$KuB7#X0$
	   $Hs   Erj   Nr  rj   rZ   rX   r  r  ^  s     9  rZ   r  c                       \ rS rSrS rSrg)BigBirdConverteri  c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    rX   r  BigBirdConverter.post_processor  r  rZ   rj   Nr  rj   rZ   rX   r  r    s    
rZ   r  c                   "    \ rS rSrS\4S jrSrg)CLIPConverteri  r\   c                 j   U R                   R                  n[        U R                   R                  R	                  5       5      nU R                   R
                  n[        [        UUS SSS[        U5      S95      n[        R                  " [        R                  " 5       [        R                  " [        S5      S5      [        R                  " 5       /5      Ul        [         R                  " [         R"                  " [        S5      SS	S
9[         R$                  " SS9/5      Ul        [(        R$                  " 5       Ul        [,        R.                  " U R                   R0                  U R                   R2                  4U R                   R4                  U R                   R6                  4SSS9Ul        U$ )Nr<  r+  Frl   r   r,  r>  r-  r.  r   z\s+r  z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTrW  r?  rc  )rc   r1  r   r2  r3  r   r   r   r   r
   r]  r\  r  r   r  r   r   r^  rC  r  r	   r	  r   rd  r4  r  rD  rE  r  r7  s        rX   r   CLIPConverter.converted  sk   ''//d--77<<>?++55	*,#)i.

	  +33__ 3 3E&M3 GI^I^I`a 
	 #1"9"9$$Z[&
 ((%@	#
	 %..0	 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY"	$
	  rZ   rj   Nr  rj   rZ   rX   r  r    s    '9 'rZ   r  c                   "    \ rS rSrS\4S jrSrg)LayoutLMv2Converteri  r\   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )Nr   FTr   r   r   r   r   r   r   r   r   r   r
  s
             rX   r   LayoutLMv2Converter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	rZ   rj   Nr  rj   rZ   rX   r  r    r  rZ   r  c                   "    \ rS rSrS\4S jrSrg)BlenderbotConverteri  r\   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " SUR                   S3UR                  UR                   4/S9Ul        U$ )Nr<  Fr=  r?  z$A:0 r   )r   r   )rc   r1  r   r2  r3  r   r   r   rC  r[   r  r	   r	  r   r  r4  r  r  re  s        rX   r   BlenderbotConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@2<<.+r/$
	  rZ   rj   Nr  rj   rZ   rX   r  r    rh  rZ   r  c                   &    \ rS rSrS rS rS rSrg)XGLMConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nU$ s  snf )Nr  r   ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  r  r  s       rX   rl   XGLMConverter.vocab  sV    
 	,,qr:JK:J;;,:JKK  z  	z Ls   !Ac                 
    SnU$ r  rj   r  s      rX   r   XGLMConverter.unk_id	  r  rZ   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz</s> $Az</s> $A </s> </s> $Br  r  r   r  r   s    rX   r  XGLMConverter.post_processor  sR    ,,'//EEeLM00FFvNO
 	
rZ   rj   Nrg  rj   rZ   rX   r  r    s    	
rZ   r  c                   D    \ rS rSrSr\rSS1r S rS r	S r
S rS	 rS
rg)GemmaConverteri  Tz<start_of_turn>z<end_of_turn>c                 0    [         R                  " SS5      $ Nr  r|  )r
   r  r  s     rX   r   GemmaConverter.normalizer(  s    ""3..rZ   c                    U R                   R                  S4U R                   R                  S4U R                   R                  S4/nX!R                  SS   Vs/ s H  o3R
                  UR                  4PM     sn-  n[        S U 5       5      (       d#  [        S [        U5       5       S 5      nUb  SX$'   U$ s  snf )Nr  r   c              3   0   #    U  H  oS    S:H  v   M     g7f)r   r   Nrj   )r8  rk   s     rX   r:  'GemmaConverter.vocab.<locals>.<genexpr>4  s     /AQ44<s   c              3   @   #    U  H  u  pUS    S:X  d  M  Uv   M     g7f)r   r   Nrj   )r8  r   rk   s      rX   r:  r  5  s!     "V1AQqTXEU111As   	)r   r  )
rc   r  r4  rD  r   r   r   anynextr   )r   r   rl   r   override_indexs        rX   rl   GemmaConverter.vocab+  s    $$..4$$..4$$..4

 	,,qr:JK:J;;,:JKK ////!"V51A"VX\]N)(3% Ls   !B;c                 0    [         R                  " SS5      $ )Nr  merged_with_previous)r   r^  r   r  r[   s      rX   r  GemmaConverter.pre_tokenizer;  s    ##C)?@@rZ   c                 
    SnU$ r  rj   r  s      rX   r   GemmaConverter.unk_id>  r  rZ   c                     [         R                  " [         R                  " SS5      [         R                  " 5       [         R                  " 5       /5      $ )Nr|  r  )r	   r]  r  r  r  r  s      rX   r	  GemmaConverter.decoderB  s?        ,%%'
 	
rZ   rj   N)r   r   r   r   r  r   r  r   r   rl   r  r   r	  r   rj   rZ   rX   r  r    s6    .L'9N/ A
rZ   r  c                   <    \ rS rSrSrS rS rS rS rS r	S r
S	rg
)LlamaConverteriL  Tc                 *   U R                   R                  S5      S4U R                   R                  S5      S4U R                   R                  S5      S4/nX!R                  SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )Nr   r  r   rr   r   )rc   r  r   r   r   r  s       rX   rl   LlamaConverter.vocabO  s    $$::1=sC$$::1=sC$$::1=sC

 	,,qr:JK:J;;,:JKK Ls   (!Bc                 
    SnU$ r   rj   r  s      rX   r   LlamaConverter.unk_idX  r  rZ   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU(       a  U[         R                  " SSS9/-  n[         R
                  " U5      $ Nr|  r  r   )contentr  r	   r  r  r  r  r]  r   r  r[   sequences       rX   r	  LlamaConverter.decoder\  \    UC(!!#MMO

 !<==H  **rZ   c                    [        U R                  SS5      (       ae  / n[        U R                  SS5      (       a  U[        R                  " SS9/-  nU[        R                  " SSS9/-  n[        R
                  " U5      $ g )Nr_   Tr[   r|  )prependr  )patternr  )rb   rc   r
   Prependr  r]  )r   r   r  s      rX   r   LlamaConverter.normalizerf  sx    4**Hd;;Ht..0BDII[00?@@,,S%HIIH''11rZ   c                     [        U R                  SS5      (       d*  [        X R                  5      n[        R                  " XSS9$ g )Nr_   TFr  rd   split)rb   rc   re   r   r  r  s       rX   r  LlamaConverter.pre_tokenizero  s?    t..$??01ACZCZ[N!++jopprZ   c                     g r   rj   r   s    rX   r  LlamaConverter.post_processoru  s    rZ   rj   N)r   r   r   r   r  rl   r   r	  r   r  r  r   rj   rZ   rX   r  r  L  s&    +rZ   r  c                   "    \ rS rSrS\4S jrSrg)MarkupLMConverteriz  r\   c                 z   U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSU R                   R                  S95      n[        R                  " UR                  S9Ul        [        R                  " 5       Ul        [        U R                   R                  5      n[        U R                   R                   5      nU R                   R"                  nU R                   R$                  n[&        R(                  " U SU 3U SU SU 3XW4Xh4/S9Ul        U$ )Nr<  Fr  r?  z $A z $B r   )rc   r1  r   r2  r3  r   r   r   r   rC  r[   r  r	   r	  r   r  r  r  r  r   r  r  )	r   rf  rl   r   r  r  r  r  r  s	            rX   r   MarkupLMConverter.converted{  s(   $$

bll'')**,#%11;;

	 #1":":BL_L_"`	$..0	$))334$))334..;;..;;#-#@#@U$se$5SEcU+##$
	  rZ   rj   Nr  rj   rZ   rX   r	  r	  z  s    "9 "rZ   r	  c                   0    \ rS rSrSrS rS rS rS rSr	g)	MoshiConverteri  Tc                    [        U S5        [        R                  X5        [        5       nUR	                  5       n[        US5       nUR                  UR                  5       5        S S S 5        X@l        g ! , (       d  f       N= fNrQ   r   	r   r   r   rY   r   r   r   r   r   r   r  r   r   r   r   s         rX   r   MoshiConverter.__init__  se    $
+4, $%	  "*d#qaffh' $
 $#    A77
Bc                     UR                   R                  n[        R                  " SS5      /nU(       d  [        R                  " U5      $ [        R                  " [        R
                  " U5      /U-   5      $ r  )r   r   r
   r  r]  r  r  s       rX   r   MoshiConverter.normalizer  sg    $44IIU+
 $''55'')@)@AU)V(WZf(fggrZ   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU(       a  U[         R                  " SSS9/-  n[         R
                  " U5      $ r  r  r  s       rX   r	  MoshiConverter.decoder  r  rZ   c                 0    Sn[         R                  " XSS9$ )Nr`   Fr  )r   r  r  s       rX   r  MoshiConverter.pre_tokenizer  s     ''KfkllrZ   r   N)
r   r   r   r   r  r   r   r	  r  r   rj   rZ   rX   r  r    s    h+mrZ   r  c                   L    \ rS rSrSrSS jrS rS rS rS r	S	 r
S
 rS rSrg)HeliumConverteri  TNc                    [        U S5        [        R                  X5        [        5       nUR	                  5       n[        US5       nUR                  UR                  5       5        S S S 5        X@l        g ! , (       d  f       N= fr  r  r  s         rX   r   HeliumConverter.__init__  sc    $
+4,#%	  "*d#qaffh' $
 $#r  c                 R   U R                  U5      n[        [        UU R                  U5      U R                  S95      n[        UR                  5       VVs/ s HR  u  pEUR                  S;   d  M  XER                  UR                  S:H  =(       d    UR                  U R                  ;   4PMT     nnnUR                  [        US S9 VVVs/ s H  u  pGn[        USUSS9PM     snnn5        UR                  [        S	SSS
9/5        UR                  SSS9  U$ s  snnf s  snnnf )Nr  r   r   c                     U S   $ r   rj   r   s    rX   rm   +HeliumConverter.tokenizer.<locals>.<lambda>  r  rZ   ro   FT)r   r   single_word
r   r  )r  pad_id)rl   r   r   r   r  r   r   r   r   r   r  r}   r   enable_padding)	r   r   r   r  r   r   r   r   r   s	            rX   r  HeliumConverter.tokenizer  s    zz%({{5)"77
	 #5<<0
0vv IR!&&A+GD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGQUV*V	
 	j%OPQ  71 =
s   D1?DD"c                     / nUR                    HB  nUR                  S:X  a  USUR                  4/-  nM'  X#R                  UR                  4/-  nMD     U$ )Nz<0x0A>r"  r  r  s       rX   rl   HeliumConverter.vocab  sV    \\E{{h&4-..;;455	 "
 rZ   c                 
    SnU$ r   rj   r  s      rX   r   HeliumConverter.unk_id  r  rZ   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU[         R                  " SSS9/-  n[         R
                  " U5      $ r  r  r  s       rX   r	  HeliumConverter.decoder  sY    UC(!!#MMO

 	X^^Ca899  **rZ   c                     [         R                  " [         R                  " S5      [         R                  " SS5      /5      $ r  )r
   r]  r   r  r  s     rX   r   HeliumConverter.normalizer
  s2    ##[%8%8%={?R?RSWY^?_$`aarZ   c                 Z    [         R                  " [         R                  " SS5      /5      $ )Nr"  
contiguous)r   r]  r^  r  s      rX   r  HeliumConverter.pre_tokenizer  s#    &&(<(<T<(P'QRRrZ   c                 8    [         R                  " SS// SQS/S9$ )Nr  r  )r  r  r  r  )r  r   r   )r   r  r   s    rX   r  HeliumConverter.post_processor  s/    ,, 
 	
rZ   r   r   )r   r   r   r   r  r   r  rl   r   r	  r   r  r  r   rj   rZ   rX   r  r    s2    
8+bS
rZ   r  c                   (    \ rS rSrSrSS jrS rSrg)ParakeetConverteri"  TNc                    Xl         [        U S5        [        R                  X5        [	        5       nUR                  5       n[        US5       nUR                  UR                  5       5        S S S 5        X@l	        g ! , (       d  f       N= fr  )
r  r   r   r   rY   r   r   r   r   r   )r   r  r  r   r   r   s         rX   r   ParakeetConverter.__init__%  sh    $$
+4,#%	  "*d#qaffh' $
 $#s    A==
Bc                    U R                  U5      nU R                  U R                  5      R                  U5      u  p4[	        U5       VVVs0 s H
  u  nu  pgXe_M     nnnn[        [        UUUR                  R                  SU R                  S S95      n	[	        UR                  5       V
Vs/ s HR  u  pUR                  S;   d  M  XR                  UR                  S:H  =(       d    UR                  U R                  ;   4PMT     nn
nU	R                  [        US S9 V
VVs/ s H  u  pn[!        USUS9PM     snnn
5        U	$ s  snnnf s  snn
f s  snnn
f )	NTr  r   r   c                     U S   $ r   rj   r   s    rX   rm   -ParakeetConverter.tokenizer.<locals>.<lambda>K  r  rZ   ro   Fr   )rl   r  r  r   r   r   r   r   r}  r  r   r   r   r   r  r}   r   )r   r   r   r  r   r   r   r   r  r  r   r   r   r   r   s                  rX   r  ParakeetConverter.tokenizer2  sG   zz%(%%doo6>>|L	5>|5LM5L!1MTTW5L	M,,66"77	
	 #5<<0
0vv IR!&&A+GD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGD*V	
 3 N
s   E/E	?E%E)r   r  r   )r   r   r   r   r  r   r  r   rj   rZ   rX   r4  r4  "  s    rZ   r4  c            	         [        [        [        S5      [        S5      S-   5      5      [        [        [        S5      [        S5      S-   5      5      -   [        [        [        S5      [        S5      S-   5      5      -   n U SS nS	n[        S
5       H4  nX0;  d  M
  U R                  U5        UR                  S
U-   5        US-  nM6     U Vs/ s H  n[	        U5      PM     nn[        [        X5      5      $ s  snf )a  
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.

The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
!~r      ¡   ¬   ®   ÿNr      )r   r{   ordr|   chrry   r  )bscsnbs       rX   bytes_to_unicoderI  R  s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[;IIaLIIdQhFA	 
 	"Q#a&"B	B 
s   C:c                   L    \ rS rSrSr    SS jrS\4S jrS rS\	4S	 jr
S
rg)TikTokenConverterij  z
A general tiktoken converter.
Nc                     Xl         X l        X0l        [        U[        5      (       a  UR                  5       U l        g UU l        g r   )r  r  r[   r~  ry   r3  extra_special_tokens)r   r  r  r[   rM  r   s         rX   r   TikTokenConverter.__init__o  sA     % 0+56JD+Q+Q %%' 	!Wk 	!rZ   tiktoken_urlc                 >  ^^  SSK Jn  U" U5      m[	        5       mU4S jn/ n0 nTR                  5        H  u  pgXuU" U5      '   [        U5      S:X  a  M   / n[        S[        U5      5       H8  n	US U	 XiS  pU
T;   d  M  UT;   d  M  X-   T;   d  M%  UR                  XU45        M:     [        UU4S jSS9nUR                  U5        M     [        US	 SS9nU Vs/ s H  o" US   5      U" US   5      4PM     nnXT4$ ! [         a    [        S5      ef = fs  snf )
Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c           	         > SR                  U R                  S5       Vs/ s H  nT[        U5         PM     sn5      $ s  snf Nr<  zlatin-1r  decoderC  rH  charbyte_encoders     rX   token_bytes_to_stringPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s8    77@ST@SLT3@STUUT   ?r   c                 $   > TU S      TU S      4$ ri   rj   )rk   r2  s    rX   rm   CTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s    1Q4)AaD/0RrZ   Frv   c                     U S   $ Nrr   rj   rt   s    rX   rm   r]        ArZ   )tiktoken.loadrQ  r  
ValueErrorrI  rz   rs   r{   r|   r}   r~   )r   rO  rQ  rY  r   rl   r   rankr   r   r   r   ru   r2  rX  s                @@rX   extract_vocab_merges_from_model1TikTokenConverter.extract_vocab_merges_from_model~  sC   	7 &l3	')	V $??,KE26'./5zQEq#e*-#(%=%-i'Gy,@gFW\eEeLL'D!9: . 5&R\abEMM%  - $6F\bc\bUX(Q02GA2OP\bc}5  	k 	2 ds   D DDc                     U R                  U R                  5      u  p[        [        XSS95      n[	        UR
                  S5      (       a  SUR
                  l        U$ NF)r.  ignore_mergesTrd  r  r   r   r   r   rh  r   r   r   r  s       rX   r  TikTokenConverter.tokenizer  M    #CCDOOTc,GH	9??O44,0IOO)rZ   r\   c                    U R                  5       n[        R                  " [        R                  " [	        U R
                  5      SSS9[        R                  " U R                  SS9/5      Ul        [        R                  " 5       Ul
        U R                  b5  UR                  U R                   Vs/ s H  n[        USSS9PM     sn5        [        R                  " SS9Ul        U$ s  snf )NrV  FrW  rZ  Tr   rA  )r  r   r]  r^  r   r  rC  r[   r  r	   r	  rM  r5  r   r   r  )r   r  r   s      rX   r   TikTokenConverter.converted  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$0((PTPiPijPiuEeTBPij $.#7#7U#K	  ks   2C))r[   rM  r  r  Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)r   r   r   r   r   r   r   rd  r  r   r   r   rj   rZ   rX   rK  rK  j  s:      K!
C >9 rZ   rK  c                   H    \ rS rSr    S
S jrS\4S jrS rS\4S jr	S	r
g)MistralConverteri  Nc                     Xl         X l        X0l        [        U[        5      (       a  UR                  5       U l        g UU l        g r   )r  r  r[   r~  ry   r3  r   )r   r  r  r[   r   r   s         rX   r   MistralConverter.__init__  sG     % 0 3T:: &**, 	& + 	&rZ   rO  c                 ,  ^^ SS K nSS Kn[        U R                  SSS9 nUR	                  U5      nS S S 5        WS   S   U l        US    Vs/ s H  n[        US   US	   S
9PM     snU l        US   n[        5       m[        U4S j5       n/ n	0 n
[        U R                  5       H  u  pXUR                  '   M     U Vs/ s H  obR                  US   5      PM     nn[        U5      n[        U5       VVs0 s H  u  pX_M	     snnm[        [        USS95       H  u  pXU" U5      '   [        U5      S:X  a  M   / n[!        S[        U5      5       H<  nUS U UUS  nnUU;   d  M  UU;   d  M  UU-   U;   d  M(  UR#                  UUU45        M>     [%        UU4S jSS9nU	R'                  U5        M     [%        U	S SS9n	U	 Vs/ s H  nU" US   5      U" US   5      4PM     n	nX4$ ! , (       d  f       GN= fs  snf s  snf s  snnf s  snf )Nr   rzutf-8)encodingconfigr  r   	token_str
is_control)r   rl   c           	         > SR                  U R                  S5       Vs/ s H  nT[        U5         PM     sn5      $ s  snf rS  rT  rV  s     rX   rY  OMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s8    77@ST@SLT3@STUUTr[  token_bytesz(Converting tekken.json to tokenizer.json)descr   c                 $   > TU S      TU S      4$ ri   rj   )rk   token_to_ranks    rX   rm   BMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s    qt1DmTUVWTXFY0ZrZ   Frv   c                     U S   $ r_  rj   rt   s    rX   rm   r    r`  rZ   )base64jsonr   r  loadr  r   r   rI  r   r   r  	b64decoderx   r   rs   r{   r|   r}   r~   )r   rO  r  r  r   untypedkr2  rY  r   rl   idxr   rank_setrc  r   r   r   r   ru   rX  r  s                       @@rX   rd  0MistralConverter.extract_vocab_merges_from_model  s*   $//39QiilG :x(3IPQaIb*
IbAJq~q?Ib*
& G$	')		V 
	V #D$B$BCJC#&%--  DAJKA%%a&67	Ky>8A)8LM8L8LM$T):d%efKD26'./5zQEq#e*-#(%=%-h&7h+>GgDUZbCbLL'7D!9: . 5&ZdijEMM%  g $6F\bc\bUX(Q02GA2OP\bc}C :9*
 LM ds#   G/HH:H
 H/
G>c                     U R                  U R                  5      u  p[        [        XSS95      n[	        UR
                  S5      (       a  SUR
                  l        U$ rg  ri  rj  s       rX   r  MistralConverter.tokenizer  rl  rZ   r\   c                    U R                  5       n[        R                  " [        R                  " [	        U R
                  5      SSS9[        R                  " U R                  SS9/5      Ul        [        R                  " 5       Ul
        UR                  U R                  5        [        R                  " SS9Ul        U$ )NrV  FrW  rZ  rA  )r  r   r]  r^  r   r  rC  r[   r  r	   r	  r  r   r   r  )r   r  s     rX   r   MistralConverter.converted  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	T;;<#-#7#7U#K	 rZ   )r[   r   r  r  ro  )r   r   r   r   r   r   rd  r  r   r   r   rj   rZ   rX   rq  rq    s6      K"&
"%C %N9 rZ   rq  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerReformerTokenizerRemBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizerSplinterTokenizerXGLMTokenizer)LlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3Tokenizerc                 <   U R                   R                  nU[        ;   a&  U(       d  [        U   nU" U 5      R                  5       $ U R                  R                  S5      (       a>  X l        [        R                  S5        [        U R                  5      R                  5       $  [        R                  S5        [        U R                  U R                  S9R                  5       $ ! [         a*    [        S[        [        R                  5       5       35      ef = f)a\  
Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

Args:
    transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
        Instance of a slow tokenizer to convert in the backend tokenizer for
        [`~tokenization_utils_base.PreTrainedTokenizerFast`].
   from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
        Defaults to False.

Return:
    A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
    [`~tokenization_utils_base.PreTrainedTokenizerFast`]
ztekken.jsonz#Converting from Mistral tekken.jsonzConverting from Tiktoken)r  rM  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r  r   SLOW_TO_FAST_CONVERTERSr   r  endswithrc   loggerinforq  rK  rM  r  rb  r   r3  )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       rX   convert_slow_tokenizerr  C  s      1::CC66}12FG45??AA		)	)	2	2=	A	A3H09: 5 @ @AKKMM	KK23$0;;%:%O%O ik  	>>BCZC_C_Ca>b=ce 	s   &A C' '4D)r<  r   )F)Wr   r  collections.abcr   	functoolsr   	packagingr   
tokenizersr   r   r   r	   r
   r   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerr   r  r=  rK  rY   boolr   re   r   r   r   r   r   r   r  r!  r%  r)  r:  rK  rS  ra  rj  rr  rz  r  r  r  r  r  rA  rN  r^  ri  rs  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r4  rI  rK  rq  r  r  rj   rZ   rX   <module>r     s    &   f f f 5 5  ` ` 5 
		H	%8 $ '  >G"$ s jo6L 02 2j"8 "Ic Id I$ $$I $N/	 /d$i $N$Y $N 6#I #Ly >)Y )Xy :$	 $Ny >x9 xv"
l "
J
| 
 - -`
 
BG\ GT-| -`,L ,^
< 
2,, ,^"
\ "
J	 	
| 
@	l 	;x| ;x|, 8
L 
!y !H	
| 	
(I (V$) $N) :
L 
61
\ 1
h+\ +\#	 #L&m\ &mRV
l V
r- -`0K K\M M`88%8 (8 ]	8
 (8 .8 ,8 ]8 8 8 (8 ,8 =8 -8 "=8  !-!8" #8$ _%8& '8( ])8* (+8, -8. =/80 +182 -384 +586 $788 }98: *;8< n=8> (?8@ nA8B =C8D $E8F ]G8H ,I8J (K8L nM8N *O8P (Q8R (S8T *U8V 0W8X MY8Z ;[8\ ]]8^ (_8` .a8b nc8d *e8f ]g8h %($#o8 v$) $rZ   