
    Z jj                     f    S SK JrJrJrJrJr  S SKJr  S SKJ	r	  S SK
Jr   " S S5      rS\4S jrg	)
    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)bytes_to_unicode)PreTrainedTokenizerFastc                   L    \ rS rSrSr    SS jrS\4S jrS rS\	4S	 jr
S
rg)MistralConverter   z
A general tiktoken converter.
Nc                 4    Xl         X l        X0l        X@l        g )N)vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   kwargss         r/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/mistral.py__init__MistralConverter.__init__   s     
 0)B&    r   c                 4  ^^ Um[        5       mU4S jn/ n0 n[        TR                  5       5       H  u  nu  pVXPR                  ;  a  XAU" U5      '   [	        U5      S:X  a  M2  / n[        S[	        U5      5       H8  nUS U XXS  pU	T;   d  M  U
T;   d  M  X-   T;   d  M%  UR                  XU45        M:     [        UU4S jSS9nUR                  U5        M  XAU'   M     [        US SS9nU Vs/ s H  o" US   5      U" US   5      4PM     nnX4$ s  snf )Nc           	         > SR                  U R                  S5       Vs/ s H  nT[        U5         PM     sn5      $ s  snf )N zlatin-1)joindecodeord)bcharbyte_encoders     r   token_bytes_to_stringOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string   s8    77@ST@SLT3@STUUTs   ?   c                 $   > TU S      TU S      4$ )Nr   r$    )x	bpe_rankss    r   <lambda>BMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>-   s    Yqt_iPQRSPTo4Vr   F)keyreversec                     U S   $ )N   r&   )vals    r   r)   r*   1   s    Ar   r   )	r	   	enumerateitemsr   lenrangeappendsortedextend)r   r   r"   mergesidxtokenranklocalindexpiece_lpiece_rr/   r(   r!   s               @@r   extract_vocab_merges_from_model0MistralConverter.extract_vocab_merges_from_model   s1   	')	V "+IOO,=">C%:::69+E23u:?"1c%j1E',Ve}eFmW)+90D'J[`iIig%=> 2 u*V`efe$"e #? $6F\bc\bUX(Q02GA2OP\bc} ds   1Dc                     U R                  U R                  5      u  p[        [        XSS95      n[	        UR
                  S5      (       a  SUR
                  l        U$ )NF)fuse_unkignore_mergesT)r?   r   r   r   hasattrmodelrC   )r   vocab_scoresr7   	tokenizers       r   rG   MistralConverter.tokenizer5   sM    #CCDJJOc,GH	9??O44,0IOO)r   returnc                    U R                  5       n[        R                  " [        R                  " [	        U R
                  5      SSS9[        R                  " U R                  SS9/5      Ul        [        R                  " 5       Ul
        UR                  U R                  5        [        R                  " SS9Ul        U$ )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)rG   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   rG   s     r   	convertedMistralConverter.converted<   s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$T%C%CD#-#7#7U#K	 r   )r   r   r   r   )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)__name__
__module____qualname____firstlineno____doc__r   strr?   rG   r   rW   __static_attributes__r&   r   r   r   r      s;      K"&CS 69 r   r   tokenizer_filec                 >   SSK Jn  SSKJn  UR	                  U 5      nUR
                  R                  R                  n[        UR
                  R                  R                  S S9nU Vs/ s H  ofS   PM	     nn[        U5       VVs0 s H  u  pXh_M	     n	nnU	R                  U5        U	nUR
                  R                  R                  R                  n
[        [        XGU
S9R!                  5       S9nUR#                  S	U05        UR$                  R&                  UR(                  R&                  UR*                  R&                  UR,                  R&                  S
.nUR/                  5        H  u  pX;   d  M  UR#                  X05        M      U$ s  snf s  snnf )z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )SpecialTokens)MistralTokenizerc                     U S   $ )Nr:   r&   )r'   s    r   r)   *convert_tekken_tokenizer.<locals>.<lambda>X   s    mnoumvr   )r+   	token_str)r   r   r   )tokenizer_objectr   )	bos_token	eos_token	pad_token	unk_token)%mistral_common.tokens.tokenizers.baserb   (mistral_common.tokens.tokenizers.mistralrc   	from_fileinstruct_tokenizerrG   _tekken_token2id_nospecialr5   _all_special_tokensr0   update_model_pat_strr
   r   rW   rU   bosvalueeospadunkr1   )r`   rb   rc   mistral_tokenizerr   sorted_tokensr9   all_specialr8   specials_tokensr   rG   
MAP_SPECALspecial_keyspecial_tokens                  r   convert_tekken_tokenizerr   L   s{    DI )22>B 00::UUE,??II]]cvwM3@A=%%=KA4=k4JK4Jjcuz4JOK5!E  22<<CCLLG ()

)+I   "={!KL #&&,,"&&,,"&&,,"&&,,	J '1&6&6&8"'((+)EF '9 A BKs   +F	FN)
tokenizersr   r   r   r   r   tokenizers.modelsr   #transformers.convert_slow_tokenizerr	   *transformers.tokenization_utils_tokenizersr
   r   r^   r   r&   r   r   <module>r      s-    M M ! @ NA AH-S -r   