
    Z j9                         S r SSKrSSKrSSKJrJr  \" 5       (       a  SSKrSSKJr  SSK	J
r
  \
R                  " \5      rSS0rS	 r " S
 S5      r " S S\5      rS/rg)z Tokenization classes for CPMAnt.    N)is_rjieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                     [         R                  " 5       n[        U SSS9 nUR                  5       nSSS5        [	        W5       H  u  pEUR                  S5      nXAU'   M     U$ ! , (       d  f       N9= f)z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextokens         /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocabr   "   sg    ##%E	j#	0F!!# 
1!&)T"e * L 
1	0s   A%%
A3c                   $    \ rS rSrSS jrS rSrg)WordpieceTokenizer-   c                 (    Xl         X l        X0l        g N)r   	unk_tokenmax_input_chars_per_word)selfr   r!   r"   s       r   __init__WordpieceTokenizer.__init__.   s    
"(@%    c                    [        U5      n[        U5      U R                  :  a  U R                  /$ Sn/ nU[        U5      :  a  [        U5      nS nX5:  a1  SR	                  X#U 5      nXpR
                  ;   a  UnOUS-  nX5:  a  M1  Uc!  UR                  U R                  5        US-  nOUR                  U5        UnU[        U5      :  a  M  U$ )Nr       )listlenr"   r!   joinr   append)r#   r   charsstart
sub_tokensend
cur_substrsubstrs           r   tokenizeWordpieceTokenizer.tokenize3   s    Uu:555NN##
c%j e*CJ+S!12ZZ'!'Jq + !!!$..1
!!*- c%j   r&   )r"   r!   r   N)<unk>   )__name__
__module____qualname____firstlineno__r$   r4   __static_attributes__ r&   r   r   r   -   s    A
r&   r   c                     ^  \ rS rSrSr\rSS/rSr         SU 4S jjr	\
S 5       r\
S 5       r\
S	 5       r\
S
\4S j5       rS rS rU 4S jrS rS\\   S
\4S jrS rS rSS\S\S-  S
\\   4S jjrSrU =r$ )CpmAntTokenizerM   a^  
Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    bod_token (`str`, *optional*, defaults to `"<d>"`):
        The beginning of document token.
    eod_token (`str`, *optional*, defaults to `"</d>"`):
        The end of document token.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token.
    line_token (`str`, *optional*, defaults to `"</n>"`):
        The line token.
    space_token (`str`, *optional*, defaults to `"</_>"`):
        The space token.
	input_idsattention_maskFc                   > [        U S/5        X l        X0l        [        U5      U l        U R                  U	   U R                  S'   U R                  U   U R                  S'   U R                  U		 U R                  U	 [
        R                  " [        U R                  R                  5       S S95      U l        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l	        [        U R                  US9U l        [        TU ]4  " SUUUUUUUU	U
SSS	S
.UD6  X4 H@  nU R                  R                  US 5      nUc  M$  U R                   R                  US 5        MB     U R#                  5         g s  snnf )Nrjieba r   c                     U S   $ Nr)   r=   xs    r   <lambda>*CpmAntTokenizer.__init__.<locals>.<lambda>       Z[\]Z^r&   key)r   r!   	all_zerosTbos)	bod_token	eod_token	bos_token	eos_token	pad_tokenr!   
line_tokenspace_tokenpadding_sidetoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_patternr=   )r   rQ   rR   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr$   added_tokens_encoderpop_added_tokens_decoder_update_total_vocab_size)r#   r   rQ   rR   rS   rT   rU   r!   rV   rW   rX   kwargskvspecial_tokentoken_id	__class__s                   r   r$   CpmAntTokenizer.__init__j   sg    	$
+""!*- LL5S!\\*5TLL%LL$"..vdll6H6H6JP^/_`)-););)=>)=)=>#5DLLT]#^  	
!#%#.26#(	
 	
 *6M0044]DIH#**..x> 7 	%%'/ ?s   E;c                 4    U R                   U R                     $ r    )r\   rQ   r#   s    r   bod_token_idCpmAntTokenizer.bod_token_id       ||DNN++r&   c                 4    U R                   U R                     $ r    )r\   rR   rn   s    r   eod_token_idCpmAntTokenizer.eod_token_id   rq   r&   c                      U R                   S   $ )Nr   r\   rn   s    r   
newline_idCpmAntTokenizer.newline_id   s    ||D!!r&   returnc                 ,    [        U R                  5      $ r    )r+   r\   rn   s    r   
vocab_sizeCpmAntTokenizer.vocab_size   s    4<<  r&   c                 B    [        U R                  40 U R                  D6$ r    )dictr\   rb   rn   s    r   	get_vocabCpmAntTokenizer.get_vocab   s    DLL>D$=$=>>r&   c                     / n[         R                  " US5       H-  nUR                  U R                  R	                  U5      5        M/     U$ )zTokenize a string.F)rD   cutextendr`   r4   )r#   textoutput_tokensrI   s       r   	_tokenizeCpmAntTokenizer._tokenize   sA    D%(A  !9!9!B!B1!EF )r&   c                    > U Vs/ s H  o3S:  d  M
  UPM     nnU Vs/ s H8  oDU R                   :w  d  M  X@R                  :w  d  M%  X@R                  :w  d  M6  UPM:     nn[        TU ]  " U40 UD6$ s  snf s  snf )zDecode ids into a string.r   )pad_token_ideos_token_idbos_token_idra   _decode)r#   	token_idsrf   irI   rk   s        r   r   CpmAntTokenizer._decode   s}     )4	1!VQ		4 
 !):):$:AqDUDU?UAZ[_p_pZpAy 	 
 wy3F33	 5
s    	A1A1A6A6A6A6c                     XR                   ;   $ r    rv   r#   r   s     r   checkCpmAntTokenizer.check   s    $$r&   r   c                 $    SR                  U5      $ )Nr(   )r,   )r#   r   s     r   convert_tokens_to_string(CpmAntTokenizer.convert_tokens_to_string   s    wwvr&   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r\   getr!   r   s     r   _convert_token_to_id$CpmAntTokenizer._convert_token_to_id   s*    ||||'7'7'GHHr&   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r_   r   r!   )r#   r   s     r   _convert_id_to_token$CpmAntTokenizer._convert_id_to_token   s    ||~~66r&   Nsave_directoryfilename_prefixc                 D   [         R                  R                  U5      (       a6  [         R                  R                  X(       a  US-   OS[        S   -   5      nOU(       a  US-   OSU-   nSnSU R
                  ;   a)  U R
                  S   U R
                  S'   U R
                  S	 SU R
                  ;   a)  U R
                  S   U R
                  S'   U R
                  S	 [        R                  " [        U R
                  R                  5       S	 S
95      U l        [        USSS9 nU R
                  R                  5        H>  u  pgXG:w  a  [        R                  SU S35        UnUR                  US-   5        US-  nM@     S S S 5        U4$ ! , (       d  f       U4$ = f)N-r(   r   r   rE   </_>r   </n>c                     U S   $ rG   r=   rH   s    r   rJ   1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>   rL   r&   rM   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r)   )ospathisdirr,   VOCAB_FILES_NAMESr\   r   r   r]   r^   r   loggerwarningwrite)r#   r   r   r   r   writerr   token_indexs           r   save_vocabularyCpmAntTokenizer.save_vocabulary   sp   77==((/3!6rUfgsUt tJ 4C/C/n\J$,,#'<<#4DLL S!4<<#'<<#5DLL T""..vdll6H6H6JP^/_`*cG4&*ll&8&8&:"'NN/
| <N N (EUT\*
 '; 5 } 54 }s   'AF
F)rQ   r_   r\   rR   r`   )	z<d>z</d>z<s>z</s>z<pad>r6   r   r   leftr    )r8   r9   r:   r;   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer$   propertyro   rs   rw   intr{   r   r   r   r   r*   strr   r   r   tupler   r<   __classcell__)rk   s   @r   r?   r?   M   s    0 *$&67
 0(d , , , , " " !C ! !?4%tCy S I7c C$J Z_`cZd  r&   r?   )r   r   r   transformers.utilsr   r   rD   tokenization_pythonr   utilsr   
get_loggerr8   r   r   r   r   r?   __all__r=   r&   r   <module>r      sn    '  	 E  6  
		H	%!;/  @X) Xv 
r&   