
    Z j                        S r SSKrSSKrSSKJr  SSKJr  \R                  " \	5      r
SSS.r0 S	S
_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_S#S$_S%S&_S'S(_S)S*_0 S+S,_S-S._S/S0_S1S2_S3S4_S5S6_S7S8_S9S:_S;S<_S=S>_S?S@_SASB_SCSD_SESF_SGSH_SISJ_SKSL_E0 SMSN_SOSP_SQSR_SSST_SUSV_SWSX_SYSZ_S[S\_S]S^_S_S`_SaSb_ScSd_SeSf_SgSh_SiSj_SkSl_SmSn_ESoSpSqSrSs.ErSt r " Su Sv\5      rSv/rg)wz)Tokenization classes for Salesforce CTRL.    N   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_file	Pregnancyi Christianityi  Explaini Fitnessi  Savingi  Aski#j  Assiv Jokei~ 	Questionsi6  Thoughtsi  Retailiv  Feminismi Writingi.  Atheismi Netflixi  	Computingiך  Opinioniͨ  Alonei  Funnyi%  Gamingi  Humani  Indiai3  JokeriR- Dietin  LegaliS.  NormaniK  Tipi Weightiw  Moviesi  Runningi[  Sciencei*  Horrori  
Confessioni  Financei/  Politicsi?  Scaryi Supportin1  Technologiesi  Teenageip Eventi  Learnedi Notioni 	Wikipediaiϒ  Booksi	  Extracti) Confessionsi- 
Conspiracyi( Linksi  	NarcissusiK Relationshipi  Relationshipsi iǢ  i  ih  i )ReviewsNewsTranslationmultilingualc                 z    [        5       nU S   nU SS  H  nUR                  X#45        UnM     [        U5      nU$ )zy
Return set of symbol pairs in a word.

Word is represented as tuple of symbols (symbols being variable-length strings).
r      N)setadd)wordpairs	prev_charchars       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/ctrl/tokenization_ctrl.py	get_pairsrH   [   sH     EEQIQR		9#$	  JEL    c                   l   ^  \ rS rSrSr\r\rSU 4S jjr	\
S 5       rS rS rS rS rS	 rS
 rSrU =r$ )CTRLTokenizerk   a0  
Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
c           
      ~  > [        USS9 n[        R                  " U5      U l        S S S 5        U R                  R	                  5        VVs0 s H  u  pgXv_M	     snnU l        [        USS9 nUR                  5       R                  S5      SS n	S S S 5        W	 V
s/ s H  n
[        U
R                  5       5      PM     n	n
[        [        U	[        [        U	5      5      5      5      U l        0 U l        SU l        [         TU ]D  " S
USSSS	.UD6  g ! , (       d  f       N= fs  snnf ! , (       d  f       N= fs  sn
f )Nzutf-8)encoding
r@   T	all_zerosnone)	unk_tokentoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )openjsonloadencoderitemsdecoderreadsplittupledictziprangelen	bpe_rankscacheadd_bpe_version_headersuper__init__)selfr   r   rS   kwargsvocab_handlekvmerges_handlemergesmerge	__class__s              rG   ri   CTRLTokenizer.__init__   s   *w/<99\2DL 0)-););)=>)=)=>+0M"'')//5a;F 14:;F5%&F;c&%F*<=>
&*# 	
#.26#)		

 	
 0/>00;s#   DD#.#D)#D:
D )
D7c                 ,    [        U R                  5      $ N)rd   r[   rj   s    rG   
vocab_sizeCTRLTokenizer.vocab_size   s    4<<  rI   c                 B    [        U R                  40 U R                  D6$ ru   )ra   r[   added_tokens_encoderrv   s    rG   	get_vocabCTRLTokenizer.get_vocab   s    DLL>D$=$=>>rI   c                 6  ^  UT R                   ;   a  T R                   U   $ [        U5      n[        [        US S 5      US   S-   /-   5      n[        U5      nU(       d  U$  [	        UU 4S jS9nUT R
                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS S	 nUT R                   U'   U$ ! [         a    UR                  X(S  5         Mq  f = f)
NrP   z</w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)re   getfloat)pairrj   s    rG   <lambda>#CTRLTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1WrI   )keyr   r@      @@ )rf   r`   listrH   minre   rd   indexextend
ValueErrorappendjoin)
rj   tokenrC   rD   bigramfirstsecondnew_wordijs
   `         rG   bpeCTRLTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E7 7FFc                     / n[         R                  " SU5      nU H;  nUR                  [        U R	                  U5      R                  S5      5      5        M=     U$ )zTokenize a string.z\S+\n? )refindallr   r   r   r_   )rj   textsplit_tokenswordsr   s        rG   	_tokenizeCTRLTokenizer._tokenize   sM    

9d+ETXXe_%:%:3%? @A rI   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r[   r   rS   )rj   r   s     rG   _convert_token_to_id"CTRLTokenizer._convert_token_to_id   s*    ||||'7'7'GHHrI   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r]   r   rS   )rj   r   s     rG   _convert_id_to_token"CTRLTokenizer._convert_id_to_token   s    ||~~66rI   c                 d    SR                  U5      R                  SS5      R                  5       nU$ )z:Converts a sequence of tokens (string) in a single string.r   r    )r   replacestrip)rj   tokens
out_strings      rG   convert_tokens_to_string&CTRLTokenizer.convert_tokens_to_string   s,    XXf%--eR8>>@
rI   )rg   re   rf   r]   r[   )z<unk>)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesCONTROL_CODEScontrol_codesri   propertyrw   r{   r   r   r   r   r   __static_attributes____classcell__)rr   s   @rG   rK   rK   k   sS      *!M
$ ! !?*XI7 rI   rK   )r   rY   regexr   tokenization_pythonr   utilsr   
get_loggerr   loggerr   r   rH   rK   __all__rW   rI   rG   <module>r      s   0   6  
		H	%  88D8 v8 u	8
 e8 
58 
58 F8 8 8 e8 8 u8 v8 u8  !8" u#8$ U%8& U'8( e)8* T+8, T-8. U/80 E182 U384 d586 
5788 e98: e;8< u=8> t?8@ eA8B %C8D uE8F G8H VI8J uK8L EM8N uO8P UQ8R uS8T fU8V W8X TY8Z u[8\ 6]8^ %_8` Ua8b c8d Ee8f Vg8h o8v n' nn 
rI   