
    Z j                     v    S r SSKrSSKJr  SSKJr  \R                  " \5      rSS0r	S r
 " S	 S
\5      rS
/rg)zTokenization classes for ESM.    N   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                     [        U S5       nUR                  5       R                  5       nU Vs/ s H  o3R                  5       PM     snsS S S 5        $ s  snf ! , (       d  f       g = f)Nr)openread
splitlinesstrip)r   flinesls       y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/esm/tokenization_esm.pyload_vocab_filer      sL    	j#	!##%#()5a	5) 
	) 
	s   #AA	AA
A(c            
         ^  \ rS rSrSr\rSS/r     SU 4S jjrS\	S\
4S jrS	\
S\	4S
 jrS rS rS	\
S\	4S jrS\	S\
4S jr SS\\	   S\\	   S-  S\\	   4S jjr SS\S\S-  S\S\\	   4S jjrS r\S\	4S j5       rSrU =r$ )EsmTokenizer!   z
Constructs an ESM tokenizer.
	input_idsattention_maskc           	      Z  > [        U5      U l        [        [        U R                  5      5      U l        [        U R                  5       VV	s0 s H  u  pX_M	     sn	nU l        [        T
U ]  " SUUUUUS.UD6  U R                  U l        U R                  U R                  5        g s  sn	nf )N)	unk_token	cls_token	pad_token
mask_token	eos_token )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr   r   r   r   r   r   kwargsindtok	__class__s             r   r%   EsmTokenizer.__init__)   s     **5 4??!;<6?6PQ6P(#SX6PQ 	
!	
 	
 '+oo#$556 Rs   B'indexreturnc                 L    U R                   R                  XR                  5      $ Nr"   getr   r(   r.   s     r   _convert_id_to_token!EsmTokenizer._convert_id_to_tokenE         $$UNN;;    tokenc                 ~    U R                   R                  XR                   R                  U R                  5      5      $ r1   r#   r3   r   r(   r9   s     r   _convert_token_to_id!EsmTokenizer._convert_token_to_idH   .      $$U,=,=,A,A$..,QRRr8   c                 "    UR                  5       $ r1   )split)r(   textr)   s      r   	_tokenizeEsmTokenizer._tokenizeK   s    zz|r8   c                 p    U R                   R                  5       nUR                  U R                  5        U$ r1   )r#   copyupdateadded_tokens_encoder)r(   
base_vocabs     r   	get_vocabEsmTokenizer.get_vocabN   s0    &&++-
$334r8   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ r1   r;   r<   s     r   token_to_idEsmTokenizer.token_to_idS   r?   r8   c                 L    U R                   R                  XR                  5      $ r1   r2   r4   s     r   id_to_tokenEsmTokenizer.id_to_tokenV   r7   r8   Ntoken_ids_0token_ids_1c                     U R                   /nU R                  /nUc  U R                  c  X1-   $ X1-   U-   $ U R                  c  [        S5      eX1-   U-   U-   U-   $ )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r(   rR   rS   clsseps        r    build_inputs_with_special_tokens-EsmTokenizer.build_inputs_with_special_tokensY   sy       !  !  ((((3..&\]] 3&4s::r8   already_has_special_tokensc                     U(       a2  Ub  [        S5      eU Vs/ s H  oDU R                  ;   a  SOSPM     sn$ S/S/[        U5      -  -   S/-   nUb  US/[        U5      -  S/-   -  nU$ s  snf )at  
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

Args:
    token_ids_0 (`list[int]`):
        List of ids of the first sequence.
    token_ids_1 (`list[int]`, *optional*):
        List of ids of the second sequence.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.   r   )rW   all_special_idslen)r(   rR   rS   r\   r9   masks         r   get_special_tokens_mask$EsmTokenizer.get_special_tokens_maskg   s    $ && R 
 LWW;%$"6"66AA=;WWsqcC,,-3"QC#k**aS00D	 Xs   A,c                    [         R                  R                  X(       a  US-   OSS-   5      n[        US5       nUR	                  SR                  U R
                  5      5        S S S 5        U4$ ! , (       d  f       U4$ = f)N- r   w
)ospathjoinr
   writer   )r(   save_directoryfilename_prefixr   r   s        r   save_vocabularyEsmTokenizer.save_vocabulary   si    WW\\.O?S3Hacgr2rs
*c"aGGDIIdoo./ #} #"}s   +A11
Bc                 ,    [        U R                  5      $ r1   )r`   r   )r(   s    r   
vocab_sizeEsmTokenizer.vocab_size   s    4??##r8   )r"   r#   r   r&   )z<unk>z<cls>z<pad>z<mask>z<eos>r1   )NF)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr%   intstrr5   r=   rC   rJ   rM   rP   listrZ   boolrb   ro   propertyrr   __static_attributes____classcell__)r,   s   @r   r   r   !   s    *$&67
 78<# <# <S# S# S
S S S< < < GK;9;379t3C;	c; fk.2Tk^b	c> $C $ $r8   r   )rx   ri   tokenization_pythonr   utilsr   
get_loggerrt   loggerry   r   r   __all__r   r8   r   <module>r      sO    $ 	 6  
		H	%!;/ *m$& m$` 
r8   