
    Z j'                      Z    S r SSKJr  SSKJr  SSKJr  / SQr\ " S S\5      5       rS/r	g)	z
Processor class for EVOLLA.
   )BatchFeature)ProcessorMixin)auto_docstring)aa_seqfoldseekmsac                      ^  \ rS rSrSU 4S jjrSS jr SS\4S jjr\    SS\	\
   \
-  S-  S\	\	\
      \	\
   -  S-  S	\S-  S\S-  4S
 jj5       rS rS rS rS rSrU =r$ )EvollaProcessor   Nc                    > Uc  [        S5      eUc  [        S5      e[        TU ]	  X5        SU R                  l        X0l        X@l        g)aQ  
protein_tokenizer (`EsmTokenizer`):
    An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
protein_max_length (`int`, *optional*, defaults to 1024):
    The maximum length of the sequence to be generated.
text_max_length (`int`, *optional*, defaults to 512):
    The maximum length of the text to be generated.
Nz+You need to specify an `protein_tokenizer`.z"You need to specify a `tokenizer`.z<|reserved_special_token_0|>)
ValueErrorsuper__init__	tokenizer	pad_tokenprotein_max_lengthtext_max_length)selfprotein_tokenizerr   r   r   kwargs	__class__s         }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/evolla/processing_evolla.pyr   EvollaProcessor.__init__   sO     $JKKABB*6#A "4.    c           
      P   / nU H  nUR                  S5      nUR                  S5      nSR                  [        XV5       VVs/ s H&  u  pxUR                  5       UR	                  5       -   PM(     snn5      n	UR                  U	5        M     U R                  USSUSS9n
U
$ s  snnf )Nr   r    ptT)return_tensors
truncation
max_lengthpadding)getjoinzipupperlowerappendr   )r   proteinsr   sa_sequencesproteinr   r   sfsa_sequence	sa_tokenss              r   process_proteins EvollaProcessor.process_proteins2   s    G[[*F{{:.H''SEZ"[EZTQ1779qwwy#8EZ"[\K,	   **$K]gk + 
	  #\s   -B"r   c           	          / nU H/  nU R                   R                  USSS9nUR                  U5        M1     U R                  USSSSUS9nU$ )NFT)tokenizeadd_generation_promptr   longest)add_special_tokensr   r!   r   r    )r   apply_chat_templater'   )r   textsr   promptsmessagespromptprompt_inputss          r   process_textEvollaProcessor.process_text?   sr    
 H^^77&* 8 F
 NN6"  $& ' 
 r   r(   messages_listr   c                 J   Ub  Uc  [        S5      eUb  UOU R                  nUb  UOU R                  n[        U[        5      (       a  U/n[        U[
        [        45      (       a!  [        US   [
        [        45      (       d  U/n[        U[
        [        45      (       a"  [        S U 5       5      (       d  [        S5      e[        U[
        [        45      (       a;  [        S U 5       5      (       d$  [        SSR                  [        5       SU 35      e[        U[
        [        45      (       a  U H  n[        U[
        [        45      (       d  [        S	[        U5       S
35      e[        S U 5       5      (       d  [        S5      e[        S U 5       5      (       d  [        S U 5       5      (       d  M  [        SU 35      e   O[        S[        U5       S
35      eU R                  X5      nU R                  X$5      n[        US   US   US   US   S.S9$ )a  
proteins (`Union[List[dict], dict]`):
    A list of dictionaries or a single dictionary containing the following keys:
        - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
        - `"foldseek"` (`str`) -- The foldseek string of the protein.
messages_list (`Union[List[List[dict]], List[dict]]`):
    A list of lists of dictionaries or a list of dictionaries containing the following keys:
        - `"role"` (`str`) -- The role of the message.
        - `"content"` (`str`) -- The content of the message.
protein_max_length (`int`, *optional*, defaults to 1024):
    The maximum length of the sequence to be generated.
text_max_length (`int`, *optional*, defaults to 512):
    The maximum length of the text.

Return:
    a dict with following keys:
        - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
        - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
        - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
        - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
z3You need to specify `messages_list` and `proteins`.    c              3   B   #    U  H  n[        U[        5      v   M     g 7fN
isinstancedict.0ps     r   	<genexpr>+EvollaProcessor.__call__.<locals>.<genexpr>   s     :aX`ST:a;N;NX`   zUThe proteins should be a list of dictionaries, but not all elements are dictionaries.c              3   b   #    U  H%  n[        S  UR                  5        5       5      v   M'     g7f)c              3   2   #    U  H  o[         ;   v   M     g 7frB   )PROTEIN_VALID_KEYS)rG   ks     r   rI   5EvollaProcessor.__call__.<locals>.<genexpr>.<genexpr>   s     :A''s   N)allkeysrF   s     r   rI   rJ      s'      ;
DLqC::::Hs   -/z2There should be a list of dictionaries with keys: z, z for each protein.But got: z;Each messages in messages_list should be a list instead of .c              3   B   #    U  H  n[        U[        5      v   M     g 7frB   rC   rG   ms     r   rI   rJ      s     A1:a..rK   zfEach message in messages_list should be a list of dictionaries, but not all elements are dictionaries.c              3   Z   #    U  H!  n[        UR                  5       5      S :g  v   M#     g7f)   N)lenrR   rU   s     r   rI   rJ      s     <8as1668})8s   )+c              3   ^   #    U  H#  n[        UR                  5       5      S S1:g  v   M%     g7f)rolecontentN)setrR   rU   s     r   rI   rJ      s(      DBJQCMfi%88(s   +-zlEach message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'.But got: zFThe messages_list should be a list of lists of dictionaries, but it's 	input_idsattention_mask)protein_input_idsprotein_attention_maskr^   r_   )data)r   r   r   rD   rE   listtuplerQ   r#   rN   	TypeErrortypeanyr/   r<   r   )	r   r(   r>   r   r   r   r9   r.   text_tokenss	            r   __call__EvollaProcessor.__call__W   sJ   > }4RSS3E3Q/W[WnWn-<-H/dNbNb h%% zHmdE]33J}UVGWZ^`eYf<g<g*OMhu..s:aX`:a7a7atuuhu..s ;
DL;
 8
 8
 D99/01 2$:'  mdE]33)!(T5M::#&abfgobpaqqr$sttAAAA$ A  <8<<< DBJD A A %$$,:/  * XY]^kYlXmmno  ))(G	''G%.{%;*34D*E(5"-.>"?	
 	
r   c                 :    U R                   R                  " U0 UD6$ rB   )r   batch_decoder   argsr   s      r   rl   EvollaProcessor.batch_decode   s    ~~**D;F;;r   c                 :    U R                   R                  " U0 UD6$ rB   )r   decoderm   s      r   rq   EvollaProcessor.decode   s    ~~$$d5f55r   c                 :    U R                   R                  " U0 UD6$ rB   )r   rl   rm   s      r   protein_batch_decode$EvollaProcessor.protein_batch_decode   s    %%22DCFCCr   c                 :    U R                   R                  " U0 UD6$ rB   )r   rq   rm   s      r   protein_decodeEvollaProcessor.protein_decode   s    %%,,d=f==r   )r   r   )N      )ry   )rz   )NNNN)__name__
__module____qualname____firstlineno__r   r/   intr<   r   rc   rE   ri   rl   rq   rt   rw   __static_attributes____classcell__)r   s   @r   r
   r
      s    /(   # 0  .2>B)-&*T
t*t#d*T
 DJ'$t*4t;T
  $J	T

 tT
 T
l<6D> >r   r
   N)
__doc__feature_extraction_utilsr   processing_utilsr   utilsr   rN   r
   __all__ r   r   <module>r      sJ    5 $ 3  [>n [> [>| 
r   