
    Z j                        S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(  SSK)J*r*J+r+  SSK,J-r-J.r.  SSK/J0r0  \(Rb                  " \25      r3S r4\" S5      SLS j5       r5S r6S r7S r8 " S S\Rr                  5      r: " S S\Rr                  5      r; " S S \Rr                  5      r<  SMS!\Rr                  S"\Rz                  S#\Rz                  S$\Rz                  S%\Rz                  S-  S&\>S-  S'\>S(\#\%   4S) jjr?\" \55       " S* S+\Rr                  5      5       r@ " S, S-\Rr                  5      rA " S. S/\Rr                  5      rB " S0 S1\Rr                  5      rC " S2 S3\Rr                  5      rD " S4 S5\5      rE " S6 S7\Rr                  5      rF " S8 S9\Rr                  5      rG\& " S: S;\!5      5       rH\& " S< S=\H5      5       rI\& " S> S?\H5      5       rJ " S@ SA\Rr                  5      rK\&" SBSC9 " SD SE\H5      5       rL\& " SF SG\H5      5       rM " SH SI\Rr                  5      rNSJ rO/ SKQrPg)NzPyTorch ESM model.    N)Callable)Optional)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )	EsmConfigc                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2s      u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/esm/modeling_esm.pyrotate_halfr/   0   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''    rotary_pos_embc                 b   U R                   nUR                  U5      nUR                  U5      nU R                  5       U-  [        U R                  5       5      U-  -   nUR                  5       U-  [        UR                  5       5      U-  -   nUR	                  U5      UR	                  U5      4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)dtype	unsqueezefloatr/   to)qkcossinunsqueeze_dimoriginal_dtypeq_embedk_embeds           r.   apply_rotary_pos_embr?   7   s    & WWN
--
&C
--
&Cwwy3;qwwy#9C#?@Gwwy3;qwwy#9C#?@G::n%wzz.'AAAr0   c                 n    U S-  S[         R                  " U [        R                  " S5      -  5      -   -  $ )zg
This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
g      ?      ?g       @)r)   erfmathsqrtr+   s    r.   gelurF   R   s.     s7cEIIa$))C.&899::r0   c                 *    X R                  SS5      -   $ )zJMake layer symmetric in final two dimensions, used for contact prediction.r$   )	transposerE   s    r.   
symmetrizerJ   Y   s    {{2r"""r0   c                     U R                  SSS9nU R                  SSS9nU R                  SSS9nX-  nUR                  U5        X-
  nU$ )z=Perform average product correct, used for contact prediction.r$   T)keepdimsrH   )r$   rH   )sumdiv_)r+   a1a2a12avg
normalizeds         r.   average_product_correctrT   ^   sW    	
rD	!B	
rD	!B
%%4%
(C
'CHHSMJr0   c                      ^  \ rS rSr% Sr\R                  \S'   SS\4U 4S jjjr	\
   SS\S-  S\S   S	\S-  S
\S\4   4S jj5       r\R                   " 5       \SS j5       5       rSrU =r$ )EsmRotaryEmbeddingj   z
Rotary position embeddings.
Implementation based on [ModernBERT's RotaryEmbedding](https://github.com/huggingface/transformers/blob/aad13b87ed59f2afcfaebc985f403301887a35fc/src/transformers/models/modernbert/modeling_modernbert.py#L94).
inv_freqNconfigc                    > [         TU ]  5         Xl        0 U l        U R	                  U R                  U5      u  p4U R                  SU5        [        U SU5        g )NrX   attention_scaling)super__init__rY   	rope_typecompute_default_rope_parametersregister_buffersetattr)selfrY   devicecurr_inv_freqcurr_attention_scaling	__class__s        r.   r]   EsmRotaryEmbedding.__init__r   sU    040T0TUYU`U`bh0i-Z7)+ABr0   rc   ztorch.deviceseq_lenreturnztorch.Tensorc           	         U R                   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )aI  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
head_dimNrA   r   r%   r3   )rc   r3   )	
rope_thetagetattrhidden_sizenum_attention_headsr)   arangeint64r6   r5   )rY   rc   rh   baser'   attention_factorrX   s          r.   r_   2EsmRotaryEmbedding.compute_default_rope_parameters|   s    (   fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r0   c                 @   [        U S5      n[        U S5      nUS S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS	9   UR                  5       UR                  5       -  R                  SS
5      n	[        R                  " X4SS9n
U
R                  5       U-  nU
R                  5       U-  nS S S 5        WR	                  UR                  S9WR	                  UR                  S94$ ! , (       d  f       N@= f)NrX   r[   r   r$   r!   mpscpuF)device_typeenabledr%   r&   rl   )rn   r5   expandr(   r6   rc   
isinstancetypestrr   rI   r)   r*   r9   r:   r3   )rb   r+   position_ids
layer_typerX   r[   inv_freq_expandedposition_ids_expandedry   freqsembr9   r:   s                r.   forwardEsmRotaryEmbedding.forward   sZ    4,#D*=>$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')//C'')//C	 D vvAGGv$cff177f&;;; DCs   'A.F
F)rY   r^   N)NNN)__name__
__module____qualname____firstlineno____doc__r)   Tensor__annotations__r"   r]   staticmethodr   inttupler5   r_   no_gradr   r   __static_attributes____classcell__rf   s   @r.   rV   rV   j   s    
 llCy C C #'+/"*D *(* t* 
~u$	%	* *< ]]_<  <r0   rV   c                   F   ^  \ rS rSrSr  SS\S\4U 4S jjjrS rSrU =r	$ )	EsmContactPredictionHead   zWPerforms symmetrization, apc, and computes a logistic regression on the output featuresin_featureseos_idxc                    > [         TU ]  5         Xl        X0l        [        R
                  " USU5      U l        [        R                  " 5       U l        g )Nr!   )	r\   r]   r   r   r   Linear
regressionSigmoid
activation)rb   r   biasr   rf   s       r.   r]   !EsmContactPredictionHead.__init__   s<     	&))KD9**,r0   c                 N   UR                  U R                  5      R                  U5      nUR                  S5      UR                  S5      -  nX#S S 2S S S S 2S S 24   -  nUSS S2S S24   nUSSS 2SS 24   nUR	                  5       u  pEpgnUR                  XEU-  Xw5      nUR                  U R                  R                  R                  5      n[        [        U5      5      nUR                  SSSS5      nU R                  U R                  U5      R                  S5      5      $ )Nr!   r%   .r$   r   r	   )ner   r6   r4   sizeviewr   weightrc   rT   rJ   permuter   squeeze)	rb   tokens
attentionseos_mask
batch_sizelayersheadsseqlen_s	            r.   r    EsmContactPredictionHead.forward   s   99T\\*--j9%%a(8+=+=a+@@1dD!Q+>"??
SbS#2#.
QR,
/9/@,
E1__Z%P
  ]]OO""))

 -Z
-CD
''1a3
tz:BB1EFFr0   )r   r   r   r   )Tr%   )
r   r   r   r   r   r   r]   r   r   r   r   s   @r.   r   r      s6    a
 	
'
' 	
' 
'G Gr0   r   c                   D   ^  \ rS rSrSrU 4S jr    SS jrS rSrU =r	$ )EsmEmbeddings   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        UR                  (       a/  [        R                  " UR
                  UR                  S9U l        OS U l        [        R                  " UR                  5      U l        [        USS5      U l        U R#                  S[$        R&                  " UR(                  5      R+                  S5      SS9  UR                  U l        U R                   S:X  a9  [        R                  " UR(                  UR
                  U R,                  S9U l        UR0                  U l        UR2                  U l        g )	N)padding_idxepsposition_embedding_typeabsoluter   r!   r$   F)
persistent)r\   r]   r   	Embedding
vocab_sizero   pad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutrn   r   r`   r)   rq   max_position_embeddingsr{   r   position_embeddingstoken_dropoutmask_token_idrb   rY   rf   s     r.   r]   EsmEmbeddings.__init__   s*   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11r0   c                    Uc*  Ub  [        XR                  5      nOU R                  U5      nUc  U R                  U5      nUnU R                  (       a  Ub  UR                  XR                  :H  R                  S5      S5      nSnUb  UR                  S5      OUR                  S   nXR                  :H  R                  S5      R                  5       U-  nUSU-
  -  SU-
  S S 2S S 4   -  R                  UR                  5      nU R                  S:X  a  U R                  U5      n	XY-   nU R                  b  U R                  U5      nUb,  XRR                  S5      -  R                  UR                  5      nU$ )Nr$           gQ?r!   r   )"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedsr   r   masked_fillr   r4   rM   r(   r5   r6   r3   r   r   r   )
rb   	input_idsattention_maskr   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedr   s
             r.   r   EsmEmbeddings.forward   s    $A)M]M]^#JJ=Y  00;M #
 )"7#//>P>P1P0[0[\^0_adeJ)4B4N.,,R0T]TcTcdeTfK#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#9J??&4J%$'?'?'CCGG
HXHXYJ r0   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr$   r!   )r3   rc   r   )r   r)   rq   r   longrc   r4   r{   )rb   r   input_shapesequence_lengthr   s        r.   r   4EsmEmbeddings.create_position_ids_from_inputs_embeds  s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r0   )r   r   r   r   r   r   r   r   NNNN)
r   r   r   r   r   r]   r   r   r   r   r   s   @r.   r   r      s+    22 /b= =r0   r   modulequerykeyvaluer   scalingr   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr$         r%   r	   r&   )ptrainingr!   )
r   r)   matmulrI   r   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r.   eager_attention_forwardr   1  s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r0   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\\	   S
\
\R                     4S jjrSrU =r$ )EsmSelfAttentioniM  Nc                 t  > [         TU ]  5         Xl        UR                  UR                  -  S:w  a7  [        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        UR                  U l        U=(       d    [#        USS5      U l        SU l        UR(                  U l        X0l        U R(                  =(       a    U(       + U l        g )	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   rA   )r\   r]   rY   ro   rp   hasattr
ValueErrorr   attention_head_sizeall_head_sizer   r   r   r   r   attention_probs_dropout_probr   rn   r   r   
is_decoder	layer_idx	is_causal)rb   rY   r   r   is_cross_attentionrf   s        r.   r]   EsmSelfAttention.__init__O  sg    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
::'> (
'-zC
$  ++"C1C-Cr0   hidden_statesr   encoder_hidden_statesencoder_attention_maskr   r   ri   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	US Ln
U
(       a  UOUnU
(       a  UOUnU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nXR                  S-  -  n	U R                  S:X  a  Uu  p[        XXSS9u  p[        R                  " U R                  R                  [        5      nU" U U	UUU4U R                  (       d  SOU R                  U R                   S.UD6u  nnUR"                  " / UQSP76 R%                  5       nUU4$ )	Nr$   r!   r%   r   rotary)r;   r   )r   r   )r(   r   r   r   rI   r   r   r   r?   r   get_interfacerY   _attn_implementationr   r   r   r   reshaper   )rb   r  r   r  r  r   r   r   hidden_shapequery_layerr   current_states	key_layervalue_layerr9   r:   attention_interfacer   r   s                      r.   r   EsmSelfAttention.forwardk  s    $))#2.CCbC$*B*BCjj/44\BLLQPQR2$>2D.-3E/>HH^,11,?II!QO	jj055lCMMaQRS "$<$<d$BB''83*HC%9+RUjk%l"K(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFHL((r0   )r   r   rY   r   r   r   r   r   rp   r   r   r   r   )NNFr   )r   r   r   r   r]   r)   r   FloatTensorr   r   r   r   r   r   r   s   @r.   r   r   M  s    D> 48:>;?37.)||.) ))D0.)  %0047	.)
 !& 1 1D 8.) #\\D0.) +,.) 
u||	.) .)r0   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EsmSelfOutputi  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r   )	r\   r]   r   r   ro   denser   r   r   r   s     r.   r]   EsmSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r0   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r  r   rb   r  input_tensors      r.   r   EsmSelfOutput.forward  ,    

=1]3%4r0   r  r   r   r   r   r]   r   r   r   r   s   @r.   r  r        >
 r0   r  c                   j   ^  \ rS rSrSU 4S jjr    S	S\R                  S-  S\\   4S jjr	Sr
U =r$ )
EsmAttentioni  Nc                    > [         TU ]  5         [        XUS9U l        [	        U5      U l        [        R                  " UR                  UR                  S9U l        g )N)r   r   r   )
r\   r]   r   rb   r  outputr   r   ro   r   )rb   rY   r   r   rf   s       r.   r]   EsmAttention.__init__  sG    $VUgh	#F+f&8&8f>S>STr0   r   r   c                 ~    U R                  U5      nU R                  " U4UUUUS.UD6u  pU R                  X5      nU$ )Nr   r  r  r   )r   rb   r!  )
rb   r  r   r  r  r   r   hidden_states_lnr   r   s
             r.   r   EsmAttention.forward  sW      >>-8
)"7#9 3
 
 kk+=r0   )r   r!  rb   )NFr   )r   r   r   r   r]   r)   r   r   r   r   r   r   r   s   @r.   r  r    sD    U "#37 #\\D0 +, r0   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )EsmIntermediatei  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        g r   )r\   r]   r   r   ro   intermediate_sizer  r   s     r.   r]   EsmIntermediate.__init__  s,    YYv1163K3KL
r0   r  ri   c                 >    U R                  U5      n[        U5      nU$ r   )r  rF   )rb   r  s     r.   r   EsmIntermediate.forward  s     

=1]+r0   )r  
r   r   r   r   r]   r)   r   r   r   r   r   s   @r.   r(  r(    s)    MU\\ ell  r0   r(  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	EsmOutputi  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r\   r]   r   r   r*  ro   r  r   r   r   r   s     r.   r]   EsmOutput.__init__  sB    YYv779K9KL
zz&"<"<=r0   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r  r  s      r.   r   EsmOutput.forward  r  r0   r  r  r   s   @r.   r0  r0    r  r0   r0  c                   l   ^  \ rS rSrU 4S jr    S	S\R                  S-  S\\   4S jjr	S r
SrU =r$ )
EsmLayeri  c                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        [        R                  " UR                   UR"                  S9U l        g )Nr!   z> should be used as a decoder model if cross attention is addedT)r   r   )r\   r]   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionRuntimeErrorcrossattentionr(  intermediater0  r!  r   r   ro   r   r   s     r.   r]   EsmLayer.__init__  s    '-'E'E$%f- ++#)#=#= ##??"dV+i#jkk".v$"OD+F3'f&8&8f>S>STr0   Nr   r   c                     U R                   " U4UUS.UD6nU R                  (       a;  Ub8  [        U S5      (       d  [        SU  S35      eU R                  " U4UUUUS.UD6nU R                  U5      nU$ )N)r   r   r=  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r$  )r:  r   r   AttributeErrorr=  feed_forward_chunk)	rb   r  r   r  r  r   r   attention_outputlayer_outputs	            r.   r   EsmLayer.forward  s      >>
) 3
 	
 ??4@4!122$=dV D` ` 
  $22  -&;'=$7    ../?@r0   c                 l    U R                  U5      nU R                  U5      nU R                  X15      nU$ r   )r   r>  r!  )rb   rC  attention_output_lnintermediate_outputrD  s        r.   rB  EsmLayer.feed_forward_chunk  s9    "nn-=>"//0CD{{#6Ir0   )	r   r;  r:  r8  r=  r>  r   r!  r9  r   )r   r   r   r   r]   r)   r   r   r   r   rB  r   r   r   s   @r.   r6  r6    sJ    U$ "#37! #\\D0! +,!F r0   r6  c                   p   ^  \ rS rSrU 4S jr\    SS\R                  S-  S\\	   4S jj5       r
SrU =r$ )	
EsmEncoderi  c                 2  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        R                  " UR                  UR                  S9U l        SU l        g s  snf )Nr   F)r\   r]   rY   r   
ModuleListrangenum_hidden_layersr6  layerr   ro   r   emb_layer_norm_aftergradient_checkpointing)rb   rY   r   rf   s      r.   r]   EsmEncoder.__init__  sq    ]]eFD\D\>]#^>]HV$4>]#^_
$&LL1C1CI^I^$_!&+# $_s   BNr   r   c           	          [        U R                  5       H  u  pxU" U4UUUUS.UD6nM     U R                  (       a  U R                  U5      n[        US9$ )Nr$  )last_hidden_state)	enumeraterP  rQ  r   )	rb   r  r   r  r  r   r   ilayer_modules	            r.   r   EsmEncoder.forward!  sg      )4OA(-&;'=$7 M  5 $$ 55mDM1MRRr0   )rY   rQ  rR  rP  r   )r   r   r   r   r]   r   r)   r   r   r   r   r   r   r   s   @r.   rK  rK    sQ    ,  "#37S #\\D0S +,S Sr0   rK  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	EsmPooleri<  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r\   r]   r   r   ro   r  Tanhr   r   s     r.   r]   EsmPooler.__init__=  s9    YYv1163E3EF
'')r0   r  ri   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r  r   )rb   r  first_token_tensorpooled_outputs       r.   r   EsmPooler.forwardB  s6     +1a40

#566r0   )r   r  r.  r   s   @r.   r[  r[  <  s(    $
U\\ ell  r0   r[  c                      ^  \ rS rSr% \\S'   SrSrSr/ SQr	S/r
SrSrSrSr\\" \SS	S
9/\" \SSS
9/S.r\R(                  " 5       U 4S j5       rS rSrU =r$ )EsmPreTrainedModeliK  rY   esmTF)r6  #EsmFoldTriangularSelfAttentionBlockr   zposition_embeddings.weightr!   r:  )index
layer_namer=  )r  r   cross_attentionsc                   > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a\  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g[        U[        5      (       a?  UR                  UR                   5      u  p#[        R                  " [#        US5      U5        gg)zInitialize the weightsr$   r   rX   N)r\   _init_weightsr|   	EsmLMHeadinitzeros_r   r   copy_r   r)   rq   r(   r{   rV   r_   rY   rn   )rb   r   rd   r   rf   s       r.   rl   EsmPreTrainedModel._init_weights`  s     	f%fi((KK$..JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 233%EEfmmTMJJwvz2MB 4r0   c                     g r    rb   s    r.   get_output_embeddings(EsmPreTrainedModel.get_output_embeddingsl  s     r0   rs  )r   r   r   r   r"   r   base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr6  r   r   _can_record_outputsr)   r   rl  ru  r   r   r   s   @r.   re  re  K  s    &*#\*F)G&N"& "%&6aKXY+1AQR
 ]]_	C 	C r0   re  c                   h  ^  \ rS rSrSrSU 4S jjrS rS rS r\	\
\      SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\\R                     \-  4S jj5       5       5       rS rS rSrU =r$ )EsmModelir  a  

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
c                   > [         TU ]  U5        Xl        [        U5      U l        SU l        [        USS5      U l        U R                  S:X  a  [        US9U l        [        U5      U l
        U(       a  [        U5      OSU l        [        UR                  UR                  -  SS9U l        U R#                  U R$                  5        U R'                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
Nr   r   r  )rY   T)r   r   )r\   r]   rY   r   r   rotary_embeddingsrn   r   rV   rK  encoderr[  poolerr   rO  rp   contact_head"_register_load_state_dict_pre_hook	load_hook	post_init)rb   rY   add_pooling_layerrf   s      r.   r]   EsmModel.__init__  s    
 	 '/!%'.v7PR\']$''83%7v%FD"!&)+<i'$40063M3MMTX
 	//?r0   c                    ^ T S3nXA;  aH  [        U4S j[        UR                  5       5       5       5      nU(       a	  XS      X'   U H  nX	 M     gg)a  Remap per-layer rotary inv_freq keys from old checkpoints to the new model-level location.

Old checkpoints stored inv_freq per attention layer at:
    {prefix}encoder.layer.{i}.attention.self.rotary_embeddings.inv_freq
New code stores a single shared inv_freq at:
    {prefix}rotary_embeddings.inv_freq
The old checkpoint values must be preserved (not recomputed) because they may
have been saved in float16, matching the precision used during training.
zrotary_embeddings.inv_freqc              3      >#    U  H7  nUR                  T5      (       d  M  UR                  S 5      (       d  M3  Uv   M9     g7f)z*.attention.self.rotary_embeddings.inv_freqN)
startswithendswith).0r8   prefixs     r.   	<genexpr>%EsmModel.load_hook.<locals>.<genexpr>  s8      0A<<' ,-JJ7c,d 0s   AA	Ar   N)sortedlistkeys)rb   
state_dictr  argsnew_keyold_keysr8   s     `    r.   r  EsmModel.load_hook  sc     H67$ joo/0 H
 &0!&=
#M  %r0   c                 .    U R                   R                  $ r   r   r   rt  s    r.   get_input_embeddingsEsmModel.get_input_embeddings  s    ...r0   c                 $    XR                   l        g r   r  )rb   r   s     r.   set_input_embeddingsEsmModel.set_input_embeddings  s    */'r0   Nr   r   r   r   r  r  r   ri   c                    USL USL-  (       a  [        S5      eUc  U R                  UUUS9nU R                  UUUUSS9u  p&U R                  S:X  aQ  Uc<  UR                  S   n[
        R                  " XR                  S9R                  S5      nU R                  XC5      n	OSn	U R                  " U4UUUU	S	.UD6n
U
S   nU R                  b  U R                  U5      OSn[        UUS
9$ )a  
input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r   )r   r  embedding_outputr  past_key_valuesr  r!   )rc   r   r$  )rU  pooler_output)r   r   _create_attention_masksr   r(   r)   rq   rc   r4   r  r  r  r   )rb   r   r   r   r   r  r  r   rh   r   encoder_outputssequence_outputrb  s                r.   r   EsmModel.forward  s2   > -t";<YZZ  !OO#-) , M 261M1M)#9*"7  2N 2
. ''83#'--a0$||G<P<PQ[[\]^"&"8"8"U"&,,
)"7#9 3
 
 *!,8<8OO4UY;-'
 	
r0   c                     U R                   R                  (       a  [        U R                   UUUS9nO[        U R                   UUS9nUb  [        U R                   UUUS9nX4$ )N)rY   r   r   r  )rY   r   r   )rY   r   r   r  )rY   r   r   r   )rb   r   r  r  r  r  s         r.   r   EsmModel._create_attention_masks  sr     ;;!!/{{.- /	N 7{{.-N "-%>{{.5&;	&" 55r0   c                 6   U " XSSS9R                   n[        R                  " USS9nX2R                  S5      R                  S5      R                  S5      -  nX2R                  S5      R                  S5      R                  S5      -  nU R	                  X5      $ )NT)r   return_dictoutput_attentionsr!   r&   r%   r	      )r   r)   stackr4   r  )rb   r   r   attnss       r.   predict_contactsEsmModel.predict_contacts#  s    V`deppEq)
 	))!,66q9CCAFF))!,66q9CCAFF  //r0   )rY   r  r   r  r  r   r  )T)NNNNNN)r   r   r   r   r   r]   r  r  r  r   r    r   r)   r   r   r   r   r   r   r  r  r   r   r   s   @r.   r  r  r  s   
6",/0   *..2,0-1596:F
<<$&F
 t+F
 llT)	F

 ||d*F
  %||d2F
 !&t 3F
 +,F
 
u||	K	KF
    F
R6@	0 	0r0   r  c                   X  ^  \ rS rSrSS0rU 4S jrS rS r\\	       SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\   S\\-  4S jj5       5       rS rSrU =r$ )EsmForMaskedLMi/  zlm_head.decoder.weightz%esm.embeddings.word_embeddings.weightc                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzjIf you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  )
r\   r]   r   loggerwarningr  rf  rm  lm_headr  r   s     r.   r]   EsmForMaskedLM.__init__3  sP     NN1
 Fe< (r0   c                 .    U R                   R                  $ r   r  decoderrt  s    r.   ru  $EsmForMaskedLM.get_output_embeddingsA  s    ||###r0   c                 $    XR                   l        g r   r  )rb   new_embeddingss     r.   set_output_embeddings$EsmForMaskedLM.set_output_embeddingsD  s    -r0   Nr   r   r   r   r  r  labelsr   ri   c           	      l   U R                   " U4UUUUUS.UD6n	U	S   n
U R                  U
5      nSnUba  [        5       nUR                  UR                  5      nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUU	R                  U	R                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
)r   r   r   r  r  r   Nr$   losslogitsr  r   )rf  r  r   r6   rc   r   rY   r   r   r  r   )rb   r   r   r   r   r  r  r  r   outputsr  prediction_scoresmasked_lm_lossloss_fcts                 r.   r   EsmForMaskedLM.forwardG  s    ( ((
)%'"7#9
 
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r0   c                 4    U R                   R                  XS9$ )N)r   )rf  r  )rb   r   r   s      r.   r  EsmForMaskedLM.predict_contactsu  s    xx(((OOr0   )rf  r  )NNNNNNN)r   r   r   r   _tied_weights_keysr]   ru  r  r   r   r)   
LongTensorr   r  r   r   r   r   r   r  r   r   r   s   @r.   r  r  /  s
   24[\$.  .2.20426:>6:*.*
##d**
 t+*
 &&-	*

 ((4/*
  %0047*
 !&t 3*
   4'*
 +,*
 
	*
  *
XP Pr0   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )rm  iy  z&ESM Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  SS9U l
        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   F)r   )r\   r]   r   r   ro   r  r   r   r   r   r  	Parameterr)   zerosr   r   s     r.   r]   EsmLMHead.__init__|  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FUSLLV->->!?@	r0   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      U R                  -   nU$ r   )r  rF   r   r  r   rb   featuresr   r+   s       r.   r   EsmLMHead.forward  sD    JJx GOOA LLOdii'r0   )r   r  r  r   	r   r   r   r   r   r]   r   r   r   r   s   @r.   rm  rm  y  s    0A r0   rm  z
    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                      ^  \ rS rSrU 4S jr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\\   S
\\-  4S jj5       5       rSrU =r$ )EsmForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         g NFr  )	r\   r]   
num_labelsrY   r  rf  EsmClassificationHead
classifierr  r   s     r.   r]   %EsmForSequenceClassification.__init__  sF      ++Fe</7r0   Nr   r   r   r   r  r   ri   c                    U R                   " U4UUUS.UD6nUS   nU R                  U5      n	Sn
UGb  UR                  U	R                  5      nU R                  R
                  c  U R                  S:X  a  SU R                  l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R
                  S:X  aI  [        5       nU R                  S:X  a&  U" U	R                  5       UR                  5       5      n
OU" X5      n
OU R                  R
                  S:X  a=  [        5       nU" U	R                  SU R                  5      UR                  S5      5      n
O,U R                  R
                  S:X  a  [        5       nU" X5      n
[!        U
U	UR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r   r   r   r   Nr!   r   single_label_classificationmulti_label_classificationr$   r  )rf  r  r6   rc   rY   problem_typer  r3   r)   r   r   r   r   r   r   r   r   r  r   rb   r   r   r   r   r  r   r  r  r  r  r  s               r.   r   $EsmForSequenceClassification.forward  s   $ ((
)%'	

 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r0   )r  rY   rf  r  NNNNN)r   r   r   r   r]   r   r   r)   r  r   r  r   r   r   r   r   r   r   r   s   @r.   r  r    s      .2.20426*.8
##d*8
 t+8
 &&-	8

 ((4/8
   4'8
 +,8
 
)	)8
  8
r0   r  c                      ^  \ rS rSrU 4S jr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\\   S
\\-  4S jj5       5       rSrU =r$ )EsmForTokenClassificationi  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )r\   r]   r  r  rf  r   r   r   r   r   ro   r  r  r   s     r.   r]   "EsmForTokenClassification.__init__  si      ++Fe<zz&"<"<=))F$6$68I8IJr0   Nr   r   r   r   r  r   ri   c                 v   U R                   " U4UUUS.UD6nUS   nU R                  U5      nU R                  U5      n	Sn
UbW  [        5       nUR	                  U	R
                  5      nU" U	R                  SU R                  5      UR                  S5      5      n
[        U
U	UR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
r  r   Nr$   r  )rf  r   r  r   r6   rc   r   r  r   r  r   r  s               r.   r   !EsmForTokenClassification.forward  s      ((
)%'	

 
 "!*,,71')HYYv}}-FFKKDOO<fkk"oND$!//))	
 	
r0   )r  r   rf  r  r  )r   r   r   r   r]   r   r   r)   r  r   r  r   r   r   r   r   r   r   r   s   @r.   r  r    s      .2.20426*.'
##d*'
 t+'
 &&-	'

 ((4/'
   4''
 +,'
 
&	&'
  '
r0   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i  z-Head for sentence-level classification tasks.c                 ,  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        g r   )r\   r]   r   r   ro   r  r   r   r   r  out_projr   s     r.   r]   EsmClassificationHead.__init__  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr0   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r`  )r   r  r)   tanhr  r  s       r.   r   EsmClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r0   )r  r   r  r  r   s   @r.   r  r    s    7I r0   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r!   r&   )r   r   r)   cumsumtype_asr   )r   r   maskincremental_indicess       r.   r   r   '  sP     <<$((*D,,t3;;DADH##%33r0   )r  r  r  r  re  )r!   )Nr   )Qr   rC   collections.abcr   typingr   r)   r   torch.nnr   r   r    r
   rn  integrationsr   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   r    configuration_esmr"   
get_loggerr   r  r/   r?   rF   rJ   rT   ModulerV   r   r   r   r5   r   r   r  r  r(  r0  r6  rK  r[  re  r  r  rm  r  r  r  r   __all__rs  r0   r.   <module>r     s     $    A A & I J 9  7 F & R R G E ( 
		H	%( *+B ,B4;#
	A< A<H Gryy  GF\=BII \=L !%II%<<% 
% <<	%
 LL4'% T\% % '(%8 )*K)ryy K) +K)\
BII 
299 <bii 
		 
7) 7tS SF		  # # #L y0! y0 y0x FP' FP FPR		 * E
#5 E
E
P 4
 2 4
 4
nBII &4 r0   