
    Z js                        S SK r S SKJr  S SKJr  S SKrS SKJr  S SKJrJ	r	J
r
  SSKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJrJr  SSKJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)  SSK*J+r+J,r,J-r-  SSK.J/r/  SSK0J1r1   " S S\Rd                  5      r3 " S S\Rd                  5      r4 " S S\Rd                  5      r5 S@S\Rd                  S\Rl                  S\Rl                  S\Rl                  S\Rl                  S-  S \7S!\74S" jjr8S# r9\" S$5      SAS% j5       r:\" \:5       " S& S'\Rd                  5      5       r; " S( S)\5      r<\) " S* S+\$5      5       r=\) " S, S-\=5      5       r> " S. S/\Rd                  5      r?\)" S0S19 " S2 S3\=5      5       r@\)" S4S19 " S5 S6\=5      5       rA\)" S7S19 " S8 S9\=5      5       rB\) " S: S;\=5      5       rC\)" S<S19 " S= S>\=5      5       rD/ S?QrEg)B    N)Callable)Optional)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask(create_bidirectional_sliding_window_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring)can_return_tuplemaybe_autocastmerge_with_config_defaults)capture_outputs   )ModernBertConfigc                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )ModernBertEmbeddings4   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
configc                 \  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  S9U l        [        R                  " UR                  UR                  UR                  S9U l        [        R                  " UR                  5      U l        g )N)padding_idxepsbias)super__init__r'   r   	Embedding
vocab_sizehidden_sizepad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutembedding_dropoutdropselfr'   	__class__s     ڃ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/modernbert/modeling_modernbert.pyr.   ModernBertEmbeddings.__init__9   su     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	    N	input_idsinputs_embedsreturnc                     Ub"  U R                  U R                  U5      5      nU$ U R                  U R                  U R                  U5      5      5      nU$ N)r:   r7   r3   )r<   rA   rB   hidden_statess       r>   forwardModernBertEmbeddings.forward@   sS     $ IIdii&>?M  !IIdii0C0CI0N&OPMr@   )r'   r:   r7   r3   NN)__name__
__module____qualname____firstlineno____doc__r#   r.   torch
LongTensorTensorrG   __static_attributes____classcell__r=   s   @r>   r%   r%   4   sW    9/ 9 _c))D0HMW[H[	 r@   r%   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	ModernBertMLPJ   a*  Applies the GLU at the end of each ModernBERT layer.

Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
r'   c                   > [         TU ]  5         Xl        [        R                  " UR
                  [        UR                  5      S-  UR                  S9U l	        [        UR                     U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR
                  UR                  S9U l        g )N   r,   )r-   r.   r'   r   Linearr1   intintermediate_sizemlp_biasWir   hidden_activationactr8   mlp_dropoutr:   Wor;   s     r>   r.   ModernBertMLP.__init__Q   s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_r@   rF   rC   c                     U R                  U5      R                  SSS9u  p#U R                  U R                  U R	                  U5      U-  5      5      $ )NrY   dim)r_   chunkrc   r:   ra   )r<   rF   inputgates       r>   rG   ModernBertMLP.forwardY   sG    ggm,221"2=wwtyy%4!7899r@   )r_   rc   ra   r'   r:   )rJ   rK   rL   rM   rN   r#   r.   rO   rQ   rG   rR   rS   rT   s   @r>   rV   rV   J   s7    `/ `:U\\ :ell : :r@   rV   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	    SS\S-  S\
S   S\S-  S	\S-  S
\S\4   4
S jj5       r\R                   " 5       \SS j5       5       rSrU =r$ )ModernBertRotaryEmbedding^   inv_freqNr'   c                 f  > [         TU ]  5         UR                  U l        UR                  U l        Xl        [        [        UR                  5      5      U l        0 U l	        U R                   H  nU R
                  R                  U   nUc  M!  US   U R                  U'   U R                  nU R                  U   S:w  a  [        U R                  U      nU" U R
                  US9u  pgU R                  U S3USS9  U R                  U S3UR                  5       SS9  [        X S3U5        M     g )	N	rope_typedefault
layer_type	_inv_freqF)
persistent_original_inv_freq_attention_scaling)r-   r.   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr'   listsetlayer_typesrr   rope_parameterscompute_default_rope_parametersr   register_bufferclonesetattr)	r<   r'   deviceru   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingr=   s	           r>   r.   "ModernBertRotaryEmbedding.__init__a   s(   "("@"@$*$B$B!F$6$6 78**J++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@Yc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST +r@   r   ztorch.deviceseq_lenru   rC   ztorch.Tensorc           	         U R                   U   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimNg      ?r   rY   dtyper   r   )	r   getattrr1   num_attention_headsrO   arangeint64tofloat)r'   r   r   ru   baserh   attention_factorrp   s           r>   r   9ModernBertRotaryEmbedding.compute_default_rope_parametersv   s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r@   c                 H   [        X S35      n[        X S35      nUS S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS	9   UR                  5       UR                  5       -  R                  SS
5      n	[        R                  " X4SS9n
U
R                  5       U-  nU
R                  5       U-  nS S S 5        WR	                  UR                  S9WR	                  UR                  S94$ ! , (       d  f       N@= f)Nrv   ry   r   rf   r"   mpscpuF)device_typeenabledrY   rg   r   )r   r   expandshaper   r   
isinstancetypestrr   	transposerO   catcossinr   )r<   xposition_idsru   rp   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                r>   rG   !ModernBertRotaryEmbedding.forward   sd    4<y!9:#DL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')//C'')//C	 D vvAGGv$cff177f&;;; DCs   +A.F
F!)r'   r   r{   r|   rr   rE   NNNN)rJ   rK   rL   rM   rO   rQ   __annotations__r#   r.   staticmethodr   r\   r   tupler   r   no_gradr   rG   rR   rS   rT   s   @r>   rn   rn   ^   s    llU/ U U* *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F ]]_<  <r@   rn   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrY   r	   rf   )rh   r   )ptrainingr"   )rO   matmulr   r   
functionalsoftmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r>   eager_attention_forwardr      s     <<}}Q':;gEL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r@   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nrf   rY   rg   )r   rO   r   )r   x1x2s      r>   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r@   rotary_pos_embc                 b   U R                   nUR                  U5      nUR                  U5      nU R                  5       U-  [        U R                  5       5      U-  -   nUR                  5       U-  [        UR                  5       5      U-  -   nUR	                  U5      UR	                  U5      4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r   	unsqueezer   r   r   )qkr   r   unsqueeze_dimoriginal_dtypeq_embedk_embeds           r>   apply_rotary_pos_embr      s    & WWN
--
&C
--
&Cwwy3;qwwy#9C#?@Gwwy3;qwwy#9C#?@G::n%wzz.'AAAr@   c                     ^  \ rS rSrSrSS\S\S-  4U 4S jjjr  SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\\   S\
\R                  \R                  S-  4   4
S jjrSrU =r$ )ModernBertAttention   an  Performs multi-headed self attention on a batch of unpadded sequences.

If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.

See `forward` method for additional details.
Nr'   	layer_idxc                 P  > [         TU ]  5         Xl        X l        UR                  UR
                  -  S:w  a&  [        SUR                   SUR
                   S35      eUR                  U l        UR                  U l        UR                  UR
                  -  U l	        [        R                  " UR                  SU R                  -  UR
                  -  UR                  S9U l        UR                  U   S:X  a  UR                  S-   U l        OS U l        S	U l        [        R                  " UR                  UR                  UR                  S9U l        UR                  S
:  a&  [        R$                  " UR                  5      U l        g [        R&                  " 5       U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r	   rZ   sliding_attentionr"   F        )r-   r.   r'   r   r1   r   
ValueErrorattention_dropoutdeterministic_flash_attnr   r   r[   attention_biasWqkvr   sliding_window	is_causalrc   r8   Identityout_dropr<   r'   r   r=   s      r>   r.   ModernBertAttention.__init__   sv   " : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%**f.H.HHIIDMM 1F4N4N NU[UjUj
	 i(,?? #)"7"7!";D"&D))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqr@   rF   position_embeddingsr   r   rC   c                    UR                   S S nU R                  U5      nUR                  " / UQSPSPU R                  P76 nUR	                  SS9u  pxn	UR                  SS5      nUR                  SS5      nU	R                  SS5      n	Uu  p[        XxXSS9u  px[        R                  " U R                  R                  [        5      nU" U UUU	U4U R                  (       a  U R                  OSU R                  S	-  U R                  U R                  S
.UD6u  pUR                   " / UQSP76 R#                  5       nU R%                  U R'                  U5      5      nX4$ )Nrf   r	   rg   r"   rY   )r   r         )r   r   r   deterministic)r   r   viewr   unbindr   r   r   get_interfacer'   _attn_implementationr   r   r   r   r   reshaper   r   rc   )r<   rF   r   r   r   input_shapeqkvquery_states
key_statesvalue_statesr   r   attention_interfacer   r   s                  r>   rG   ModernBertAttention.forward  so    $))#2.ii&hh::Q::DMM:141C.,#--a3))!Q/
#--a3&#7RUjk#l (?(M(MKK,,.E)
 %8%
 /3mmD**MM4'..77%
 %
! "));;;;FFHmmDGGK$89((r@   )
rc   r   r   r'   r   r   r   r   r   r   rE   rI   )rJ   rK   rL   rM   rN   r#   r\   r.   rO   rQ   r   r   r   rG   rR   rS   rT   s   @r>   r   r      s    r/ rC$J r r@ IM.2	')||') #5<<#=>E') t+	')
 +,') 
u||U\\D00	1') ')r@   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr  SS\R                  S\R                  S-  S\R                  S-  S	\	\
   S
\R                  4
S jjrSrU =r$ )ModernBertEncoderLayeri7  Nr'   r   c                   > [         TU ]  5         Xl        X l        US:X  a  [        R
                  " 5       U l        O9[        R                  " UR                  UR                  UR                  S9U l        [        XS9U l        [        R                  " UR                  UR                  UR                  S9U l        [        U5      U l        UR                   U   U l        g )Nr   r*   )r'   r   )r-   r.   r'   r   r   r   	attn_normr4   r1   r5   r6   r   attnmlp_normrV   mlpr   attention_typer   s      r>   r.   ModernBertEncoderLayer.__init__8  s    ">[[]DN\\&*<*<&//X^XhXhiDN'vK	V%7%7V__SYScScd ($00;r@   rF   r   r   r   rC   c                     U R                   " U R                  U5      4UUS.UD6u  pVX-   nXR                  U R                  U5      5      -   nU$ )N)r   r   )r   r   r  r   )r<   rF   r   r   r   r   _s          r>   rG   ModernBertEncoderLayer.forwardE  s_     NN=)
 3)
 	
 &3%}1M(NNr@   )r  r   r   r'   r   r  r   rE   rI   )rJ   rK   rL   rM   r#   r\   r.   rO   rQ   r   r   rG   rR   rS   rT   s   @r>   r   r   7  s    </ <C$J < <  /337	|| t+ #\\D0	
 +, 
 r@   r   c                       \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSr\\S.r\R"                  " 5       S\R&                  4S	 j5       rS
rg)ModernBertPreTrainedModeliW  r'   modelTr%   r   )rF   
attentionsr   c                 Z  ^ U R                   R                  mTc  SmS[        R                  S[        4U4S jjnU R                   R
                  U R                   R
                  [        R                  " SU R                   R                  -  5      -  U R                   R
                  U R                   R                  S-  S.n[        U[        5      (       a  U" UR                  US   5        g [        U[        5      (       a-  U" UR                  US	   5        U" UR                  US
   5        g [        U[         5      (       a-  U" UR"                  US	   5        U" UR                  US
   5        g [        U[$        5      (       a  U" UR&                  US
   5        g [        U[(        5      (       a  U" UR*                  US
   5        g [        U[,        [.        [0        [2        45      (       a  U" UR4                  US   5        g [        U[        R6                  5      (       aO  [8        R:                  " UR<                  5        UR>                  b!  [8        R@                  " UR>                  5        g g [        U[B        5      (       a  URD                   H  nURF                  nURH                  U   S:w  a  [J        URH                  U      nU" UR                   US9u  pg[8        RL                  " [O        X S35      U5        [8        RL                  " [O        X S35      U5        M     g g )Nr	   r   stdc                    > [         R                  " U R                  SUT* U-  TU-  S9  [        U [        R
                  5      (       a/  U R                  b!  [         R                  " U R                  5        g g g )Nr   )meanr  ab)inittrunc_normal_weightr   r   r[   r,   zeros_)r   r  cutoff_factors     r>   init_weight<ModernBertPreTrainedModel._init_weights.<locals>.init_weightm  sg     .3&#% &")),,;;*KK, + -r@   g       @r   )inout	embedding	final_outr  r  r  r  rs   rt   rv   rx   )(r'   initializer_cutoff_factorr   Moduler   initializer_rangemathsqrtnum_hidden_layersr1   r   r%   r3   rV   r_   rc   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr4   r  ones_r  r,   r  rn   r   r   rr   r   copy_r   )	r<   r   r  stdsru   r   r   r  r  s	           @r>   _init_weights'ModernBertPreTrainedModel._init_weightsg  sq   == M	-		 	- 	- ++//;;00499S4;;C`C`=`3aa6600$6	
 f233--tK/@A..		4:.		4;/ 344T$Z0		4;/ 899d5k2 566U43+0.	
 
 ))4+<=--JJv}}%{{&FKK( ' 9::$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 ;r@    N)rJ   rK   rL   rM   r#   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrO   r   r   r  r.  rR   r0  r@   r>   r  r  W  so    &*#/1IJN"& 0)
 ]]_:^BII :^ :^r@   r  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	\
    SS\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )ModernBertModeli  r'   c           	        > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l
        [
        R                  " UR                  UR                  UR                  S9U l        [!        US9U l        SU l        U R'                  5         g s  snf )Nr*   )r'   F)r-   r.   r'   r%   
embeddingsr   
ModuleListranger!  r   layersr4   r1   r5   r6   
final_normrn   
rotary_embgradient_checkpointing	post_initr   s      r>   r.   ModernBertModel.__init__  s     .v6mmHMfNfNfHghHg9#F6Hgh
 ,,v'9'9vU[UeUef36B&+# is   Cc                 .    U R                   R                  $ rE   r<  r3   r<   s    r>   get_input_embeddings$ModernBertModel.get_input_embeddings  s    ---r@   c                 $    XR                   l        g rE   rF  )r<   r   s     r>   set_input_embeddings$ModernBertModel.set_input_embeddings  s    ).&r@   NrA   r   r   rB   r   rC   c                    US L US L-  (       a  [        S5      eUb  UR                  S   OUR                  S   nUb  UR                  OUR                  nUc#  [        R                  " XgS9R                  S5      nU R                  XS9n[        U=n	[        5      (       d'  U R                  UUS.n
[        S
0 U
D6[        S
0 U
D6S.n	0 n[        U R                  R                  5       H  nU R                  XU5      X'   M     U R                   H'  nU" U4XR                      XR                      S.UD6nM)     U R#                  U5      n[%        US	9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr"   r   r   )rA   rB   )r'   rB   r   )full_attentionr   )r   r   )last_hidden_stater0  )r   r   r   rO   r   r   r<  r   dictr'   r   r   r~   r   rA  r?  r  r@  r   )r<   rA   r   r   rB   r   r   r   rF   attention_mask_mappingmask_kwargsr   ru   encoder_layers                 r>   rG   ModernBertModel.forward  sg    -t";<YZZ,9,E-%%a(9??[\K]%.%:!!@T@T <<?II!LL)YNB0DII++!."0K #<"Jk"J%M%\P[%\&"
 !dkk556J.2oom[e.f+ 7 "[[M)56R6RS$78T8T$U 	M ) 6??r@   )r'   r<  r@  rB  r?  rA  r   )rJ   rK   rL   rM   r#   r.   rH  rK  r    r!   r   rO   rP   rQ   r   r   r   rG   rR   rS   rT   s   @r>   r:  r:    s    
/ 
./   .2.204-1,@##d*,@ t+,@ &&-	,@
 ||d*,@ +,,@ 
,@    ,@r@   r:  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r"  i  r'   c                 F  > [         TU ]  5         Xl        [        R                  " UR
                  UR
                  UR                  5      U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        g )Nr*   )r-   r.   r'   r   r[   r1   classifier_biasr#  r   classifier_activationra   r4   r5   r6   r7   r;   s     r>   r.   !ModernBertPredictionHead.__init__  so    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	r@   rF   rC   c                 `    U R                  U R                  U R                  U5      5      5      $ rE   )r7   ra   r#  )r<   rF   s     r>   rG    ModernBertPredictionHead.forward  s#    yy$**]";<==r@   )ra   r'   r#  r7   )rJ   rK   rL   rM   r#   r.   rO   rQ   rG   rR   rS   rT   s   @r>   r"  r"    s2    a/ a>U\\ >ell > >r@   r"  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc                   P  ^  \ rS rSrSS0rS\4U 4S jjrS rS\R                  4S jr
\\     SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\   S\\R                     \-  4S jj5       5       rSrU =r$ )r$  i  zdecoder.weightz&model.embeddings.tok_embeddings.weightr'   c                 n  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  UR                  S9U l        U R                  R                  U l        U R                  R                  U l        U R                  5         g )NrZ   )r-   r.   r'   r:  r	  r"  headr   r[   r1   r0   decoder_biasr%  sparse_predictionsparse_pred_ignore_indexrC  r;   s     r>   r.   ModernBertForMaskedLM.__init__  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	r@   c                     U R                   $ rE   r%  rG  s    r>   get_output_embeddings+ModernBertForMaskedLM.get_output_embeddings  s    ||r@   new_embeddingsc                     Xl         g rE   rf  )r<   ri  s     r>   set_output_embeddings+ModernBertForMaskedLM.set_output_embeddings  s    %r@   NrA   r   r   rB   labelsr   rC   c                    U R                   " SUUUUS.UD6nUS   nU R                  (       aI  UbF  UR                  S5      nUR                  UR                  S   S5      nXPR                  :g  n	X   nXY   nU R                  U R                  U5      5      n
S nUb)  U R                  " X4SU R                  R                  0UD6n[        UU
UR                  UR                  S9$ )NrA   r   r   rB   r   rf   r0   losslogitsrF   r
  r0  )r	  rb  r   r   rc  r%  r`  loss_functionr'   r0   r   rF   r
  )r<   rA   r   r   rB   rm  r   outputsrP  mask_tokensrr  rq  s               r>   rG   ModernBertForMaskedLM.forward  s     ** 
)%'	

 
 $AJ!!f&8[[_F 1 6 6v||A K !$A$AAK 1 >(Fdii(9:;%%fbAWAWb[abD!//))	
 	
r@   )r'   r%  r`  r	  rc  rb  NNNNN)rJ   rK   rL   rM   _tied_weights_keysr#   r.   rg  r   r[   rk  r   r   rO   rP   rQ   r   r   r   r   rG   rR   rS   rT   s   @r>   r$  r$    s     +,TU/ &BII &  .2.2,0-1&*'
##d*'
 t+'
 llT)	'

 ||d*'
 t#'
 +,'
 
u||	~	-'
  '
r@   r$  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                      ^  \ rS rSrS\4U 4S jjr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\   S\\R                     \-  4S jj5       5       rSrU =r$ )r&  i>  r'   c                 n  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R!                  5         g rE   )r-   r.   
num_labelsr'   r:  r	  r"  r`  rO   r   r8   classifier_dropoutr:   r[   r1   r*  rC  r;   s     r>   r.   ,ModernBertForSequenceClassification.__init__D  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r@   NrA   r   r   rB   rm  r   rC   c                 b   U R                   " SUUUUS.UD6nUS   nU R                  R                  S:X  a
  USS2S4   nOU R                  R                  S:X  ao  Uc;  [        R                  " UR
                  SS UR                  [        R                  S9nXR                  S5      -  R                  S	S
9UR                  S	SS9-  nU R                  U5      n	U R                  U	5      n	U R                  U	5      n
SnUGb  U R                  R                  c  U R                  S	:X  a  SU R                  l        OoU R                  S	:  aN  UR                  [        R                   :X  d  UR                  [        R"                  :X  a  SU R                  l        OSU R                  l        U R                  R                  S:X  aI  [%        5       nU R                  S	:X  a&  U" U
R'                  5       UR'                  5       5      nOU" X5      nOU R                  R                  S:X  a=  [)        5       nU" U
R+                  SU R                  5      UR+                  S5      5      nO,U R                  R                  S:X  a  [-        5       nU" X5      n[/        UU
UR0                  UR2                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
ro  r   clsNr  rY   r   rf   r"   rg   Trh   keepdim
regressionsingle_label_classificationmulti_label_classificationrp  r0  )r	  r'   classifier_poolingrO   onesr   r   boolr   sumr`  r:   r*  problem_typer{  r   longr\   r   squeezer   r   r   r   rF   r
  )r<   rA   r   r   rB   rm  r   rt  rP  pooled_outputrr  rq  loss_fcts                r>   rG   +ModernBertForSequenceClassification.forwardQ  s]   " ** 
)%'	

 
 $AJ;;))U2 1!Q$ 7[[++v5%!&%++BQ/8I8P8PX]XbXb" "35M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r@   )r*  r'   r:   r`  r	  r{  rw  )rJ   rK   rL   rM   r#   r.   r   r   rO   rP   rQ   r   r   r   r   rG   rR   rS   rT   s   @r>   r&  r&  >  s    /   .2.2,0-1&*C
##d*C
 t+C
 llT)	C

 ||d*C
 t#C
 +,C
 
u||	7	7C
  C
r@   r&  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                      ^  \ rS rSrS\4U 4S jjr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\   S\\R                     \-  4S jj5       5       rSrU =r$ )r(  i  r'   c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g rE   r-   r.   r{  r:  r	  r"  r`  rO   r   r8   r|  r:   r[   r1   r*  rC  r;   s     r>   r.   )ModernBertForTokenClassification.__init__  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r@   NrA   r   r   rB   rm  r   rC   c                 b   U R                   " SUUUUS.UD6nUS   nU R                  U5      nU R                  U5      nU R                  U5      n	Sn
Ub<  [	        5       nU" U	R                  SU R                  5      UR                  S5      5      n
[        U
U	UR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
ro  r   Nrf   rp  r0  )
r	  r`  r:   r*  r   r   r{  r   rF   r
  )r<   rA   r   r   rB   rm  r   rt  rP  rr  rq  r  s               r>   rG   (ModernBertForTokenClassification.forward  s     ** 
)%'	

 
 $AJ II&78 II&78!23')HFKKDOO<fkk"oND$!//))	
 	
r@   r*  r:   r`  r	  r{  rw  )rJ   rK   rL   rM   r#   r.   r   r   rO   rP   rQ   r   r   r   r   rG   rR   rS   rT   s   @r>   r(  r(    s    
/ 
  .2.2,0-1&*$
##d*$
 t+$
 llT)	$

 ||d*$
 t#$
 +,$
 
u||	4	4$
  $
r@   r(  c                      ^  \ rS rSrS\4U 4S jjr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )r)  i  r'   c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g rE   r  r;   s     r>   r.   'ModernBertForQuestionAnswering.__init__  sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJr@   NrA   r   r   start_positionsend_positionsr   rC   c                    U R                   " U4UUS.UD6nUS   nU R                  U5      nU R                  U5      nU R                  U5      n	U	R	                  SSS9u  pU
R                  S5      R                  5       n
UR                  S5      R                  5       nS nUb  Ub  U R                  " XXE40 UD6n[        UU
UUR                  UR                  S9$ )N)r   r   r   r"   rf   rg   )rq  start_logits
end_logitsrF   r
  )r	  r`  r:   r*  splitr  r   rs  r   rF   r
  )r<   rA   r   r   r  r  r   rt  rP  rr  r  r  rq  s                r>   rG   &ModernBertForQuestionAnswering.forward  s     **
)%
 	
 $AJ II&78 II&78!23#)<<r<#: #++B/::<''+668
&=+D%%libhiD+%!!//))
 	
r@   r  rw  )rJ   rK   rL   rM   r#   r.   r   r   rO   rQ   r   r   r   r   rG   rR   rS   rT   s   @r>   r)  r)    s    	/ 	  *..2,0/3-1#
<<$&#
 t+#
 llT)	#

 ,#
 ||d*#
 +,#
 
u||	;	;#
  #
r@   r)  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                      ^  \ rS rSrS\4U 4S jjr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\   S\\R                     \-  4S jj5       5       rSrU =r$ )r'  i	  r'   c                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  S5      U l        U R                  5         g )Nr"   )r-   r.   r'   r:  r	  r"  r`  rO   r   r8   r|  r:   r[   r1   r*  rC  r;   s     r>   r.   $ModernBertForMultipleChoice.__init__  sm     $V,
,V4	HH$$V%>%>?	))F$6$6: 	r@   NrA   r   r   rB   rm  r   rC   c                    Ub  UR                   S   OUR                   S   nUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " SUUUUS.UD6nUS   n	U R                  R
                  S:X  a  [        R                  " U	R                   S   U	R                  S9n
Ub)  UR                  SS	9R                  U	R                  5      nO.[        R                  " S[        R                  U	R                  S
9nXU4   n	OMU R                  R
                  S:X  a3  UR                  SSS9nXR                  S5      -  R                  SS	9U-  n	U R                  U	5      nU R!                  U5      nU R#                  U5      nUR                  SU5      nSnUb  [$        R&                  " 5       nU" X5      n[)        UUUR*                  UR,                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
Nr"   rf   ro  r   r  rN  rg   )r   r   r  Tr  rp  r0  )r   r   sizer	  r'   r  rO   r   r   argmaxr   tensorr  r  r   r`  r:   r*  r   r   r   rF   r
  )r<   rA   r   r   rB   rm  r   num_choicesrt  rP  	indices_0cls_masknum_non_pad_tokensr  rr  reshaped_logitsrq  r  s                     r>   rG   #ModernBertForMultipleChoice.forward  sf     -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ** 
)%'	

 
 $AJ ;;))U2%6%<%<Q%?HYH`H`aI))00R08;;<M<T<TU !<<DUD\D\] 1X2E F [[++v5!/!3!34!3!H!25M5Mb5Q!Q V V[\ V ]`r r		"34		-0/ ++b+6**,HO4D("!//))	
 	
r@   )r*  r'   r:   r`  r	  rw  )rJ   rK   rL   rM   r#   r.   r   r   rO   rP   rQ   r   r   r   r   rG   rR   rS   rT   s   @r>   r'  r'  	  s    
/ 
  .2.2,0-1&*C
##d*C
 t+C
 llT)	C

 ||d*C
 t#C
 +,C
 
u||	8	8C
  C
r@   r'  )r:  r  r$  r&  r(  r)  r'  )r   )r"   )Fr  collections.abcr   typingr   rO   r   torch.nnr   r   r    r
   r  activationsr   integrationsr   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.genericr   r   r    utils.output_capturingr!   configuration_modernbertr#   r  r%   rV   rn   rQ   r   r   r   r   r   r   r  r:  r"  r$  r&  r(  r)  r'  __all__r0  r@   r>   <module>r     s[  ,  $    A A & ! I ` 9  L F & 7 Y Y 5 6299 ,:BII :(L<		 L<l %II%<<% 
% <<	%
 LL4'% % %,( *+B ,B4 )*N)")) N) +N)b7 @ J^ J^ J^Z B@/ B@ B@J	>ryy 	> 
?
5 ?

?
D 
S
*C S

S
l 
3
'@ 3

3
l 1
%> 1
 1
h 
R
"; R

R
jr@   