
    Z j"                        S SK r S SKJr  S SKJrJr  S SKrS SKJr  S SKJ	r	  S SK
JrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJrJ r J!r!J"r"J#r#  SSK$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.  SSK/J0r0J1r1  SSK2J3r3  SSK4J5r5  SSK6J7r7J8r8  \.Rr                  " \:5      r;\-" SS9\ " S S\5      5       5       r< " S S\	Rz                  5      r> " S S\	Rz                  5      r? " S  S!\75      r@\" S"5      S>S# j5       rA\" \A5       " S$ S%\	Rz                  5      5       rB " S& S'\5      rC\- " S( S)\(5      5       rD\- " S* S+\D5      5       rE " S, S-\	Rz                  5      rF\-" S.S/9 " S0 S1\D5      5       rG\-" S2S/9 " S3 S4\D5      5       rH\-" S5S/9 " S6 S7\D5      5       rI\- " S8 S9\D5      5       rJ\-" S:S/9 " S; S<\D5      5       rK/ S=QrLg)?    N)Callable)LiteralOptional)strict)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)PreTrainedConfig)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask(create_bidirectional_sliding_window_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONS)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )eager_attention_forward)Gemma3RotaryEmbeddingrotate_halfzanswerdotai/ModernBERT-base)
checkpointc                     ^  \ rS rSr% SrSrS/rSSS.rSr\	\
S	'   S
r\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\\
S'   Sr\	\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\	S-  \
S '   S!r\	\\	   -  S-  \
S"'   S#r\	S-  \
S$'   S#r\	S-  \
S%'   S!r\	S-  \
S&'   Sr\\
S''   S(r\\	-  \
S)'   Sr \\   S-  \
S*'   Sr!\"\#S+   \"4   S-  \
S,'   S-r$\	\
S.'   S(r%\\	-  \
S/'   Sr&\\
S0'   S(r'\\	-  \
S1'   S2r(\\
S3'   S4r)\#S5   \
S6'   S(r*\\	-  \
S7'   Sr+\\
S8'   Sr,\\
S9'   Sr-\\
S:'   Sr.\\
S;'   S<r/\	\
S='   S2r0\\
S>'   U 4S? jr1S@ r2U 4SA jr3\4SB 5       r5\5Rl                  SC 5       r5SDr7U =r8$ )EModernBertConfig4   a  
initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
    The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
norm_eps (`float`, *optional*, defaults to 1e-05):
    The epsilon used by the rms normalization layers.
norm_bias (`bool`, *optional*, defaults to `False`):
    Whether to use bias in the normalization layers.
local_attention (`int`, *optional*, defaults to 128):
    The window size for local attention.
mlp_dropout (`float`, *optional*, defaults to 0.0):
    The dropout ratio for the MLP layers.
decoder_bias (`bool`, *optional*, defaults to `True`):
    Whether to use bias in the decoder layers.
classifier_pooling (`str`, *optional*, defaults to `"cls"`):
    The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
    CLS token doesn't attend to all tokens on long sequences.
classifier_bias (`bool`, *optional*, defaults to `False`):
    Whether to use bias in the classifier.
classifier_activation (`str`, *optional*, defaults to `"gelu"`):
    The activation function for the classifier.
deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
    Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
sparse_prediction (`bool`, *optional*, defaults to `False`):
    Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
    The index to ignore for the sparse prediction.

Examples:

```python
>>> from transformers import ModernBertModel, ModernBertConfig

>>> # Initializing a ModernBert style configuration
>>> configuration = ModernBertConfig()

>>> # Initializing a model from the modernbert-base style configuration
>>> model = ModernBertModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
modernbertpast_key_valuesg     Ag     @)globallocali  
vocab_sizei   hidden_sizei  intermediate_size   num_hidden_layers   num_attention_headsgeluhidden_activationi    max_position_embeddingsg{Gz?initializer_range       @initializer_cutoff_factorgh㈵>norm_epsF	norm_biasik  Npad_token_idij  eos_token_idii  bos_token_idcls_token_idsep_token_idattention_bias        attention_dropoutlayer_typesfull_attentionsliding_attentionrope_parameters   local_attentionembedding_dropoutmlp_biasmlp_dropoutTdecoder_biascls)rR   meanclassifier_poolingclassifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionisparse_pred_ignore_indextie_word_embeddingsc                    > UR                  SS5      nU R                  c?  [        U R                  5       Vs/ s H  n[	        X2-  5      (       a  SOSPM     snU l        [
        TU ]  " S0 UD6  g s  snf )Nglobal_attn_every_n_layersr   rJ   rI    )getrG   ranger4   boolsuper__post_init__)selfkwargsr]   i	__class__s       ڂ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/modernbert/modular_modernbert.pyrc   ModernBertConfig.__post_init__   sy    %+ZZ0La%P"# t556 6A (,A,J'K'K#Qaa6 D
 	'' s    A1c                    UR                  SS 5      nSS0SS0S.nU R                  b  U R                  OUU l        Ub<  U R                  S   R                  U5        U R                  S   R                  U5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  SU R
                  S	   5      5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  S
U R
                  S   5      5        U R                  5         U$ )Nrope_scaling	rope_typedefault)rJ   rI   rI   rJ   
rope_thetaglobal_rope_thetar.   local_rope_thetar/   )poprK   updater_   
setdefaultdefault_thetastandardize_rope_params)rd   re   rk   default_rope_paramss       rh   convert_rope_params_to_dict,ModernBertConfig.convert_rope_params_to_dict   sc   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G  !45<<\J ##$45=6A95MD  !12-.99&**%8$:L:LX:VW	
 ##$78@9Di8PD  !4501<<&**%79K9KG9TU	

 	$$&    c                 H   > [         TU ]  5       nUR                  SS 5        U$ )Nreference_compile)rb   to_dictrq   )rd   outputrg   s     rh   r|   ModernBertConfig.to_dict   s#    "

&-ry   c                      U R                   S-  $ )zKHalf-window size: `local_attention` is the total window, so we divide by 2.r$   rM   rd   s    rh   sliding_windowModernBertConfig.sliding_window   s     ##q((ry   c                     US-  U l         g)z<Set sliding_window by updating local_attention to 2 * value.r$   Nr   rd   values     rh   r   r      s      %qyry   )rG   rM   rK   )9__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencert   r0   int__annotations__r1   r2   r4   r6   r8   strr9   r:   floatr<   r=   r>   ra   r?   r@   listrA   rB   rC   rD   rF   rG   rK   dictr   rM   rN   rO   rP   rQ   rT   rU   rV   rW   rX   rY   rZ   r[   rc   rw   r|   propertyr   setter__static_attributes____classcell__rg   s   @rh   r*   r*   4   s   (T J#4"5(8<MJK!s!s!!#s##'S'#u#'*u*HeIt$L#*$+0L#S	/D(0$L#*$$L#*$$L#*$ ND %(us{($(KcT!(Y]OT'"GH$NORVV]OS%(us{(Hd"K"L$16.6&))!OT!!'3'%*d*#t#$(c( $$	(<
 ) ) ) )ry   r*   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )ModernBertEmbeddings   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
configc                 \  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  S9U l        [        R                  " UR                  UR                  UR                  S9U l        [        R                  " UR                  5      U l        g )N)padding_idxepsbias)rb   __init__r   r   	Embeddingr0   r1   r?   tok_embeddings	LayerNormr=   r>   normDropoutrN   droprd   r   rg   s     rh   r   ModernBertEmbeddings.__init__   su     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	ry   N	input_idsinputs_embedsreturnc                     Ub"  U R                  U R                  U5      5      nU$ U R                  U R                  U R                  U5      5      5      nU$ N)r   r   r   )rd   r   r   hidden_statess       rh   forwardModernBertEmbeddings.forward   sS     $ IIdii&>?M  !IIdii0C0CI0N&OPMry   )r   r   r   r   NN)r   r   r   r   r   r*   r   torch
LongTensorTensorr   r   r   r   s   @rh   r   r      sW    9/ 9 _c))D0HMW[H[	 ry   r   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	ModernBertMLP   a*  Applies the GLU at the end of each ModernBERT layer.

Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
r   c                   > [         TU ]  5         Xl        [        R                  " UR
                  [        UR                  5      S-  UR                  S9U l	        [        UR                     U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR
                  UR                  S9U l        g )Nr$   r   )rb   r   r   r   Linearr1   r   r2   rO   Wir   r8   actr   rP   r   Wor   s     rh   r   ModernBertMLP.__init__   s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_ry   r   r   c                     U R                  U5      R                  SSS9u  p#U R                  U R                  U R	                  U5      U-  5      5      $ )Nr$   dim)r   chunkr   r   r   )rd   r   inputgates       rh   r   ModernBertMLP.forward   sG    ggm,221"2=wwtyy%4!7899ry   )r   r   r   r   r   )r   r   r   r   r   r*   r   r   r   r   r   r   r   s   @rh   r   r      s7    `/ `:U\\ :ell : :ry   r   c                      ^  \ rS rSrSS\4U 4S jjjr\    SS\S-  S\S   S\S-  S\	S-  S	\
S
\4   4
U 4S jjj5       rSrU =r$ )ModernBertRotaryEmbedding   Nr   c                 $   > [         TU ]  X5        g r   )rb   r   )rd   r   devicerg   s      rh   r   "ModernBertRotaryEmbedding.__init__   s    (ry   r   ztorch.deviceseq_len
layer_typer   ztorch.Tensorc                 $   > [         TU ]  XX#5      $ r   )rb   compute_default_rope_parameters)r   r   r   r   rg   s       rh   r   9ModernBertRotaryEmbedding.compute_default_rope_parameters   s     w6vw[[ry   r^   r   NNNN)r   r   r   r   r*   r   staticmethodr   r   r   tupler   r   r   r   r   s   @rh   r   r      s    )/ ) ) *.+/"!%	\ 4'\(\ t\ $J	\
 
~u$	%\ \ry   r   rotary_pos_embc                 b   U R                   nUR                  U5      nUR                  U5      nU R                  5       U-  [        U R                  5       5      U-  -   nUR                  5       U-  [        UR                  5       5      U-  -   nUR	                  U5      UR	                  U5      4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)dtype	unsqueezer   r'   to)qkcossinunsqueeze_dimoriginal_dtypeq_embedk_embeds           rh   apply_rotary_pos_embr      s    & WWN
--
&C
--
&Cwwy3;qwwy#9C#?@Gwwy3;qwwy#9C#?@G::n%wzz.'AAAry   c                     ^  \ rS rSrSrSS\S\S-  4U 4S jjjr  SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\\   S\
\R                  \R                  S-  4   4
S jjrSrU =r$ )ModernBertAttentioni  an  Performs multi-headed self attention on a batch of unpadded sequences.

If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.

See `forward` method for additional details.
Nr   	layer_idxc                 P  > [         TU ]  5         Xl        X l        UR                  UR
                  -  S:w  a&  [        SUR                   SUR
                   S35      eUR                  U l        UR                  U l        UR                  UR
                  -  U l	        [        R                  " UR                  SU R                  -  UR
                  -  UR                  S9U l        UR                  U   S:X  a  UR                  S-   U l        OS U l        S	U l        [        R                  " UR                  UR                  UR                  S9U l        UR                  S
:  a&  [        R$                  " UR                  5      U l        g [        R&                  " 5       U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   rJ      FrE   )rb   r   r   r   r1   r6   
ValueErrorrF   rX   head_dimr   r   rD   WqkvrG   r   	is_causalr   r   Identityout_droprd   r   r   rg   s      rh   r   ModernBertAttention.__init__  sv   " : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%**f.H.HHIIDMM 1F4N4N NU[UjUj
	 i(,?? #)"7"7!";D"&D))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqry   r   position_embeddingsattention_maskre   r   c                    UR                   S S nU R                  U5      nUR                  " / UQSPSPU R                  P76 nUR	                  SS9u  pxn	UR                  SS5      nUR                  SS5      nU	R                  SS5      n	Uu  p[        XxXSS9u  px[        R                  " U R                  R                  [        5      nU" U UUU	U4U R                  (       a  U R                  OSU R                  S	-  U R                  U R                  S
.UD6u  pUR                   " / UQSP76 R#                  5       nU R%                  U R'                  U5      5      nX4$ )Nr   r   r   r   r$   )r   rE         )dropoutscalingr   deterministic)shaper   viewr   unbind	transposer   r   get_interfacer   _attn_implementationr%   trainingrF   r   rX   reshape
contiguousr   r   )rd   r   r   r   re   input_shapeqkvquery_states
key_statesvalue_statesr   r   attention_interfaceattn_outputattn_weightss                  rh   r   ModernBertAttention.forward;  so    $))#2.ii&hh::Q::DMM:141C.,#--a3))!Q/
#--a3&#7RUjk#l (?(M(MKK,,.E)
 %8%
 /3mmD**MM4'..77%
 %
! "));;;;FFHmmDGGK$89((ry   )
r   r   rF   r   rX   r   r   r   r   r   r   r   )r   r   r   r   r   r*   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r   r     s    r/ rC$J r r@ IM.2	')||') #5<<#=>E') t+	')
 +,') 
u||U\\D00	1') ')ry   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr  SS\R                  S\R                  S-  S\R                  S-  S	\	\
   S
\R                  4
S jjrSrU =r$ )ModernBertEncoderLayerie  Nr   r   c                   > [         TU ]  5         Xl        X l        US:X  a  [        R
                  " 5       U l        O9[        R                  " UR                  UR                  UR                  S9U l        [        XS9U l        [        R                  " UR                  UR                  UR                  S9U l        [        U5      U l        UR                   U   U l        g )Nr   r   )r   r   )rb   r   r   r   r   r   	attn_normr   r1   r=   r>   r   attnmlp_normr   mlprG   attention_typer   s      rh   r   ModernBertEncoderLayer.__init__f  s    ">[[]DN\\&*<*<&//X^XhXhiDN'vK	V%7%7V__SYScScd ($00;ry   r   r   r   re   r   c                     U R                   " U R                  U5      4UUS.UD6u  pVX-   nXR                  U R                  U5      5      -   nU$ )N)r   r   )r  r  r  r  )rd   r   r   r   re   r
  _s          rh   r   ModernBertEncoderLayer.forwards  s_     NN=)
 3)
 	
 &3%}1M(NNry   )r  r  r  r   r   r  r  r   r   )r   r   r   r   r*   r   r   r   r   r   r   r   r   r   r   s   @rh   r  r  e  s    </ <C$J < <  /337	|| t+ #\\D0	
 +, 
 ry   r  c                       \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSr\\S.r\R"                  " 5       S\R&                  4S	 j5       rS
rg)ModernBertPreTrainedModeli  r   modelTr   r  )r   
attentionsmodulec                 Z  ^ U R                   R                  mTc  SmS[        R                  S[        4U4S jjnU R                   R
                  U R                   R
                  [        R                  " SU R                   R                  -  5      -  U R                   R
                  U R                   R                  S-  S.n[        U[        5      (       a  U" UR                  US   5        g [        U[        5      (       a-  U" UR                  US	   5        U" UR                  US
   5        g [        U[         5      (       a-  U" UR"                  US	   5        U" UR                  US
   5        g [        U[$        5      (       a  U" UR&                  US
   5        g [        U[(        5      (       a  U" UR*                  US
   5        g [        U[,        [.        [0        [2        45      (       a  U" UR4                  US   5        g [        U[        R6                  5      (       aO  [8        R:                  " UR<                  5        UR>                  b!  [8        R@                  " UR>                  5        g g [        U[B        5      (       a  URD                   H  nURF                  nURH                  U   S:w  a  [J        URH                  U      nU" UR                   US9u  pg[8        RL                  " [O        X S35      U5        [8        RL                  " [O        X S35      U5        M     g g )Nr   r  stdc                    > [         R                  " U R                  SUT* U-  TU-  S9  [        U [        R
                  5      (       a/  U R                  b!  [         R                  " U R                  5        g g g )NrE   )rS   r  ab)inittrunc_normal_weight
isinstancer   r   r   zeros_)r  r  cutoff_factors     rh   init_weight<ModernBertPreTrainedModel._init_weights.<locals>.init_weight  sg     .3&#% &")),,;;*KK, + -ry   r;   r   )inout	embedding	final_outr-  r+  r,  r.  rm   )r   	_inv_freq_original_inv_freq)(r   r<   r   Moduler   r:   mathsqrtr4   r1   r&  r   r   r   r   r   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr   r#  ones_r%  r   r'  r   rG   r   rl   r   copy_getattr)	rd   r  r)  stdsr   rope_init_fncurr_inv_freqr  r(  s	           @rh   _init_weights'ModernBertPreTrainedModel._init_weights  sq   == M	-		 	- 	- ++//;;00499S4;;C`C`=`3aa6600$6	
 f233--tK/@A..		4:.		4;/ 344T$Z0		4;/ 899d5k2 566U43+0.	
 
 ))4+<=--JJv}}%{{&FKK( ' 9::$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 ;ry   r^   N)r   r   r   r   r*   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr  r   _can_record_outputsr   no_gradr   r1  rC  r   r^   ry   rh   r  r    so    &*#/1IJN"& 0)
 ]]_:^BII :^ :^ry   r  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	\
    SS\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )ModernBertModeli  r   c           	        > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l
        [
        R                  " UR                  UR                  UR                  S9U l        [!        US9U l        SU l        U R'                  5         g s  snf )Nr   )r   F)rb   r   r   r   
embeddingsr   
ModuleListr`   r4   r  layersr   r1   r=   r>   
final_normr   
rotary_embgradient_checkpointing	post_initr   s      rh   r   ModernBertModel.__init__  s     .v6mmHMfNfNfHghHg9#F6Hgh
 ,,v'9'9vU[UeUef36B&+# is   Cc                 .    U R                   R                  $ r   rQ  r   r   s    rh   get_input_embeddings$ModernBertModel.get_input_embeddings  s    ---ry   c                 $    XR                   l        g r   rZ  r   s     rh   set_input_embeddings$ModernBertModel.set_input_embeddings  s    ).&ry   Nr   r   position_idsr   re   r   c                    US L US L-  (       a  [        S5      eUb  UR                  S   OUR                  S   nUb  UR                  OUR                  nUc#  [        R                  " XgS9R                  S5      nU R                  XS9n[        U=n	[        5      (       d'  U R                  UUS.n
[        S
0 U
D6[        S
0 U
D6S.n	0 n[        U R                  R                  5       H  nU R                  XU5      X'   M     U R                   H'  nU" U4XR                      XR                      S.UD6nM)     U R#                  U5      n[%        US	9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   r   )r   r   r   rH   )r   r   )last_hidden_stater^   )r   r   r   r   aranger   rQ  r&  r   r   r   r   setrG   rU  rS  r  rT  r   )rd   r   r   r`  r   re   r   r   r   attention_mask_mappingmask_kwargsr   r   encoder_layers                 rh   r   ModernBertModel.forward  sg    -t";<YZZ,9,E-%%a(9??[\K]%.%:!!@T@T <<?II!LL)YNB0DII++!."0K #<"Jk"J%M%\P[%\&"
 !dkk556J.2oom[e.f+ 7 "[[M)56R6RS$78T8T$U 	M ) 6??ry   )r   rQ  rT  rV  rS  rU  r   )r   r   r   r   r*   r   r[  r^  r"   r#   r   r   r   r   r   r   r   r   r   r   r   s   @rh   rO  rO    s    
/ 
./   .2.204-1,@##d*,@ t+,@ &&-	,@
 ||d*,@ +,,@ 
,@    ,@ry   rO  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r4  i  r   c                 F  > [         TU ]  5         Xl        [        R                  " UR
                  UR
                  UR                  5      U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        g )Nr   )rb   r   r   r   r   r1   rV   r5  r   rW   r   r   r=   r>   r   r   s     rh   r   !ModernBertPredictionHead.__init__  so    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	ry   r   r   c                 `    U R                  U R                  U R                  U5      5      5      $ r   )r   r   r5  )rd   r   s     rh   r    ModernBertPredictionHead.forward!  s#    yy$**]";<==ry   )r   r   r5  r   )r   r   r   r   r*   r   r   r   r   r   r   r   s   @rh   r4  r4    s2    a/ a>U\\ >ell > >ry   r4  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc                   P  ^  \ rS rSrSS0rS\4U 4S jjrS rS\R                  4S jr
\\     SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\   S\\R                     \-  4S jj5       5       rSrU =r$ )r6  i%  zdecoder.weightz&model.embeddings.tok_embeddings.weightr   c                 n  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  UR                  S9U l        U R                  R                  U l        U R                  R                  U l        U R                  5         g )Nr   )rb   r   r   rO  r  r4  headr   r   r1   r0   rQ   r7  rY   rZ   rW  r   s     rh   r   ModernBertForMaskedLM.__init__-  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	ry   c                     U R                   $ r   r7  r   s    rh   get_output_embeddings+ModernBertForMaskedLM.get_output_embeddings:  s    ||ry   new_embeddingsc                     Xl         g r   ru  )rd   rx  s     rh   set_output_embeddings+ModernBertForMaskedLM.set_output_embeddings=  s    %ry   Nr   r   r`  r   labelsre   r   c                    U R                   " SUUUUS.UD6nUS   nU R                  (       aI  UbF  UR                  S5      nUR                  UR                  S   S5      nXPR                  :g  n	X   nXY   nU R                  U R                  U5      5      n
S nUb)  U R                  " X4SU R                  R                  0UD6n[        UU
UR                  UR                  S9$ )Nr   r   r`  r   r   r   r0   losslogitsr   r  r^   )r  rY   r   r   rZ   r7  rr  loss_functionr   r0   r   r   r  )rd   r   r   r`  r   r|  re   outputsrc  mask_tokensr  r  s               rh   r   ModernBertForMaskedLM.forward@  s     ** 
)%'	

 
 $AJ!!f&8[[_F 1 6 6v||A K !$A$AAK 1 >(Fdii(9:;%%fbAWAWb[abD!//))	
 	
ry   )r   r7  rr  r  rZ   rY   NNNNN)r   r   r   r   _tied_weights_keysr*   r   rv  r   r   rz  r!   r   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r6  r6  %  s     +,TU/ &BII &  .2.2,0-1&*'
##d*'
 t+'
 llT)	'

 ||d*'
 t#'
 +,'
 
u||	~	-'
  '
ry   r6  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                      ^  \ rS rSrS\4U 4S jjr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\   S\\R                     \-  4S jj5       5       rSrU =r$ )r8  il  r   c                 n  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R!                  5         g r   )rb   r   
num_labelsr   rO  r  r4  rr  r   r   r   rU   r   r   r1   r<  rW  r   s     rh   r   ,ModernBertForSequenceClassification.__init__r  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	ry   Nr   r   r`  r   r|  re   r   c                 b   U R                   " SUUUUS.UD6nUS   nU R                  R                  S:X  a
  USS2S4   nOU R                  R                  S:X  ao  Uc;  [        R                  " UR
                  SS UR                  [        R                  S9nXR                  S5      -  R                  S	S
9UR                  S	SS9-  nU R                  U5      n	U R                  U	5      n	U R                  U	5      n
SnUGb  U R                  R                  c  U R                  S	:X  a  SU R                  l        OoU R                  S	:  aN  UR                  [        R                   :X  d  UR                  [        R"                  :X  a  SU R                  l        OSU R                  l        U R                  R                  S:X  aI  [%        5       nU R                  S	:X  a&  U" U
R'                  5       UR'                  5       5      nOU" X5      nOU R                  R                  S:X  a=  [)        5       nU" U
R+                  SU R                  5      UR+                  S5      5      nO,U R                  R                  S:X  a  [-        5       nU" X5      n[/        UU
UR0                  UR2                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r~  r   rR   NrS   r$   )r   r   r   r   r   Tr   keepdim
regressionsingle_label_classificationmulti_label_classificationr  r^   )r  r   rT   r   onesr   r   ra   r   sumrr  r   r<  problem_typer  r   longr   r
   squeezer	   r   r   r   r   r  )rd   r   r   r`  r   r|  re   r  rc  pooled_outputr  r  loss_fcts                rh   r   +ModernBertForSequenceClassification.forward  s]   " ** 
)%'	

 
 $AJ;;))U2 1!Q$ 7[[++v5%!&%++BQ/8I8P8PX]XbXb" "35M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
ry   )r<  r   r   rr  r  r  r  )r   r   r   r   r*   r   r!   r   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r8  r8  l  s    /   .2.2,0-1&*C
##d*C
 t+C
 llT)	C

 ||d*C
 t#C
 +,C
 
u||	7	7C
  C
ry   r8  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                      ^  \ rS rSrS\4U 4S jjr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\   S\\R                     \-  4S jj5       5       rSrU =r$ )r:  i  r   c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g r   rb   r   r  rO  r  r4  rr  r   r   r   rU   r   r   r1   r<  rW  r   s     rh   r   )ModernBertForTokenClassification.__init__  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	ry   Nr   r   r`  r   r|  re   r   c                 b   U R                   " SUUUUS.UD6nUS   nU R                  U5      nU R                  U5      nU R                  U5      n	Sn
Ub<  [	        5       nU" U	R                  SU R                  5      UR                  S5      5      n
[        U
U	UR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
r~  r   Nr   r  r^   )
r  rr  r   r<  r	   r   r  r   r   r  )rd   r   r   r`  r   r|  re   r  rc  r  r  r  s               rh   r   (ModernBertForTokenClassification.forward  s     ** 
)%'	

 
 $AJ II&78 II&78!23')HFKKDOO<fkk"oND$!//))	
 	
ry   r<  r   rr  r  r  r  )r   r   r   r   r*   r   r!   r   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r:  r:    s    
/ 
  .2.2,0-1&*$
##d*$
 t+$
 llT)	$

 ||d*$
 t#$
 +,$
 
u||	4	4$
  $
ry   r:  c                      ^  \ rS rSrS\4U 4S jjr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )r;  i  r   c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g r   r  r   s     rh   r   'ModernBertForQuestionAnswering.__init__  sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJry   Nr   r   r`  start_positionsend_positionsre   r   c                    U R                   " U4UUS.UD6nUS   nU R                  U5      nU R                  U5      nU R                  U5      n	U	R	                  SSS9u  pU
R                  S5      R                  5       n
UR                  S5      R                  5       nS nUb  Ub  U R                  " XXE40 UD6n[        UU
UUR                  UR                  S9$ )N)r   r`  r   r   r   r   )r  start_logits
end_logitsr   r  )r  rr  r   r<  splitr  r  r  r   r   r  )rd   r   r   r`  r  r  re   r  rc  r  r  r  r  s                rh   r   &ModernBertForQuestionAnswering.forward  s     **
)%
 	
 $AJ II&78 II&78!23#)<<r<#: #++B/::<''+668
&=+D%%libhiD+%!!//))
 	
ry   r  r  )r   r   r   r   r*   r   r!   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r;  r;    s    	/ 	  *..2,0/3-1#
<<$&#
 t+#
 llT)	#

 ,#
 ||d*#
 +,#
 
u||	;	;#
  #
ry   r;  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                      ^  \ rS rSrS\4U 4S jjr\\     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\   S\\R                     \-  4S jj5       5       rSrU =r$ )r9  i7  r   c                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  S5      U l        U R                  5         g )Nr   )rb   r   r   rO  r  r4  rr  r   r   r   rU   r   r   r1   r<  rW  r   s     rh   r   $ModernBertForMultipleChoice.__init__=  sm     $V,
,V4	HH$$V%>%>?	))F$6$6: 	ry   Nr   r   r`  r   r|  re   r   c                    Ub  UR                   S   OUR                   S   nUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " SUUUUS.UD6nUS   n	U R                  R
                  S:X  a  [        R                  " U	R                   S   U	R                  S9n
Ub)  UR                  SS	9R                  U	R                  5      nO.[        R                  " S[        R                  U	R                  S
9nXU4   n	OMU R                  R
                  S:X  a3  UR                  SSS9nXR                  S5      -  R                  SS	9U-  n	U R                  U	5      nU R!                  U5      nU R#                  U5      nUR                  SU5      nSnUb  [$        R&                  " 5       nU" X5      n[)        UUUR*                  UR,                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
Nr   r   r~  r   rR   rb  r   )r   r   rS   Tr  r  r^   )r   r   sizer  r   rT   r   rd  r   argmaxr   tensorr  r  r   rr  r   r<  r   r	   r   r   r  )rd   r   r   r`  r   r|  re   num_choicesr  rc  	indices_0cls_masknum_non_pad_tokensr  r  reshaped_logitsr  r  s                     rh   r   #ModernBertForMultipleChoice.forwardI  sf     -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ** 
)%'	

 
 $AJ ;;))U2%6%<%<Q%?HYH`H`aI))00R08;;<M<T<TU !<<DUD\D\] 1X2E F [[++v5!/!3!34!3!H!25M5Mb5Q!Q V V[\ V ]`r r		"34		-0/ ++b+6**,HO4D("!//))	
 	
ry   )r<  r   r   rr  r  r  )r   r   r   r   r*   r   r!   r   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r9  r9  7  s    
/ 
  .2.2,0-1&*C
##d*C
 t+C
 llT)	C

 ||d*C
 t#C
 +,C
 
u||	8	8C
  C
ry   r9  )r*   rO  r  r6  r8  r:  r;  r9  )r   )Mr2  collections.abcr   typingr   r   r   huggingface_hub.dataclassesr   r   torch.nnr   r	   r
    r   r#  activationsr   configuration_utilsr   integrationsr   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r    utils.genericr!   r"   utils.output_capturingr#   align.modeling_alignr%   gemma3.modeling_gemma3r&   r'   
get_loggerr   loggerr*   r1  r   r   r   r   r   r  r  rO  r4  r6  r8  r:  r;  r9  __all__r^   ry   rh   <module>r     s5     $ $  .  A A & ! 3 I ` 9  7 F & @ @ I 5 : G 
		H	% 89G)' G)  :G)T299 ,:BII :(\ 5 \ *+B ,B4 )*N)")) N) +N)b7 @ J^ J^ J^Z B@/ B@ B@J	>ryy 	> 
?
5 ?

?
D 
S
*C S

S
l 
3
'@ 3

3
l 1
%> 1
 1
h 
R
"; R

R
j	ry   