
    Z j                     l    S SK Jr  S SKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       rS	/r	g
)    )Literal)strict   )PreTrainedConfig)auto_docstringzanswerdotai/ModernBERT-base)
checkpointc                     ^  \ rS rSr% SrSrS/rSSS.rSr\	\
S	'   S
r\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\\
S'   Sr\	\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\	S-  \
S '   S!r\	\\	   -  S-  \
S"'   S#r\	S-  \
S$'   S#r\	S-  \
S%'   S!r\	S-  \
S&'   Sr\\
S''   S(r\\	-  \
S)'   Sr \\   S-  \
S*'   Sr!\"\#S+   \"4   S-  \
S,'   S-r$\	\
S.'   S(r%\\	-  \
S/'   Sr&\\
S0'   S(r'\\	-  \
S1'   S2r(\\
S3'   S4r)\#S5   \
S6'   S(r*\\	-  \
S7'   Sr+\\
S8'   Sr,\\
S9'   Sr-\\
S:'   Sr.\\
S;'   S<r/\	\
S='   S2r0\\
S>'   U 4S? jr1S@ r2U 4SA jr3\4SB 5       r5\5Rl                  SC 5       r5SDr7U =r8$ )EModernBertConfig   a  
initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
    The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
norm_eps (`float`, *optional*, defaults to 1e-05):
    The epsilon used by the rms normalization layers.
norm_bias (`bool`, *optional*, defaults to `False`):
    Whether to use bias in the normalization layers.
local_attention (`int`, *optional*, defaults to 128):
    The window size for local attention.
mlp_dropout (`float`, *optional*, defaults to 0.0):
    The dropout ratio for the MLP layers.
decoder_bias (`bool`, *optional*, defaults to `True`):
    Whether to use bias in the decoder layers.
classifier_pooling (`str`, *optional*, defaults to `"cls"`):
    The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
    CLS token doesn't attend to all tokens on long sequences.
classifier_bias (`bool`, *optional*, defaults to `False`):
    Whether to use bias in the classifier.
classifier_activation (`str`, *optional*, defaults to `"gelu"`):
    The activation function for the classifier.
deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
    Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
sparse_prediction (`bool`, *optional*, defaults to `False`):
    Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
    The index to ignore for the sparse prediction.

Examples:

```python
>>> from transformers import ModernBertModel, ModernBertConfig

>>> # Initializing a ModernBert style configuration
>>> configuration = ModernBertConfig()

>>> # Initializing a model from the modernbert-base style configuration
>>> model = ModernBertModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
modernbertpast_key_valuesg     Ag     @)globallocali  
vocab_sizei   hidden_sizei  intermediate_size   num_hidden_layers   num_attention_headsgeluhidden_activationi    max_position_embeddingsg{Gz?initializer_rangeg       @initializer_cutoff_factorgh㈵>norm_epsF	norm_biasik  Npad_token_idij  eos_token_idii  bos_token_idcls_token_idsep_token_idattention_biasg        attention_dropoutlayer_types)full_attentionsliding_attentionrope_parameters   local_attentionembedding_dropoutmlp_biasmlp_dropoutTdecoder_biascls)r/   meanclassifier_poolingclassifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionisparse_pred_ignore_indextie_word_embeddingsc                    > UR                  SS5      nU R                  c?  [        U R                  5       Vs/ s H  n[	        X2-  5      (       a  SOSPM     snU l        [
        TU ]  " S0 UD6  g s  snf )Nglobal_attn_every_n_layersr   r'   r&    )getr%   ranger   boolsuper__post_init__)selfkwargsr:   i	__class__s       ڈ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/modernbert/configuration_modernbert.pyr@   ModernBertConfig.__post_init__q   sy    %+ZZ0La%P"# t556 6A (,A,J'K'K#Qaa6 D
 	'' s    A1c                    UR                  SS 5      nSS0SS0S.nU R                  b  U R                  OUU l        Ub<  U R                  S   R                  U5        U R                  S   R                  U5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  SU R
                  S	   5      5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  S
U R
                  S   5      5        U R                  5         U$ )Nrope_scaling	rope_typedefault)r'   r&   r&   r'   
rope_thetaglobal_rope_thetar   local_rope_thetar   )popr(   updater<   
setdefaultdefault_thetastandardize_rope_params)rA   rB   rH   default_rope_paramss       rE   convert_rope_params_to_dict,ModernBertConfig.convert_rope_params_to_dict|   sc   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G  !45<<\J ##$45=6A95MD  !12-.99&**%8$:L:LX:VW	
 ##$78@9Di8PD  !4501<<&**%79K9KG9TU	

 	$$&    c                 H   > [         TU ]  5       nUR                  SS 5        U$ )Nreference_compile)r?   to_dictrN   )rA   outputrD   s     rE   rY   ModernBertConfig.to_dict   s#    "

&-rV   c                      U R                   S-  $ )zKHalf-window size: `local_attention` is the total window, so we divide by 2.   r*   )rA   s    rE   sliding_windowModernBertConfig.sliding_window   s     ##q((rV   c                     US-  U l         g)z<Set sliding_window by updating local_attention to 2 * value.r]   Nr^   )rA   values     rE   r_   r`      s      %qyrV   )r%   r*   r(   )9__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencerQ   r   int__annotations__r   r   r   r   r   strr   r   floatr   r   r   r>   r   r   listr    r!   r"   r#   r$   r%   r(   dictr   r*   r+   r,   r-   r.   r1   r2   r3   r4   r5   r6   r7   r8   r@   rT   rY   propertyr_   setter__static_attributes____classcell__)rD   s   @rE   r
   r
      s   (T J#4"5(8<MJK!s!s!!#s##'S'#u#'*u*HeIt$L#*$+0L#S	/D(0$L#*$$L#*$$L#*$ ND %(us{($(KcT!(Y]OT'"GH$NORVV]OS%(us{(Hd"K"L$16.6&))!OT!!'3'%*d*#t#$(c( $$	(<
 ) ) ) )rV   r
   N)
typingr   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r
   __all__r;   rV   rE   <module>ry      sK   ,  . 3 # 89G)' G)  :G)T 
rV   