
    Z j>+                         S SK Jr  SSKJr  SSKJr  SSKJrJr  \R                  " \
5      r\" SS9\ " S S	\5      5       5       r\" SS9\ " S
 S\5      5       5       r\" SS9\ " S S\5      5       5       r/ SQrg)    )strict   )PreTrainedConfig)RopeParameters)auto_docstringloggingz meta-llama/Llama-4-Scout-17B-16E)
checkpointc                      \ rS rSr% SrSSSSSSSS.rSrSrS	r\	\
S
'   Sr\\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\\	   -  \\	\	4   -  \
S'   Sr\	\\	   -  \\	\	4   -  \
S'   Sr\\
S'   Sr\\
S'   Sr\\
S '   S!r\\
S"'   S#r\	\
S$'   S#r\	\
S%'   S&r\\
S''   S(r\\	-  \
S)'   S(r \\	-  \
S*'   S+r!\"\#-  S+-  \
S,'   S-r$g+).Llama4VisionConfig   aG  
vision_output_dim (`int`, *optional*, defaults to 7680):
    Dimensionality of the vision model output. Includes output of transformer
    encoder with intermediate layers and global transformer encoder.
pixel_shuffle_ratio (`float`, *optional*, defaults to 0.5):
    Pixel-shuffle ratio for downsampling patch tokens. Smaller values produce fewer tokens (more downsampling).
projector_input_dim (`int`, *optional*, defaults to 4096):
    Width of the vision adapter MLP before pixel shuffle. Larger value increases capacity and compute.
projector_output_dim (`int`, *optional*, defaults to 4096):
    Output width of the vision adapter. Larger value yields higher-dimensional image features.
projector_dropout (`float`, *optional*, defaults to 0.0):
    Dropout rate inside the vision adapter MLP. Higher value adds more regularization.
colwiserowwisecolwise_gather_output)zmodel.layers.*.self_attn.q_projzmodel.layers.*.self_attn.k_projzmodel.layers.*.self_attn.v_projzmodel.layers.*.self_attn.o_projzvision_adapter.mlp.fc1zvision_adapter.mlp.fc2zpatch_embedding.linearllama4_vision_modelvision_configi   hidden_sizegelu
hidden_act"   num_hidden_layers   num_attention_headsr   num_channelsi   intermediate_sizei   vision_output_dimi  
image_size   
patch_sizeh㈵>norm_epsdefaultvision_feature_select_strategy{Gz?initializer_rangeg      ?pixel_shuffle_ratioi   projector_input_dimprojector_output_dimFmulti_modal_projector_bias        projector_dropoutattention_dropoutNrope_parameters )%__name__
__module____qualname____firstlineno____doc__base_model_tp_plan
model_typebase_config_keyr   int__annotations__r   strr   r   r   r   r   r   listtupler   r    floatr"   r$   r%   r&   r'   r(   boolr*   r+   r,   r   dict__static_attributes__r-       ڀ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/llama4/configuration_llama4.pyr   r      s8    ,5+4+4+4"+"+"9 'J%OKJs!!L#!s!!s!47Jd3i%S/1746Jd3i%S/16He*3"C3#u#!$$## $#$',,%(us{(%(us{(48O^d*T18r?   r   c                     ^  \ rS rSr% SrSrS/rSrSSSSSSSSSSSSS	.rSSSSS
S
SSSSS.
r	Sr
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   S#r\\S$'   S%r\\S&'   S'r\S'-  \S('   S)r\S'-  \S*'   S+r\\\   -  S'-  \S,'   S-r \\S.'   S/r!\\-  \S0'   S)r"\\S1'   S2r#\\S3'   S'r$\\   S'-  \S4'   S)r%\\S5'   S%r&\\S6'   S-r'\\S7'   S8r(\\S9'   S/r)\\S:'   S'r*\+\,-  S'-  \S;'   S'r-\\   S'-  \S<'   S=r.\\S>'   Sr/\S'-  \S?'   S'r0\\   S'-  \S@'   S%r1\\SA'   Sr2\\SB'   SCr3\\SD'   S-r4\\SE'   U 4SF jr5SGr6U =r7$ )HLlama4TextConfigM   a  
intermediate_size_mlp (`int`, *optional*, defaults to 16384):
    Intermediate size of dense MLP layers. Larger value increases FFN capacity and compute.
moe_layers (`list[int]`, *optional*):
    List of layer indices that use MoE. Overrides `interleave_moe_layer_step` when set.
interleave_moe_layer_step (`int`, *optional*, defaults to 1):
    Spacing between MoE layers when `moe_layers` is `None`. Larger value means fewer MoE layers.
use_qk_norm (`bool`, *optional*, defaults to `True`):
    Whether to L2-normalize queries/keys on RoPE layers. Can stabilize attention when enabled.
no_rope_layers (`list[int]`, *optional*):
    List with at least the same length as the number of layers in the model.
    A `1` at an index position indicates that the corresponding layer will use RoPE,
    while a `0` indicates that it's a NoPE layer.
no_rope_layer_interval (`int`, *optional*, defaults to 4):
    If `no_rope_layers` is `None`, it will be created using a NoPE layer every
    `no_rope_layer_interval` layers.
attention_chunk_size (`int`, *optional*, defaults to 8192):
    Chunk size for the attention computation. Smaller value enforces more local attention and lowers memory.
attn_temperature_tuning (`bool`, *optional*, defaults to `True`):
    Whether to dynamically scale the attention temperature for each query token based on sequence length.
    Recommended for long sequences (e.g., >32k tokens) to maintain stable output results.
floor_scale (`int`, *optional*, defaults to 8192):
    Base scale (in tokens) for attention temperature tuning. Larger value delays scaling to longer positions.
attn_scale (`float`, *optional*, defaults to 0.1):
    Strength of attention temperature tuning. Larger value increases scaling at long positions.

Example:
llama4_textpast_key_valuesg    Ar   r   packed_rowwise)layers.*.self_attn.q_projlayers.*.self_attn.k_projlayers.*.self_attn.v_projlayers.*.self_attn.o_projz-layers.*.feed_forward.shared_expert.gate_projz+layers.*.feed_forward.shared_expert.up_projz-layers.*.feed_forward.shared_expert.down_proj*layers.*.feed_forward.experts.gate_up_proj'layers.*.feed_forward.experts.down_projlayers.*.feed_forward.gate_projlayers.*.feed_forward.up_projlayers.*.feed_forward.down_projgrouped_gemm	ep_router)
rG   rH   rI   rJ   rK   rL   rM   rN   rO   zlayers.*.feed_forward.routeri@ 
vocab_sizei   r   i    r   i @  intermediate_size_mlp0   r   (   r      num_key_value_heads   head_dimsilur   i   max_position_embeddingsr#   r$   r   rms_norm_epsT	use_cacheNpad_token_id   bos_token_id   eos_token_idFtie_word_embeddingsr)   r+   num_experts_per_tokr   num_local_experts
moe_layersinterleave_moe_layer_stepuse_qk_normoutput_router_logitsgMbP?router_aux_loss_coefrouter_jitter_noiser,   no_rope_layers   no_rope_layer_intervalattention_chunk_sizelayer_typesattn_temperature_tuningfloor_scaleg?
attn_scaleattention_biasc                   > U R                   c  U R                  U l         [        U R                  5       Vs/ s H!  n[	        US-   U R
                  -  S:g  5      PM#     nnU R                  (       a  U R                  OUU l        U R                  b  U R                  OU R                  U R                  -  U l        U R                  b  U R                  O6[        [        U R                  S-
  U R                  U R                  5      5      U l	        U R                  c*  U R                   Vs/ s H  oD(       a  SOSPM     snU l        [        TU ]8  " S0 UD6  g s  snf s  snf )Nr_   r   chunked_attentionfull_attentionr-   )rW   r   ranger   r6   rn   rl   rY   r   rf   r9   rg   rp   super__post_init__)selfkwargs	layer_idxdefault_no_rope_layersno_rope	__class__s        r@   rz   Llama4TextConfig.__post_init__   sH   ##+'+'?'?D$ V[[_[q[qUr"
Ur	CQ$"="==BCUr 	 "
 6:5H5Hd11Nd)-)BHXHX\`\t\tHt * OO22Q6**22 	 #TXTgTg Tgw#4DDTg D 	''/"
& s   (E,E)rY   rp   rf   rl   rW   )8r.   r/   r0   r1   r2   r4   keys_to_ignore_at_inferencedefault_thetar3   base_model_ep_planrR   r6   r7   r   r   rS   r   r   rW   rY   r   r8   r[   r$   r;   r\   r]   r<   r^   r`   rb   r9   rc   r+   rd   re   rf   rg   rh   ri   rj   rk   r,   r   r=   rl   rn   ro   rp   rq   rr   rs   rt   rz   r>   __classcell__r   s   @r@   rB   rB   M   s/   : J#4"5M%.%.%.%.9B7@9B6F3<+4)2+4 &/%.%.%.6D3A+4)2+4(3 JK!s!!&3&s!!  HcJ#,S,#u#L%It#L#*# L#* +,L#S	/D(, %%%(us{(  s#'JS	D '%&s&K!&$&"'%'!$$48O^d*T18'+NDI$+"#C#'+#*+$(KcT!($(T(KJ ND ( (r?   rB   c                      ^  \ rS rSr% SrSrSSSS.r\\S.r	S	S
0r
Sr\\-  S-  \S'   Sr\\-  S-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   U 4S jrSrU =r$ )Llama4Config   a<  
boi_token_index (`int`, *optional*, defaults to 200080):
    The begin-of-image token index to wrap the image prompt.
eoi_token_index (`int`, *optional*, defaults to 200081):
    The end-of-image token index to wrap the image prompt.

```python
>>> from transformers import Llama4Model, Llama4Config

>>> # Initializing a Llama4 7B style configuration
>>> configuration = Llama4Config()

>>> # Initializing a model from the Llama4 7B style configuration
>>> model = Llama4Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
llama4image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configr   zmulti_modal_projector.linear_1colwise_repNr   r   i i i Frc   c                   > U R                   c%  [        5       U l         [        R                  S5        O9[	        U R                   [
        5      (       a  [        S0 U R                   D6U l         U R                  c%  [        5       U l        [        R                  S5        O9[	        U R                  [
        5      (       a  [        S0 U R                  D6U l        [        TU ]$  " S0 UD6  g )Nz9vision_config is None, using default llama4 vision configz5text_config is None, using default llama4 text configr-   )
r   r   loggerinfo
isinstancer=   r   rB   ry   rz   )r{   r|   r   s     r@   rz   Llama4Config.__post_init__   s    %!3!5DKKST**D11!3!Id6H6H!ID#/1DKKOP(($///C$2B2BCD''r?   )r.   r/   r0   r1   r2   r4   attribute_maprB   r   sub_configsr3   r   r=   r   r7   r   r   r6   r   r   rc   r<   rz   r>   r   r   s   @r@   r   r      s    ( J-))M
 #3EWXK(- 59M4**T1826K((4/6!OS!!OS!#s# %%( (r?   r   )r   rB   r   N)huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r   
get_loggerr.   r   r   rB   r   __all__r-   r?   r@   <module>r      s   " / 3 1 , 
		H	% =>-9) -9  ?-9` =>{(' {(  ?{(| =>3(# 3(  ?3(l Er?   