
    Z j                        S SK r S SKJr  S SKJr  S SKJr  S SKrS SKJ	r	  S SK
J	s  Jr  S SKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,J-r-  SSK.J/r/  SSK0J1r1J2r2J3r3J4r4J5r5  SSK6J7r7J8r8  SSK9J:r:  SSK;J<r<J=r=  \4R|                  " \?5      r@ " S S\	R                  5      rB " S S\	R                  5      rC " S S\R                  R                  5      rD " S S\	R                  5      rE " S  S!\	R                  5      rG\" S"5       " S# S"\	R                  5      5       rH " S$ S%\	R                  5      rIS&\R                  S'\R                  S(\R                  S)\K\R                  \R                  4   4S* jrLS+\R                  S,\MS)\R                  4S- jrN S`S.\	R                  S/\R                  S0\R                  S1\R                  S2\R                  S-  S3\OS4\O4S5 jjrP S`S.\	R                  S/\R                  S0\R                  S1\R                  S2\R                  S-  S3\OS4\O4S6 jjrQ " S7 S8\	R                  5      rR " S9 S:\!5      rS\2 " S; S<\-5      5       rT\2 " S= S>\T5      5       rU " S? S@\T\5      rV\2" SASB9\ " SC SD\'5      5       5       rW " SE SF\R                  R                  5      rX " SG SH\	R                  5      rYSI rZ " SJ SK\	R                  5      r[SL\R                  S/\R                  4SM jr\S/\R                  S0\R                  SL\R                  S)\K\R                  \R                  4   4SN jr] " SO SP\	R                  5      r^ " SQ SR\	R                  5      r_ " SS ST\!5      r` " SU SV\	R                  5      ra " SW SX\	R                  5      rb " SY SZ\	R                  5      rc " S[ S\\T5      rd " S] S^\T\5      re/ S_Qrfg)a    N)Callable)	dataclass)Optional)Llama4VisionConfig   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )Llama4ConfigLlama4TextConfigc                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Llama4TextExperts8   configc                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U l        [        R                  " [        R                  " U R                  U R
                  SU R                  -  5      5      U l        [        R                  " [        R                  " U R                  U R                  U R
                  45      5      U l        [        UR                     U l        g N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchzerosgate_up_projempty	down_projr	   
hidden_actact_fnselfr*   	__class__s     {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/llama4/modeling_llama4.pyr/   Llama4TextExperts.__init__9   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 n   UR                  U R                  R                  S   SU R                  5      n[        R
                  " XR                  5      nUR                  SSS9u  p4[        R
                  " X@R                  U5      -  U R                  5      nUR                  SU R                  5      nU$ )a  
This should really not be run on a single machine, as we are reaching compute bound:
- the inputs are expected to be "sorted" per expert already.
- the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

Args:
    hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
    selected_experts (torch.Tensor): (batch_size * token_num, top_k)
    routing_weights (torch.Tensor): (batch_size * token_num, top_k)
Returns:
    torch.Tensor
r   r-   dim)	viewr9   shaper3   r7   bmmchunkr=   r;   )r?   rD   gate_upgateupnext_statess         rA   forwardLlama4TextExperts.forwardC   s     &**4+<+<+B+B1+Er4K[K[\))M+<+<====+iikk$&7!7$..I!&&r4+;+;<rC   )r=   r;   r4   r9   r3   r2   r1   )__name__
__module____qualname____firstlineno__r&   r/   r7   TensorrR   __static_attributes____classcell__r@   s   @rA   r(   r(   8   s0    0/ 0U\\ ell  rC   r(   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Llama4TextMLPY   c                 X  > [         TU ]  5         Uc  UR                  nXl        [        R
                  " UR                  USS9U l        [        R
                  " UR                  USS9U l        [        R
                  " X!R                  SS9U l	        [        UR                     U l        g NFbias)r.   r/   r2   r*   r5   Linearr3   	gate_projup_projr;   r	   r<   activation_fn)r?   r*   r2   r@   s      rA   r/   Llama4TextMLP.__init__Z   s    $ & 8 86#5#57HuUyy!3!35FUS#46H6HuU#F$5$56rC   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      $ N)rf   rd   re   r;   )r?   xr;   s      rA   rR   Llama4TextMLP.forwardf   s7    &&t~~a'89DLLOK	~~i((rC   )rf   r*   r;   rd   re   ri   rT   rU   rV   rW   r/   rR   rY   rZ   r[   s   @rA   r]   r]   Y   s    
7) )rC   r]   c                   F   ^  \ rS rSrSS\4U 4S jjjrS rS rS rSr	U =r
$ )	Llama4TextL2Normk   epsc                 .   > [         TU ]  5         Xl        g ri   )r.   r/   rp   )r?   rp   r@   s     rA   r/   Llama4TextL2Norm.__init__l   s    rC   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ Nr-   rG   T)keepdimr7   rsqrtpowmeanrp   r?   rj   s     rA   _normLlama4TextL2Norm._normp   4    5;;quuQx}}R}>IJJJrC   c                 ^    U R                  UR                  5       5      R                  U5      $ ri   )r{   floattype_asrz   s     rA   rR   Llama4TextL2Norm.forwards   s"    zz!'')$,,Q//rC   c                      SU R                    3$ )Nzeps=rp   r?   s    rA   
extra_reprLlama4TextL2Norm.extra_reprv   s    dhhZ  rC   r   )gư>)rT   rU   rV   rW   r   r/   r{   rR   r   rY   rZ   r[   s   @rA   rn   rn   k   s)    E  K0! !rC   rn   c                   >   ^  \ rS rSrSU 4S jjrS rS rS rSrU =r	$ )Llama4TextRMSNormz   c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g)z,
Llama4RMSNorm is equivalent to T5LayerNorm
N)r.   r/   rp   r5   r6   r7   onesweight)r?   r3   rp   r@   s      rA   r/   Llama4TextRMSNorm.__init__{   s.     	ll5::k#:;rC   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ rt   rv   rz   s     rA   r{   Llama4TextRMSNorm._norm   r}   rC   c                 z    U R                  UR                  5       5      R                  U5      nX R                  -  $ ri   )r{   r   r   r   )r?   rj   outputs      rA   rR   Llama4TextRMSNorm.forward   s.    AGGI&..q1##rC   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   rK   rp   r   s    rA   r   Llama4TextRMSNorm.extra_repr   s'    ))*+6$((<<rC   )rp   r   )gh㈵>)
rT   rU   rV   rW   r/   r{   rR   r   rY   rZ   r[   s   @rA   r   r   z   s    <K$= =rC   r   c                   4   ^  \ rS rSrU 4S jrU 4S jrSrU =r$ )Llama4Router   c                    > [         TU ]  UR                  UR                  SS9  UR                  U l        UR
                  U l        g r`   )r.   r/   r3   r0   r1   num_experts_per_toktop_kr>   s     rA   r/   Llama4Router.__init__   s>    ++V-E-EER!33//
rC   c                 j  > [         TU ]  U5      n[        R                  " X R                  SS9u  p4[        R
                  " U[        S5      5      R                  SXC5      n[        R                  R                  R                  UR                  5       5      R                  UR                  5      nXR4$ )Nr$   rH   z-inf)r.   rR   r7   topkr   	full_liker   scatter_r5   
functionalsigmoidtodtype)r?   rD   router_logitsrouter_top_valuerouter_indicesrouter_scoresr@   s         rA   rR   Llama4Router.forward   s    6+0::mZZUV+W(uV}ENNqR`s++33M4G4G4IJMMmNaNab++rC   )r1   r   rl   r[   s   @rA   r   r      s    0
, ,rC   r   Llama4TextMoec                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r      c                    > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        U5      U l	        [        U5      U l        [        U5      U l        g ri   )r.   r/   r   r   r3   
hidden_dimr0   r1   r(   expertsr   routerr]   shared_expertr>   s     rA   r/   Llama4TextMoe.__init__   s[    //
 ,,!33(0"6**62rC   c                    UR                  SU R                  5      nU R                  U5      u  p#UR                  UR                  S   S5      nXBR                  SS5      R                  SS5      -  nU R                  U5      nU R                  U5      nUR                  UR                  UR                  S   SUR                  S   5      R                  SS95        Xc4$ )NrG   r$   r   rH   )
reshaper   r   repeatrK   	transposer   r   add_sum)r?   rD   r   r   	routed_in
routed_outouts          rA   rR   Llama4TextMoe.forward   s    %--b$//B'+{{='A$!(()<)<Q)?C	 7 71 = E Eb! LL	\\),
  /##M$7$7$:B
@P@PQS@TUYY^_Y`a!!rC   )r   r   r1   r   r   r   rl   r[   s   @rA   r   r      s    3" "rC   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )Llama4TextRotaryEmbedding   inv_freqNr*   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultr   F
persistentoriginal_inv_freq)r.   r/   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr*   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r?   r*   devicerope_init_fnr   r@   s        rA   r/   "Llama4TextRotaryEmbedding.__init__   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUrC   r   ztorch.deviceseq_lenrE   ztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimN      ?r   r-   r   )r   r   )	r   getattrr3   num_attention_headsr7   arangeint64r   r   )r*   r   r   baserI   attention_factorr   s          rA   r   9Llama4TextRotaryEmbedding.compute_default_rope_parameters   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))rC   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       n[	        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  UR
                  5      U-  R                  SS5      n[        R                  " [        R                  " U5      U5      nXpR                  -  nS S S 5        U$ ! , (       d  f       W$ = f)	Nr   rG   r$   mpscpuF)device_typeenabledr-   )r   r   expandrK   
isinstancer   typestrr!   r   r   r7   polar	ones_liker   )r?   rj   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           rA   rR   !Llama4TextRotaryEmbedding.forward   s    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&))!((36KKVVWXZ[\EEOOE$:EBI!$:$::I D
  DC
 s    A(D22
E)r   r*   r   r   r   ri   )NNN)rT   rU   rV   rW   r7   rX   __annotations__r&   r/   staticmethodr   intr   r   r   no_gradr   rR   rY   rZ   r[   s   @rA   r   r      s    llV/ V V  *.+/"* 4'*(* t* 
~u$	%	* *< ]]_
  
rC   r   xqxkr   rE   c           	      *   [         R                  " U R                  5       R                  " / U R                  S S QSPSP76 5      n[         R                  " UR                  5       R                  " / UR                  S S QSPSP76 5      n[         R
                  " X2S S 2S S 2S S S 24   -  5      R                  S5      n[         R
                  " XBS S 2S S 2S S S 24   -  5      R                  S5      nUR                  U 5      UR                  U5      4$ )NrG   r-   r   )r7   view_as_complexr   r   rK   view_as_realflattenr   )r   r   r   xq_xk_xq_outxk_outs          rA   apply_rotary_embr      s    
 


 2 2 IBHHSbM I2 Iq I
JC



 2 2 IBHHSbM I2 Iq I
JC1dA&> >?GGJF1dA&> >?GGJF>>"v~~b111rC   rD   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r$   N)rK   r   r   )rD   r   batchnum_key_value_headsslenr   s         rA   	repeat_kvr    s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTrC   modulequerykeyvalueattention_maskscalingdropoutc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
SS9n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr-   r   rG   rH   ptrainingr$   )r  num_key_value_groupsr7   matmulr   r5   r   softmaxr  r  
contiguousr  r  r  r  r	  r
  r  kwargs
key_statesvalue_statesattn_weightsattn_outputs               rA   eager_attention_forwardr    s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2(>L==((6??([L,,|:K''1-88:K$$rC   c                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U R
                  S-  -  n
Ub  X-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr-   r         rG   rH   r  r$   )r  r  r7   r  r   r   r5   r   r  r  r  r  r  s               rA   vision_eager_attention_forwardr  (  s     3 ; ;<JU$?$?@L<<';';Aq'ABV__VZEZZL!#4==((2(>L==((6??([L,,|:K''1-88:K$$rC   c                   "  ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  \	\R                     S-  4   4S jjrSrU =r$ )Llama4TextAttentioniA  z=Multi-headed attention from 'Attention Is All You Need' paperr*   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  U l        UR                  UR                  -  U l	        UR                  U l        U R                  S-  U l
        UR                  U l        UR                  U l        UR                  U l        UR                  U l        SU l        UR                   U   U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  U R                  -  UR
                  UR(                  S9U l        U R                  R2                  (       a-  U R"                  (       a  [5        UR6                  5      U l        g g g )Nr   r  Tra   )r.   r/   r*   	layer_idxr   r3   r   r   r  r  r
  
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper5   rc   attention_biasq_projk_projv_projo_projuse_qk_normrn   rms_norm_epsqk_normr?   r*   r!  r@   s      rA   r/   Llama4TextAttention.__init__D  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"rC   NrD   position_embeddingsr	  past_key_valuesr  rE   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      nU R	                  U5      R                  " / UQSPU R                  P76 n	U R                  U5      R                  U5      R                  SS5      n
U R                  (       a'  [        XUR                  UR                  5      5      u  p[        U S5      (       a"  U R                  U5      nU R                  U	5      n	U R                  (       Ga  U R                  (       d  Ub  UR                  U R                  5      OSn[         R"                  " UR                   S   UR                  S9U-   n[         R$                  " [         R&                  " UR)                  5       S-   U R*                  -  5      5      U R,                  -  S-   nUR                  SUS   SS45      R/                  / UQSPSP75      nX-  R                  UR0                  5      nUR                  SS5      nU	R                  SS5      n	Ub  UR3                  XU R                  5      u  p[4        R6                  " U R8                  R:                  [<        5      nU" U UU	U
U4U R>                  (       d  SOU R@                  U RB                  S	.UD6u  nnURD                  " / UQSP76 RG                  5       nU RI                  U5      nUU4$ )
NrG   r$   r-   r0  r   r   r           )r  r
  )%rK   r   r*  rJ   r+  r,  r   r(  r   r   r   hasattrr0  r$  get_seq_lengthr!  r7   r   log1pfloorr   r#  r"  r   r   updater   get_interfacer*   _attn_implementationr  r  r%  r
  r   r  r-  )r?   rD   r3  r	  r4  r  input_shapehidden_shapequery_statesr  r  past_seen_tokens	positionsattn_scalesattention_interfacer  r  s                    rA   rR   Llama4TextAttention.forwardb  s    $))#2.88b8$--8{{=166|D[[/44UkU2Ut}}U
{{=166|DNNqRST=='7*=*@*@ATAT*U($L 4##<<5Lj1J '''Q`Ql==dnnMrs]%8%8%;MDXDXY\llIEKK):S)@DDTDT(TUVY]YhYhhknn  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(6::<;M;MNL#--a3))!Q/
&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((rC   )r%  r"  r$  r*   r#  r   r&  r+  r!  r   r  r  r-  r*  r0  r
  r(  r,  ri   )rT   rU   rV   rW   __doc__r&   r/   r7   rX   r   r
   r   r   rR   rY   rZ   r[   s   @rA   r  r  A  s    GA/ AF )-8)||8) #5<<#=>8) t+	8)
 8) -.8) 
u||U\\D0%2E2LL	M8) 8)rC   r  c                   H  ^  \ rS rSrU 4S jr     SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	S-  S	\
\R                  \R                  4   S-  S
\\   S\
\R                  \
\R                  \R                  4   S-  4   4S jjrSrU =r$ )Llama4TextDecoderLayeri  c                   > [         TU ]  5         UR                  U l        X l        [	        X5      U l        X!R                  ;   U l        U R                  (       a  [        U5      U l	        O[        XR                  S9U l	        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )N)r2   r   )r.   r/   r3   r!  r  	self_attn
moe_layersis_moe_layerr   feed_forwardr]   intermediate_size_mlpr   r/  input_layernormpost_attention_layernormr1  s      rA   r/   Llama4TextDecoderLayer.__init__  s    !--",V?%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%rC   NrD   r	  r   r4  	use_cacher3  r  rE   c           	         UnU R                  U5      nU R                  " SUUUUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nU R                  (       a  Uu  pXR                  UR                  5      -   nU$ )N)rD   r3  r	  r4  rS   )rP  rK  rQ  rN  rM  rJ   rK   )r?   rD   r	  r   r4  rS  r3  r  residualattention_states_s              rA   rR   Llama4TextDecoderLayer.forward  s     !,,]; #nn 
' 3)+
 
 !3 !55mD))-8,M #5#5hnn#EErC   )rN  r3   rP  rM  r!  rQ  rK  )NNNFN)rT   rU   rV   rW   r/   r7   rX   
LongTensorr
   boolr   r   r   FloatTensorrR   rY   rZ   r[   s   @rA   rI  rI    s    g" /304(,!&HL ||  t+  &&-	 
   $;  #5<<#=>E  -.  
u  %(9(95;L;L(L"MPT"TT	U   rC   rI  c                   ~   ^  \ rS rSr% \\S'   SrSrS/rSr	Sr
SrSrSr\R                  " 5       U 4S j5       rSrU =r$ )	Llama4PreTrainedModeli  r*   )imagetextTr4  Fc                   > [         TU ]  U5        [        U R                  S5      (       a  U R                  R                  OU R                  R
                  R                  n[        U[        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                  SUS9  g [        U[        5      (       a;  [        R                  " UR                  UR                  UR                  5      5        g [        U[         5      (       aS  [        R                  " UR"                  UR$                  S9  [        R                  " UR&                  UR$                  S9  g g )Ninitializer_ranger7  )ry   std)rc  )r.   _init_weightsr8  r*   rb  text_configr   r(   initnormal_r9   r;   Llama4VisionRotaryEmbeddingcopy_freqs_ci_compute_freqs_ciLlama4VisionModelclass_embeddingscalepositional_embedding_vlm)r?   r  rc  r@   s      rA   rd  #Llama4PreTrainedModel._init_weights  s    f% t{{$788 KK))((:: 	
 f/00LL,,3C@LL))= ;<<JJv(@(@(OP 122LL//V\\BLL88fllK 3rC   rU  )rT   rU   rV   rW   r%   r   input_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr7   r   rd  rY   rZ   r[   s   @rA   r^  r^    sU    (&*##4"5 N!"&
]]_L LrC   r^  c                   8  ^  \ rS rSr% S/rSrSr\\S'   \	\
\S.rS\4U 4S jjr\\\\      SS	\R&                  S-  S
\R(                  S-  S\R&                  S-  S\S-  S\R,                  S-  S\S-  S\\   S\\-  4S jj5       5       5       5       rSrU =r$ )Llama4TextModeli  rI  model)r`  r*   )
attentionsrD   r   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   r*   F)r.   r/   pad_token_idpadding_idx
vocab_sizer5   	Embeddingr3   embed_tokens
ModuleListrangenum_hidden_layersrI  layersr   r/  normr   
rotary_embgradient_checkpointing	post_initr1  s      rA   r/   Llama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHghHg9#F6Hgh
 &f&8&8f>Q>QR	36B&+# 	 is   C?N	input_idsr	  r   r4  inputs_embedsrS  r  rE   c           
      L   US L US L-  (       a  [        S5      eUc>  U R                  UR                  U R                  R                  R                  5      5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       d)  U R                  UUUUS.n
[        S
0 U
D6[        S
0 U
D6S.n	UnU R!                  X5      n[#        U R$                  S U R                  R&                   5       H-  u  pU" U4XR                  R(                  U      UUUUS.UD6nM/     U R+                  U5      n[-        UU(       a  US	9$ S S	9$ )N:You must specify exactly one of input_ids or inputs_embedsr~  r   r$   r6  )r*   r  r	  r4  r   )full_attentionchunked_attention)r	  r   r4  rS  r3  )last_hidden_stater4  rU  )
ValueErrorr  r   r   r   r   r*   r9  r7   r   rK   	unsqueezer   dictr   r   r  	enumerater  r  layer_typesr  r   )r?   r  r	  r   r4  r  rS  r  rB  causal_mask_mappingmask_kwargsrD   freq_cisidecoder_layers                  rA   rR   Llama4TextModel.forward
  s    -t";<YZZ  --ill4;L;L;S;S;Z;Z.[\M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%?%N+%N#
 & ??=? )$++6U8U8U*V WA)2;;3J3J13MN) /#$, M !X 		-0&+/8O
 	
>B
 	
rC   )r  r  r  r  r  r  r  )NNNNNN)rT   rU   rV   rW   _no_split_modulesbase_model_prefixrq  r&   r   r  rI  r   _can_record_outputsr/   r   r"   r#   r   r7   rZ  rX   r
   r\  r[  r   r   r   r   rR   rY   rZ   r[   s   @rA   rz  rz    s   12 )/&/    .2.204(,26!%<
##d*<
 t+<
 &&-	<

 <
 ((4/<
 $;<
 +,<
 
(	(<
     <
rC   rz  c                   d  ^  \ rS rSr% S/rSrSS0rSS0r\\	S'   S\4U 4S	 jjr
\\        SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R"                  S
-  S\R                  S
-  S\S
-  S\\R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )Llama4ForCausalLMiM  rI  language_modelzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr*   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r`   )
r.   r/   rz  r{  r  r5   rc   r3   r  r  r>   s     rA   r/   Llama4ForCausalLM.__init__T  sU     $V,
 ++yy!3!3V5F5FUS 	rC   Nr  r	  r   r4  r  labelsrS  logits_to_keepr  rE   c	           
      n   U R                   " SUUUUUUS.U	D6n
U
S   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nSnUb)  U R
                  " SXU R                  R                  S.U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Llama4ForCausalLM

>>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r  r	  r   r4  r  rS  r   N)logitsr  r  )lossr  r4  rD   r|  rU  )r{  r   r   slicer  loss_functionr*   r  r   r4  rD   r|  )r?   r  r	  r   r4  r  r  rS  r  r  outputsrD   slice_indicesr  r  s                  rA   rR   Llama4ForCausalLM.forward]  s    H ** 
)%+'
 
  
8B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD%#33!//))
 	
rC   )r  r{  r  )NNNNNNNr   )rT   rU   rV   rW   r  r  _tied_weights_keys_tp_planr&   r   r/   r   r   r7   rZ  rX   r
   r\  r[  r   r   r   r   r   rR   rY   rZ   r[   s   @rA   r  r  M  s&   12(*,GH23H/   .2.204(,26*.!%-.:
##d*:
 t+:
 &&-	:

 :
 ((4/:
   4':
 $;:
 ell*:
 +,:
 
'	':
  :
rC   r  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)Llama4CausalLMOutputWithPasti  a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nr  r  r4  rD   r|  image_hidden_statesrU  )rT   rU   rV   rW   rG  r  r7   r\  r   r  r4  r
   rD   r   r|  r  rY   rU  rC   rA   r  r    s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18rC   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4VisionMLP2i  c                 x  > [         TU ]  5         UR                  U l        UR                  U l        [        R
                  " U R                  UR                  SS9U l        [        R
                  " UR                  UR                  SS9U l	        [        R                  " 5       U l        UR                  U l        g r`   )r.   r/   r3   r2   r5   rc   projector_input_dimfc1projector_output_dimfc2GELUrf   projector_dropoutr  r>   s     rA   r/   Llama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//rC   c                     U R                  U5      nU R                  U5      n[        R                  " XR                  U R                  S9nU R                  U R                  U5      5      $ )Nr  )r  rf   Fr  r  r  r?   rD   s     rA   rR   Llama4VisionMLP2.forward  sR    /**=9		-<<$--X!!$((="9::rC   )rf   r  r  r  r3   r2   rl   r[   s   @rA   r  r    s    0; ;rC   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4MultiModalProjectori  c                    > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        g r`   )	r.   r/   r5   rc   vision_configvision_output_dimre  r3   linear_1r>   s     rA   r/   "Llama4MultiModalProjector.__init__  s?    		  22**
rC   c                 (    U R                  U5      nU$ ri   r  )r?   image_featuresrD   s      rA   rR   !Llama4MultiModalProjector.forward  s    n5rC   r  rl   r[   s   @rA   r  r    s    
 rC   r  c           
      8   U R                   u  p#n[        [        R                  " U5      5      nU R	                  X%US5      n U R                  5       u  p&ptU R	                  X&[        Xq-  5      [        XA-  5      5      nUR                  SSSS5      R                  5       nUR	                  U[        Xa-  5      [        Xq-  5      [        XAS-  -  5      5      nUR                  SSSS5      R                  5       nUR	                  USUR                   S   5      n	U	$ )NrG   r   r-   r$   r   )rK   r   mathsqrtrJ   sizepermuter  )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             rA   pixel_shuffler    s   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='J"''
C@U<VX[\d\tXuvO%--aAq9DDFO%**C./U5J1KSQYlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMrC   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4VisionPixelShuffleMLPi  c                    > [         TU ]  5         UR                  U l        [        UR                  U R                  S-  -  5      U l        UR                  U l        [        U5      U l	        g r,   )
r.   r/   pixel_shuffle_ratior   r  	inner_dimr  
output_dimr  mlpr>   s     rA   r/   $Llama4VisionPixelShuffleMLP.__init__  sX    #)#=#= V77D<T<TVW<WXY 55#F+rC   encoded_patchesrE   c                 N    [        XR                  5      nU R                  U5      $ ri   )r  r  r  )r?   r  s     rA   rR   #Llama4VisionPixelShuffleMLP.forward  s!    '9Q9QRxx((rC   )r  r  r  r  
rT   rU   rV   rW   r/   r7   rX   rR   rY   rZ   r[   s   @rA   r  r    s(    ,)u|| ) ) )rC   r  rj  c                     UR                   n[        UR                  5       VVs/ s H  u  p4US:X  d  X2S-
  :X  a  UOSPM     nnnU R                  " U6 $ s  snnf )Nr$   )ndimr  rK   rJ   )rj  r  r  r  drK   s         rA   reshape_for_broadcastr    sT    ::D=Fu{{=ST=STQ!q&AMQq0=SET==%   Us   Ac                 >   [         R                  " U R                  5       R                  " / U R                  S S QSPSP76 5      n[         R                  " UR                  5       R                  " / UR                  S S QSPSP76 5      n[        X#S9nUR                  UR                  5      n[         R                  " X2-  5      R                  S5      n[         R                  " XB-  5      R                  S5      nUR                  U 5      UR                  U5      4$ )NrG   r-   )rj  r  r   )r7   r   r   r   rK   r  r   r   r   r   r   )r  r  rj  query_key_	query_outkey_outs          rA   vision_apply_rotary_embr    s    
 ""5;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!Lciin!Lb!L!!LMD$hEH{{6==)H""6#45==a@I  199!<GU#W__S%999rC   c                     ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S\R                  S-  S\S-  S	\	\
   S
\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Llama4VisionAttentioni  r*   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  UR
                  -  U l        SU l        UR                  U l	        U R                  S-  U l
        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )Nr$   r  Tra   )r.   r/   r*   r3   	embed_dimr   	num_headsr   r  r%  r
  r5   rc   r*  r+  r,  r-  r>   s     rA   r/   Llama4VisionAttention.__init__  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZrC   NrD   rj  r	  r4  r  rE   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      nU R	                  U5      R                  U5      n	U R                  U5      R                  U5      n
[        XUS9u  pUR                  SS5      nU	R                  SS5      n	U
R                  SS5      n
[        R                  " U R                  R                  [        5      nU" U UU	U
S 4U R                  (       d  SOU R                  S SS.UD6u  pUR                  " / UQSP76 R!                  5       nU R#                  U5      nX4$ )NrG   )rj  r$   r-   r7  F)r  r
  r&  )rK   r   r*  rJ   r+  r,  r  r   r   r=  r*   r>  r  r  r%  r   r  r-  )r?   rD   rj  r	  r4  r  r?  r@  rA  r  r  rE  r  r  s                 rA   rR   Llama4VisionAttention.forward  s`    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g #--a3))!Q/
#--a3(?(M(MKK,,.L)
 %8
%
  $}}C$2H2H
%
 
%
! "));;;;FFHkk+.((rC   )r%  r*   r  r   r+  r  r  r-  r*  r
  r,  NN)rT   rU   rV   rW   r   r/   r7   rX   r
   r   r   r   rR   rY   rZ   r[   s   @rA   r  r    s    [1 [& /3(,')||') ,,') t+	')
 ') -.') 
u||U\\D0%2E2LL	M') ')rC   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4VisionMLPiI  c                   > [         TU ]  5         Xl        [        R                  " 5       U l        [        R                  " UR                  UR                  SS9U l	        [        R                  " UR                  UR                  SS9U l
        g )NTra   )r.   r/   r*   r5   r  rf   rc   r3   r2   r  r  r>   s     rA   r/   Llama4VisionMLP.__init__J  sc    WWY99V//1I1IPTU99V55v7I7IPTUrC   rD   rE   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ ri   )r  rf   r  r  s     rA   rR   Llama4VisionMLP.forwardQ  s4    /**=9/rC   )rf   r*   r  r  r  r[   s   @rA   r  r  I  s)    VU\\ ell  rC   r  c            
          ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S\R                  S-  S\S-  4S	 jjr	S
r
U =r$ )Llama4VisionEncoderLayeriX  r*   c                   > [         TU ]  5         UR                  U l        [        U5      U l        [        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  5      U l
        g ri   )r.   r/   r3   r  rK  r  r  r5   	LayerNormrP  rQ  r>   s     rA   r/   !Llama4VisionEncoderLayer.__init__Y  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%rC   Nhidden_staterj  r	  output_attentionsc                     UnU R                  U5      nU R                  UUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )N)rj  r	  )rP  rK  rQ  r  )r?   r  rj  r	  r  rV  r  r  s           rA   rR    Llama4VisionEncoderLayer.forwardc  s      ++L9%)^^) &4 &
"
  .  44\Bxx-./&GrC   )r3   rP  r  rQ  rK  r  )rT   rU   rV   rW   r   r/   r7   rX   r[  rR   rY   rZ   r[   s   @rA   r	  r	  X  s_    I1 I /3)-ll ,, t+	
  $; rC   r	  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S\R                  S-  S	\	S-  S
\	S-  S\	S-  S\
\-  4S jjrSrU =r$ )Llama4VisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Llama4VisionEncoderLayer`].

Args:
    config: Llama4VisionConfig
r*   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        Xl        g s  snf )NF)
r.   r/   r*   r5   r  r  r  r	  r  r  )r?   r*   rX  r@   s      rA   r/   Llama4VisionEncoder.__init__  sY    mmuU[UmUmOn$oOn!%=f%EOn$op&+# %ps   A,NrD   rj  r	  r  output_hidden_statesreturn_dictrE   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnU R                   H,  n	U(       a  Xq4-   nU	" UUUUS9n
U(       a  XS   4-   nU
S   nM.     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrU  )r  r	  r  rj  r$   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fri   rU  .0vs     rA   	<genexpr>.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     e$Sq$S   	r  rD   r|  )r*   r  r  r  r  r   r   )r?   rD   rj  r	  r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputss              rA   rR   Llama4VisionEncoder.forward  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY30d![[M#!/2B!B)*-"3!	M !!/3C2E!E)!,M )   +.>>Ne]N$Seee+Vd
 	
rC   )r*   r  r  NNNN)rT   rU   rV   rW   rG  r   r/   r7   rX   r[  r   r   rR   rY   rZ   r[   s   @rA   r  r    s    1  /3)-,0#'?
||?
 ,,?
 t+	?

  $;?
 #Tk?
 D[?
 
	 ?
 ?
rC   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4UnfoldConvolutioni  c                 8  > [         TU ]  5         UR                  n[        U[        5      (       a  X"4n[
        R                  R                  X!R                  S9U l        [        R                  " UR                  US   -  US   -  UR                  SS9U l        g )N)kernel_sizestrider   r$   Fra   )r.   r/   r  r   r   r7   r5   Unfoldunfoldrc   num_channelsr3   linear)r?   r*   r)  r@   s      rA   r/    Llama4UnfoldConvolution.__init__  s    ''k3''&4Khhoo+FWFWoXii+a.0;q>A
rC   rD   rE   c                 p    U R                  U5      nUR                  SSS5      nU R                  U5      nU$ )Nr   r-   r$   )r,  r  r.  r  s     rA   rR   Llama4UnfoldConvolution.forward  s8    M2%--aA6M2rC   )r.  r,  r  r[   s   @rA   r'  r'    s(    

U\\ ell  rC   r'  c                   F   ^  \ rS rSrS\4U 4S jjr\S 5       rS rSr	U =r
$ )rh  i  r*   c                 n   > [         TU ]  5         Xl        U R                  SU R	                  U5      SS9  g )Nrj  Fr   )r.   r/   r*   r   rk  r>   s     rA   r/   $Llama4VisionRotaryEmbedding.__init__  s4    Z)?)?)GTYZrC   c                    U R                   U R                  -  n[        R                  " US-  [        R                  S9R                  US-  S5      n[        R                  " X"S S /SS9nSUS'   X!-  nX!-  nU R                  U R                  -  S-  nSU R                  S	   [        R                  " SUS5      S US-   R                  5       U-  -  -  nUS-   S
   US S S S 24   -  R                  SSS9nUS-   S
   US S S S 24   -  R                  SSS9n[        R                  " Xx/SS9R                  5       R                  5       SS S S24   n	U	R                  UR                  SSS5      S:  S5      n	[        R                  " [        R                  " [        R                   " U	5      [        R"                  " U	5      /SS95      n
U
$ )Nr-   r   r$   r   rH   )rG   rG   r   r   ).NrG   .)
image_sizer  r7   r   int32r   catr3   r   r   r   repeat_interleaver  masked_fillr   stackcossin)r*   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   r  s              rA   rk  -Llama4VisionRotaryEmbedding._compute_freqs_ci  s   6#4#44,,sAvU[[9AA#q&!L))Wbqk2:%%)C)CCqH""<0Q!,->A?EEG(RT
	 "A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bcrC   c                 L    U R                   R                  UR                  5      $ ri   )rj  r   r   r  s     rA   rR   #Llama4VisionRotaryEmbedding.forward  s    }} 4 455rC   r~  )rT   rU   rV   rW   r   r/   r   rk  rR   rY   rZ   r[   s   @rA   rh  rh    s0    [1 [
  &6 6rC   rh  c                      ^  \ rS rSr% SrSrS/r\\S'   S\4U 4S jjr	S r
    SS	\R                  S
\R                  S-  S\S-  S\S-  S\S-  S\\\R                  S4   -  4S jjrSrU =r$ )rl  i  vision_model)r_  r	  r*   c                 ~  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  -  S-  S-   U l        UR                  S-  U l        [        U5      U l	        [        R                  " U R                  [        R                  " U R                  5      -  5      U l        [        R                  " U R                  [        R                  " U R                  U R                  5      -  5      U l        [!        U5      U l        [        R$                  " U R                  5      U l        [        R$                  " U R                  5      U l        [+        U5      U l        [/        U5      U l        U R3                  5         g )Nr-   r$   r  )r.   r/   r7  r  r3   r-  r  rn  r'  patch_embeddingr5   r6   r7   randnrm  ro  rh  rotary_embeddingr  layernorm_prelayernorm_postr  r{  r  vision_adapterr  r>   s     rA   r/   Llama4VisionModel.__init__  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&ArC   c                     U R                   $ )zW
This function is used to fetch the first embedding layer to activate grads on inputs.
)rM  r   s    rA   get_input_embeddings&Llama4VisionModel.get_input_embeddings'  s     ###rC   Npixel_valuesr	  r  r  r  rE   .c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUR                  u  pxpSnSnU R                  U5      nUR                  u  pnUR                  X{-  U-  UU5      nU R                  R                  UR                  S   SUR                  S   5      n[        R                  " UU/SS9nUS-  nUR                  X{-  XU5      nU R                  R                  UR                  UR                  S9nUU-   nU R                  U5      nUR!                  USU5      nU R#                  U5      nU R%                  USUUUS9nUR&                  nU R)                  U5      nUSS2SS2SS24   nU R+                  U5      nU(       a  UR,                  OSnU(       a  US   nOSnU(       d  [/        S	 UUU4 5       5      $ [1        UUUS
9$ )a  

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, MllamaVisionModel

>>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
>>> model = MllamaVisionModel.from_pretrained(checkpoint)
>>> processor = AutoProcessor.from_pretrained(checkpoint)

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> inputs = processor(images=image, return_tensors="pt")

>>> output = model(**inputs)

>>> print(output.last_hidden_state.shape)
torch.Size([1, 1, 4, 1025, 7680])
```
Nr$   r   rG   rH   r   r   )r	  r  r  rj  r-   c              3   .   #    U  H  oc  M  Uv   M     g 7fri   rU  r  s     rA   r  ,Llama4VisionModel.forward.<locals>.<genexpr>  s     _$Mq$Mr  r  )r*   r  r  r  rK   rM  r   rm  r   r7   r9  ro  r   r   r   rP  rJ   rO  r{  r  rQ  rR  rD   r   r   )r?   rW  r	  r  r  r  r  batch_size_times_num_tilesr-  r  r  num_concurrent_media
num_chunksr  rX  r  r   rm  positional_embeddingrj  r   rD   r|  s                          rA   rR   Llama4VisionModel.forward-  sF   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY COBTBT?"& 
++L9%1%7%7"
 $++&=
JKYc
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&=zXb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___)*'!
 	
rC   )rm  r3   r7  rQ  rP  r{  r-  r  rM  r  ro  rO  rn  rR  r%  )rT   rU   rV   rW   r  rq  r  r   r   r/   rU  r7   rX   r[  r   r   rR   rY   rZ   r[   s   @rA   rl  rl    s    &!341 2$ /3)-,0#'b
llb
 t+b
  $;	b

 #Tkb
 D[b
 
$eELL#,=&>	>b
 b
rC   rl  c            #         ^  \ rS rSr% SS/r0 rSr\\S'   S\4U 4S jjr	S r
S rS	 rS
 rS rS r\\" SS9\" SS9S\R(                  S\S\\   S\\-  4S j5       5       5       rS\R6                  S\R(                  S\R(                  4S jr\\" SS9\             S'S\R6                  S-  S\R(                  S-  S\R:                  S-  S\R6                  S-  S\S-  S\R(                  S-  S\S-  S\R6                  S-  S\S-  S \S-  S!\S-  S"\S-  S#\ \R:                  -  S\\   S\\!-  4S$ jj5       5       5       r"      S(S% jr#S&r$U =r%$ ))Llama4ForConditionalGenerationi  rI  r	  r{  r*   c                   > [         TU ]  U5        [        UR                  5      U l        [        U5      U l        [        UR                  5      U l	        UR                  R                  U l
        [        U R                  S5      (       a  U R                  R                  U l        O.U R                  R                  R                  =(       d    SU l        U R                  5         g )Nr  rG   )r.   r/   rl  r  rK  r  multi_modal_projectorr  re  r  r  r8  r*   r  r  r>   s     rA   r/   'Llama4ForConditionalGeneration.__init__  s     -f.B.BC%>v%F"/0B0BC ,,774;;// $ 8 8D $ 7 7 D D JDrC   c                 6    U R                   R                  5       $ ri   )r  rU  r   s    rA   rU  3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799rC   c                 :    U R                   R                  U5        g ri   )r  set_input_embeddings)r?   r  s     rA   ri  3Llama4ForConditionalGeneration.set_input_embeddings  s    007rC   c                 6    U R                   R                  5       $ ri   )r  get_output_embeddingsr   s    rA   rl  4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::rC   c                 :    U R                   R                  U5        g ri   )r  set_output_embeddings)r?   new_embeddingss     rA   ro  4Llama4ForConditionalGeneration.set_output_embeddings  s    11.ArC   c                 :    U R                   R                  U5        g ri   )r  set_decoder)r?   decoders     rA   rs  *Llama4ForConditionalGeneration.set_decoder  s    ''0rC   c                 6    U R                   R                  5       $ ri   )r  get_decoderr   s    rA   rw  *Llama4ForConditionalGeneration.get_decoder  s    ""..00rC   F)tie_last_hidden_stateszOObtains image last hidden states from the vision tower and apply al projection.r  rW  vision_feature_select_strategyr  rE   c                     UR                  5        VVs0 s H  u  pEUc  M
  XE_M     nnnU R                  " U40 UD6$ s  snnf )a:  
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
    The tensors corresponding to the input images.
vision_feature_select_strategy (`str`):
    The feature selection strategy used to select the vision feature from the vision backbone.
    Can be one of `"default"` or `"full"`
)itemsrK  )r?   rW  rz  r  kr  s         rA   get_image_features1Llama4ForConditionalGeneration.get_image_features  sB      $*<<>C>41Q$!$>C  888 Ds   	==r  r  r  c           	      "   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SUR                  S    35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
rY  rG   z6Image features and image tokens do not match, tokens: z, features: r   )rU  r7   tensorr*   image_token_idlongr   allr   r  	expand_asr   r    numelrK   )r?   r  r  r  special_image_maskn_image_tokenss         rA   get_placeholder_mask3Llama4ForConditionalGeneration.get_placeholder_mask  s     !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aoauauvwax`yz	
 "!rC   Nr	  r   r4  r  rS  r  r  r  r  c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [	        S5      eUb  Ub  [	        S5      eUc  U R                  5       " U5      nUb  U R                  UUSS9R                  nUR                  SUR                  S5      5      nU R                  U5      R                  UR                  UR                  5      nU R                  XUS9nUR                  UU5      nU R                   " SUUUUU	U
UUUS.	UD6nUS	   nSnUGb>  Ub  USS2UR"                  S
   S
-
  * S24   R                  UR                  5      nUSSS2SS24   UR                  UR                  5      S	:g     R%                  5       nUSS
S24   UR                  UR                  5      S	:g     R%                  5       nO1USSS2SS24   R%                  5       nUSS
S24   R%                  5       n[&        R(                  " 5       nU" UR                  SUR                  S5      5      UR                  S5      R                  UR                  5      5      nU(       d  U4US
S -   nUb  U4U-   $ U$ [+        UUUR,                  UR.                  UR0                  Ub  WS9$ SS9$ )aA  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, LlavaForConditionalGeneration

>>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
>>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

>>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
```Nr  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oneT)rW  rz  r  rG   )r  r  )	r	  r   r4  r  rS  r  r  r  r  r   r$   .)r  r  r4  rD   r|  r  rU  )r*   r  r  r  r  rU  r~  r  rJ   r  rd  r   r   r   r  masked_scatterr  rK   r  r5   CrossEntropyLossr  r4  rD   r|  )r?   r  rW  r	  r   r4  r  rz  r  rS  r  r  r  r  r  r  vision_flatprojected_vision_flatr  r  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr   s                              rA   rR   &Llama4ForConditionalGeneration.forward  sK   d 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY-t";<YZZ#(Av    557	BM#!44)/M  5   	  )--b.2E2Eb2IJK$($>$>{$K$N$N$$m&9&9%! "&!:!:G\ "; " *889KMbcM%% 
)%+'/!5#)
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
rC   c           	          U R                   R                  " U4UUUUUS.UD6n	U(       d  UR                  SS5      (       d  XIS'   U	$ )N)r4  r  r	  r  is_first_iterationrS  TrW  )r  prepare_inputs_for_generationget)
r?   r  r4  r  rW  r	  r  r  r  model_inputss
             rA   r  <Llama4ForConditionalGeneration.prepare_inputs_for_generationc  s`     **HH
+'))1
 
 VZZT%B%B
 ,8(rC   )r  rd  r  rK  r  )NNNNNNNNNNNNr   )NNNNNF)&rT   rU   rV   rW   r  r  r  r%   r   r/   rU  ri  rl  ro  rs  rw  r"   r#   r   r7   r\  r   r   r   r   r   r~  rZ  r  rX   r
   r[  r   r  rR   r  rY   rZ   r[   s   @rA   rb  rb    se   13MNH| :8;B11  E2!rs9''9 ),9 +,	9
 
+	+9 t 3  9 "))":?:K:K"]b]n]n".  E2 .215.204(,2659*.!%)-,0#'-.|
##d*|
 ''$.|
 t+	|

 &&-|
 |
 ((4/|
 ),d
|
   4'|
 $;|
  $;|
 #Tk|
 D[|
 ell*|
 +,|
  
-	-!|
  3  |
B   rC   rb  )r^  rz  rl  r  rb  )r7  )gr  collections.abcr   dataclassesr   typingr   r7   torch.nnr5   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r    r   rf  activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    utils.genericr!   r"   utils.output_capturingr#   configuration_llama4r%   r&   
get_loggerrT   loggerModuler(   r]   rn   r   rc   r   r   r   rX   r   r   r   r  r   r  r  r  rI  r^  rz  r  r  r  r  r  r  r  r  r  r  r	  r  r'  rh  rl  rb  __all__rU  rC   rA   <module>r     s    $ !      N & ! . ) 7 K B 9  G & j j G 5 @ 
		H	%		 B)BII )$!uxx !=		 =(,299 , _-"BII " .",?		 ?D	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 LL4'% % %B %II%<<% 
% <<	%
 LL4'% % %2Y)")) Y)x/7 /d LO L L< [
+ [
 [
|L
- L
^ 
 9; 9 90;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:7)BII 7)tbii )9 )XO
")) O
dbii (6")) 6<G
- G
To%:O odrC   