
    Z j~                     l   S SK Jr  S SKJr  S SKJr  S SKrS SKJr  SSKJ	r
  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJ r   SSK!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-  SSK.J/r/J0r0  SSK1J2r2  SSK3J4r4  SSK5J6r6J7r7  \" S5       " S S\Rp                  5      5       r9 " S S\Rp                  5      r: " S S \Rp                  5      r; " S! S"\Rp                  5      r< " S# S$\Rp                  5      r=S% r> " S& S'\Rp                  5      r? " S( S)\Rp                  5      r@ " S* S+\Rp                  5      rAS, rB\" S-5      SXS. j5       rCS/\R                  S0\ES1\R                  4S2 jrF SYS3\Rp                  S4\R                  S5\R                  S6\R                  S7\R                  S-  S8\GS9\GS:\(\*   4S; jjrH\" \C5       " S< S=\Rp                  5      5       rI " S> S?\5      rJ\+ " S@ SA\&5      5       rK\+ " SB SC\&5      5       rL " SD SE\Rp                  5      rM\+ " SF SG\K5      5       rN\+ " SH SI\K\5      5       rO\+" SJSK9\ " SL SM\ 5      5       5       rP\\+" SNSK9 " SO SP\5      5       5       rQ\+" SQSK9 " SR SS\L5      5       rR\+" STSK9 " SU SV\L\5      5       rS/ SWQrTg)Z    )Callable)	dataclass)OptionalN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )
AriaConfigAriaTextConfigRMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )AriaTextRMSNorm3   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z.
AriaTextRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer,   	__class__s      w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/aria/modeling_aria.pyr0   AriaTextRMSNorm.__init__5   s/     	ll5::k#:; #    hidden_statesc                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr#   T)keepdim)	dtypetor2   float32powmeanrsqrtr5   r4   )r6   r<   input_dtypevariances       r9   forwardAriaTextRMSNorm.forward=   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r;   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler4   shaper5   r6   s    r9   
extra_reprAriaTextRMSNorm.extra_reprD   s*    ))*+6$2G2G1HIIr;   )r5   r4   )gư>)__name__
__module____qualname____firstlineno__floatr0   r2   TensorrH   rN   __static_attributes____classcell__r8   s   @r9   r*   r*   3   sB    $ $$ $ $;U\\ ;ell ;J Jr;   r*   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )AriaProjectorMLPH   z
Feed-Forward Network module for the Aria Projector.

Args:
    in_features (`int`):
        Input embedding dimension.
    hidden_features (`int`):
        Hidden dimension of the feed-forward network.
    output_dim (`int`):
        Output dimension.
c                    > [         TU ]  5         [        R                  " XSS9U l        [        R                  " X#SS9U l        [        S   U l        g )NFbiasgelu_new)r/   r0   r   Linear	linear_in
linear_outr	   act)r6   in_featureshidden_features
output_dimr8   s       r9   r0   AriaProjectorMLP.__init__U   s>    ;eL))OeL*%r;   c                 h    U R                  U R                  U5      5      nU R                  U5      nU$ Nrc   ra   rb   )r6   r<   s     r9   rH   AriaProjectorMLP.forward[   s-    !>?6r;   rj   	rP   rQ   rR   rS   __doc__r0   rH   rV   rW   rX   s   @r9   rZ   rZ   H   s    
& r;   rZ   c                   F   ^  \ rS rSrSrSS\S\4U 4S jjjrS	S jrSr	U =r
$ )
AriaCrossAttentiona   zb
Aria Cross-Attention module.

Args:
    config (`AriaConfig`):
        The configuration to use.
configdropout_ratec                 .  > [         TU ]  5         UR                  R                  nUR                  R                  nX@l        [        R                  " X3SS9U l        [        R                  " X3SS9U l	        [        R                  " X3SS9U l
        [        R                  " X4SS9U l        [        R                  " X35      U l        [        R                  " U5      U l        [        R                   " U5      U l        [        R                   " U5      U l        g )NFr]   T)batch_first)r/   r0   vision_configr7   num_attention_heads	num_headsr   r`   q_projk_projv_projMultiheadAttentionmultihead_attnlinearDropoutdropout	LayerNorm
layer_normlayer_norm_kv)r6   rq   rr   r7   rw   r8   s        r9   r0   AriaCrossAttention.__init__j   s    **66((<<	"iiuEiiuEiiuE !33KX\]ii9zz,/,,{3\\+6r;   c                    U R                  U R                  U5      5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  XEXcS9u  pxU R                  U R                  U5      5      nU$ )ai  
Forward pass of the AriaCrossAttention module.

Args:
    key_value_states (`torch.Tensor`):
        Input tensor for key and value.
    hidden_states (`torch.Tensor`):
        Input tensor for query.
    attn_mask (`torch.Tensor`, *optional*, defaults to None):
        Attention mask.

Returns:
    torch.Tensor:
        Output tensor after cross-attention.
	attn_mask)rx   r   r   ry   rz   r|   r   r}   )	r6   key_value_statesr<   r   querykeyvalueattn_output_s	            r9   rH   AriaCrossAttention.forward{   s      DOOM:;--.>?kk*+,-,,U,Tll4;;{#;<r;   )	r   ry   r   r   r}   r|   rw   rx   rz   )r   ri   )rP   rQ   rR   rS   rm   r&   rT   r0   rH   rV   rW   rX   s   @r9   ro   ro   a   s*    7z 7 7 7" r;   ro   c                   x   ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\R                  S-  4S jjr	S	r
U =r$ )AriaProjector   z
Aria Projector module.

This module projects vision features into the language model's embedding space, enabling interaction between vision and language components.

Args:
    config (`AriaConfig`):
        Configuration object for the model.
rq   c                   > [         TU ]  5         UR                  U l        UR                  R
                  U l        UR                  R                  U l        UR                  R
                  U l	        UR                  R
                  U l        UR                  R
                  U l        [        R                  " [        R                   " UR"                  U R                  5      5      U l        ['        U5      U l        [        R*                  " U R                  5      U l        [/        U R                  U R                  U R                  5      U l        g ri   )r/   r0   projector_patch_to_query_dictpatch_to_query_dictru   r7   rd   rv   rw   kv_dimtext_configre   rf   r   r1   r2   zeros'max_value_projector_patch_to_query_dictr   ro   
cross_attnr   r   rZ   feed_forwardr6   rq   r8   s     r9   r0   AriaProjector.__init__   s     	#)#G#G !//;;--AA**66%11== ,,88\\%++f.\.\^b^n^n"op
,V4,,t'7'78,T-=-=t?S?SUYUdUder;   Nr   r   c                 .   UR                   S   UR                   S   pCX@R                  ;  a*  [        SU SU R                  R                  5        S35      eU R                  U   nU R                  SU R                  S5      R                  USS5      nUbM  UR                  U R                  S5      nUR                  S5      R                  SUR                  S5      S5      nU R                  XUS9nU R                  U R                  U5      5      nU$ )	aH  
Forward pass of the Projector module.

Args:
    key_value_states (`torch.Tensor`):
        Input tensor of shape (batch_size, num_patches, kv_dim).
    attn_mask (`torch.Tensor`, *optional*, default is None):
        Attention mask.

Returns:
    `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim).
r   r%   zNumber of patches z: not found in patch_to_query_dict amongst possible values .Nr>   r   )rL   r   KeyErrorkeysr   	unsqueezerepeatrepeat_interleaverw   expandsizer   r   r   )	r6   r   r   
batch_sizenum_patches	query_numqueriesattention_outouts	            r9   rH   AriaProjector.forward   s0    #3"8"8";=M=S=STU=VK666$[M1klp  mE  mE  mJ  mJ  mL  lM  MN  O  ,,[9	**Zi(2215<<ZAN !33DNNAFI!++A.55b',,q/2NI(8YW >?
r;   )
r   r   re   rd   r   r   rw   rf   r   r   ri   )rP   rQ   rR   rS   rm   r&   r0   r2   rU   rH   rV   rW   rX   s   @r9   r   r      s=    ff( PTAT  r;   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )AriaSharedExpertsMLP   a  
Shared Expert MLP for shared experts.

Unlike routed experts, shared experts process all tokens without routing.
This class reconfigures the intermediate size in comparison to the LlamaMLP.

Args:
    config (`AriaTextConfig`): Configuration object for the Aria language model.
rq   c                   > [         TU ]  5         Xl        UR                  U l        UR                  UR
                  -  U l        [        R                  " U R                  U R                  UR                  S9U l	        [        R                  " U R                  U R                  UR                  S9U l
        [        R                  " U R                  U R                  UR                  S9U l        [        UR                     U l        g )Nr]   )r/   r0   rq   r7   intermediate_sizemoe_num_shared_expertsr   r`   mlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr   s     r9   r0   AriaSharedExpertsMLP.__init__   s    !--!'!9!9F<Y<Y!Y4#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r;   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ ri   )r   r   r   r   )r6   xr   s      r9   rH   AriaSharedExpertsMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r;   )r   rq   r   r   r7   r   r   )
rP   rQ   rR   rS   rm   r'   r0   rH   rV   rW   rX   s   @r9   r   r      s    0~ 0 r;   r   c                    U R                   S   nUR                   S   n[        R                  " X4U R                  U R                  S9n[        R
                  " USS9n[        R                  " S[        R                  UR                  S9n[        R                  " Xv45      n[        UR                   S   5       H.  nXh   n	XhS-      n
X	U
 n[        R                  " XU   5      nXX& M0     U$ )a
  
Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts.

Args:
    token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features).
    expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features).
    tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert.

Returns:
    torch.Tensor: Output tensor of shape (num_tokens, out_features).
r   r>   r@   devicedimr%   )
rL   r2   r   r@   r   cumsumlongcatrangematmul)token_statesexpert_weightstokens_per_expert
num_tokensout_featuresoutputcumsum_num_tokenszero_tensor
expert_numstartendtokensr   s                r9   sequential_experts_gemmr      s     ##A&J!''+L[[9K9KT`TgTghF%6A>++auzz:K:R:RSK		;"BCN0034
!-Q/C(ll6*#=>u 5 Mr;   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )AriaGroupedExpertsGemmi  a  
Grouped GEMM (General Matrix Multiplication) module for efficient expert computation.
This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm)
for optimized performance. If the grouped_gemm library is not installed, it gracefully
falls back to a sequential GEMM implementation, which may be slower but ensures
functionality.

Args:
    in_features (`int`):
        Number of input features.
    out_features (`int`):
        Number of output features.
    groups (`int`):
        Number of expert groups.
c                    > [         TU ]  5         Xl        X l        X0l        [
        R                  " [        R                  " X1U5      5      U l	        g ri   )
r/   r0   rd   r   groupsr   r1   r2   emptyr4   )r6   rd   r   r   r8   s       r9   r0   AriaGroupedExpertsGemm.__init__#  s:    &(ll5;;vL#QRr;   c                 L    [        UU R                  UR                  5       5      $ )a-  
Perform grouped matrix multiplication.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (num_tokens, in_features).
    tokens_per_expert (`torch.Tensor`):
        Number of tokens assigned to each expert.

Returns:
    torch.Tensor: Output tensor of shape (num_tokens, out_features).
)r   r4   cpu)r6   inputr   s      r9   rH   AriaGroupedExpertsGemm.forward*  s'     'KK!!#
 	
r;   )r   rd   r   r4   rl   rX   s   @r9   r   r     s     S
 
r;   r   c                   \   ^  \ rS rSrS\SS4U 4S jjrS rS\R                  4S jr	Sr
U =r$ )	AriaExpertsi>  rq   r-   Nc                    > [         TU ]  5         Xl        [        UR                  UR
                  S-  UR                  5      U l        [        UR
                  UR                  UR                  5      U l        g )Nr#   )	r/   r0   rq   r   r7   r   moe_num_expertsfc1fc2r   s     r9   r0   AriaExperts.__init__?  s_    )&*<*<f>V>VYZ>Z\b\r\rs)&*B*BFDVDVX^XnXnor;   c                     [         R                  " XR                  R                  SS9u  p#[        R
                  R                  USS9nX44$ )Nr%   )kr   r>   r   )r2   topkrq   moe_topkr   
functionalsoftmax)r6   router_logits
top_logitstop_indicesscoress        r9   route_tokens_to_experts#AriaExperts.route_tokens_to_expertsE  sB    "'**]kk>R>RXY"Z
&&zr&:""r;   c                 $   U R                  U5      u  p4UR                  n[        R                  " UR	                  5       R                  [        R                  5      U R                  R                  SU R                  R                  S-
  S9R                  U5      nUnUR                  S5      n[        R                  " U5      n	UR                  SXR                  R                  -  5      n
U R                  X5      n[        R                  " USSS9u  p[        R                   R#                  U5      U-  nU R%                  X5      n[        R&                  " UR(                  S   U R                  R                  -  UR+                  S5      4UR                  UR,                  S9nUR/                  SX5        UR                  SU R                  R                  UR+                  S5      5      nXR1                  S5      -  R3                  SS9nU$ )Nr   r%   )binsminmaxr>   r#   r   r   )r   r@   r2   histcflattenrA   rB   rq   r   viewargsortindex_selectr   r   chunkr   r   silur   r   rL   r   r   index_copy_r   sum)r6   r<   r   top_k_indextop_k_weightsoriginal_dtyper   indicesflatten_indicessorted_indicespermuted_tokens
fc1_output
projectiongateexpert_outputunpermuted_tokensr   s                    r9   rH   AriaExperts.forwardJ  s   %)%A%A-%P"$**!KK!$$U]]3,,++a/	

 "^
 	 !,,r*7'44Q++J^J^8^_XXoA
 ;;z1"=
]]''
3d:
?!KK  #dkk&:&::M<N<Nq<QR%% ''

 	%%aG-222t{{7K7K]M_M_`aMbc#&=&=b&AAFF1FMr;   )rq   r   r   )rP   rQ   rR   rS   r'   r0   r   r2   rU   rH   rV   rW   rX   s   @r9   r   r   >  s3    p~ p$ p#
u||  r;   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )AriaTextMoELayerij  rq   c                    > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        U5      U l        [        U5      U l
        Xl        g NFr]   )r/   r0   r   r`   r7   r   routerr   expertsr   shared_expertsrq   r   s     r9   r0   AriaTextMoELayer.__init__k  sM    ii 2 2F4J4JQVW"6*26:r;   r<   r-   c                    UR                   nUR                  SUR                  S5      5      nU R                  U5      nU R	                  X5      R                  U5      nU R                  UR                  U5      5      nXE-   $ Nr>   )rL   r   r   r  r  r  )r6   r<   original_shaper   r  shared_expert_outputs         r9   rH   AriaTextMoELayer.forwardr  sv    &,,%**2}/A/A"/EFM2]BGGW#22=3E3En3UV33r;   )rq   r  r  r  )rP   rQ   rR   rS   r'   r0   r2   rU   rH   rV   rW   rX   s   @r9   r  r  j  s/    ~ 4U\\ 4ell 4 4r;   r  c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr>   r#   r   )rL   r2   r   )r   x1x2s      r9   rotate_halfr  {  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r;   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r   r  )qr   cossinunsqueeze_dimq_embedk_embeds          r9   apply_rotary_pos_embr#    sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr;   r<   n_repr-   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r%   N)rL   r   reshape)r<   r$  batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr+    s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr;   moduler   r   r   attention_maskscalingr   kwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr#   r   r>   )r   r@   )ptrainingr%   )r+  num_key_value_groupsr2   r   	transposer   r   r   rB   rA   r@   r   r2  
contiguous)r,  r   r   r   r-  r.  r   r/  
key_statesvalue_statesattn_weightsr   s               r9   eager_attention_forwardr9    s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r;   c                     ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  4   4S jjrSrU =r$ )AriaTextAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrq   	layer_idxc                 P  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        g )Nr*  g      Tr]   )r/   r0   rq   r<  getattrr7   rv   r*  r(  r3  r.  attention_dropout	is_causalr   r`   attention_biasrx   ry   rz   o_projr6   rq   r<  r8   s      r9   r0   AriaTextAttention.__init__  sI   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r;   Nr<   position_embeddingsr-  past_key_valuesr/  r-   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )Nr>   r%   r#           )r   r.  )rL   r*  rx   r   r4  ry   rz   r#  updater<  r   get_interfacerq   _attn_implementationr9  r2  r?  r.  r&  r5  rB  )r6   r<   rE  r-  rF  r/  input_shapehidden_shapequery_statesr6  r7  r  r  attention_interfacer   r8  s                   r9   rH   AriaTextAttention.forward  s~    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r;   )r?  rq   r*  r@  ry   r<  r3  rB  rx   r.  rz   NNN)rP   rQ   rR   rS   rm   r'   intr0   r2   rU   rK   r
   r   r   rH   rV   rW   rX   s   @r9   r;  r;    s    G
~ 
# 
4 IM.2(,&)||&) #5<<#=>E&) t+	&)
 &) +,&) 
u||U\\)	*&) &)r;   r;  c                     ^  \ rS rSrSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )AriaTextDecoderLayeri  aG  
Aria Text Decoder Layer.

This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network.

Args:
    config (`AriaTextConfig`):
        Configuration object for the text component of the model.
    layer_idx (`int`):
        Index of the layer.
rq   r<  c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g )N)rq   r<  r,   )r/   r0   r7   r;  	self_attnr  mlpr*   rms_norm_epsinput_layernormpost_attention_layernormrC  s      r9   r0   AriaTextDecoderLayer.__init__  sk    !--*&N#F+.v/A/AvGZGZ[(78J8JPVPcPc(d%r;   Nr<   r-  position_idsrF  	use_cacherE  r/  r-   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r<   r-  r]  rF  r^  rE   )rZ  rW  r[  rX  )
r6   r<   r-  r]  rF  r^  rE  r/  residualr   s
             r9   rH   AriaTextDecoderLayer.forward  s     !,,];>> 
')%+ 3
 
 !0 !55mD/ 0r;   )r7   rZ  rX  r[  rW  )NNNFN)rP   rQ   rR   rS   rm   r'   rR  r0   r2   rU   
LongTensorr
   boolrK   r   r   rH   rV   rW   rX   s   @r9   rT  rT    s    
e~ e# e /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 r;   rT  c                      ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrSr\\S	.r\R$                  " 5       U 4S
 j5       rSrU =r$ )AriaTextPreTrainedModeli;  rq   model)imagetextrT  r   TrF  r<   
attentionsc                    > [         TU ]  U5        [        U[        5      (       a5  [        R
                  " UR                  SU R                  R                  S9  g g )NrH  )rD   std)	r/   _init_weights
isinstancer   initnormal_r4   rq   initializer_ranger6   r,  r8   s     r9   rn  %AriaTextPreTrainedModel._init_weightsL  sA    f%f455LLSdkk6S6ST 6r;   r`  )rP   rQ   rR   rS   r'   __annotations__base_model_prefixinput_modalities_no_split_modulessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_attention_backendrT  r;  _can_record_outputsr2   no_gradrn  rV   rW   rX   s   @r9   rf  rf  ;  sh    (/1IJ&*#"3N"&-'
 ]]_U Ur;   rf  c                      ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\\S.r\R&                  " 5       U 4S	 j5       rS
rU =r$ )AriaPreTrainedModeliS  rq   rg  TAriaDecoderLayerrF  Frj  c                    > [         TU ]  U5        [        U[        5      (       a4  [        R
                  " UR                  U R                  R                  S9  g g )N)rm  )	r/   rn  ro  r   rp  trunc_normal_r   rq   rr  rs  s     r9   rn  !AriaPreTrainedModel._init_weightsd  s@    f%fm,,v||1N1NO -r;   r`  )rP   rQ   rR   rS   r&   ru  rv  ry  rx  rz  r{  r|  _supports_flex_attn_can_compile_fullgraphr}  rT  r;  r~  r2   r  rn  rV   rW   rX   s   @r9   r  r  S  sn    &*#+,#4"5N""&-'
 ]]_P Pr;   r  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )AriaTextRotaryEmbeddingik  inv_freqNrq   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultr  F)
persistentoriginal_inv_freq)r/   r0   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrq   rope_parametersr  compute_default_rope_parametersr   attention_scalingregister_bufferclone)r6   rq   r   rope_init_fnr  r8   s        r9   r0    AriaTextRotaryEmbedding.__init__n  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr;   r   ztorch.deviceseq_lenr-   ztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetar*  Ng      ?r   r#   r@   )r   r@   )	r  r>  r7   rv   r2   arangeint64rA   rT   )rq   r   r  baser   attention_factorr  s          r9   r  7AriaTextRotaryEmbedding.compute_default_rope_parameters~  s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r;   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r>   r%   mpsr   F)device_typeenabledr#   r   r  )r  rT   r   rL   rA   r   ro  typestrr    r4  r2   r   r  r  r  r@   )
r6   r   r]  inv_freq_expandedposition_ids_expandedr  freqsembr  r  s
             r9   rH   AriaTextRotaryEmbedding.forward  sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)r  rq   r  r  r  ri   rQ  )rP   rQ   rR   rS   r2   rU   ru  r'   r0   staticmethodr   rR  rK   rT   r  r  r   rH   rV   rW   rX   s   @r9   r  r  k  s    llV~ V V  (,+/"*%*(* t* 
~u$	%	* *: ]]_<  <r;   r  c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rSrU =r$ )AriaTextModeli  rq   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )NrV  rq   F)r/   r0   pad_token_idpadding_idx
vocab_sizer   	Embeddingr7   embed_tokens
ModuleListr   num_hidden_layersrT  layersr*   rY  normr  
rotary_embgradient_checkpointing	post_initrC  s      r9   r0   AriaTextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammFKFLdLdFefFe!&4Fef
 $F$6$6F<O<OP	1@&+# 	 gs   C?N	input_idsr-  r]  rF  inputs_embedsr^  r/  r-   c           
      >   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  XS9nU R                  S U R                  R                    H  nU" U
4U	UUUUS.UD6n
M     U R                  U
5      n
[        U
US	9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r%   )r   )rq   r  r-  rF  r]  )r]  )r-  rE  r]  rF  r^  )last_hidden_staterF  )
ValueErrorr  r   rq   get_seq_lengthr2   r  rL   r   r   r   r  r  r  r  r   )r6   r  r-  r]  rF  r  r^  r/  past_seen_tokenscausal_maskr<   rE  decoder_layers                r9   rH   AriaTextModel.forward  sF    -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oomoW![[)H4;;+H+HIM)*$7) /# M J 		-0&++
 	
r;   )r  r  r  r  r  r  r  )NNNNNN)rP   rQ   rR   rS   r'   r0   r!   r"   r   r2   rc  rU   r
   FloatTensorrd  r   r   r   rH   rV   rW   rX   s   @r9   r  r    s    ~     .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
!2
    2
r;   r  c                   N  ^  \ rS rSrSS0rSS0rSS/S/40rS\4U 4S	 jjr\	        SS\
R                  S
-  S\
R                  S
-  S\
R                  S
-  S\S
-  S\
R                  S
-  S\
R                  S
-  S\S
-  S\\
R                  -  S\\   S\4S jj5       rSrU =r$ )AriaTextForCausalLMi  lm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr<   logitsrq   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
r/   r0   r  rg  r  r   r`   r7   r  r  r   s     r9   r0   AriaTextForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r;   Nr  r-  r]  rF  r  labelsr^  logits_to_keepr/  r-   c	           
      |   U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SXU R                  R                  S.U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a  
Example:

```python
>>> from transformers import AutoTokenizer, AriaTextForCausalLM

>>> model = AriaTextForCausalLM.from_pretrained("meta-aria_text/AriaText-2-7b-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-aria_text/AriaText-2-7b-hf")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r  r-  r]  rF  r  r^  Nr  r  r  lossr  rF  r<   rk  r`  )rg  r  ro  rR  slicer  loss_functionrq   r  r   rF  r<   rk  )r6   r  r-  r]  rF  r  r  r^  r  r/  outputsr<   slice_indicesr  r  s                  r9   rH   AriaTextForCausalLM.forward  s    < ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD%#33!//))
 	
r;   )r  rg  r  )NNNNNNNr   )rP   rQ   rR   rS   _tied_weights_keys_tp_plan_pp_planr'   r0   r   r2   rc  rU   r
   r  rd  rR  r   r   r   rH   rV   rW   rX   s   @r9   r  r    s   *,GH23H_-z:;H~   .2.204(,26*.!%-.6
##d*6
 t+6
 &&-	6

 6
 ((4/6
   4'6
 $;6
 ell*6
 +,6
 
 6
 6
r;   r  zP
    Base class for Aria causal language model (or autoregressive) outputs.
    custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)AriaCausalLMOutputWithPasti?  a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nr  r  rF  r<   rk  image_hidden_statesr`  )rP   rQ   rR   rS   rm   r  r2   r  ru  r  rF  r
   r<   rK   rk  r  rV   r`  r;   r9   r  r  ?  s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r;   r  zI
    Base class for Aria outputs, with hidden states and attentions.
    c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)AriaModelOutputWithPasti]  a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nr  r`  )
rP   rQ   rR   rS   rm   r  r2   r  ru  rV   r`  r;   r9   r  r  ]  s    	 59**T18r;   r  zt
    The Aria model which consists of a vision backbone and a language model, without a language modeling head.
    c                   @  ^  \ rS rSrS\4U 4S jjrS rS r\\	\
" SS9   SS	\R                  S
\R                  S-  S\\\   -  S\S-  S\\   S\\-  4S jj5       5       5       rS\R*                  S\R                  S\R                  4S jr\	\
        SS\R*                  S-  S	\R                  S-  S
\R*                  S-  S\R.                  S-  S\R*                  S-  S\S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rS rSrU =r$ )	AriaModelir  rq   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g ri   )r/   r0   r$   from_configru   vision_towerr   multi_modal_projectorr   language_modelr  r   s     r9   r0   AriaModel.__init__x  sY     %11&2F2FG%26%:"'33F4F4FGr;   c                 6    U R                   R                  5       $ ri   )r  get_input_embeddingsrM   s    r9   r  AriaModel.get_input_embeddings  s    ""7799r;   c                 :    U R                   R                  U5        g ri   )r  set_input_embeddingsr6   r   s     r9   r  AriaModel.set_input_embeddings  s    007r;   zWObtains image last hidden states from the vision tower and apply multimodal projection.r  Npixel_values
pixel_maskvision_feature_layeroutput_hidden_statesr/  r-   c                     U R                  U5      nU R                  " U4USSS.UD6nS nUb'  UR                  S5      n	[        R                  " U	5      nUR
                  U   n
U R                  XS9Ul        U$ )NT)patch_attention_maskr  return_dictr%   r   )_create_patch_attention_maskr  r   r2   logical_notr<   r  pooler_output)r6   r  r  r  r  r/  r  image_outputsimage_attn_maskflattened_maskselected_image_features              r9   get_image_featuresAriaModel.get_image_features  s      $@@L))
!5!%	

 
 +199!<N#//?O!.!<!<=Q!R&*&@&@AW&@&s#r;   r  r  image_featuresc           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r   r>   r   r%   z6Image features and image tokens do not match, tokens: z, features: )r  r2   tensorrq   image_token_idr   r   allr   rL   r   	expand_asrA   r   numel)r6   r  r  r  special_image_maskn_image_tokensn_image_featuress          r9   get_placeholder_maskAriaModel.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r;   r-  r]  rF  r^  c	           	         Uc  U R                  5       " U5      nUb  UR                  S   S:w  aw  U R                  UUU R                  R                  SS9R
                  n
U
R                  UR                  UR                  5      n
U R                  XU
S9nUR                  X5      nU R                  " SUUUUUS.U	D6n[        UR                  U(       a  UR                  OS UR                  UR                   Ub  W
S9$ S S9$ )Nr%   T)r  r  r  r  )r  r  )r-  r]  rF  r  r^  )r  rF  r<   rk  r  r`  )r  rL   r  rq   r  r   rA   r   r@   r  masked_scatterr  r  r  rF  r<   rk  )r6   r  r  r  r-  r]  rF  r  r^  r/  r  r  r  s                r9   rH   AriaModel.forward  s8      557	BM #(;(;A(>!(C!44)%%)[[%E%E 	 5 
 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M%% 
)%+'
 
 '%777@G33d!//))2>2J
 	

 QU
 	
r;   c                 ~   Uc  g UR                  SU R                  R                  R                  U R                  R                  R                  S9nUR                  SU R                  R                  R                  U R                  R                  R                  S9nUR	                  SS9S:  R                  5       $ )Nr%   )	dimensionr   stepr#   )r>   r   r   )unfoldr  rq   
patch_sizer   rd  )r6   r  patches_subgrids      r9   r  &AriaModel._create_patch_attention_mask  s    $++""))44""))44 , 

 *00""))44""))44 1 

  ###1A5;;==r;   )r  r  r  )Nr>   N)NNNNNNNN)rP   rQ   rR   rS   r&   r0   r  r  r!   r   r   r2   r  rR  listrd  r   r   rK   r   r  rc  r  rU   r
   r   r  rH   r  rV   rW   rX   s   @r9   r  r  r  s   z :8  n 0402,0'' %%, "DIo	
 #Tk +, 
+	+   
4"))":?:K:K"]b]n]n"0  .215.2.204(,26!%,
##d*,
 ''$.,
 $$t+	,

 t+,
 &&-,
 ,
 ((4/,
 $;,
 -.,
 
(	(,
  ,
\> >r;   r  z
    Aria model for conditional generation tasks.

    This model combines a vision tower, a multi-modal projector, and a language model
    to perform tasks that involve both image and text inputs.
    c                   H  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  4S	 jr\  SS\R                  S\R                  S
-  S\\\   -  S\\   S\\-  4
S jj5       r\\          SS\R.                  S
-  S\R                  S
-  S\R.                  S
-  S\R0                  S
-  S\R.                  S
-  S\S
-  S\R                  S
-  S\R.                  S
-  S\S
-  S\\R0                  -  S\\   S\\-  4S jj5       5       r       SU 4S jjrSrU =r$ )AriaForConditionalGenerationi  r  z(model.language_model.embed_tokens.weightrq   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g r  )r/   r0   r  rg  r   r`   r   r7   r  r  r  r   s     r9   r0   %AriaForConditionalGeneration.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr;   c                 6    U R                   R                  5       $ ri   )rg  r  rM   s    r9   r  1AriaForConditionalGeneration.get_input_embeddings  s    zz..00r;   c                 :    U R                   R                  U5        g ri   )rg  r  r  s     r9   r  1AriaForConditionalGeneration.set_input_embeddings  s    

''.r;   r-   c                     U R                   $ ri   )r  rM   s    r9   get_output_embeddings2AriaForConditionalGeneration.get_output_embeddings  s    ||r;   Nr  r  r  r/  c                 B    U R                   R                  " SUUUS.UD6$ )N)r  r  r  r`  )rg  r  )r6   r  r  r  r/  s        r9   r  /AriaForConditionalGeneration.get_image_features  s3     zz,, 
%!!5
 	
 	
r;   r  r-  r]  rF  r  r  r^  r  c                    U R                   " SUUUUUUUU	S.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnUb3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a	  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `AriaForConditionalGeneration`).
    Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
    computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import httpx
>>> from io import BytesIO
>>> import torch
>>> from PIL import Image
>>> from io import BytesIO

>>> from transformers import AutoProcessor, AutoModel
>>> from transformers.image_utils import load_image

>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
>>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
>>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

>>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria")
>>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", dtype=torch.bfloat16, device_map="auto")

>>> # Create inputs
>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {"type": "image"},
...             {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
...             {"type": "image"},
...             {"type": "text", "text": "What can we see in this image?"},
...         ]
...     },
...     {
...         "role": "user",
...         "content": [
...             {"type": "image"},
...             {"type": "text", "text": "In which city is that bridge located?"},
...         ]
...     }
... ]

>>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
>>> images = [[image1, image2], [image3]]
>>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)

>>> # Generate
>>> generated_ids = model.generate(**inputs, max_new_tokens=256)
>>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

>>> print(generated_texts[0])
Assistant: There are buildings, trees, lights, and water visible in this image.

>>> print(generated_texts[1])
Assistant: The bridge is in San Francisco.
```)r  r  r  r-  r]  rF  r  r^  r   Nr  r  r`  )rg  ro  rR  r  r  r  rq   r   r  r  rF  r<   rk  )r6   r  r  r  r-  r]  rF  r  r  r^  r  r/  r  r<   r  r  r  s                    r9   rH   $AriaForConditionalGeneration.forward&  s    Z ** 

%!)%+'

 

  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD *#33!//))
 	
r;   c	           	         > [         TU ]  " U4UUUUUS.U	D6n
U(       d  U	R                  SS5      (       d  XJS'   XZS'   U
$ )N)rF  r  r-  r  is_first_iterationr^  Tr  r  )r/   prepare_inputs_for_generationget)r6   r  rF  r  r  r  r-  r  r/  r/  model_inputsr8   s              r9   r0  :AriaForConditionalGeneration.prepare_inputs_for_generation  sb     w<
+'))1
 
 VZZT%B%B
 ,8()3&r;   )r  rg  r  )
NNNNNNNNNr   )NNNNNNF) rP   rQ   rR   rS   r  r&   r0   r  r  r   Moduler(  r   r2   r  rR  r  r   r   rK   r   r  r   rc  rU   r
   rd  r  rH   r0  rV   rW   rX   s   @r9   r   r     s    +,VWz 1/ryy   0402	
''
 %%,
 "DIo	

 +,
 
+	+
 
  .215.2.204(,26*.!%-.h
##d*h
 ''$.h
 $$t+	h

 t+h
 &&-h
 h
 ((4/h
   4'h
 $;h
 ell*h
 +,h
 
+	+h
  h
Z   r;   r   )r   r  rf  r  r  r  )r%   )rH  )Ucollections.abcr   dataclassesr   typingr   r2   r    r   rp  activationsr	   cache_utilsr
   r   
generationr   integrationsr   r   r   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr    r!   utils.output_capturingr"   autor$   configuration_ariar&   r'   r4  r*   rZ   ro   r   r   r   r   r   r  r  r#  rU   rR  r+  rT   r9  r;  rT  rf  r  r  r  r  r  r  r  r   __all__r`  r;   r9   <module>rJ     s3  ( % !    & ! . ) f f / B 9  L F & a a G 5  : Y'Jbii J (J(ryy 24 4n>BII >B299 4>)
RYY )
X)")) )X4ryy 4"( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*@)		 @) +@)F35 3l Uo U U. P/ P P.><bii ><B F
+ F
 F
R E
1? E
 E
P 
 9 9 90 
95 9 9 
C># C>
C>L k#6 kk\r;   