
    Z j_                        S SK r S SKJr  S SK Jr  S SKJr  SSKJr	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJrJrJrJrJr  SSKJr  SSK J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+J,r,  SSK-J.r.J/r/J0r0J1r1J2r2  SSK3J4r4J5r5J6r6  SSK7J8r8  SSK9J:r:J;r;J<r<J=r=J>r>J?r?J@r@  SSKAJBrBJCrCJDrDJErE  \2R                  " \G5      rHS rI\0" SS9\ " S S\85      5       5       rJ\0" SS9\ " S S\5      5       5       rK " S  S!\@5      rL " S" S#\R                  5      rN " S$ S%\R                  5      rO " S& S'\R                  5      rP " S( S)\%S*S+9rQ\0 " S, S-\5      5       rR " S. S/\%S*S+9rS " S0 S1\'S*S+9rT\0 " S2 S3\(5      5       rU " S4 S5\=5      rV " S6 S7\R                  5      rW " S8 S9\R                  5      rX " S: S;\R                  5      rY " S< S=\:5      rZ " S> S?\;5      r[\0 " S@ SA\#5      5       r\ " SB SC\?5      r] " SD SE\>5      r^ " SF SG\\\<5      r_ " SH SI\B5      r` " SJ SK\E5      ra " SL SM\D5      rb\0" SNSO9 " SP SQ\C5      5       rc/ SRQrdg)S    N)strict)nn)
functional   )initialization)ACT2FN)Cache)PreTrainedConfig)TorchvisionBackend)BatchFeatureget_patch_output_sizeselect_best_resolution)divide_to_patches)ChannelDimension
ImageInputPILImageResamplingSizeDictget_image_size)FlashAttentionKwargs)BaseModelOutputWithPooling)PreTrainedModel)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)
TensorTypeTransformersKwargsauto_docstringcan_return_tuplelogging   )CONFIG_MAPPING
AutoConfigAutoTokenizer)LlamaConfig)LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastc                    U R                   S   nUR                   S   n[        R                  " X4U R                  U R                  S9n[        R
                  " USS9n[        R                  " S[        R                  UR                  S9n[        R                  " Xv45      n[        UR                   S   5       H.  nXh   n	XhS-      n
X	U
 n[        R                  " XU   5      nXX& M0     U$ )a
  
Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts.

Args:
    token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features).
    expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features).
    tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert.

Returns:
    torch.Tensor: Output tensor of shape (num_tokens, out_features).
r   dtypedevicedim   )
shapetorchzerosr7   r8   cumsumlongcatrangematmul)token_statesexpert_weightstokens_per_expert
num_tokensout_featuresoutputcumsum_num_tokenszero_tensor
expert_numstartendtokensouts                v/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/aria/modular_aria.pysequential_experts_gemmrR   C   s     ##A&J!''+L[[9K9KT`TgTghF%6A>++auzz:K:R:RSK		;"BCN0034
!-Q/C(ll6*#=>u 5 M    zrhymes-ai/Aria)
checkpointc                       \ rS rSr% SrSrSrSSSSSSSS.rSr\	\
S	'   S
r\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	S-  \
S'   Srg)AriaTextConfigb   a%  
moe_num_experts (`int`, *optional*, defaults to 8):
    The number of experts in the MoE layer.
moe_topk (`int`, *optional*, defaults to 2):
    The number of top experts to route to for each token.
moe_num_shared_experts (`int`, *optional*, defaults to 2):
    The number of shared experts.
	aria_texttext_configcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projz%layers.*.mlp.shared_experts.gate_projz#layers.*.mlp.shared_experts.up_projz%layers.*.mlp.shared_experts.down_proji   intermediate_size   moe_num_expertsr$   moe_topkmoe_num_shared_expertsNpad_token_id )__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keybase_model_tp_planr\   int__annotations__r^   r_   r`   ra   __static_attributes__rb   rS   rQ   rV   rV   b   sj     J#O%.%.%.%.1:/81: "s!OSHc"#C# L#* rS   rV   c                      ^  \ rS rSr% SrSrSS0r\\S.r	Sr
\\-  S-  \S'   Sr\\-  S-  \S	'   S
r\\\   -  \S'   Sr\S-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   U 4S jrSrU =r$ )
AriaConfig   ze
projector_patch_to_query_dict (`dict`, *optional*):
    Mapping of patch sizes to query dimensions.
ariaimage_token_idimage_token_index)rY   vision_configNrt   rY   r5   vision_feature_layerprojector_patch_to_query_dict	   g{Gz?initializer_rangeFtie_word_embeddingsc                   > U R                   c
  SSS.U l         U R                   R                  5        VVs0 s H  u  p#[        U5      [        U5      _M     snnU l         [        U R                   R	                  5       5      U l        [        U R                  [        5      (       a;  SU R                  S'   [        U R                  S      " S0 U R                  D6U l        O U R                  c  [        S   " 5       U l        [        U R                  [        5      (       a+  SU R                  ;   a  [        S0 U R                  D6U l
        OU R                  c  [        5       U l
        [        TU ]4  " S0 UD6  g s  snnf )N      )i  i$  idefics3_visionrh   rb   )rv   itemsrk   maxvalues'max_value_projector_patch_to_query_dict
isinstancert   dictr%   rY   rV   super__post_init__)selfkwargskv	__class__s       rQ   r   AriaConfig.__post_init__   sA    --52D. JNIkIkIqIqIs-tIsc!fc!fnIs-t*7:4;];];d;d;f7g4d(($///@D|,!/0B0B<0P!Q!gTXTfTf!gD'!/0A!B!DDd&&--,$BRBR2R-A0@0@AD%-/D'' .us   !E9)r   rv   rY   rt   )rc   rd   re   rf   rg   rh   attribute_maprV   r&   sub_configsrt   r   r
   rl   rY   ru   rk   listrv   rs   rx   floatry   boolr   rm   __classcell__r   s   @rQ   ro   ro      s    
 J-M #1:NK48M4**T1804K&-4,.#S	/.15!4$;5s#u# %%( (rS   ro   c                       \ rS rSrSrg)AriaTextRMSNorm   rb   Nrc   rd   re   rf   rm   rb   rS   rQ   r   r          rS   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )AriaProjectorMLP   z
Feed-Forward Network module for the Aria Projector.

Args:
    in_features (`int`):
        Input embedding dimension.
    hidden_features (`int`):
        Hidden dimension of the feed-forward network.
    output_dim (`int`):
        Output dimension.
c                    > [         TU ]  5         [        R                  " XSS9U l        [        R                  " X#SS9U l        [        S   U l        g )NFbiasgelu_new)r   __init__r   Linear	linear_in
linear_outr   act)r   in_featureshidden_features
output_dimr   s       rQ   r   AriaProjectorMLP.__init__   s>    ;eL))OeL*%rS   c                 h    U R                  U R                  U5      5      nU R                  U5      nU$ Nr   r   r   )r   hidden_statess     rQ   forwardAriaProjectorMLP.forward   s-    !>?6rS   r   	rc   rd   re   rf   rg   r   r   rm   r   r   s   @rQ   r   r      s    
& rS   r   c                   F   ^  \ rS rSrSrSS\S\4U 4S jjjrS	S jrSr	U =r
$ )
AriaCrossAttention   zb
Aria Cross-Attention module.

Args:
    config (`AriaConfig`):
        The configuration to use.
configdropout_ratec                 .  > [         TU ]  5         UR                  R                  nUR                  R                  nX@l        [        R                  " X3SS9U l        [        R                  " X3SS9U l	        [        R                  " X3SS9U l
        [        R                  " X4SS9U l        [        R                  " X35      U l        [        R                  " U5      U l        [        R                   " U5      U l        [        R                   " U5      U l        g )NFr   T)batch_first)r   r   rt   hidden_sizenum_attention_heads	num_headsr   r   q_projk_projv_projMultiheadAttentionmultihead_attnlinearDropoutdropout	LayerNorm
layer_normlayer_norm_kv)r   r   r   r   r   r   s        rQ   r   AriaCrossAttention.__init__   s    **66((<<	"iiuEiiuEiiuE !33KX\]ii9zz,/,,{3\\+6rS   c                    U R                  U R                  U5      5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  XEXcS9u  pxU R                  U R                  U5      5      nU$ )ai  
Forward pass of the AriaCrossAttention module.

Args:
    key_value_states (`torch.Tensor`):
        Input tensor for key and value.
    hidden_states (`torch.Tensor`):
        Input tensor for query.
    attn_mask (`torch.Tensor`, *optional*, defaults to None):
        Attention mask.

Returns:
    torch.Tensor:
        Output tensor after cross-attention.
	attn_mask)r   r   r   r   r   r   r   r   )	r   key_value_statesr   r   querykeyvalueattn_output_s	            rQ   r   AriaCrossAttention.forward   s      DOOM:;--.>?kk*+,-,,U,Tll4;;{#;<rS   )	r   r   r   r   r   r   r   r   r   )r   r   )rc   rd   re   rf   rg   ro   r   r   r   rm   r   r   s   @rQ   r   r      s*    7z 7 7 7" rS   r   c                   x   ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\R                  S-  4S jjr	S	r
U =r$ )AriaProjectori  z
Aria Projector module.

This module projects vision features into the language model's embedding space, enabling interaction between vision and language components.

Args:
    config (`AriaConfig`):
        Configuration object for the model.
r   c                   > [         TU ]  5         UR                  U l        UR                  R
                  U l        UR                  R                  U l        UR                  R
                  U l	        UR                  R
                  U l        UR                  R
                  U l        [        R                  " [        R                   " UR"                  U R                  5      5      U l        ['        U5      U l        [        R*                  " U R                  5      U l        [/        U R                  U R                  U R                  5      U l        g r   )r   r   rv   patch_to_query_dictrt   r   r   r   r   kv_dimrY   r   r   r   	Parameterr=   r>   r   r   r   
cross_attnr   r   r   feed_forwardr   r   r   s     rQ   r   AriaProjector.__init__  s     	#)#G#G !//;;--AA**66%11== ,,88\\%++f.\.\^b^n^n"op
,V4,,t'7'78,T-=-=t?S?SUYUdUderS   Nr   r   c                 .   UR                   S   UR                   S   pCX@R                  ;  a*  [        SU SU R                  R                  5        S35      eU R                  U   nU R                  SU R                  S5      R                  USS5      nUbM  UR                  U R                  S5      nUR                  S5      R                  SUR                  S5      S5      nU R                  XUS9nU R                  U R                  U5      5      nU$ )	aH  
Forward pass of the Projector module.

Args:
    key_value_states (`torch.Tensor`):
        Input tensor of shape (batch_size, num_patches, kv_dim).
    attn_mask (`torch.Tensor`, *optional*, default is None):
        Attention mask.

Returns:
    `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim).
r   r;   zNumber of patches z: not found in patch_to_query_dict amongst possible values .Nr5   r   )r<   r   KeyErrorkeysr   	unsqueezerepeatrepeat_interleaver   expandsizer   r   r   )	r   r   r   
batch_sizenum_patches	query_numqueriesattention_outrP   s	            rQ   r   AriaProjector.forward#  s0    #3"8"8";=M=S=STU=VK666$[M1klp  mE  mE  mJ  mJ  mL  lM  MN  O  ,,[9	**Zi(2215<<ZAN !33DNNAFI!++A.55b',,q/2NI(8YW >?
rS   )
r   r   r   r   r   r   r   r   r   r   r   )rc   rd   re   rf   rg   ro   r   r=   Tensorr   rm   r   r   s   @rQ   r   r     s=    ff( PTAT  rS   r   c                   N    \ rS rSr% Sr\\S'   \\S'   \\\      \S'   \\S'   Sr	g)	AriaImageProcessorKwargsiE  a  
max_image_size (`int`, *optional*, defaults to `self.max_image_size`):
    Maximum image size. Must be either 490 or 980.
min_image_size (`int`, *optional*, defaults to `self.min_image_size`):
    Minimum image size. Images smaller than this in any dimension will be scaled up.
split_resolutions (`list[list[int]]`, *optional*, defaults to `self.split_resolutions`):
    A list of possible resolutions as (height, width) pairs for splitting high-resolution images into patches.
split_image (`bool`, *optional*, defaults to `self.split_image`):
    Whether to split the image into patches using the best matching resolution from `split_resolutions`.
max_image_sizemin_image_sizesplit_resolutionssplit_imagerb   N)
rc   rd   re   rf   rg   rk   rl   r   r   rm   rb   rS   rQ   r   r   E  s)    	 DI&rS   r   F)totalc                     ^  \ rS rSr/ SQr\r\R                  r	/ SQr
/ SQrSrSrSrSrSrSrSrS	\\   4U 4S
 jjrS\S\S\\   4S jrSSS\SSSS4S jrSSS\SS4S jrSSS\\\      S\SSS\S   4
S jr     S)S\S   S\S\S\S\\\   -  S-  S\\\   -  S-  S\S-  S\\-  S-  S \S!\S"\\\      S-  S#\SSS\ 4S$ jjr!S*S%\S&\4S' jjr"S(r#U =r$$ )+AriaImageProcessoriW  pixel_values
pixel_mask	num_crops)      ?r   r     P  FNTr   c                    > UR                  S5      c'  / SQnU Vs/ s H  o3S   S-  US   S-  /PM     snUS'   [        TU ]  " S0 UD6  g s  snf )Nr   ))r;   r$   )r;   r   )r;      )r;      )r;      )r;      )r;   r]   )r$   r   )r$   r   )r$   r$   )r$   r;   )r   r;   )r   r$   )r   r;   )r   r$   )r   r;   )r   r;   )r   r;   )r]   r;   r     r;   rb   )getr   r   )r   r   default_resolutionselr   s       rQ   r   AriaImageProcessor.__init__g  sa    ::)*2 #{Pc*dPc"qECKA+EPc*dF&'"6" +es   Aoriginal_resolutiontarget_resolutionreturnc                 d    Uu  p4Uu  pV[        Xd-
  S5      u  px[        XS-
  S5      u  pXyXx-   X-   /$ )zNGet padding size for patching, returns [left, top, right, bottom] for tvF.pad.r$   )divmod)r   r  r  original_heightoriginal_widthtarget_heighttarget_widthpaste_xr_xpaste_yr_ys              rQ   _get_padding_size$AriaImageProcessor._get_padding_sizem  sE    *='&7#l;Q?m=qA'-??rS   imageztorch.Tensorresamplez7PILImageResampling | tvF.InterpolationMode | int | Nonec                 j    [        X[        R                  S9u  pEU R                  U[	        XES9U5      $ )zFResize an image to a target resolution while maintaining aspect ratio.input_data_formatheightwidth)r   r   FIRSTresizer   )r   r  r  r  
new_height	new_widths         rQ   _resize_for_patching'AriaImageProcessor._resize_for_patchingu  s8     !68H8N8N!

 {{5(*"NPXYYrS   c                 |    [        X[        R                  S9nU R                  X25      n[        R
                  " XS9$ )zCPad an image to a target resolution while maintaining aspect ratio.r  )padding)r   r   r  r  tvFpad)r   r  r  new_resolutionr!  s        rQ   _pad_for_patching$AriaImageProcessor._pad_for_patching  s6     /u[k[q[qr((Kwwu..rS   grid_pinpoints
patch_sizec                     [        U[        5      (       d  [        S5      e[        U[        R
                  S9n[        XR5      nU R                  XU5      nU R                  Xv5      n[        XS9n	U	$ )a:  
Process an image with variable resolutions by dividing it into patches.

Args:
    image (`torch.Tensor`):
        The input image to be processed (channels-first format).
    grid_pinpoints (`list[list[int]]`):
        A list of possible resolutions as (height, width) pairs.
    patch_size (`int`):
        Size of each square patch to divide the image into.
    resample (`PILImageResampling | tvF.InterpolationMode | int | None`):
        Resampling filter to use when resizing.

Returns:
    `list[torch.Tensor]`: A list of image patches in channels-first format.
z6grid_pinpoints must be a list of possible resolutions.)channel_dim)r(  )
r   r   	TypeErrorr   r   r  r   r  r%  r   )
r   r  r'  r(  r  
image_sizebest_resolutionresized_imagepadded_imagepatchess
             rQ   get_image_patches$AriaImageProcessor.get_image_patches  sk    . .$//TUU#E7G7M7MN
0L11%(S--mM#LHrS   images
do_rescalerescale_factordo_normalize
image_mean	image_stddisable_groupingreturn_tensorsr   r   r   r   c           
      X   U	S;  a  [        S5      e/ n/ nS nU GHA  nU(       a  U R                  UXU5      nOU/nUb  [        U5      U:  a  [        U5      nU H  nUR                  S   UR                  S   nnU	[	        UU5      -  nUU:  a  [	        [        UU-  5      U
5      nU	nOU	n[	        [        UU-  5      U
5      nU R                  U[        UUS9U5      nU	U-
  nU	U-
  n[        R                  " USSUU/5      n[        R                  " X4[        R                  S9nSUS U2S U24'   UR                  U5        UR                  U5        M     GMD     [        R                  " USS	9nU R                  UX#XEU5      n[        R                  " USS	9n[!        UUUS
.US9$ )Nr   r   z(max_image_size must be either 490 or 980r5   r  r   )r7   Tr9   r   datatensor_type)
ValueErrorr1  lenr<   r   rk   r  r   r"  r#  r=   r>   r   appendstackrescale_and_normalizer   )r   r3  r4  r5  r6  r7  r8  r9  r:  r   r   r   r   r  r   pixel_masksprocessed_cropsr   r  crop_images
crop_imagehwscalenew_hnew_wpadding_bottompadding_rightr   stacked_imagesstacked_maskss                                  rQ   _preprocessAriaImageProcessor._preprocess  s   " +GHH	E"44U<M_gh$g C$4y$@,	)
!''+Z-=-=b-A1&Q26AI?E*E*EAI?E![[XURW5XZbc
!/%!7 . 6 WWZ!Q~1VW
"[[.)IQVQ[Q[\
-1
6E66E6>*"":.&&z2' * < _!<33JR[
 KQ7 .+&
 '
 	
rS   r  r  c                     UR                  SU R                  5      nUR                  SU R                  5      n[        X4U R                  5      u  pgU(       d  SnU$ Xe-  U-  U-  nU$ )aU  
A utility that returns number of image patches for a given image size.

Args:
    height (`int`):
        Height of the input image.
    width (`int`):
        Width of the input image.
    images_kwargs (`dict`, *optional*):
        Any kwargs to override defaults of the image processor.

Returns:
    `int`: Number of patches per image.
r   r   r;   )r   r   r   r   r   )	r   r  r  images_kwargsr   r   resized_heightresized_widthr   s	            rQ   get_number_of_image_patches.AriaImageProcessor.get_number_of_image_patches  sv     $''t7G7GH&**+;T=P=PQ(>PTPfPf(g%*a 1?0PS`0`dr0rrS   rb   )r   r   NFNr   )%rc   rd   re   rf   model_input_namesr   valid_kwargsr   BICUBICr  r7  r8  r   r   r   r   do_convert_rgbr4  r6  r   r   tupler   rk   r  r  r%  r1  r   r   strr   r   rS  rY  rm   r   r   s   @rQ   r   r   W  s   C+L!))H JINNKNJL#(@!A #@U @u @Y]^aYb @
Z
Z !
Z L	
Z
 

Z// !/ 
	/ T#Y 	
 L 
n	V "!48!NRC
^$C
 C
 	C

 C
 DK'$.C
 4;&-C
 +C
 j(4/C
 C
 C
  S	?T1C
 C
 LC
  
!C
J# c  rS   r   c                   8    \ rS rSr% Sr\\S'   \\S'   \\S'   Srg)AriaImagesKwargsi  a  
split_image (`bool`, *optional*, defaults to `False`):
    Whether to split large images into multiple crops. When enabled, images exceeding the maximum size are
    divided into overlapping crops that are processed separately and then combined. This allows processing
    of very high-resolution images that exceed the model's input size limits.
max_image_size (`int`, *optional*, defaults to `980`):
    Maximum image size (in pixels) for a single image crop. Images larger than this will be split into
    multiple crops when `split_image=True`, or resized if splitting is disabled. This parameter controls
    the maximum resolution of individual image patches processed by the model.
min_image_size (`int`, *optional*):
    Minimum image size (in pixels) for a single image crop. Images smaller than this will be upscaled to
    meet the minimum requirement. If not specified, images are processed at their original size (subject
    to the maximum size constraint).
r   r   r   rb   N)	rc   rd   re   rf   rg   r   rl   rk   rm   rb   rS   rQ   rb  rb    s     rS   rb  c                   L    \ rS rSr% \\S'   SSS.SSS.\R                  S.rSr	g	)
AriaProcessorKwargsi  rV  F)r!  return_mm_token_type_idsr   )r   r   )text_kwargsrV  r:  rb   N)
rc   rd   re   rf   rb  rl   r   PYTORCH	_defaultsrm   rb   rS   rQ   rd  rd    s4    ## (-

 " 
 %,,
IrS   rd  c            
          ^  \ rS rSr    SS\\-  S\S-  S\\\-  \4   S-  4U 4S jjjr	\
 SS\\-  \\   -  \\   -  S\S-  S	\\   S
\4S jj5       rSS jr\S 5       rSrU =r$ )AriaProcessori-  N	tokenizerchat_templatesize_conversionc                 ,  > Uc  SSS.nUR                  5        VVs0 s H  u  pV[        U5      U_M     snnU l        UR                  U l        UR                  U l        Ub  UR
                  c  UR                  Ul        [        TU ]!  XUS9  gs  snnf )z`
size_conversion (`Dict`, *optional*):
    A dictionary indicating size conversions for images.
Nr{   r|   r<  )rl  )	r~   rk   rm  image_tokenrr   	pad_token	unk_tokenr   r   )r   image_processorrk  rl  rm  r   r   r   s          rQ   r   AriaProcessor.__init__/  s     "$'c2O6E6K6K6MN6MdaA	6MN$00'66 Y%8%8%@"+"5"5I=Q  Os   Btextr3  r   r  c                 b   U R                   " [        4SU R                  R                  0UD6n[	        U[
        5      (       a  U/nO8[	        U[        5      (       d#  [	        US   [
        5      (       d  [        S5      eUb  U R                  " U40 US   D6nU R                  UR                  R                  S      n/ nUR                  S5      U-  nU HQ  n	U	R                  U R                  R                  U R                  R                  U-  5      n	UR                  U	5        MS     O0 nUnUS   R                  S	S5      n
US   R                  S
S5      nU R                  " U40 US   DS	S0D6nU R!                  X|S/S9  U(       a  U R#                  US   5      US'   [%        0 UEUEU
S9$ )a[  
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:
    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
    `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
    `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsNrV  r$   r   rf  r:  re  Fr  )
modalities	input_idsmm_token_type_idsr>  )_merge_kwargsrd  rk  init_kwargsr   r`  r   r+  rr  rm  r   r<   popreplacero  rC  _check_special_mm_tokenscreate_mm_token_type_idsr   )r   rt  r3  r   output_kwargsimage_inputstokens_per_imageprompt_stringsr   sampler:  re  text_inputss                rQ   __call__AriaProcessor.__call__E  s   " **
"&.."<"<
 
 dC  6DD$''
47C0H0H_``//Y-:XYL#33L4M4M4S4STU4VWN$((58HHI(B(BDNND^D^ajDjk%%f- 
 L!N&}599:JDQ#0#?#C#CD^`e#f nn^i}]7Sidhi%%nwi%X#/3/L/L[YdMe/fK+,!@K!@<!@n]]rS   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ s H!  nU R                  R                  " / UQUP76 PM#     nnU Vs/ s H  oR                  U   U-  PM     n	nUR                  XS.5        [        S0 UD6$ s  snf s  snf )ay  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
NrV  r   )num_image_tokensnum_image_patchesrb   )	rd  rh  r   updaterr  r   rY  rm  r   )
r   image_sizesr   vision_datarV  max_sizer,  r  r   r  s
             rQ   _get_num_multimodal_tokens(AriaProcessor._get_num_multimodal_tokensx  s     "/99==orRM  ($(()94@gDDXDXDgDgH #.!"-J $$@@\*\m\"-  ! arr`qQ\ 4 4X > L`qr4Dmn,,,!  ss   *(CCc                     U R                   R                  nU R                  R                  nU Vs/ s H  o3S:w  d  M
  UPM     nn[        [        R                  X-   5      5      $ s  snf )Nr   )rk  r[  rr  r   r   fromkeys)r   tokenizer_input_namesimage_processor_input_namesnames       rQ   r[  AriaProcessor.model_input_names  sb     $ @ @&*&:&:&L&L# 9T&k8S_jWjt8S#&kDMM"7"UVWW 'ls
   	A&A&)ro  rr   rm  )NNNNr   )rc   rd   re   rf   r'   r`  r   r   rk   r   r!   r   r   r   r   r   rd  r   r  r  propertyr[  rm   r   r   s   @rQ   rj  rj  -  s     )-$(9=R !3&R Tz	R
 eck3./$6R R,  %)0^++d9o=EV@WW0^ T!0^ ,-	0^
 
0^ 0^d-4 X XrS   rj  c                   4   ^  \ rS rSrSrS\4U 4S jjrSrU =r$ )AriaSharedExpertsMLPi  a  
Shared Expert MLP for shared experts.

Unlike routed experts, shared experts process all tokens without routing.
This class reconfigures the intermediate size in comparison to the LlamaMLP.

Args:
    config (`AriaTextConfig`): Configuration object for the Aria language model.
r   c                 `   > [         TU ]  U5        UR                  UR                  -  U l        g r   )r   r   r\   r`   r   s     rQ   r   AriaSharedExpertsMLP.__init__  s)     !'!9!9F<Y<Y!YrS   )r\   )	rc   rd   re   rf   rg   rV   r   rm   r   r   s   @rQ   r  r    s    Z~ Z ZrS   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )AriaGroupedExpertsGemmi  a  
Grouped GEMM (General Matrix Multiplication) module for efficient expert computation.
This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm)
for optimized performance. If the grouped_gemm library is not installed, it gracefully
falls back to a sequential GEMM implementation, which may be slower but ensures
functionality.

Args:
    in_features (`int`):
        Number of input features.
    out_features (`int`):
        Number of output features.
    groups (`int`):
        Number of expert groups.
c                    > [         TU ]  5         Xl        X l        X0l        [
        R                  " [        R                  " X1U5      5      U l	        g r   )
r   r   r   rH   groupsr   r   r=   emptyweight)r   r   rH   r  r   s       rQ   r   AriaGroupedExpertsGemm.__init__  s:    &(ll5;;vL#QRrS   c                 L    [        UU R                  UR                  5       5      $ )a-  
Perform grouped matrix multiplication.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (num_tokens, in_features).
    tokens_per_expert (`torch.Tensor`):
        Number of tokens assigned to each expert.

Returns:
    torch.Tensor: Output tensor of shape (num_tokens, out_features).
)rR   r  cpu)r   inputrF   s      rQ   r   AriaGroupedExpertsGemm.forward  s'     'KK!!#
 	
rS   )r  r   rH   r  r   r   s   @rQ   r  r    s     S
 
rS   r  c                   \   ^  \ rS rSrS\SS4U 4S jjrS rS\R                  4S jr	Sr
U =r$ )	AriaExpertsi  r   r  Nc                    > [         TU ]  5         Xl        [        UR                  UR
                  S-  UR                  5      U l        [        UR
                  UR                  UR                  5      U l        g )Nr$   )	r   r   r   r  r   r\   r^   fc1fc2r   s     rQ   r   AriaExperts.__init__  s_    )&*<*<f>V>VYZ>Z\b\r\rs)&*B*BFDVDVX^XnXnorS   c                     [         R                  " XR                  R                  SS9u  p#[        R
                  R                  USS9nX44$ )Nr;   )r   r:   r5   r9   )r=   topkr   r_   r   r   softmax)r   router_logits
top_logitstop_indicesscoress        rQ   route_tokens_to_experts#AriaExperts.route_tokens_to_experts  sB    "'**]kk>R>RXY"Z
&&zr&:""rS   c                 $   U R                  U5      u  p4UR                  n[        R                  " UR	                  5       R                  [        R                  5      U R                  R                  SU R                  R                  S-
  S9R                  U5      nUnUR                  S5      n[        R                  " U5      n	UR                  SXR                  R                  -  5      n
U R                  X5      n[        R                  " USSS9u  p[        R                   R#                  U5      U-  nU R%                  X5      n[        R&                  " UR(                  S   U R                  R                  -  UR+                  S5      4UR                  UR,                  S9nUR/                  SX5        UR                  SU R                  R                  UR+                  S5      5      nXR1                  S5      -  R3                  SS9nU$ )Nr   r;   )binsminr   r5   r$   r9   r6   )r  r7   r=   histcflattentofloat32r   r^   viewargsortindex_selectr_   r  chunkr   r   silur  r>   r<   r   r8   index_copy_r   sum)r   r   r  top_k_indextop_k_weightsoriginal_dtyperF   indicesflatten_indicessorted_indicespermuted_tokens
fc1_output
projectiongateexpert_outputunpermuted_tokensrI   s                    rQ   r   AriaExperts.forward  s   %)%A%A-%P"$**!KK!$$U]]3,,++a/	

 "^
 	 !,,r*7'44Q++J^J^8^_XXoA
 ;;z1"=
]]''
3d:
?!KK  #dkk&:&::M<N<Nq<QR%% ''

 	%%aG-222t{{7K7K]M_M_`aMbc#&=&=b&AAFF1FMrS   )r   r  r  )rc   rd   re   rf   rV   r   r  r=   r   r   rm   r   r   s   @rQ   r  r    s3    p~ p$ p#
u||  rS   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )AriaTextMoELayeri  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        U5      U l        [        U5      U l
        Xl        g NFr   )r   r   r   r   r   r^   routerr  expertsr  shared_expertsr   r   s     rQ   r   AriaTextMoELayer.__init__  sM    ii 2 2F4J4JQVW"6*26:rS   r   r  c                    UR                   nUR                  SUR                  S5      5      nU R                  U5      nU R	                  X5      R                  U5      nU R                  UR                  U5      5      nXE-   $ Nr5   )r<   r  r   r  r  r  )r   r   original_shaper  r  shared_expert_outputs         rQ   r   AriaTextMoELayer.forward  sv    &,,%**2}/A/A"/EFM2]BGGW#22=3E3En3UV33rS   )r   r  r  r  )rc   rd   re   rf   rV   r   r=   r   r   rm   r   r   s   @rQ   r  r    s/    ~ 4U\\ 4ell 4 4rS   r  c                       \ rS rSrSrSrg)AriaTextAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrb   N)rc   rd   re   rf   rg   rm   rb   rS   rQ   r  r    s    GrS   r  c                   8   ^  \ rS rSrSrS\S\4U 4S jjrSrU =r	$ )AriaTextDecoderLayeri  aG  
Aria Text Decoder Layer.

This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network.

Args:
    config (`AriaTextConfig`):
        Configuration object for the text component of the model.
    layer_idx (`int`):
        Index of the layer.
r   	layer_idxc                 D   > [         TU ]  X5        [        U5      U l        g r   )r   r   r  mlpr   r   r  r   s      rQ   r   AriaTextDecoderLayer.__init__'  s    +#F+rS   )r  )
rc   rd   re   rf   rg   rV   rk   r   rm   r   r   s   @rQ   r  r    s     
,~ ,# , ,rS   r  c                      ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrSr\\S	.r\R$                  " 5       U 4S
 j5       rSrU =r$ )AriaTextPreTrainedModeli,  r   model)r  rt  r  r  Tpast_key_values)r   
attentionsc                    > [         TU ]  U5        [        U[        5      (       a5  [        R
                  " UR                  SU R                  R                  S9  g g )Ng        )meanstd)	r   _init_weightsr   r  initnormal_r  r   rx   )r   moduler   s     rQ   r  %AriaTextPreTrainedModel._init_weights=  sA    f%f455LLSdkk6S6ST 6rS   rb   )rc   rd   re   rf   rV   rl   base_model_prefixinput_modalities_no_split_modulessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_attention_backendr  r  _can_record_outputsr=   no_gradr  rm   r   r   s   @rQ   r  r  ,  sh    (/1IJ&*#"3N"&-'
 ]]_U UrS   r  c                   Z    \ rS rSr% \\S'   SrSrSr\	R                  " 5       S 5       rSrg)	AriaPreTrainedModeliD  r   r  FTc                     [         R                  " X5        [        U[        5      (       a4  [        R
                  " UR                  U R                  R                  S9  g g )N)r  )	r   r  r   r   r  trunc_normal_r   r   rx   )r   r  s     rQ   r  !AriaPreTrainedModel._init_weightsJ  s@    %%d3fm,,v||1N1NO -rS   rb   N)rc   rd   re   rf   ro   rl   r  _can_compile_fullgraphr  r=   r  r  rm   rb   rS   rQ   r  r  D  s4    ""&
]]_P PrS   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )AriaTextModeliQ  r   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l        U R                  5         g s  snf )NF)
r   r   r   
ModuleListrB   num_hidden_layersr  layersgradient_checkpointing	post_initr  s      rQ   r   AriaTextModel.__init__R  s_     mmFKFLdLdFefFe!&4Fef
 ',# gs   A1)r
  r	  )rc   rd   re   rf   rV   r   rm   r   r   s   @rQ   r  r  Q  s    ~  rS   r  c                   N   ^  \ rS rSrSS0rS\4U 4S jjr\U 4S j5       rSr	U =r
$ )AriaTextForCausalLMi[  lm_head.weightzmodel.embed_tokens.weightr   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
r   r   r  r  
vocab_sizer   r   r   lm_headr  r   s     rQ   r   AriaTextForCausalLM.__init__^  sU     "6*
 ++yy!3!3V5F5FUS 	rS   c                 (   > [         TU ]  " U 40 UD6  g r   )r   r   )r   super_kwargsr   s     rQ   r   AriaTextForCausalLM.forwardg  s    --rS   )r  r  r  )rc   rd   re   rf   _tied_weights_keysrV   r   r!   r   rm   r   r   s   @rQ   r  r  [  s/    *,GH~  . .rS   r  c                       \ rS rSrSrg)AriaCausalLMOutputWithPastil  rb   Nr   rb   rS   rQ   r  r  l  r   rS   r  c                       \ rS rSrSrg)AriaModelOutputWithPastip  rb   Nr   rb   rS   rQ   r  r  p  r   rS   r  c                     ^  \ rS rSrS\4U 4S jjrS r   SS\R                  S\R                  S-  S\	\
\	   -  S	\S-  S
\\   S\\-  4S jjr        SS\R"                  S-  S\R                  S-  S\R"                  S-  S\R$                  S-  S\R"                  S-  S\S-  S\R                  S-  S\S-  S
\\   S\\-  4S jjrSrU =r$ )	AriaModelit  r   c                 D   > [         TU ]  U5        [        U5      U l        g r   )r   r   r   multi_modal_projectorr   s     rQ   r   AriaModel.__init__u  s     %26%:"rS   c                 ~   Uc  g UR                  SU R                  R                  R                  U R                  R                  R                  S9nUR                  SU R                  R                  R                  U R                  R                  R                  S9nUR	                  SS9S:  R                  5       $ )Nr;   )	dimensionr   stepr$   )r5   r=  r9   r   )unfoldvision_towerr   r(  r  r   )r   r   patches_subgrids      rQ   _create_patch_attention_mask&AriaModel._create_patch_attention_masky  s    $++""))44""))44 , 

 *00""))44""))44 1 

  ###1A5;;==rS   Nr   r   ru   output_hidden_statesr   r  c                     U R                  U5      nU R                  " U4USSS.UD6nS nUb'  UR                  S5      n	[        R                  " U	5      nUR
                  U   n
U R                  XS9Ul        U$ )NT)patch_attention_maskr)  return_dictr;   r   )r'  r%  r  r=   logical_notr   r  pooler_output)r   r   r   ru   r)  r   r+  image_outputsimage_attn_maskflattened_maskselected_image_features              rQ   get_image_featuresAriaModel.get_image_features  s      $@@L))
!5!%	

 
 +199!<N#//?O!.!<!<=Q!R&*&@&@AW&@&s#rS   rx  attention_maskposition_idsr  inputs_embeds	use_cachec	           	         Uc  U R                  5       " U5      nUb  UR                  S   S:w  aw  U R                  UUU R                  R                  SS9R
                  n
U
R                  UR                  UR                  5      n
U R                  XU
S9nUR                  X5      nU R                  " SUUUUUS.U	D6n[        UR                  U(       a  UR                  OS UR                  UR                   Ub  W
S9$ S S9$ )Nr;   T)r   r   ru   r,  )r7  image_features)r5  r6  r  r7  r8  )last_hidden_stater  r   r  image_hidden_statesrb   )get_input_embeddingsr<   r3  r   ru   r.  r  r8   r7   get_placeholder_maskmasked_scatterlanguage_modelr  r;  r  r   r  )r   rx  r   r   r5  r6  r  r7  r8  r   r:  special_image_maskoutputss                rQ   r   AriaModel.forward  s8      557	BM #(;(;A(>!(C!44)%%)[[%E%E 	 5 
 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M%% 
)%+'
 
 '%777@G33d!//))2>2J
 	

 QU
 	
rS   )r  )Nr5   N)NNNNNNNN)rc   rd   re   rf   ro   r   r'  r=   FloatTensorrk   r   r   r   r    r_  r   r3  
LongTensorr   r	   r   r  r   rm   r   r   s   @rQ   r  r  t  sa   ;z ;>& 0402,0'' %%, "DIo	
 #Tk +, 
+	+8 .215.2.204(,26!%,
##d*,
 ''$.,
 $$t+	,

 t+,
 &&-,
 ,
 ((4/,
 $;,
 -.,
 
(	(,
 ,
rS   r  z
    Aria model for conditional generation tasks.

    This model combines a vision tower, a multi-modal projector, and a language model
    to perform tasks that involve both image and text inputs.
    )custom_introc                     ^  \ rS rSrSS0r\  SS\R                  S\R                  S-  S\\	\   -  S\
\   S	\\-  4
S
 jj5       r\\          SS\R                   S-  S\R                  S-  S\R                   S-  S\R"                  S-  S\R                   S-  S\S-  S\R                  S-  S\R                   S-  S\S-  S\\R"                  -  S\
\   S	\\-  4S jj5       5       r       SU 4S jjrSrU =r$ )AriaForConditionalGenerationi  r  z(model.language_model.embed_tokens.weightNr   r   ru   r   r  c                 B    U R                   R                  " SUUUS.UD6$ )N)r   r   ru   rb   )r  r3  )r   r   r   ru   r   s        rQ   r3  /AriaForConditionalGeneration.get_image_features  s3     zz,, 
%!!5
 	
 	
rS   rx  r5  r6  r  r7  labelsr8  logits_to_keepc                    U R                   " SUUUUUUUU	S.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnUb3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a	  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `AriaForConditionalGeneration`).
    Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
    computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import httpx
>>> from io import BytesIO
>>> import torch
>>> from PIL import Image
>>> from io import BytesIO

>>> from transformers import AutoProcessor, AutoModel
>>> from transformers.image_utils import load_image

>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
>>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
>>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

>>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria")
>>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", dtype=torch.bfloat16, device_map="auto")

>>> # Create inputs
>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {"type": "image"},
...             {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
...             {"type": "image"},
...             {"type": "text", "text": "What can we see in this image?"},
...         ]
...     },
...     {
...         "role": "user",
...         "content": [
...             {"type": "image"},
...             {"type": "text", "text": "In which city is that bridge located?"},
...         ]
...     }
... ]

>>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
>>> images = [[image1, image2], [image3]]
>>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)

>>> # Generate
>>> generated_ids = model.generate(**inputs, max_new_tokens=256)
>>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

>>> print(generated_texts[0])
Assistant: There are buildings, trees, lights, and water visible in this image.

>>> print(generated_texts[1])
Assistant: The bridge is in San Francisco.
```)rx  r   r   r5  r6  r  r7  r8  r   N)logitsrK  r  )lossrN  r  r   r  rb   )r  r   rk   slicer  loss_functionr   rY   r  r  r  r   r  )r   rx  r   r   r5  r6  r  r7  rK  r8  rL  r   rB  r   slice_indicesrN  rO  s                    rQ   r   $AriaForConditionalGeneration.forward  s    Z ** 

%!)%+'

 

  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD *#33!//))
 	
rS   c	           	         > [         TU ]  " U4UUUUUS.U	D6n
U(       d  U	R                  SS5      (       d  XJS'   XZS'   U
$ )N)r  r7  r5  rL  is_first_iterationr8  Tr   r   )r   prepare_inputs_for_generationr   )r   rx  r  r7  r   r   r5  rL  rU  r   model_inputsr   s              rQ   rV  :AriaForConditionalGeneration.prepare_inputs_for_generationX  sb     w<
+'))1
 
 VZZT%B%B
 ,8()3&rS   rb   r  )
NNNNNNNNNr   )NNNNNNF)rc   rd   re   rf   r  r!   r=   rD  rk   r   r   r    r_  r   r3  r"   rE  r   r	   r   r  r   rV  rm   r   r   s   @rQ   rH  rH    s    +,VW 0402	
''
 %%,
 "DIo	

 +,
 
+	+
 
  .215.2.204(,26*.!%-.h
##d*h
 ''$.h
 $$t+	h

 t+h
 &&-h
 h
 ((4/h
   4'h
 $;h
 ell*h
 +,h
 
+	+h
  h
Z   rS   rH  )
ro   rV   r   rj  rH  r  r  r  r  r  )er=   huggingface_hub.dataclassesr   r   torchvision.transforms.v2r   r"   r   r  activationsr   cache_utilsr	   configuration_utilsr
   image_processing_backendsr   image_processing_utilsr   r   r   image_transformsr   image_utilsr   r   r   r   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   r   r   r   r   tokenization_pythonr   r   utilsr   r    r!   r"   r#   autor%   r&   r'   llama.configuration_llamar(   llama.modeling_llamar)   r*   r+   r,   r-   r.   r/   llava.modeling_llavar0   r1   r2   r3   
get_loggerrc   loggerrR   rV   ro   r   Moduler   r   r   r   r   rb  rd  rj  r  r  r  r  r  r  r  r  r  r  r  r  r  rH  __all__rb   rS   rQ   <module>rq     s    .  7 & !   3 ; a a 1  C : - f f ?  = < 3    
		H	%> +,![ !  -!: +,*(! *(  -*(Z	l 	ryy 24 4n>BII >B|5 $ m+ m m`|5 **%   lXN lX lX^Z8 Z )
RYY )
X)")) )X4ryy 4"H H,, ,$ Uo U U.
P. 
PJ .13C ."	!< 		6 	[

 [
| \#@ \\~rS   