
    Z jb                        S SK rS SK Jr  S SKJr  S SKrS SKJr  SSKJ	r
  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/J0r0   SBS\Rb                  S\Rd                  S\Rd                  S\Rd                  S\Rd                  S-  S\3S\3\4-  4S jjr5 " S S\'5      r6 " S  S!\%5      r7\\" S"S#9 " S$ S%\5      5       5       r8 " S& S'\Rb                  5      r9 " S( S)\Rb                  5      r: " S* S+\#5      r;\Rx                  \6S,.r= " S- S.\5      r> " S/ S0\Rb                  5      r?\ " S1 S2\5      5       r@\ " S3 S4\@5      5       rA " S5 S6\-5      rBSrC " S7 S8\Rb                  5      rD " S9 S:\,5      rE " S; S<\+5      rF " S= S>\)5      rG " S? S@\*5      rH/ SAQrIg)C    N)Callable)	dataclass   )initialization)ACT2FN)Cache)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfigmodulequerykeyvalueattention_maskscalingdropoutc                 h   UnUn	[         R                  " XR                  SS5      5      U-  n
Ub  X-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r   dim)ptrainingr   )	torchmatmul	transposenn
functionalsoftmaxr'   r-   
contiguous)r!   r"   r#   r$   r%   r&   r'   kwargs
key_statesvalue_statesattn_weightsattn_outputs               ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forwardr;   .   s     JL<<';';Aq'ABWLL!#4 ==((2(>L==((6??([L,,|:K''1-88:K$$    c                       \ rS rSrSrg)InternVLVisionRMSNormH    N__name__
__module____qualname____firstlineno____static_attributes__r@   r<   r:   r>   r>   H       r<   r>   c                      ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S-  S\\	   4S jjr
S	rU =r$ )InternVLVisionAttentionL   configc                 2  > [         TU ]  U5        U ?SU l        UR                  nU(       a  [        U R                  5      O[        R                  " 5       U l	        U(       a  [        U R                  5      U l
        g [        R                  " 5       U l
        g NF)super__init__num_key_value_groups	is_causaluse_qk_normr>   	embed_dimr1   Identityq_normk_norm)selfrK   qk_norm	__class__s      r:   rO    InternVLVisionAttention.__init__M   sd     % $$?F+DNN;BKKM?F+DNN;BKKMr<   Nhidden_statesr%   r5   c                 p   UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	U R	                  U5      nU R                  U5      nUR                  XEU R                  U R                  5      R                  SS5      nUR                  XEU R                  U R                  5      R                  SS5      nU	R                  XEU R                  U R                  5      R                  SS5      n	[        R                  " U R                  R                  [        5      n
U
" U UUU	U4U R                   (       d  SOU R"                  U R$                  SS.UD6u  pUR                  XEU R&                  5      nU R)                  U5      nU R+                  U5      nX4$ )Nr   r           F)r'   r&   rQ   )sizeq_projk_projv_projrU   rV   reshape	num_headshead_dimr0   viewr   get_interfacerK   _attn_implementationr;   r-   attention_dropoutscalerS   projection_layerprojection_dropout)rW   r[   r%   r5   
batch_sizeseq_len_query_statesr6   r7   attention_interfacer9   r8   outputs                 r:   forwardInternVLVisionAttention.forwardX   s    "/!3!3!5
Q{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJ
%
 
%
! "))*t~~N&&{3((0##r<   )rQ   rV   rU   N)rB   rC   rD   rE   r    rO   r.   Tensorr   r   rr   rF   __classcell__rY   s   @r:   rI   rI   L   sP    	Z3 	Z /3'$||'$ t+'$ +,	'$ '$r<   rI   z7
    Class for outputs of [`InternVLVisionModel`].
    custom_introc                       \ rS rSrSrSrg)$InternVLVisionModelOutputWithPooling   a2  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
    *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
    will be returned.
r@   N)rB   rC   rD   rE   __doc__rF   r@   r<   r:   r{   r{      s    r<   r{   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )InternVLVisionPatchEmbeddings   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pTUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l        Xpl        [        R                  " XEX3S9U l
        g )Nr   r   )kernel_sizestride)rN   rO   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper1   Conv2d
projection)	rW   rK   r   r   r   r   r   r   rY   s	           r:   rO   &InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir<   pixel_valuesreturnc                    UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  UR	                  U R                  R
                  R                  5      5      nUR                  S5      R                  SS5      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   )	shaper   
ValueErrorr   toweightdtypeflattenr0   )rW   r   rl   r   heightwidth
embeddingss          r:   rr   %InternVLVisionPatchEmbeddings.forward   sz    2>2D2D/
&,,,w  __\__T__5K5K5Q5Q%RS
''*44Q:
r<   )r   r   r   r   r   r   )rB   rC   rD   rE   r}   rO   r.   ru   rr   rF   rv   rw   s   @r:   r   r      s.    j
ELL 
U\\ 
 
r<   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
 SS\R                  S\R                  S-  S\R                  4S jjrSrU =r$ )InternVLVisionEmbeddings   z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

rK   r   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )rN   rO   r1   	Parameterr.   zerosr   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probr'   )rW   rK   r   rY   s      r:   rO   !InternVLVisionEmbeddings.__init__   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r<   r   r   r   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr)   r         ?r   r   bicubicF)r^   modealign_cornersr*   )r   r   r.   jit
is_tracingr   r   rb   permuter1   r2   interpolatere   cat)rW   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedr+   
new_height	new_widthsqrt_num_positionss               r:   interpolate_pos_encoding1InternVLVisionEmbeddings.interpolate_pos_encoding   s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr<   r   bool_masked_posc                    UR                   u    p4nU R                  U5      nUR                  5       u  pxnUbI  U R                  R	                  XxS5      n	UR                  S5      R                  U	5      n
USU
-
  -  X-  -   nU R                  R	                  USS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nU$ )Nr)   r   r*   )r   r   r^   r   expand	unsqueezetype_asr   r.   r   r   r   r'   )rW   r   r   rn   r   r   r   rl   rm   mask_tokensw
cls_tokenss               r:   rr    InternVLVisionEmbeddings.forward   s    
 +001e**<8
!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
r<   )r   r'   r   r   r   r   r   rt   )rB   rC   rD   rE   r}   r    rO   r.   ru   intr   
BoolTensorrr   rF   rv   rw   s   @r:   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 48ll ))D0 
	 r<   r   c                       \ rS rSrSrg)InternVLVisionMLPi  r@   NrA   r@   r<   r:   r   r     rG   r<   r   )
layer_normrms_normc                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	\R                     \	\R                  \R                  4   -  4S jr
S	rU =r$ )
InternVLVisionLayeri  z?This corresponds to the Block class in the timm implementation.rK   r   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                     " UR                  UR                  S9U l        [        UR                     " UR                  UR                  S9U l        UR                  n[        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R*                  " UR,                  5      U l        g )Nr   epsT)requires_grad)rN   rO   chunk_size_feed_forwardseq_len_dimrI   	attentionr   mlpNORM2FN	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer1   r   r.   oneslambda_1lambda_2r   r   r'   )rW   rK   init_valuesrY   s      r:   rO   InternVLVisionLayer.__init__  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r<   r[   c                    U R                  U R                  U5      5      u  p#U R                  U-  nX!-   nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  b  U R                  U-  nXA-   nU$ rt   )r   r   r   r   r   r'   r   )rW   r[   attention_outputrn   layer_outputs        r:   rr   InternVLVisionLayer.forward-  s     #nn!!-0
  ==+;; )8 ++M:xx-||L1==$==<7L $3r<   )	r   r   r'   r   r   r   r   r   r   )rB   rC   rD   rE   r}   r    rO   r.   ru   tuplerr   rF   rv   rw   s   @r:   r   r     sZ    I>3 > >|| 
u||	uU\\5<<%?@	@ r<   r   c                   `   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\\	-  4S jr
SrU =r$ )	InternVLVisionEncoderiI  rK   r   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf rM   )
rN   rO   rK   r1   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rW   rK   irY   s      r:   rO   InternVLVisionEncoder.__init__J  sS    ]]vOgOgIh#iIhA$7$?Ih#ij
&+# $js   A&r[   c                 J    U R                    H  nU" U5      nM     [        US9$ )N)last_hidden_state)r   r
   )rW   r[   layer_modules      r:   rr   InternVLVisionEncoder.forwardP  s.     !JJL(7M ' +
 	
r<   )rK   r   r   )rB   rC   rD   rE   r    rO   r.   ru   r   r
   rr   rF   rv   rw   s   @r:   r   r   I  s<    ,3 , ,	
||	
 
	 	
 	
r<   r   c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
SrSrSrSr\\S.r\R&                  " 5       U 4S	 j5       rS
rU =r$ )InternVLVisionPreTrainedModeli\  rK   internvl_visionr   )imagevideoTr   )r[   
attentionsc                 F  > [         TU ]  U5        [        U[        5      (       a|  [        R
                  " UR                  5        UR                  b   [        R
                  " UR                  5        UR                  b!  [        R
                  " UR                  5        gg[        U[        5      (       ak  [        R                  " UR                  U R                  R                  5        [        R                  " UR                  U R                  R                  5        gg)zInitialize the weightsN)rN   _init_weightsr   r   initzeros_r   r   r   r   	constant_r   rK   r   r   )rW   r!   rY   s     r:   r   +InternVLVisionPreTrainedModel._init_weightsn  s     	f%f677KK(()  ,F--.))5F667 6 344NN6??DKK,N,NONN6??DKK,N,NO 5r<   r@   )rB   rC   rD   rE   r    __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rI   _can_record_outputsr.   no_gradr   rF   rv   rw   s   @r:   r   r   \  sl      )$O)&*#./N"& --
 ]]_P Pr<   r   c                      ^  \ rS rSrS\SS4U 4S jjrS r\\" SS9\	 SS	\
R                  S
\
R                  S-  S\\-  4S jj5       5       5       rSrU =r$ )InternVLVisionModeli}  rK   r   Nc                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U R                  5         g )Nr   )rN   rO   rK   r   r   r   encoderuse_mean_poolingr1   rT   	LayerNormr   r   	layernorm	post_initrW   rK   rY   s     r:   rO   InternVLVisionModel.__init__  sm     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r<   c                 .    U R                   R                  $ rt   )r   r   )rW   s    r:   get_input_embeddings(InternVLVisionModel.get_input_embeddings  s    ///r<   F)tie_last_hidden_statesr   r   c                     U R                  XS9nU R                  U5      nUS   nU R                  U5      n[        UUR                  UR
                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)r   r   )r   r[   r   )r   r  r  r{   r[   r   )rW   r   r   r5   embedding_outputencoder_outputssequence_outputs          r:   rr   InternVLVisionModel.forward  s^      ??<?Y,,'78)!,..93-)77&11
 	
r<   )rK   r   r  r  rt   )rB   rC   rD   rE   r    rO   r  r   r   r   r.   ru   r   r   r{   rr   rF   rv   rw   s   @r:   r  r  }  su    3  0  E2UY
!LL
;@;K;Kd;R
	5	5
  3  
r<   r  c                       \ rS rSrSrSrg)InternVLPreTrainedModeli  )r   textr   r@   N)rB   rC   rD   rE   r  rF   r@   r<   r:   r"  r"    s    1r<   r"  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )InternVLMultiModalProjectori  rK   c                 0  > [         TU ]  5         [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  5      U l        [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  UR                  R
                  5      U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  5      U l        g )Nr   r   )rN   rO   r1   r  vision_configr   r   downsample_ratior   Lineartext_configlinear_1r   projector_hidden_actactlinear_2r  s     r:   rO   $InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar<   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ rt   )r   r+  r-  r.  )rW   image_featuresr[   s      r:   rr   #InternVLMultiModalProjector.forward  s@    7m4/m4r<   )r-  r   r+  r.  )	rB   rC   rD   rE   r   rO   rr   rF   rv   rw   s   @r:   r%  r%    s    b~ b r<   r%  c                       \ rS rSrSrg)InternVLModelOutputWithPasti  r@   NrA   r@   r<   r:   r4  r4    rG   r<   r4  c                      \ rS rSrSS\R
                  S\4S jjr\\	\
" SS9  SS\R                  S	\\\   -  \\   -  S-  S
\S-  S\\   S\\-  4
S jj5       5       5       r\	\
        SS\R(                  S-  S\R                  S-  S\R
                  S-  S\R(                  S-  S\S-  S\R                  S-  S	\\\   -  \\   -  S-  S
\S-  S\\   S\\-  4S jj5       5       rSrg)InternVLModeli  vision_featuresscale_factorc           
         UR                  5       u  p4pVXR-  S:w  d  XB-  S:w  a  [        S5      eUR                  X4[        XR-  5      [        Xb-  5      5      nUR	                  SSSS5      R                  5       nUR                  U[        XR-  5      [        XB-  5      [        XbS-  -  5      5      nUR	                  SSSS5      R                  5       nU$ )a  Perform pixel shuffle downsampling on vision features.

Args:
    vision_features (`torch.Tensor`):
        Input tensor of shape (batch_size, width, height, channels).
    scale_factor (`float`, *optional*, defaults to `0.5`):
        Factor by which to downsample. Default is 0.5, which halves the dimensions.

Returns:
    vision_features (`torch.Tensor`):
        Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r^   r   re   r   r   r4   )rW   r7  r8  rl   r   r   channelss          r:   pixel_shuffleInternVLModel.pixel_shuffle  s     />.B.B.D+
6 A%)=)Bjkk *..s6#893x?V;W
 *11!Q1=HHJ *..F12C8L4MsS[mn_nSoOp

 *11!Q1=HHJr<   zWObtains image last hidden states from the vision tower and apply multimodal projection.rx   Nr   vision_feature_layervision_feature_select_strategyr5   r   c                    UR                  U R                  S9nU R                  R                  nUS:w  a  SUS'   U R                  " SUSS.UD6nUS:X  a  UR
                  nOUR                  U   nUS:X  a  USS2SS2SS24   nUR                  S   n[        US	-  5      n	UR                  S
   n
UR                  XU	S5      nU R                  XuS9nUR                  U
SUR                  S   5      nU R                  U5      nXvl        U$ )z
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
    The tensors corresponding to the input images.
vision_feature_layer (`int` or `list[int]`):
    Layer index or list of layer indices to extract features from.
)r   r)   Toutput_hidden_states)r   return_dictdefaultNr   r   r   )r8  r@   )r   r   rK   r(  vision_towerr   r[   r   r   rb   r;  multi_modal_projectorpooler_output)rW   r   r=  r>  r5   r(  vision_outputsr7  r:  feature_sizerl   s              r:   get_image_features InternVLModel.get_image_features  s1   $ $TZZ8;;772%-1F)***aRVaZ`a2%,>>O,::;OPO)Y6-aQh7O #((+8S=)$**1-
 *11*LZ\] ,,_,\ *11*b/BWBWXZB[\ 44_E'6$r<   	input_idsr%   position_idspast_key_valuesinputs_embedsc	                    US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbc  U R                  UUUSS9R                  n
U
R	                  UR
                  UR                  5      n
U R                  XU
S9nUR                  X5      nU R                  " SUUUUS.U	D6n[        UR                  UR                  UR                  UR                  Ub  W
S9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embedsT)r   r=  r>  rA  )rM  r1  )r%   rK  rL  rM  )r   rL  r[   r   image_hidden_statesr@   )r   r  rH  rE  r   devicer   get_placeholder_maskmasked_scatterlanguage_modelr4  r   rL  r[   r   )rW   rJ  r   r%   rK  rL  rM  r=  r>  r5   r1  special_image_maskoutputss                r:   rr   InternVLModel.forward  s1    -t";<YZZ  557	BM#!44)%9/M 	 5 
 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M%% 
)%+'	

 
 +%77#33!//))2>2J
 	

 QU
 	
r<   r@   )r   )NN)NNNNNNNN)rB   rC   rD   rE   r.   ru   floatr;  r   r   r   FloatTensorr   liststrr   r   r   r   rH  
LongTensorr   r4  rr   rF   r@   r<   r:   r6  r6    s   !U\\ ! !F  n DH59	,'', "DIoS	9D@, ),d
	,
 +,, 
+	+,   
,\  .215.204(,26CG59-
##d*-
 ''$.-
 t+	-

 &&--
 -
 ((4/-
 "DIoS	9D@-
 ),d
-
 +,-
 
,	,-
  -
r<   r6  c                       \ rS rSrSrg)InternVLCausalLMOutputWithPastiM  r@   NrA   r@   r<   r:   r]  r]  M  rG   r<   r]  c                   (   ^  \ rS rSrU 4S jrSrU =r$ ) InternVLForConditionalGenerationiQ  c                  :   > [        5       R                  " S0 U D6  g)as  
Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText

>>> torch_device = "cuda"
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
>>> model = AutoModelForImageTextToText.from_pretrained(
...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
... )

>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {
...                 "type": "image",
...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
...             },
...             {
...                 "type": "image",
...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
...             },
...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
...         ],
...     },
... ]

>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
The images depict the Statue of Liberty and the Golden Gate Bridge.
```Nr@   )rN   rr   )super_kwargsrY   s    r:   rr   (InternVLForConditionalGeneration.forwardR  s    H 	','r<   r@   )rB   rC   rD   rE   rr   rF   rv   rw   s   @r:   r_  r_  Q  s    $( $(r<   r_  )r   r  r"  r6  r_  )r]   )Jcollections.abcr   r   dataclassesr   r.   torch.nnr1    r   r   activationsr   cache_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   r   configuration_internvlr   r    Moduleru   rW  r   r;   r>   rI   r{   r   r   r   r  r   r   r   r   r  r"  INTERNVL_INPUTS_DOCSTRINGr%  r4  r6  r]  r_  __all__r@   r<   r:   <module>rx     s     $ !   & !   9 K F & B B I 5 ( 7 /  I %II%<<% 
% <<	%
 LL4'% % S[%4	L 	3$2 3$l 
+E   BII  J[ryy [|	 	 3H
I+4 +\
BII 
& PO P P@ &
7 &
 &
R22 2 ! ")) $	": 	F
J F
R	%@ 	%('D %(Pr<   