
    Z j                     `   S SK rS SK Jr  S SKJr  S SKrS SKJr  SSKJ	r
  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,J-r-  \" S5       " S S\R\                  5      5       r/ SFS\R\                  S\R`                  S\R`                  S\R`                  S\R`                  S-  S\1S\1\2-  4S jjr3 " S  S!\R\                  5      r4\\!" S"S#9 " S$ S%\5      5       5       r5 " S& S'\R\                  5      r6 " S( S)\R\                  5      r7 " S* S+\R\                  5      r8\Rr                  \/S,.r: " S- S.\5      r; " S/ S0\R\                  5      r<\! " S1 S2\5      5       r=\! " S3 S4\=5      5       r>\! " S5 S6\5      5       r? " S7 S8\R\                  5      r@\\!" S9S#9 " S: S;\5      5       5       rA\!" S<S#9 " S= S>\?5      5       rB\!" S?S#9\ " S@ SA\5      5       5       rC\!" SBS#9 " SC SD\?\5      5       rD/ SEQrEg)G    N)Callable)	dataclass   )initialization)ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringtorch_compilable_check	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )InternVLVisionRMSNorm-   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z4
InternVLVisionRMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizer#   	__class__s      /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/internvl/modeling_internvl.pyr'   InternVLVisionRMSNorm.__init__/   s/     	ll5::k#:; #    hidden_statesc                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   T)keepdim)	dtypetor*   float32powmeanrsqrtr-   r,   )r.   r4   input_dtypevariances       r1   forwardInternVLVisionRMSNorm.forward7   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r3   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler,   shaper-   r.   s    r1   
extra_repr InternVLVisionRMSNorm.extra_repr>   s*    ))*+6$2G2G1HIIr3   )r-   r,   )gư>)__name__
__module____qualname____firstlineno__floatr'   r*   Tensorr@   rF   __static_attributes____classcell__r0   s   @r1   r!   r!   -   sB    $ $$ $ $;U\\ ;ell ;J Jr3   r!   modulequerykeyvalueattention_maskscalingdropoutc                 h   UnUn	[         R                  " XR                  SS5      5      U-  n
Ub  X-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r   r6   dim)ptrainingr   )	r*   matmul	transposer(   
functionalsoftmaxrW   r\   
contiguous)rQ   rR   rS   rT   rU   rV   rW   kwargs
key_statesvalue_statesattn_weightsattn_outputs               r1   eager_attention_forwardrg   B   s     JL<<';';Aq'ABWLL!#4 ==((2(>L==((6??([L,,|:K''1-88:K$$r3   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   4S	 jjrS
rU =r$ )InternVLVisionAttention\   z+Attention Class for InternVL Vision Encoderconfigc                 $  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  5      U l        US:  a  [        R*                  " U5      O[        R,                  " 5       U l        U(       a  [/        U R                  5      O[        R,                  " 5       U l        U(       a  [/        U R                  5      U l        g [        R,                  " 5       U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r&   r'   rk   r/   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr(   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr!   q_normk_norm)r.   rk   proj_dropoutqk_normr0   s       r1   r'    InternVLVisionAttention.__init___   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta?F+DNN;BKKM?F+DNN;BKKMr3   Nr4   rU   rb   c                 p   UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	U R	                  U5      nU R                  U5      nUR                  XEU R                  U R                  5      R                  SS5      nUR                  XEU R                  U R                  5      R                  SS5      nU	R                  XEU R                  U R                  5      R                  SS5      n	[        R                  " U R                  R                  [        5      n
U
" U UUU	U4U R                   (       d  SOU R"                  U R$                  SS.UD6u  pUR                  XEU R&                  5      nU R)                  U5      nU R+                  U5      nX4$ )Nr   r           F)rW   rV   rx   )sizer{   r|   r}   r   r   reshaperq   rr   r^   viewr   get_interfacerk   _attn_implementationrg   r\   ru   rt   ro   r~   rv   )r.   r4   rU   rb   
batch_sizeseq_len_query_statesrc   rd   attention_interfacerf   re   outputs                 r1   r@   InternVLVisionAttention.forward{   s    "/!3!3!5
Q{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJ
%
 
%
! "))*t~~N&&{3((0##r3   )ru   rk   ro   rr   rx   r   r|   rq   rv   r~   r   r{   rt   r}   N)rH   rI   rJ   rK   __doc__r   r'   r*   rM   r   r   r@   rN   rO   rP   s   @r1   ri   ri   \   sS    5Z3 Z> /3'$||'$ t+'$ +,	'$ '$r3   ri   z7
    Class for outputs of [`InternVLVisionModel`].
    custom_introc                       \ rS rSrSrSrg)$InternVLVisionModelOutputWithPooling   a2  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
    *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
    will be returned.
 N)rH   rI   rJ   rK   r   rN   r   r3   r1   r   r      s    r3   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )InternVLVisionPatchEmbeddings   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pTUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l        Xpl        [        R                  " XEX3S9U l
        g )Nr   r   )kernel_sizestride)r&   r'   
image_size
patch_sizenum_channelsr/   num_patchespatch_shaper(   Conv2d
projection)	r.   rk   r   r   r   r/   r   r   r0   s	           r1   r'   &InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir3   pixel_valuesr$   c                    UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  UR	                  U R                  R
                  R                  5      5      nUR                  S5      R                  SS5      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   )	rD   r   rs   r   r9   r,   r8   flattenr^   )r.   r   r   r   heightwidth
embeddingss          r1   r@   %InternVLVisionPatchEmbeddings.forward   sz    2>2D2D/
&,,,w  __\__T__5K5K5Q5Q%RS
''*44Q:
r3   )r   r   r   r   r   r   )rH   rI   rJ   rK   r   r'   r*   rM   r@   rN   rO   rP   s   @r1   r   r      s.    j
ELL 
U\\ 
 
r3   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
 SS\R                  S\R                  S-  S\R                  4S jjrSrU =r$ )InternVLVisionEmbeddings   z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

rk   r$   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )r&   r'   r(   r)   r*   zerosr/   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsr   hidden_dropout_probrW   )r.   rk   r   r0   s      r1   r'   !InternVLVisionEmbeddings.__init__   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r3   r   r   r   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr6   r         ?r   r   bicubicF)r   modealign_cornersrY   )rD   r   r*   jit
is_tracingr   r   r   permuter(   r_   interpolater   cat)r.   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrZ   
new_height	new_widthsqrt_num_positionss               r1   interpolate_pos_encoding1InternVLVisionEmbeddings.interpolate_pos_encoding   s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr3   r   bool_masked_posc                    UR                   u    p4nU R                  U5      nUR                  5       u  pxnUbI  U R                  R	                  XxS5      n	UR                  S5      R                  U	5      n
USU
-
  -  X-  -   nU R                  R	                  USS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nU$ )Nr6   r   rY   )rD   r   r   r   expand	unsqueezetype_asr   r*   r   r   r   rW   )r.   r   r   r   r   r   r   r   r   mask_tokensw
cls_tokenss               r1   r@    InternVLVisionEmbeddings.forward  s    
 +001e**<8
!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
r3   )r   rW   r   r   r   r   r   r   )rH   rI   rJ   rK   r   r   r'   r*   rM   intr   
BoolTensorr@   rN   rO   rP   s   @r1   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 48ll ))D0 
	 r3   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InternVLVisionMLPi7  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )r&   r'   rk   r   
hidden_actactivation_fnr(   ry   r/   intermediate_sizefc1fc2r.   rk   r0   s     r1   r'   InternVLVisionMLP.__init__8  sb    #F$5$5699V//1I1IJ99V55v7I7IJr3   r4   r$   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )r.   r4   s     r1   r@   InternVLVisionMLP.forward?  s4    /**=9/r3   )r   rk   r   r   )
rH   rI   rJ   rK   r'   r*   rM   r@   rN   rO   rP   s   @r1   r   r   7  s)    KU\\ ell  r3   r   )
layer_normrms_normc                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	\R                     \	\R                  \R                  4   -  4S jr
S	rU =r$ )
InternVLVisionLayeriI  z?This corresponds to the Block class in the timm implementation.rk   r$   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                     " UR                  UR                  S9U l        [        UR                     " UR                  UR                  S9U l        UR                  n[        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R*                  " UR,                  5      U l        g )Nr   r#   T)requires_grad)r&   r'   chunk_size_feed_forwardseq_len_dimri   	attentionr   mlpNORM2FN	norm_typer/   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer(   r)   r*   r+   lambda_1lambda_2r   r   rW   )r.   rk   init_valuesr0   s      r1   r'   InternVLVisionLayer.__init__L  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r3   r4   c                    U R                  U R                  U5      5      u  p#U R                  U-  nX!-   nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  b  U R                  U-  nXA-   nU$ r   )r   r   r   r   r   rW   r   )r.   r4   attention_outputr   layer_outputs        r1   r@   InternVLVisionLayer.forward[  s     #nn!!-0
  ==+;; )8 ++M:xx-||L1==$==<7L $3r3   )	r   r   rW   r   r   r   r   r   r   )rH   rI   rJ   rK   r   r   r'   r*   rM   rC   r@   rN   rO   rP   s   @r1   r   r   I  sZ    I>3 > >|| 
u||	uU\\5<<%?@	@ r3   r   c                   `   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\\	-  4S jr
SrU =r$ )	InternVLVisionEncoderiw  rk   r$   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r&   r'   rk   r(   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r.   rk   ir0   s      r1   r'   InternVLVisionEncoder.__init__x  sS    ]]vOgOgIh#iIhA$7$?Ih#ij
&+# $js   A&r4   c                 J    U R                    H  nU" U5      nM     [        US9$ )N)last_hidden_state)r
  r   )r.   r4   layer_modules      r1   r@   InternVLVisionEncoder.forward~  s.     !JJL(7M ' +
 	
r3   )rk   r  r
  )rH   rI   rJ   rK   r   r'   r*   rM   rC   r   r@   rN   rO   rP   s   @r1   r  r  w  s<    ,3 , ,	
||	
 
	 	
 	
r3   r  c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
SrSrSrSr\\S.r\R&                  " 5       U 4S	 j5       rS
rU =r$ )InternVLVisionPreTrainedModeli  rk   internvl_visionr   )imagevideoTr   )r4   
attentionsc                 F  > [         TU ]  U5        [        U[        5      (       a|  [        R
                  " UR                  5        UR                  b   [        R
                  " UR                  5        UR                  b!  [        R
                  " UR                  5        gg[        U[        5      (       ak  [        R                  " UR                  U R                  R                  5        [        R                  " UR                  U R                  R                  5        gg)zInitialize the weightsN)r&   _init_weightsr   r   initzeros_r   r   r   r   	constant_r   rk   r   r   )r.   rQ   r0   s     r1   r  +InternVLVisionPreTrainedModel._init_weights  s     	f%f677KK(()  ,F--.))5F667 6 344NN6??DKK,N,NONN6??DKK,N,NO 5r3   r   )rH   rI   rJ   rK   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   ri   _can_record_outputsr*   no_gradr  rN   rO   rP   s   @r1   r  r    sl      )$O)&*#./N"& --
 ]]_P Pr3   r  c                      ^  \ rS rSrS\SS4U 4S jjrS r\\" SS9\	 SS	\
R                  S
\
R                  S-  S\\-  4S jj5       5       5       rSrU =r$ )InternVLVisionModeli  rk   r$   Nc                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U R                  5         g )Nr   )r&   r'   rk   r   r   r  encoderuse_mean_poolingr(   r   	LayerNormr/   r   	layernorm	post_initr   s     r1   r'   InternVLVisionModel.__init__  sm     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r3   c                 .    U R                   R                  $ r   )r   r   rE   s    r1   get_input_embeddings(InternVLVisionModel.get_input_embeddings  s    ///r3   F)tie_last_hidden_statesr   r   c                     U R                  XS9nU R                  U5      nUS   nU R                  U5      n[        UUR                  UR
                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)r   r   )r  r4   r  )r   r-  r0  r   r4   r  )r.   r   r   rb   embedding_outputencoder_outputssequence_outputs          r1   r@   InternVLVisionModel.forward  s^      ??<?Y,,'78)!,..93-)77&11
 	
r3   )rk   r   r-  r0  r   )rH   rI   rJ   rK   r   r'   r4  r   r   r   r*   rM   r   rC   r   r@   rN   rO   rP   s   @r1   r+  r+    su    3  0  E2UY
!LL
;@;K;Kd;R
	5	5
  3  
r3   r+  c                   D    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrSrg)	InternVLPreTrainedModeli  rk   model)r  textr  Tpast_key_valuesr   N)rH   rI   rJ   rK   r   r  r  r!  r"  _skip_keys_device_placementr%  r$  _can_compile_fullgraphr&  r'  rN   r   r3   r1   r=  r=    s=    1&*#"3N!"&r3   r=  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )InternVLMultiModalProjectori  rk   c                 0  > [         TU ]  5         [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  5      U l        [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  UR                  R
                  5      U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  5      U l        g )Nr   r   )r&   r'   r(   r/  vision_configr/   r   downsample_ratior   ry   text_configlinear_1r   projector_hidden_actactlinear_2r   s     r1   r'   $InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar3   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   rI  rK  rL  )r.   image_featuresr4   s      r1   r@   #InternVLMultiModalProjector.forward  s@    7m4/m4r3   )rK  r   rI  rL  )	rH   rI   rJ   rK   r   r'   r@   rN   rO   rP   s   @r1   rD  rD    s    b~ b r3   rD  zM
    Base class for InternVL outputs, with hidden states and attentions.
    c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)InternVLModelOutputWithPasti  a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_statesr   )
rH   rI   rJ   rK   r   rS  r*   FloatTensorr  rN   r   r3   r1   rR  rR    s    	 59**T18r3   rR  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    c                   Z  ^  \ rS rSrS\4U 4S jjrS rS r\\	\
" SS9  SS	\R                  S
\\\   -  \\   -  S-  S\S-  S\\   S\\-  4
S jj5       5       5       rS\R*                  S\R                  S\R                  4S jr\	\
        SS\R*                  S-  S	\R                  S-  S\R.                  S-  S\R*                  S-  S\S-  S\R                  S-  S
\\\   -  \\   -  S-  S\S-  S\\   S\\-  4S jj5       5       rSS\R.                  S\4S jjrSrU =r$ )InternVLModeli  rk   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g r   )r&   r'   r   from_configrF  vision_towerrD  multi_modal_projectorrH  language_modelr1  r   s     r1   r'   InternVLModel.__init__  sY     %11&2F2FG%@%H"'33F4F4FGr3   c                 6    U R                   R                  5       $ r   )r[  r4  rE   s    r1   r4  "InternVLModel.get_input_embeddings  s    ""7799r3   c                 :    U R                   R                  U5        g r   )r[  set_input_embeddingsr.   rT   s     r1   r`  "InternVLModel.set_input_embeddings  s    007r3   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   Nr   vision_feature_layervision_feature_select_strategyrb   r$   c                    UR                  U R                  S9nU R                  R                  nUS:w  a  SUS'   U R                  " SUSS.UD6nUS:X  a  UR
                  nOUR                  U   nUS:X  a  USS2SS2SS24   nUR                  S   n[        US	-  5      n	UR                  S
   n
UR                  XU	S5      nU R                  XuS9nUR                  U
SUR                  S   5      nU R                  U5      nXvl        U$ )z
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
    The tensors corresponding to the input images.
vision_feature_layer (`int` or `list[int]`):
    Layer index or list of layer indices to extract features from.
)r8   r6   Toutput_hidden_states)r   return_dictdefaultNr   r   r   )scale_factorr   )r9   r8   rk   rG  rY  r  r4   rD   r   r   pixel_shufflerZ  pooler_output)r.   r   rc  rd  rb   rG  vision_outputsvision_featureschannelsfeature_sizer   s              r1   get_image_features InternVLModel.get_image_features   s1   $ $TZZ8;;772%-1F)***aRVaZ`a2%,>>O,::;OPO)Y6-aQh7O #((+8S=)$**1-
 *11*LZ\] ,,_,\ *11*b/BWBWXZB[\ 44_E'6$r3   	input_idsinputs_embedsrO  c           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r8   devicer6   r   r   z6Image features and image tokens do not match, tokens: z, features: )r4  r*   tensorrk   image_token_idlongru  allsumrD   r   	expand_asr9   r   numel)r.   rr  rs  rO  special_image_maskn_image_tokensn_image_featuress          r1   get_placeholder_mask"InternVLModel.get_placeholder_maskS  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r3   rU   position_idsr@  c	                    US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbc  U R                  UUUSS9R                  n
U
R	                  UR
                  UR                  5      n
U R                  XU
S9nUR                  X5      nU R                  " SUUUUS.U	D6n[        UR                  UR                  UR                  UR                  Ub  W
S9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embedsT)r   rc  rd  rg  )rs  rO  )rU   r  r@  rs  )r  r@  r4   r  rS  r   )rs   r4  rp  rk  r9   ru  r8   r  masked_scatterr[  rR  r  r@  r4   r  )r.   rr  r   rU   r  r@  rs  rc  rd  rb   rO  r}  outputss                r1   r@   InternVLModel.forwardk  s1    -t";<YZZ  557	BM#!44)%9/M 	 5 
 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M%% 
)%+'	

 
 +%77#33!//))2>2J
 	

 QU
 	
r3   rm  ri  c           
         UR                  5       u  p4pVXR-  S:w  d  XB-  S:w  a  [        S5      eUR                  X4[        XR-  5      [        Xb-  5      5      nUR	                  SSSS5      R                  5       nUR                  U[        XR-  5      [        XB-  5      [        XbS-  -  5      5      nUR	                  SSSS5      R                  5       nU$ )a  Perform pixel shuffle downsampling on vision features.

Args:
    vision_features (`torch.Tensor`):
        Input tensor of shape (batch_size, width, height, channels).
    scale_factor (`float`, *optional*, defaults to `0.5`):
        Factor by which to downsample. Default is 0.5, which halves the dimensions.

Returns:
    vision_features (`torch.Tensor`):
        Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r   rs   r   r   r   ra   )r.   rm  ri  r   r   r   rn  s          r1   rj  InternVLModel.pixel_shuffle  s     />.B.B.D+
6 A%)=)Bjkk *..s6#893x?V;W
 *11!Q1=HHJ *..F12C8L4MsS[mn_nSoOp

 *11!Q1=HHJr3   )r[  rZ  rY  NN)NNNNNNNN)r   )rH   rI   rJ   rK   r   r'   r4  r`  r   r   r   r*   rT  r   liststrr   r   rC   r   rp  
LongTensorr  rM   r   rR  r@   rL   rj  rN   rO   rP   s   @r1   rV  rV    s   ~ :8  n DH59	,'', "DIoS	9D@, ),d
	,
 +,, 
+	+,   
,\"))":?:K:K"]b]n]n"0  .215.204(,26CG59-
##d*-
 ''$.-
 t+	-

 &&--
 -
 ((4/-
 "DIoS	9D@-
 ),d
-
 +,-
 
,	,-
  -
^!U\\ ! ! !r3   rV  zT
    Base class for InternVL causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)InternVLCausalLMOutputWithPasti  a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitsr@  r4   r  rS  r   )rH   rI   rJ   rK   r   r  r*   rT  r  r  r@  r   r4   rC   r  rS  rN   r   r3   r1   r  r    s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r3   r  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c                   h  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  4S	 jr\  SS\R                  S\\\   -  \\   -  S
-  S\S
-  S\\   S\\-  4
S jj5       r\\           SS\R0                  S
-  S\R                  S
-  S\R2                  S
-  S\R0                  S
-  S\S
-  S\R                  S
-  S\\\   -  \\   -  S
-  S\S
-  S\R0                  S
-  S\\R2                  -  S\R2                  S
-  S\\   S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ ) InternVLForConditionalGenerationi  zlm_head.weightz(model.language_model.embed_tokens.weightrk   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFrm   )r&   r'   rV  r>  r(   ry   rH  r/   
vocab_sizelm_headr1  r   s     r1   r'   )InternVLForConditionalGeneration.__init__  sS     "6*
yy!3!3!?!?ASASA^A^ejkr3   c                 6    U R                   R                  5       $ r   )r>  r4  rE   s    r1   r4  5InternVLForConditionalGeneration.get_input_embeddings  s    zz..00r3   c                 :    U R                   R                  U5        g r   )r>  r`  ra  s     r1   r`  5InternVLForConditionalGeneration.set_input_embeddings  s    

''.r3   r$   c                     U R                   $ r   )r  rE   s    r1   get_output_embeddings6InternVLForConditionalGeneration.get_output_embeddings  s    ||r3   Nr   rc  rd  rb   c                 B    U R                   R                  " SUUUS.UD6$ )N)r   rc  rd  r   )r>  rp  )r.   r   rc  rd  rb   s        r1   rp  3InternVLForConditionalGeneration.get_image_features  s3     zz,, 
%!5+I
 	
 	
r3   rr  rU   r  r@  rs  labelslogits_to_keepimage_sizesc                    U R                   " SUUUUUUUUUS.	UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnU	b3  U R
                  " SUXR                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )as  
Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText

>>> torch_device = "cuda"
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
>>> model = AutoModelForImageTextToText.from_pretrained(
...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
... )

>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {
...                 "type": "image",
...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
...             },
...             {
...                 "type": "image",
...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
...             },
...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
...         ],
...     },
... ]

>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
The images depict the Statue of Liberty and the Golden Gate Bridge.
```)	rr  r   rU   r  r@  rs  rc  rd  r  r   N)r  r  r  )r  r  r@  r4   r  rS  r   )r>  r   r   slicer  loss_functionrk   rH  r  r  r@  r4   r  rS  )r.   rr  r   rU   r  r@  rs  rc  rd  r  r  r  rb   r  r4   slice_indicesr  r  s                     r1   r@   (InternVLForConditionalGeneration.forward  s    h ** 
%)%+'!5+I#
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r3   c           	      z   > [         T
U ]  " U4UUUUUS.UD6n	U(       d  UR                  SS5      (       d  XIS'   U	$ )N)r@  rs  rU   r  is_first_iteration	use_cacheTr   )r&   prepare_inputs_for_generationget)r.   rr  r@  rs  r   rU   r  r  rb   model_inputsr0   s             r1   r  >InternVLForConditionalGeneration.prepare_inputs_for_generationY  sZ     w<
+'))1
 
 VZZT%B%B
 ,8(r3   )r  r>  r  )NNNNNNNNNr   N)NNNNNF) rH   rI   rJ   rK   _tied_weights_keysr   r'   r4  r`  r(   Moduler  r   r*   rT  r   r  r  r   r   rC   r   rp  r   r  rM   r   r  r@   r  rN   rO   rP   s   @r1   r  r    s    +,VW~ 1/ryy   DH59	
''
 "DIoS	9D@
 ),d
	

 +,
 
+	+
 
  .215.204(,26CG59*.-.+/Q
##d*Q
 ''$.Q
 t+	Q

 &&-Q
 Q
 ((4/Q
 "DIoS	9D@Q
 ),d
Q
   4'Q
 ell*Q
 \\D(Q
 +,Q
 
/	/Q
  Q
l   r3   r  )r  r+  r=  rV  r  )r   )Fcollections.abcr   r   dataclassesr   r*   torch.nnr(    r   r  activationsr   cache_utilsr   
generationr	   integrationsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   autor   configuration_internvlr   r   r  r!   rM   rL   r   rg   ri   r   r   r   r   r/  r   r   r  r  r+  r=  rD  rR  rV  r  r  __all__r   r3   r1   <module>r     s  ,  $ !   & !   ) 7 9 d d F & g g I 5  H Y'JBII J (J6 %II%<<% 
% <<	%
 LL4'% % S[%4F$bii F$R 
+E   BII  J[ryy [|		  3H
I+4 +\
BII 
& PO P P@ &
7 &
 &
R 'o ' '")) $ 
9"9 9 9 
l+ l
l^ 
 9[ 9 90 
T'> T
Tnr3   