
    Z j                        S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSKJ	r
  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/  SSK0J1r1J2r2  \(Rf                  " \45      r5\\'" SS9 " S S\%5      5       5       r6S\Rn                  S\4S jr8\+" SS S!S"9  S\S#\S!\Rn                  S$\Rn                  S-  S%\S-  S&\Rn                  S-  S'\Rn                  S-  S(\9S-  S\:4S) jj5       r; " S* S+\Rx                  5      r= " S, S-\Rx                  5      r> " S. S/\Rx                  5      r?S0\>0r@ " S1 S2\Rx                  5      rA " S3 S4\Rx                  5      rB " S5 S6\Rx                  5      rC " S7 S8\5      rD " S9 S:\Rx                  5      rE\' " S; S<\5      5       rF " S= S>\Rx                  5      rG " S? S@\Rx                  5      rH S]SA\Rx                  SB\Rn                  SC\Rn                  SD\Rn                  S$\Rn                  S-  SE\ISF\I4SG jjrJ " SH SI\Rx                  5      rK " SJ SK\5      rL " SL SM\Rx                  5      rM " SN SO\Rx                  5      rN\'" SPS9 " SQ SR\F5      5       rO " SS ST\Rx                  5      rP\'" SUS9 " SV SW\F5      5       rQ\'" SXS9 " SY SZ\F\5      5       rR/ S[QrSg)^zPyTorch GIT model.    N)Callable)	dataclass)nn   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)create_masks_for_generate)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)deprecate_kwarg)merge_with_config_defaults)capture_outputs   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
GitVisionModelOutput8   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r&   torchFloatTensor__annotations__r'   r(   tupler)   __static_attributes__r*       u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/git/modeling_git.pyr$   r$   8   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r5   r$   	group_idsreturnc           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )aY  
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
Args:
    group_ids (`torch.Tensor`):
        A tensor of shape `(bs, len)` assigning each token to a vision group. Tokens with the same group
        come from the same input image. Text is denoted by `-1`.
	batch_idxhead_idxq_idxkv_idxr8   c                    > T	R                   S   nUR                  US-
  S9nUR                  US-
  S9nT	X4   nT	X4   n[        R                  " X$:  US5      n[        R                  " X4:  US5      nXx:H  US:  -  $ )Nr   )maxr   )shapeclampr0   where)
r:   r;   r<   r=   
seq_lengthq_idx_clampedkv_idx_clampedq_groupkv_groupr7   s
            r6   
inner_mask0token_type_ids_mask_function.<locals>.inner_maskV   s    __R(
 
Q7*q.9 I45Y67++e0'2>;;v2HbA#155r5   )intbool)r7   rI   s   ` r6   token_type_ids_mask_functionrM   L   s3    6c 6S 6 6c 6d 6 r5   input_embedsz5.6.0inputs_embeds)versionnew_nameconfigattention_maskpast_key_valuesposition_idstoken_type_idsis_first_iterationc                 v   U R                  5       UUUUS.nUb  US:H  R                  UR                  5      n	[        R                  R                  U	SSS9SS2SS24   n
X) -  n[        R                  " UR                  5       SS9S-
  n[        R                  " XS5      n[        U5      US	'   [        S
0 UD6$ )a  
Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

Uses `pixel_values` as an optional input to disambiguate edge cases.
)rR   rO   rS   rT   rU   Nr   )r   r   r   )valuer?   dimor_mask_functionr*   )get_text_configtodevicer   
functionalpadr0   cumsumrK   rC   rM   r   )rR   rO   rS   rT   rU   rV   rW   kwargsmask_kwargsis_imageis_previous_imagenew_image_startr7   s                r6   create_causal_mask_mappingrh   g   s    & ((*&(*$K ! #a'++M,@,@AMM--ha-HCRCP"%77LL!4!4!6A>B	KKR8	*Fy*Q&'$3{33r5   c                      ^  \ rS rSrSrU 4S jr    SS\R                  S-  S\R                  S-  S\R                  S-  S\	S	\R                  4
S
 jjrSrU =r$ )GitEmbeddings   z;Construct the embeddings from word and position embeddings.c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR                  5      R%                  S5      SS9  g )N)padding_idxepsrU   r   r?   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr0   arangeexpandselfrR   	__class__s     r6   rt   GitEmbeddings.__init__   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
r5   N	input_idsrU   rO   past_key_values_lengthr8   c                 .   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XFU-   24   nUc  U R                  U5      nOUnU R                  U5      nXx-  nU R	                  U5      nU R                  U5      nU$ )Nr?   r   )sizerU   ry   r{   r|   r   )	r   r   rU   rO   r   input_shaperD   
embeddingsr{   s	            r6   forwardGitEmbeddings.forward   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J"66|D)
^^J/
\\*-
r5   )r|   r   r{   ry   )NNNr   )r+   r,   r-   r.   r/   rt   r0   
LongTensorr1   rK   Tensorr   r4   __classcell__r   s   @r6   rj   rj      sx    E

 .20426&'##d* &&- ((4/	
 !$ 
 r5   rj   c                      ^  \ rS rSrSU 4S jjr  SS\R                  S\R                  S-  S\S-  S\	\
   S\\R                     4
S	 jjrS
rU =r$ )GitSelfAttention   Nc                 &  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eX l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        UR                  R                  UR                  R                   -  S-  S	-   5      U l        UR$                  b  U =R"                  UR$                  -  sl        [&        R(                  " UR                  U R                  5      U l        [&        R(                  " UR                  U R                  5      U l        [&        R(                  " UR                  U R                  5      U l        [&        R0                  " UR2                  5      U l        g )
Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   )rs   rt   rw   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncer   r+   rK   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyrY   r~   attention_probs_dropout_probr   r   rR   r   r   s      r6   rt   GitSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr5   r(   rS   rT   rc   r8   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	Ub  UR                  XU R                  5      u  p[        R                  " XxR	                  SS5      5      n
U
[        R                  " U R                  5      -  n
Ub  X-   n
[        R                  R                  U
SS9nU R!                  U5      n[        R                  " X5      nUR#                  SSSS5      R%                  5       nUR'                  5       S S U R(                  4-   nUR                  U5      nX4$ )Nr?   r   r   rZ   r   r   )rA   r   r   view	transposer   rY   updater   r0   matmulmathsqrtr   r`   softmaxr   permute
contiguousr   r   )r   r(   rS   rT   rc   r   hidden_shapequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes                 r6   r   GitSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR&%4%;%;ITXTbTb%c"I !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC--r5   )	r   r   r   r   r   r   r   r   rY   NNNr+   r,   r-   r.   rt   r0   r   r1   r	   r   r   r3   r   r4   r   r   s   @r6   r   r      sm    G> 48(,	%.||%. ))D0%. 	%.
 +,%. 
u||	%. %.r5   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )GitSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nrn   )rs   rt   r   r   rw   denser|   r}   r~   r   r   r   s     r6   rt   GitSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r5   r(   input_tensorr8   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   r|   r   r(   r   s      r6   r   GitSelfOutput.forward
  5    

=1]3}'CDr5   r|   r   r   
r+   r,   r-   r.   rt   r0   r   r   r4   r   r   s   @r6   r   r     6    >U\\  RWR^R^  r5   r   eagerc                      ^  \ rS rSrSU 4S jjr  SS\R                  S\R                  S-  S\S-  S\	\
   S\\R                     4
S	 jjrS
rU =r$ )GitAttentioni  Nc                 z   > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        g )Nr   )rs   rt   GIT_SELF_ATTENTION_CLASSES_attn_implementationr   r   outputr   s      r6   rt   GitAttention.__init__  s1    .v/J/JKFh	#F+r5   r(   rS   rT   rc   r8   c                 V    U R                   " UUU40 UD6u  pVU R                  XQ5      nU$ r   )r   r   )r   r(   rS   rT   rc   attn_output_attention_outputs           r6   r   GitAttention.forward  s>     
 	
  ;;{Br5   )r   r   r   r   r   r   s   @r6   r   r     sl    , 48(,	 ||  ))D0  	 
 +,  
u||	   r5   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )GitIntermediatei.  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rs   rt   r   r   rw   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r6   rt   GitIntermediate.__init__/  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r5   r(   r8   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   r   r(   s     r6   r   GitIntermediate.forward7  s&    

=100?r5   r   r   r   s   @r6   r   r   .  s(    9U\\ ell  r5   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )	GitOutputi>  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rs   rt   r   r   r   rw   r   r|   r}   r~   r   r   r   s     r6   rt   GitOutput.__init__?  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r5   r(   r   r8   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r6   r   GitOutput.forwardE  r   r5   r   r   r   s   @r6   r   r   >  r   r5   r   c                      ^  \ rS rSrSU 4S jjr  SS\R                  S\R                  S-  S\S-  S\	\
   S\\R                     4
S	 jjrS
 rSrU =r$ )GitLayeriL  Nc                    > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        [        U5      U l        [        U5      U l	        g )Nr   r   )
rs   rt   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r   s      r6   rt   GitLayer.__init__M  sI    '-'E'E$%fB+F3'r5   r(   rS   rT   rc   r8   c                     U R                   " UU4SU0UD6n[        U R                  U R                  U R                  U5      nU$ )NrT   )r   r   feed_forward_chunkr   r   )r   r(   rS   rT   rc   r   layer_outputs          r6   r   GitLayer.forwardU  s]      >>
 ,
 	
 1##T%A%A4CSCSUe
 r5   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   )r   r   intermediate_outputr   s       r6   r   GitLayer.feed_forward_chunkh  s)    "//0@A{{#6Ir5   )r   r   r   r   r   r   r   )r+   r,   r-   r.   rt   r0   r   r1   r	   r   r   r3   r   r   r4   r   r   s   @r6   r   r   L  sq    ( 48(,	|| ))D0 	
 +, 
u||	& r5   r   c                      ^  \ rS rSrU 4S jr   SS\R                  S\R                  S-  S\S-  S\	S-  S\
\   S	\4S
 jjrSrU =r$ )
GitEncoderin  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf NF)
rs   rt   rR   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r   rR   ir   s      r6   rt   GitEncoder.__init__o  sR    ]]vG_G_A`#aA`AHV$7A`#ab
&+# $b   A&Nr(   rS   rT   	use_cacherc   r8   c                 R    U R                    H  nU" UUU40 UD6nM     [        UUS9$ )Nr'   rT   )r  r   )r   r(   rS   rT   r  rc   layer_modules          r6   r   GitEncoder.forwardu  sD     !JJL( 	M ' '++
 	
r5   )rR   r  r  )NNN)r+   r,   r-   r.   rt   r0   r   r1   r	   rL   r   r   r   r   r4   r   r   s   @r6   r  r  n  st    , 48(,!%
||
 ))D0
 	

 $;
 +,
 
!
 
r5   r  c                   Z    \ rS rSr% \\S'   SrSrSr\	R                  " 5       S 5       rSrg)	GitPreTrainedModeli  rR   git)imagetextTc                 @   [        U[        5      (       Ga	  [        R                  " UR                  SU R
                  R                  S9  [        R                  " UR                  R                  U R
                  R                  S9  [        R                  " UR                  R                  U R
                  R                  S9  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        U[         R"                  5      (       ac  [        R                  " UR                  SU R
                  R                  S9  UR$                  b!  [        R&                  " UR$                  5        gg[        U[         R(                  5      (       a  [        R                  " UR                  SU R
                  R                  S9  UR*                  bK  [-        UR                  SS5      (       d.  [        R&                  " UR                  UR*                     5        ggg[        U[         R.                  5      (       aA  [        R&                  " UR$                  5        [        R0                  " UR                  5        g[        U[2        5      (       a\  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        gg)	zInitialize the weights        )meanstd)r  r?   rp   N_is_hf_initializedF)r   GitVisionEmbeddingsinitnormal_class_embeddingrR   initializer_rangepatch_embeddingweightposition_embeddingcopy_rU   r0   r   rA   r   r   r   biaszeros_ru   rm   getattrr|   ones_rj   )r   modules     r6   _init_weights GitPreTrainedModel._init_weights  s    f122LL//ct{{?\?\]LL//66DKK<Y<YZLL2299t{{?\?\]JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghfbii((LLSdkk6S6ST{{&FKK( '--LLSdkk6S6ST!!-gfmmMach6i6iFMM&*<*<=> 7j---KK$JJv}}%..JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh /r5   r*   N)r+   r,   r-   r.   r    r2   base_model_prefixinput_modalitiessupports_gradient_checkpointingr0   no_gradr-  r4   r*   r5   r6   r  r    s4    (&*#
]]_i ir5   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )r  i  rR   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestrider(  r   r   rU   rp   rq   )rs   rt   rR   rw   	embed_dimr   r   r   	Parameterr0   randnr"  Conv2dnum_channelsr$  num_patchesnum_positionsru   r&  r   r   r   r   s     r6   rt   GitVisionEmbeddings.__init__  s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr5   r   heightwidthr8   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr?   g      ?r   r   bicubicF)r   modealign_cornersrZ   )rA   r&  r%  	unsqueezer0   jit
is_tracingrU   r   r   reshaper   r   r`   interpolater   cat)r   r   rA  rB  r>  r&  r?  class_pos_embedpatch_pos_embedr[   
new_height	new_widthsqrt_num_positionss                r6   interpolate_pos_encoding,GitVisionEmbeddings.interpolate_pos_encoding  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr5   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model ().dtyper   r   r?   rZ   )rA   r   r   r$  r%  rY  r^   flattenr   r"  r   r0   rL  rR  r&  rU   )r   rT  rR  
batch_sizer   rA  rB  target_dtypepatch_embedsclass_embedsr   s              r6   r   GitVisionEmbeddings.forward  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr5   )	r"  rR   r9  r   r>  r?  r$  r   r&  )F)r+   r,   r-   r.   r!   rt   r0   r   rK   rR  r1   r   r4   r   r   s   @r6   r  r    si    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r5   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )GitVisionMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rs   rt   rR   r   r   activation_fnr   r   rw   r   fc1fc2r   s     r6   rt   GitVisionMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr5   r(   r8   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rd  rc  re  r   s     r6   r   GitVisionMLP.forward  s4    /**=9/r5   )rc  rR   rd  re  r   r   s   @r6   ra  ra    s)    KU\\ ell  r5   ra  r,  r   r   rY   scalingr   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr?   r   )r[   rY  )ptrainingr   r   )r0   r   r   r   r`   r   float32r^   rY  r   rl  r   )
r,  r   r   rY   rS   ri  r   rc   attn_weightsr   s
             r6   eager_attention_forwardro    s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r5   c                      ^  \ rS rSrSrU 4S jr SS\R                  S\R                  S-  S\\	   S\
\R                  \R                  S-  4   4S	 jjrS
rU =r$ )GitVisionAttentioni%  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rW  g      F)rs   rt   rR   rw   r9  r   	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   r   k_projv_projq_projout_projr   s     r6   rt   GitVisionAttention.__init__(  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar5   Nr(   rS   rc   r8   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  U R                  U R                  (       d  SOU R                  S.UD6u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNr?   r   r   r  )rw  ri  r   )rA   rt  rz  r   r   rx  ry  r   get_interfacerR   r   ro  rw  ru  rl  r   rJ  r   r{  )r   r(   rS   rc   r   r   querieskeysvaluesattention_interfacer   rn  s               r6   r   GitVisionAttention.forward<  sG    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8
%
 nnJJ#}}C$,,
%
 
%
! "));;;;FFHmmK0((r5   )rR   r   r9  rt  rw  rx  rs  r{  rz  ru  ry  r   )r+   r,   r-   r.   r/   rt   r0   r   r   r   r3   r   r4   r   r   s   @r6   rq  rq  %  sk    GB. /3)||) t+) +,	)
 
u||U\\D00	1) )r5   rq  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jrS	rU =r$ )
GitVisionEncoderLayeri_  rR   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )rs   rt   rw   r9  rq  	self_attnr   r|   r}   layer_norm1ra  mlplayer_norm2r   s     r6   rt   GitVisionEncoderLayer.__init__`  sm    +++F3<<F<Q<QR'<<F<Q<QRr5   r(   rS   rc   r8   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r(   rS   r*   )r  r  r  r  )r   r(   rS   rc   residualr   s         r6   r   GitVisionEncoderLayer.forwardh  sz     !((7>> 
')
 

 !0 ((7/ 0r5   )r9  r  r  r  r  )r+   r,   r-   r.   r!   rt   r0   r   r   r   r1   r   r4   r   r   s   @r6   r  r  _  sU    S S||  +,	
 
		 r5   r  c                   p   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\4S	 jjrS
rU =r$ )GitVisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`GitVisionEncoderLayer`].

Args:
    config: GitVisionConfig
rR   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
rs   rt   rR   r   r  r	  r
  r  layersr  )r   rR   r   r   s      r6   rt   GitVisionEncoder.__init__  sT    mmERXRjRjLk$lLkq%:6%BLk$lm&+# %mr  NrS   rc   r8   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )Nr'   )r  r   )r   rO   rS   rc   r(   encoder_layers         r6   r   GitVisionEncoder.forward  sC     &![[M) M ) +
 	
r5   )rR   r  r  r   )r+   r,   r-   r.   r/   r!   rt   r0   r   r   r   r   r   r4   r   r   s   @r6   r  r    sP    , , /3
 t+
 +,	

 

 
r5   r  c            
          ^  \ rS rSrS\4U 4S jjr\  SS\R                  S-  S\	S-  S\
\   S\4S	 jj5       rS
rU =r$ )GitVisionTransformeri  rR   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )rs   rt   rR   rw   r  r   r   r|   r}   pre_layrnormr  encoderpost_layernorm)r   rR   r9  r   s      r6   rt   GitVisionTransformer.__init__  sd    &&	-f5LL8M8MN'/ ll9:O:OPr5   NrT  rR  rc   r8   c                     Uc  [        S5      eU R                  XS9nU R                  U5      nU R                  " SSU0UD6nUR                  nU R                  U5      n[        US9$ )Nz You have to specify pixel_valuesrR  rO   r  r*   )r   r   r  r  r'   r  r   )r   rT  rR  rc   r(   encoder_outputsr'   s          r6   r   GitVisionTransformer.forward  s     ?@@h))-8,, 
'


 ,== //0AB/
 	
r5   )rR   r   r  r  r  r  )r+   r,   r-   r.   r!   rt   r   r0   r1   rL   r   r   r   r   r4   r   r   s   @r6   r  r    sh    Q Q  2605
''$.
 #'+
 +,	

 

 
r5   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSr\\	S.r
S\4U 4S jjrS\R                  4S jr\\" S	S
9\  SS\R&                  S-  S\S\\   S\\-  4S jj5       5       5       rSrU =r$ )GitVisionModeli  rR   rT  )r  r(   r)   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rs   rt   r  vision_model	post_initr   s     r6   rt   GitVisionModel.__init__  s'     08r5   r8   c                 B    U R                   R                  R                  $ r   )r  r   r$  r   s    r6   get_input_embeddings#GitVisionModel.get_input_embeddings  s      ++;;;r5   F)tie_last_hidden_statesNrR  rc   c                 ,    U R                   " SUUS.UD6$ )aT  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, GitVisionModel

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
>>> model = GitVisionModel.from_pretrained("microsoft/git-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```)rT  rR  r*   r  )r   rT  rR  rc   s       r6   r   GitVisionModel.forward  s,    <    
%%=
 
 	
r5   r  r  )r+   r,   r-   r.   r!   r2   main_input_namer0  r  rq  _can_record_outputsrt   r   Moduler  r   r   r   r0   r1   rL   r   r   r3   r   r   r4   r   r   s   @r6   r  r    s     $O!.(
 <bii <  E2 26).
''$.
 #'
 +,	

 
	 
  3  
r5   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )GitProjectioni  rR   c                 .  > [         TU ]  5         Xl        [        R                  " [        R
                  " UR                  R                  UR                  5      [        R                  " UR                  UR                  R                  S95      U l
        g r   )rs   rt   rR   r   
Sequentialr   r   rw   r|   r}   visual_projectionr   s     r6   rt   GitProjection.__init__  sd    !#IIf**668J8JKLL++1E1E1T1TU"
r5   r   r8   c                 $    U R                  U5      $ r   )r  )r   r   s     r6   r   GitProjection.forward  s    %%j11r5   )rR   r  )r+   r,   r-   r.   r    rt   r0   r   r   r4   r   r   s   @r6   r  r    s/    
y 
2%,, 25<< 2 2r5   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                   V  ^  \ rS rSr\\S.rU 4S jrS rS r	\
\\        SS\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\S-  S\S-  S\S\\   S\\R                     \-  4S jj5       5       5       rSrU =r$ )GitModeli  r  c                 r  >^ [         TU ]  T5        TU l        [        T5      U l        [        TR                  5      U l        [        T5      U l	        [        T5      U l        TR                  b8  [        R                  " U4S j[        TR                  5       5       5      U l        U R#                  5         g )Nc              3      >#    U  HE  n[         R                  " [        R                  " S S TR                  R
                  5      5      v   MG     g7f)r   N)r   r:  r0   zerosr   rw   ).0r   rR   s     r6   	<genexpr>$GitModel.__init__.<locals>.<genexpr>(  s=      ;?A U[[Av/C/C/O/OPQQ?s   AA)rs   rt   rR   rj   r   r  r   image_encoderr  r  r  r  r   r   ParameterListr	  img_temporal_embeddingr  r   s    `r6   rt   GitModel.__init__  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r5   c                 .    U R                   R                  $ r   r   ry   r  s    r6   r  GitModel.get_input_embeddings0  s    ...r5   c                 $    XR                   l        g r   r  )r   rY   s     r6   set_input_embeddingsGitModel.set_input_embeddings3  s    */'r5   Nr   rS   rU   rT  rO   rT   r  rR  rc   r8   c	           	      <   USL USL-  (       a  [        S5      eU(       a  Uc  [        U R                  S9nSn
Ub5  [        U[        5      (       d  UR                  5       OUR                  5       n
Uc  Ub  UR                  S   S:X  a  X:-   nU R                  UUUU
S9n[        R                  " U[        R                  S9S   nUGb  UR                  S	:X  a  U R                  XHS
9R                  nOUR                  S:X  a  / n[        UR                  S   5       HL  nU R                  USS2USS2SS24   US
9R                  nXR                  U   -  nUR!                  U5        MN     [        R"                  " USS9nO[        S5      eU R%                  U5      nUR'                  UR)                  S5      UR)                  S5      -  SS5      n[        R"                  " UU4SS9n[        R*                  " U[        R                  S9S   n[        R"                  " UU/SS9nUb+  [        R"                  " [        R*                  " U5      U/SS9nOxUbu  UR                  S   S:X  ab  [        R,                  " UR                  S   XR                  S   -
  S-   4UR.                  UR0                  S9n[        R"                  " UU/SS9n[3        U R                  UUUSUS9nUnU R4                  " U4UUUS.U	D6n[7        UR                  UR8                  S9$ )a  
Examples:

```python
>>> from transformers import AutoProcessor, AutoModel
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
>>> model = AutoModel.from_pretrained("microsoft/git-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> text = "this is an image of two cats"

>>> inputs = processor(images=image, text=text, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```Nz:You must specify exactly one of input_ids or inputs_embeds)rR   r   r   )r   rU   rO   r   rX  ).r      r     rZ   z#pixel_values must be of rank 4 or 5r?   )rY  r_   )rO   rS   rT   rU   rV   )rS   rT   r  r  )r   r
   rR   r   r	   get_seq_lengthrA   r   r0   
zeros_likerK   ndimr  r'   r	  r  appendrL  r  repeatr   	ones_likeonesrY  r_   rh   r  r   rT   )r   r   rS   rU   rT  rO   rT   r  rR  rc   r   embedding_outputrV   visual_features	frame_idxvisual_features_frameprojected_visual_featuresimage_token_type_idsextended_attention_maskcausal_maskr(   r  s                         r6   r   GitModel.forward6  se   L -t";<YZZ0*$++>O "#& "/599  ..0$335 # O$?IOOTUDVZ[D['@L??%'#9	 + 
 ))*:%))LVT#  A%"&"4"4  #5 ###   ""a'"$!&|'9'9!'<!=I,0,>,>$Q	1a%78Sk -? -'' * *-H-H-SS)#**+@A "> #())O"C !!FGG(,(>(>(O% )B(H(H %%a(,E,J,J1,MMqRS)%
  %yy*CEU)V\]^#(??3LTYT]T]#^_e#f "YY(<n'MSUVN)!&EOO<P,QSa+bhj!k(Y__Q-?1-D ',jj%%a(*@CWCWXYCZ*Z]^*^_$**%,,'#
 #YY(?'PVXYN 1KK*)+)
 )37<<4
&+	4

 4
 '-??+;;
 	
r5   )rR   r   r  r  r  r  )NNNNNNNF)r+   r,   r-   r.   r   r   r  rt   r  r  r   r   r   r0   r   r	   rL   r   r   r3   r   r   r4   r   r   s   @r6   r  r    s    "&
&/0   *..2,0,0-1(,!%).E
<<$&E
 t+E
 llT)	E

 llT)E
 ||d*E
 E
 $;E
 #'E
 +,E
 
u||	9	9E
    E
r5   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                     ^  \ rS rSrSS0rU 4S jrS rS r\\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S\\R                  -  S\\   S\\R                     \-  4S jj5       5       5       r     SU 4S jjrSrU =r$ )GitForCausalLMi  zoutput.weightz%git.embeddings.word_embeddings.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        U R                  5         g r   )
rs   rt   r  r  r   r   rw   rv   r   r  r   s     r6   rt   GitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r5   c                     U R                   $ r   r   r  s    r6   get_output_embeddings$GitForCausalLM.get_output_embeddings  s    {{r5   c                     Xl         g r   r  )r   new_embeddingss     r6   set_output_embeddings$GitForCausalLM.set_output_embeddings  s    $r5   Nr   rS   rU   rT  rO   labelsrT   r  rR  logits_to_keeprc   r8   c                    Ub  SnU R                   " U4UUUUUUU	S.UD6nUR                  n[        U
[        5      (       a  [	        U
* S5      OU
nU R                  USS2USS24   5      nSnUb  U R                   R                  R                  S   R                  R                  R                  nUSS2US2SS24   R                  5       nUSS2SS24   R                  5       nU R                  " UR                  SU R                  R                  5      UR                  S5      4SU R                  R                  0UD6n[!        UUUR"                  UR$                  UR&                  S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

Examples:

Image captioning example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
>>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_caption)
two cats sleeping on a pink blanket next to remotes.
```

Visual question answering (VQA) example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> from huggingface_hub import hf_hub_download
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

>>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
>>> image = Image.open(file_path).convert("RGB")

>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

>>> question = "what does the front of the bus say at the top?"

>>> input_ids = processor(text=question, add_special_tokens=False).input_ids
>>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
>>> input_ids = torch.tensor(input_ids).unsqueeze(0)

>>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
>>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
['what does the front of the bus say at the top? special']
```

Video captioning example:

```python
>>> import av
>>> import numpy as np
>>> from PIL import Image
>>> from huggingface_hub import hf_hub_download
>>> from transformers import AutoProcessor, AutoModelForCausalLM

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

>>> # set seed for reproducibility
>>> np.random.seed(45)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # load video
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample frames
>>> num_frames = model.config.num_image_with_embedding
>>> indices = sample_frame_indices(
...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
... )
>>> frames = read_video_pyav(container, indices)

>>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

>>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
```
NF)rS   rU   rT  rO   rT   r  rR  r   r?   r   rv   )losslogitsrT   r(   r)   )r  r'   r   rK   slicer   r  r  r   r   r   r   loss_functionr   rR   rv   r   rT   r(   r)   )r   r   rS   rU   rT  rO   r  rT   r  rR  r  rc   outputsr(   slice_indicesr  r  num_image_tokensshifted_logitss                      r6   r   GitForCausalLM.forward  s}   l I+/88
,
)%%'+%=
,
 
,
  118B>SV8W8W~ot4]k]1mQ+>?@#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%%##B(>(>?B  ;;11 	D &#33!//))
 	
r5   c                 X   > [         T	U ]  " U4UUUUS.UD6nU(       d  U(       d  X8S'   U$ )N)rT   rS   r  rW   rT  )rs   prepare_inputs_for_generation)
r   r   rT   rT  rS   r  rW   rc   model_inputsr   s
            r6   r  ,GitForCausalLM.prepare_inputs_for_generation  sG     w<
+)1
 
 Y+7(r5   )r  r   )
NNNNNNNNFr   )NNNNF)r+   r,   r-   r.   _tied_weights_keysrt   r  r  r   r   r   r0   r   r	   rL   rK   r   r   r3   r   r   r  r4   r   r   s   @r6   r  r    s`    *+RS%   *..2,0,0-1&*(,!%).-.z
<<$&z
 t+z
 llT)	z

 llT)z
 ||d*z
 t#z
 z
 $;z
 #'z
 ell*z
 +,z
 
u||	5	5z
    z
~   r5   r  )r  r  r  r  r   )r  )Tr/   r   collections.abcr   dataclassesr   r0   r    r   r   activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   utils.deprecationr   utils.genericr   utils.output_capturingr   configuration_gitr    r!   
get_loggerr+   r   r$   r   rM   rL   dictrh   r  rj   r   r   r   r   r   r   r   r  r  r  ra  floatro  rq  r  r  r  r  r  r  r  __all__r*   r5   r6   <module>r     sZ     $ !   & ! . 3 ) 6 9  G & 6  1 7 5 9 
		H	% 	<; 	< 	<ELL X 6 ?K +/&*$4$4<<$4 LL4'$4 T\	$4
 ,,%$4 LL4'$4 t$4 
$4 L$4N*BII *ZB.ryy B.LBII   
 299  0bii  		 ) D
 
: i i i>P")) Pf299 . %II%<<% 
% <<	%
 LL4'% % %.6) 6)t6 D
ryy 
D#
299 #
L 
4
' 4

4
n
2BII 
2 
g
! g

g
T 
i' i
iX Qr5   