
    Z j;>                     ^   S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJr  \R2                  " \5      r\ " S S\5      5       r " S S\R:                  5      r S%S\R:                  S\R>                  S\R>                  S\R>                  S\R>                  S-  S\ S\ 4S jjr! " S S\R:                  5      r" " S S\R:                  5      r# " S S \5      r$ " S! S"\R:                  5      r% " S# S$\R:                  5      r&g)&zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)Callable)	dataclass)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONS)Unpack)ModelOutputTransformersKwargslogging   )IdeficsVisionConfigc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
IdeficsVisionModelOutput'   a  
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

Args:
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       s/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/idefics/vision.pyr   r   '   sr    * .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r$   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )IdeficsVisionEmbeddingsE   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r)   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr)   	__class__s     r%   r5    IdeficsVisionEmbeddings.__init__F   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr$   
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  U R                  5      nUR                   S   S-
  nXF:X  a  X#:X  a  U$ USS2S4   nUSS2SS24   nUR                   S   n	X R                  R                  -  n
X0R                  R                  -  nU
S-   US-   p[
        R                  " U5      nUR                  S[        U5      [        U5      U	5      nUR                  SSSS5      nUR                  [        R                  :H  nU(       a4  [        R                  S5        UR                  [        R                   5      n["        R$                  R'                  UX-  X-  4S	S
S9nU(       a  UR                  [        R                  5      n[        U
5      UR                   S   :w  d  [        U5      UR                   S   :w  aB  [)        S[        U
5      [        U5      4 SUR                   S   UR                   S   4 S35      eUR                  SSSS5      R+                  SSU	5      n[        R,                  " UR/                  S5      U4SS9$ )z
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
resolution images.

Source:
https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
r   Nr   r2   g?r   r0   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaperC   r1   r)   r9   mathsqrtreshapeintpermutedtyper   bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rH   rK   rL   rM   r@   	pos_embedrA   class_pos_embedpatch_pos_embedr7   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastings                 r%   interpolate_pos_encoding0IdeficsVisionEmbeddings.interpolate_pos_encoding]   sV    !&&q)A-++D,=,=>	!*Q.'FO#AqD/#AqrE*$$R(	++"8"88!7!77 (5s':MC<O}!YY}5)11!S9K5LcRdNegpq)11!Q1=(..%..@h .00=O--33'<m>`a	 4 
 -00@O}!6!6r!::c->PTcTiTijlTm>m0]1CSEW1W0X Y00?0E0Eb0I?K`K`acKd0d/eefh  *11!Q1=BB1b)Tyy/33A6HaPPr$   pixel_valuesrq   c                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)r^   r0   r   r2   rV   )rX   r8   rf   r?   weightr^   rb   flatten	transposer<   rF   r   rh   rq   rC   r1   )rH   rs   rq   
batch_sizer>   rL   rM   target_dtypepatch_embedsclass_embedsrK   s              r%   forwardIdeficsVisionEmbeddings.forward   s'   2>2D2D/
&'(E__,D (% 9)4??*;;su 
 ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
 $#&C&CJX]&^^J  $&=&=d>O>O&PPJr$   )	r<   r)   r7   r8   r@   rA   r?   r9   rC   )F)r   r   r   r   r   r5   r   Tensorr\   rq   r    boolr}   r#   __classcell__rI   s   @r%   r'   r'   E   sr    q2 q./Q5<< /Q /QUX /Q]b]i]i /QbE$5$5 QU bgbnbn  r$   r'   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr2   rT   )rW   r^   )ptrainingr   r0   )r   matmulrx   r   rd   softmaxfloat32rb   r^   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r%   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r$   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   S	\\R                  \R                  S-  4   4S
 jjrSrU =r$ )IdeficsVisionAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr)   c                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r4   r5   r)   r6   r7   num_attention_heads	num_headshead_dimrf   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrG   s     r%   r5   IdeficsVisionAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar$   Nr   r   r   rN   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      nU R                  U5      nU R	                  U5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  U R                  U R                  (       d  SOU R                  S.UD6u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNr2   r   r0           )r   r   r   )rX   r   r   r   r   rg   rx   r   get_interfacer)   _attn_implementationr   r   r   r   r   r[   r   r   )rH   r   r   r   input_shapehidden_shapequerieskeysvaluesattention_interfacer   r   s               r%   r}   IdeficsVisionAttention.forward   sP    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8
%
 nnJJ#}}C$,,
%
 
%
! "));;;;FFHmmK0((r$   )r)   r   r7   r   r   r   r   r   r   r   r   N)r   r   r   r   r   r   r5   r   r   r   r   r"   r}   r#   r   r   s   @r%   r   r      st    GB2 B. /3%)||%) t+%) +,	%)
 
u||U\\D00	1%) %)r$   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )IdeficsVisionMLP   c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )r4   r5   r)   r   
hidden_actactivation_fnr   r   r6   intermediate_sizefc1fc2rG   s     r%   r5   IdeficsVisionMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr$   r   rN   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )rH   r   s     r%   r}   IdeficsVisionMLP.forward  s4    /**=9/r$   )r   r)   r   r   )
r   r   r   r   r5   r   r   r}   r#   r   r   s   @r%   r   r      s)    KU\\ ell  r$   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jrS	rU =r$ )
IdeficsVisionEncoderLayeri  r)   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g N)eps)r4   r5   r6   r7   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rG   s     r%   r5   "IdeficsVisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr$   r   r   r   rN   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r   r   r   )r   r   r   r   )rH   r   r   r   residual_s         r%   r}   !IdeficsVisionEncoderLayer.forward  sz     !((7>> 
')
 

 !0 ((7/ 0r$   )r7   r   r   r   r   )r   r   r   r   r   r5   r   r   r   r   r    r}   r#   r   r   s   @r%   r   r     sV    S2 S||  +,	
 
		 r$   r   c                   p   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\4S	 jjrS
rU =r$ )IdeficsVisionEncoderi2  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`IdeficsVisionEncoderLayer`].

Args:
    config: IdeficsVisionConfig
r)   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r4   r5   r)   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rH   r)   r   rI   s      r%   r5   IdeficsVisionEncoder.__init__;  sU    mmPUV\VnVnPo$pPo1%>v%FPo$pq&+# %qs   A&Nr   r   rN   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)r   )r   r	   )rH   inputs_embedsr   r   r   encoder_layers         r%   r}   IdeficsVisionEncoder.forwardA  sC     &![[M) M ) +
 	
r$   )r)   r   r   r   )r   r   r   r   r   r   r5   r   r   r   r   r	   r}   r#   r   r   s   @r%   r   r   2  sQ    ,2 , /3
 t+
 +,	

 

 
r$   r   c                   t   ^  \ rS rSrS\4U 4S jjr  S
S\R                  S-  S\S-  S\	\
-  4S jjrS	rU =r$ )IdeficsVisionTransformeriU  r)   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )r4   r5   r)   r6   r'   rK   r   r   r   pre_layrnormr   encoderpost_layernorm)rH   r)   r7   rI   s      r%   r5   !IdeficsVisionTransformer.__init__V  sd    &&	1&9LL8M8MN+F3 ll9:O:OPr$   Nrs   rq   rN   c                     Uc  [        S5      eU R                  XS9nU R                  U5      nU R                  " SSU0UD6nUR                  nUSS2SSS24   nU R                  U5      n[        UUS9$ )z
Returns:

Nz You have to specify pixel_values)rq   r   r   )r   pooler_outputr   )rf   rK   r   r   r   r   r
   )rH   rs   rq   r   r   encoder_outputsr   pooled_outputs           r%   r}    IdeficsVisionTransformer.forwarda  s     ?@@h))-8+/<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r$   )r)   rK   r   r   r   r   )r   r   r   r   r   r5   r   r    r   r"   r
   r}   r#   r   r   s   @r%   r   r   U  sU    Q2 Q 2605
''$.
 #'+

 
+	+
 
r$   r   )r   )'r   rY   collections.abcr   dataclassesr   r   r   activationsr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   configuration_ideficsr   
get_loggerr   r`   r   Moduler'   r   rc   r   r   r   r   r   r   r   r$   r%   <module>r      s-   [  $ !   ! 9 K 5 & 
 7 
		H	% <{ < <:`bii `V %II%<<% 
% <<	%
 LL4'% % %.<)RYY <)@ryy   : D
299 
F(
ryy (
r$   