
    Z j                     0   S SK Jr  S SKJr  S SKJr  S SKrS SKJr  SSK	J
r  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*  \\  " S S\5      5       5       r+ " S S\RX                  5      r- SNS\RX                  S\R\                  S\R\                  S\R\                  S\R\                  S-  S\/S\/4S jjr0 " S S \RX                  5      r1 " S! S"\RX                  5      r2 " S# S$\RX                  5      r3 " S% S&\RX                  5      r4 " S' S(\RX                  5      r5 " S) S*\5      r6 " S+ S,\RX                  5      r7 " S- S.\RX                  5      r8 " S/ S0\RX                  5      r9 " S1 S2\RX                  5      r: " S3 S4\5      r; " S5 S6\RX                  5      r< " S7 S8\RX                  5      r=\  " S9 S:\5      5       r>\ " S;S<9 " S= S>\>5      5       r?\ " S?S<9 " S@ SA\>5      5       r@ " SB SC\>5      rASD\R\                  SE\R\                  4SF jrBSG\R\                  SE\R\                  4SH jrCSI\R\                  SE\R\                  4SJ jrD\  " SK SL\>5      5       rE/ SMQrFg)O    )Callable)	dataclass)AnyN   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuple	torch_int)merge_with_config_defaults)capture_outputs   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfigc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)AltCLIPOutput)   aq  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AltCLIPTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AltCLIPVisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fN)
isinstancer   to_tuple).0vs     }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/altclip/modeling_altclip.py	<genexpr>)AltCLIPOutput.to_tuple.<locals>.<genexpr>H   s)     ^P]1Z;%?%?QZZ\QFP]s   46)tuplevalues)selfs    r/   r,   AltCLIPOutput.to_tupleG   s    ^PTP[P[P]^^^     )__name__
__module____qualname____firstlineno____doc__r    torchFloatTensor__annotations__r!   r"   r#   r$   r%   r   r&   r2   r   r,   __static_attributes__r7   r6   r/   r   r   )   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r6   r   c                      ^  \ rS rSrSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S
\R                  4S jjr\S 5       r\SS j5       rSrU =r$ )AltRobertaEmbeddingsK   zGConstruct the embeddings from word, position and token_type embeddings.c                 >  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR$                  5      R'                  S5      SS9  U R                  S[         R(                  " U R*                  R-                  5       [         R.                  S9SS9  UR                  U l        [        R                  " UR$                  UR
                  U R0                  S9U l        g )	N)padding_idxepsposition_idsr   F
persistenttoken_type_idsdtype)super__init__nn	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr=   arangemax_position_embeddingsexpandzerosrH   sizelongrE   position_embeddingsr4   config	__class__s     r/   rQ   AltRobertaEmbeddings.__init__N   s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
 r6   N	input_idsrM   rH   inputs_embedspast_key_values_lengthr'   c                    Uc;  Ub  U R                  XR                  U5      nOU R                  X@R                  5      nUb  UR                  5       nOUR                  5       S S nUu  pxUc  [	        U S5      (       aQ  U R
                  R                  UR                  S   S5      n	[        R                  " U	SUS9n	U	R                  Xx5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R!                  U5      nX-   nU R#                  U5      nU R%                  U5      nU$ )NrJ   rM   r   r   )dimindexrO   device)"create_position_ids_from_input_idsrE   &create_position_ids_from_inputs_embedsrd   hasattrrM   rb   shaper=   gatherrc   re   rH   rr   rW   rY   rf   rZ   r^   )r4   rk   rM   rH   rl   rm   input_shape
batch_size
seq_lengthbuffered_token_type_idsrY   
embeddingsrf   s                r/   forwardAltRobertaEmbeddings.forwardb   sb    $#FF//1G   $JJ=ZjZjk #..*K',,.s3K!,

 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
r6   c                     U R                  5       SS nUS   n[        R                  " US-   X1-   S-   [        R                  U R                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
NrJ   r   rq   r   )rd   r=   r`   re   rr   	unsqueezerb   )rl   rE   rx   sequence_lengthrH   s        r/   rt   ;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   sn     $((*3B/%a.||!O_:Q>ejjYfYmYm
 %%a(//<<r6   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   ro   )neintr=   cumsumtype_asre   )rk   rE   rm   maskincremental_indicess        r/   rs   7AltRobertaEmbeddings.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r6   )rZ   r^   rE   rf   rY   rW   )NNNNr   )r   )r8   r9   r:   r;   r<   rQ   r=   
LongTensorr>   r   Tensorr}   staticmethodrt   rs   r@   __classcell__ri   s   @r/   rB   rB   K   s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r6   rB   modulequerykeyvalueattention_maskscalingr^   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )N   r   rJ   )ro   rO   )ptrainingr   )r=   matmul	transposerR   
functionalsoftmaxfloat32torO   r^   r   
contiguous)
r   r   r   r   r   r   r^   kwargsattn_weightsattn_outputs
             r/   eager_attention_forwardr      s     <<}}Q':;gEL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r6   c                      ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\
\R                  \R                  S-  4   4S jjrS	rU =r$ )AltRobertaSelfAttention   c                 D  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        UR                   U l        U R                  S-  U l        SU l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()      F)rP   rQ   rU   num_attention_headsru   
ValueErrorrh   r   attention_head_sizeall_head_sizerR   Linearr   r   r   r\   attention_probs_dropout_probr^   attention_dropoutr   	is_causalrg   s     r/   rQ    AltRobertaSelfAttention.__init__   sK    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r6   Nhidden_statesr   r   r'   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  (       d  SOU R                  U R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
X4$ )NrJ   r   r           )r^   r   )rv   r   r   viewr   r   r   r   get_interfacerh   _attn_implementationr   r   r   r   reshaper   )r4   r   r   r   rx   hidden_shapequery_states
key_statesvalue_statesattention_interfacer   r   s               r/   r}   AltRobertaSelfAttention.forward   s8    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFH((r6   )r   r   r   rh   r^   r   r   r   r   r   r   r*   )r8   r9   r:   r;   rQ   r=   r   r>   r   r   r2   r}   r@   r   r   s   @r/   r   r      si    2 48)||) ))D0) +,	)
 
u||U\\D00	1) )r6   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AltRobertaSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrF   )rP   rQ   rR   r   rU   denserZ   r[   r\   r]   r^   rg   s     r/   rQ   AltRobertaSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r6   r   input_tensorr'   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r*   r   r^   rZ   r4   r   r   s      r/   r}   AltRobertaSelfOutput.forward	  5    

=1]3}'CDr6   rZ   r   r^   
r8   r9   r:   r;   rQ   r=   r   r}   r@   r   r   s   @r/   r   r     6    >U\\  RWR^R^  r6   r   c            	          ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\R                  4S jjr
S	rU =r$ )AltRobertaAttentioni  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r*   )rP   rQ   r   r4   r   outputrg   s     r/   rQ   AltRobertaAttention.__init__  s&    +F3	*62r6   Nr   r   r   r'   c                 Z    UnU R                   " U4SU0UD6u  pU R                  X5      nU$ Nr   )r4   r   r4   r   r   r   residual_s         r/   r}   AltRobertaAttention.forward  sE     !99
)
 

 M<r6   )r   r4   r*   )r8   r9   r:   r;   rQ   r=   r   r>   r   r   r}   r@   r   r   s   @r/   r   r     sV    3 48|| ))D0 +,	
 
 r6   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AltRobertaIntermediatei&  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r*   )rP   rQ   rR   r   rU   intermediate_sizer   r+   
hidden_actstrr   intermediate_act_fnrg   s     r/   rQ   AltRobertaIntermediate.__init__'  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r6   r   r'   c                 J    U R                  U5      nU R                  U5      nU$ r*   r   r   r4   r   s     r/   r}   AltRobertaIntermediate.forward/  s&    

=100?r6   r   r   r   s   @r/   r   r   &  s(    9U\\ ell  r6   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AltRobertaOutputi5  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rP   rQ   rR   r   r   rU   r   rZ   r[   r\   r]   r^   rg   s     r/   rQ   AltRobertaOutput.__init__6  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r6   r   r   r'   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r*   r   r   s      r/   r}   AltRobertaOutput.forward<  r   r6   r   r   r   s   @r/   r   r   5  r   r6   r   c            	          ^  \ rS rSrU 4S jr SS\R                  S\R                  S-  S\\	   S\R                  4S jjr
S	 rS
rU =r$ )AltRobertaLayeriC  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g )Nr   )
rP   rQ   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rg   s     r/   rQ   AltRobertaLayer.__init__D  sI    '-'E'E$,V426:&v.r6   Nr   r   r   r'   c                     U R                   " U4SU0UD6n[        U R                  U R                  U R                  U5      nU$ r   )r   r   feed_forward_chunkr   r   )r4   r   r   r   s       r/   r}   AltRobertaLayer.forwardL  sW     
)
 
 2##T%A%A4CSCSUb
 r6   c                 J    U R                  U5      nU R                  X!5      nU$ r*   )r   r   )r4   attention_outputintermediate_outputlayer_outputs       r/   r   "AltRobertaLayer.feed_forward_chunk^  s)    "//0@A{{#6Ir6   )r   r   r   r   r   r*   )r8   r9   r:   r;   rQ   r=   r   r>   r   r   r}   r   r@   r   r   s   @r/   r   r   C  s[    / 48|| ))D0 +,	
 
$ r6   r   c                   p   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\4S	 jjrS
rU =r$ )AltRobertaEncoderid  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`AltRobertaEncoderLayer`].

Args:
    config: AltCLIPTextConfig
rh   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rP   rQ   rh   rR   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointingr4   rh   r   ri   s      r/   rQ   AltRobertaEncoder.__init__m  sR    mmeFLdLdFe$fFe_V%<Fe$fg&+# %g   A&Nr   r   r'   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ N)last_hidden_stater  r   r4   rl   r   r   r   encoder_layers         r/   r}   AltRobertaEncoder.forwards  C     &![[M) M ) +
 	
r6   rh   r  r  r*   )r8   r9   r:   r;   r<   r   rQ   r=   r   r   r   r   r}   r@   r   r   s   @r/   r   r   d  sQ    ,0 , /3
 t+
 +,	

 

 
r6   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AltRobertaPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r*   )rP   rQ   rR   r   rU   r   Tanh
activationrg   s     r/   rQ   AltRobertaPooler.__init__  s9    YYv1163E3EF
'')r6   r   r'   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )r4   r   first_token_tensorpooled_outputs       r/   r}   AltRobertaPooler.forward  s6     +1a40

#566r6   )r  r   r   r   s   @r/   r  r    s(    $
U\\ ell  r6   r  c                      ^  \ rS rSrSrS\\-  4U 4S jjr SS\R                  S\R                  S-  S\
\   S	\\R                  \R                  S-  4   4S
 jjrSrU =r$ )AltCLIPAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrh   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  S-  U l        UR                  U l
        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nr   F)rP   rQ   rh   rU   	embed_dimr   	num_headshead_dimscaler   r^   r   rR   r   k_projv_projq_projout_projrg   s     r/   rQ   AltCLIPAttention.__init__  s    ++33$..8]]D(
//ii?ii?ii?		$..$..Ar6   Nr   r   r   r'   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      nU R                  U5      nU R	                  U5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  U R                  (       d  SOU R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
U R#                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNrJ   r   r   r   )r   r^   )rv   r#  r'  r%  r&  r   r   r   r   rh   r   r   r$  r   r^   r   r   r(  )r4   r   r   r   rx   r   querieskeysr3   r   r   r   s               r/   r}   AltCLIPAttention.forward  sI    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
! "));;;;FFHmmK0((r6   )rh   r^   r!  r#  r   r%  r"  r(  r'  r$  r&  r*   )r8   r9   r:   r;   r<   r   r   rQ   r=   r   r   r   r2   r}   r@   r   r   s   @r/   r  r    sz    GB25FF B$ /3%)||%) t+%) +,	%)
 
u||U\\D00	1%) %)r6   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
AltCLIPMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r*   )rP   rQ   rh   r   r   activation_fnrR   r   rU   r   fc1fc2rg   s     r/   rQ   AltCLIPMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr6   r   r'   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r*   )r2  r1  r3  r   s     r/   r}   AltCLIPMLP.forward  s4    /**=9/r6   )r1  rh   r2  r3  r   r   s   @r/   r/  r/    s)    KU\\ ell  r6   r/  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jrS	rU =r$ )
AltCLIPEncoderLayeri  rh   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )rP   rQ   rU   r!  r  	self_attnrR   rZ   r[   layer_norm1r/  mlplayer_norm2rg   s     r/   rQ   AltCLIPEncoderLayer.__init__  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr6   r   r   r   r'   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r   r   r7   )r;  r:  r=  r<  r   s         r/   r}   AltCLIPEncoderLayer.forward  sz     !((7>> 
')
 

 !0 ((7/ 0r6   )r!  r;  r=  r<  r:  )r8   r9   r:   r;   r   rQ   r=   r   r   r   r>   r}   r@   r   r   s   @r/   r8  r8    sV    S2 S||  +,	
 
		 r6   r8  c                   p   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\4S	 jjrS
rU =r$ )AltCLIPEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`AltCLIPEncoderLayer`].

Args:
    config: AltCLIPConfig
rh   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
rP   rQ   rh   rR   r  r  r  r8  r  r  r  s      r/   rQ   AltCLIPEncoder.__init__  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %kr
  Nr   r   r'   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ r  r  r  s         r/   r}   AltCLIPEncoder.forward  r  r6   r  r*   )r8   r9   r:   r;   r<   r   rQ   r=   r   r   r   r   r}   r@   r   r   s   @r/   rB  rB    sP    ,} , /3
 t+
 +,	

 

 
r6   rB  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )AltCLIPVisionEmbeddingsi!  rh   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasr   r   rH   rI   rK   )rP   rQ   rh   rU   r!  
image_size
patch_sizerR   	Parameterr=   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsrS   position_embeddingr_   r`   rb   rg   s     r/   rQ    AltCLIPVisionEmbeddings.__init__"  s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr6   r|   heightwidthr'   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   NrJ         ?r   r   bicubicF)rd   modealign_cornersr   )rv   rY  weightr   r=   jit
is_tracingrH   rP  r   r   permuterR   r   interpolater   cat)r4   r|   r[  r\  rW  rY  rX  class_pos_embedpatch_pos_embedro   
new_height	new_widthsqrt_num_positionss                r/   interpolate_pos_encoding0AltCLIPVisionEmbeddings.interpolate_pos_encoding8  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr6   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z).rN   r   r   rJ   r   )rv   rO  r   rV  rb  rO   r   flattenr   rS  rb   r=   rg  rm  rY  rH   )r4   ro  rm  ry   r   r[  r\  target_dtypepatch_embedsclass_embedsr|   s              r/   r}   AltCLIPVisionEmbeddings.forwarda  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr6   )	rS  rh   r!  rO  rW  rX  rV  rP  rY  F)r8   r9   r:   r;   r   rQ   r=   r   r   rm  r>   r}   r@   r   r   s   @r/   rH  rH  !  sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r6   rH  c                   |    \ rS rSr% \\S'   SrSr/ SQrSr	Sr
SrSrSr\\S.r\R$                  " 5       S 5       rS	rg
)AltCLIPPreTrainedModelit  rh   altclip)imagetext)AltCLIPTextEmbeddingsr8  rH  Tr   
attentionsc                 
   U R                   R                  n[        U[        5      (       a  [        R
                  " UR                  SUR                  S-  U-  S9  [        R
                  " UR                  R                  UR                   R                  U-  S9  [        R
                  " UR                  R                  UR                   R                  U-  S9  [        R                  " UR                  [        R                  " UR                   5      R#                  S5      5        g[        U[$        5      (       a  UR                  S-  SUR                   R&                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  " UR(                  R                  US9  [        R
                  " UR*                  R                  US9  [        R
                  " UR,                  R                  US9  [        R
                  " UR.                  R                  US9  g[        U[0        5      (       a  UR                   R2                  S-  SUR                   R&                  -  S-  -  U-  nSUR                   R2                  -  S-  U-  n[        R
                  " UR4                  R                  US9  [        R
                  " UR6                  R                  US9  g[        U[8        5      (       as  [        R
                  " UR:                  R                  UR<                  S-  U-  S9  [        R
                  " UR>                  R                  UR@                  S-  U-  S9  g[        U[B        RD                  5      (       aA  [        RF                  " URH                  5        [        RJ                  " UR                  5        g[        U[B        RL                  5      (       aO  [        R
                  " UR                  SUS9  URH                  b!  [        RF                  " URH                  5        gg[        U[B        RN                  5      (       ay  [        R
                  " UR                  SUS9  URP                  bK  [S        UR                  SS	5      (       d.  [        RF                  " UR                  URP                     5        ggg[        U[T        5      (       a|  [        R                  " UR                  [        R                  " UR                  RV                  S
   5      R#                  S5      5        [        RF                  " URX                  5        gg)zInitialize the weightsr   r   )meanstd)r  rI   r   N_is_hf_initializedFrJ   )-rh   initializer_factorr+   rH  initnormal_rS  r!  rV  rb  initializer_rangerY  copy_rH   r=   r`   rX  rb   r  r  r'  r%  r&  r(  r/  rU   r2  r3  AltCLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimrR   rZ   zeros_rN  ones_r   rS   rE   getattrrB   rv   rM   )r4   r   factorin_proj_stdout_proj_stdfc_stds         r/   _init_weights$AltCLIPPreTrainedModel._init_weights  s    //f566LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 011!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
++!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<--LL&&--))4/&8 LL((//++T1F: --KK$JJv}}%		**LLSf={{&FKK( '--LLSf=!!-gfmmMach6i6iFMM&*<*<=> 7j- 455JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 6r6   r7   N)r8   r9   r:   r;   r   r?   base_model_prefixinput_modalities_no_split_modulessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr8  r  _can_record_outputsr=   no_gradr  r@   r7   r6   r/   ry  ry  t  s_    !(c&*#N"&,&
 ]]_+/ +/r6   ry  zN
    The vision model from ALTCLIP without any head or projection on top.
    )custom_introc                      ^  \ rS rSr% \\S'   SrSrSrS\4U 4S jjr	\
\" SS9\  SS\R                  S	-  S
\S	-  S\\   S\4S jj5       5       5       rSrU =r$ )AltCLIPVisionModeli  rh   ro  )r{  rV  c                 (  > [         TU ]  U5        UR                  n[        U5      U l        [
        R                  " X!R                  S9U l        [        U5      U l
        [
        R                  " X!R                  S9U l        U R                  5         g r   )rP   rQ   rU   rH  r|   rR   rZ   r[   pre_layrnormrB  encoderpost_layernorm	post_init)r4   rh   r!  ri   s      r/   rQ   AltCLIPVisionModel.__init__  sk     &&	1&9LL8M8MN%f- ll9:O:OPr6   F)tie_last_hidden_statesNrm  r   r'   c                     U R                  XS9nU R                  U5      nU R                  " SSU0UD6nUR                  nUSS2SSS24   nU R	                  U5      n[        UUS9$ )a  
Examples:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, AltCLIPVisionModel

>>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```)rm  rl   Nr   r  pooler_outputr7   )r|   r  r  r  r  r   )r4   ro  rm  r   r   encoder_outputsr  r  s           r/   r}   AltCLIPVisionModel.forward  s    > h))-8+/<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r6   )r|   r  r  r  r  )r8   r9   r:   r;   r   r?   main_input_namer  _input_embed_layerrQ   r   r   r   r=   r>   boolr   r   r   r}   r@   r   r   s   @r/   r  r    s      $O!*2   E2 2605+
''$.+
 #'++
 +,	+

 
$+
  3  +
r6   r  aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                   *  ^  \ rS rSr% \\S'   SrSr\\	S.r
SU 4S jjr\\\     SS\R                   S-  S	\R                   S-  S
\R                   S-  S\R                   S-  S\R                   S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )AltRobertaModeli  rh   r|  rW   r~  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)	rP   rQ   rB   r|   r   r  r  poolerr  )r4   rh   add_pooling_layerri   s      r/   rQ   AltRobertaModel.__init__  sE    
 	 .v6(02C&v.r6   Nrk   r   rM   rH   rl   r   r'   c                 
   USL USL-  (       a  [        S5      eU R                  UUUUS9n[        U R                  UUS9nU R                  " U4SU0UD6nUS   nU R
                  b  U R                  U5      OSn	[        UU	S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, AltRobertaModel

>>> model = AltRobertaModel.from_pretrained("openai/alt_roberta-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/alt_roberta-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```Nz:You must specify exactly one of input_ids or inputs_embeds)rk   rH   rM   rl   )rh   rl   r   r   r   r  )r   r|   r	   rh   r  r  r   )
r4   rk   r   rM   rH   rl   r   r  sequence_outputr  s
             r/   r}   AltRobertaModel.forward  s    6 -t";<YZZ%)'	 ( 
 3;;')
 ,,
)
 

 *!,8<8OO4UY)-'
 	
r6   )r|   r  r  )TNNNNN)r8   r9   r:   r;   r   r?   r  r  r   r   r  rQ   r   r   r   r=   r   r   r   r2   r   r}   r@   r   r   s   @r/   r  r    s      *(-

   *..2.2,0-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 +,3
 
+	+3
    3
r6   r  c                     ^  \ rS rSr% \\S'   SrSrSrU 4S jr	\
\     SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )AltCLIPTextModeliR  rh   r  word_embeddingrobertac                   > [         TU ]  U5        [        USS9U l        [        R
                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        U R                  5         g )NF)r  rF   )rP   rQ   r  r  rR   r   rU   project_dimtransformationrZ   r[   pre_LNr  rg   s     r/   rQ   AltCLIPTextModel.__init__X  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr6   Nrk   r   rM   rH   rl   r   r'   c           	          U R                   " SUUUUUS.UD6nUS   nU R                  U5      nU R                  U5      n	U	SS2S4   n
[        U	U
UR                  UR
                  S9$ )a  
Examples:

```python
>>> from transformers import AutoProcessor, AltCLIPTextModel

>>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> texts = ["it's a cat", "it's a dog"]

>>> inputs = processor(text=texts, padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```)rk   r   rM   rH   rl   r   N)r  r  r   r  r7   )r  r  r  r   r   r  )r4   rk   r   rM   rH   rl   r   outputsr  projection_stater  s              r/   r}   AltCLIPTextModel.forward_  s    : ,, 
))%'
 
 "!* ++o6  ..?(A.6.'!//))	
 	
r6   )r  r  r  r  )r8   r9   r:   r;   r   r?   r  r  r  rQ   r   r   r=   r   r   r   r2   r   r}   r@   r   r   s   @r/   r  r  R  s     )!  *..2.2,0-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 +,3
 
8	83
  3
r6   r  logitsr'   c                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )N)rr   )rR   r   cross_entropyr=   r`   lenrr   )r  s    r/   contrastive_lossr    s/    ==&&vu||CKPVP]P]/^__r6   
similarityc                 P    [        U 5      n[        U R                  5      nX-   S-  $ )Ng       @)r  T)r  caption_loss
image_losss      r/   image_text_contrastive_lossr    s(    #J/L!*,,/J%,,r6   tensorc                     [         R                  " U S5      n[         R                  " USSS9n[         R                  " US5      nU$ )z
This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
r   rJ   T)ro   keepdimr^  )r=   powsum)r  square_tensor
sum_tensornormed_tensors       r/   _get_vector_normr    s<    
 IIfa(M=b$?JIIj#.Mr6   c                   &  ^  \ rS rSr% \\S'   S\4U 4S jjr\\   SS\	R                  S\	R                  S-  S\	R                  S-  S\	R                  S-  S	\\   S
\\-  4S jj5       5       r\\ SS\	R                   S\S	\\   S
\\-  4S jj5       5       r\\       SS\	R&                  S-  S\	R                   S-  S\	R                  S-  S\	R                  S-  S\	R&                  S-  S\S-  S\S	\\   S
\\-  4S jj5       5       rSrU =r$ )r  i  rh   c                   > [         TU ]  U5        UR                  nUR                  nUR                  U l        UR
                  U l        UR                  U l        [        R                  U R                  R                  5      U l        [        R                  U R                  R                  5      U l        [        R                   " U R                  U R                  SS9U l        [        R                   " U R                  U R                  SS9U l        [        R&                  " [(        R*                  " U R                  R,                  5      5      U l        U R1                  5         g )NF)rN  )rP   rQ   text_configvision_configprojection_dimr  r  rU   r  r  _from_configrh   
text_modelr  vision_modelrR   r   r  r  rQ  r=   r  logit_scale_init_valuelogit_scaler  )r4   rh   r  r  ri   s       r/   rQ   AltCLIPModel.__init__  s     ((,,$33)55 - 9 9*778O8OP.;;DKK<U<UV!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r6   Nrk   r   rM   rH   r   r'   c                     U R                   " SUUUUS.UD6nUR                  SS2SSS24   nU R                  U5      Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```rk   r   rM   rH   Nr   r7   )r  r  r  r  )r4   rk   r   rM   rH   r   text_outputsr  s           r/   get_text_featuresAltCLIPModel.get_text_features  s^    0 48?? 4
))%	4

 4
 %66q!Qw?%)%9%9-%H"r6   ro  rm  c                 v    U R                   " SUUSS.UD6nUR                  nU R                  U5      Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AltCLIPModel
>>> from transformers.image_utils import load_image

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```T)ro  rm  return_dictr7   )r  r  r  )r4   ro  rm  r   vision_outputsr  s         r/   get_image_featuresAltCLIPModel.get_image_features  sR    4 6:5F5F 6
%%=6
 	6
 '44'+'='=m'L$r6   return_lossc           
      4   U R                   " SUUS.UD6n	U R                  " SUUUUS.UD6n
U	S   nU R                  U5      nU
S   nU R                  U5      nU[	        U5      -  nU[	        U5      -  n[
        R                  " XR                  5       R                  UR                  5      5      nXR                  R                  5       R                  UR                  5      -  nUR                  5       nSnU(       a  [        U5      n[        UUUUUU
U	S9$ )u  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AltCLIPModel
>>> from transformers.image_utils import load_image

>>> model = AltCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
>>> processor = AutoProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")

>>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
>>> image = load_image(url)

>>> inputs = processor(text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, return_tensors="pt", padding=True)

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```)ro  rm  r  r   N)r    r!   r"   r#   r$   r%   r&   r7   )r  r  r  r  r  r=   r   tr   rr   r  expr  r   )r4   rk   ro  r   rM   rH   r  rm  r   r  r  r$   r#   r"   r!   r    s                   r/   r}   AltCLIPModel.forward  s?   J ** 
%%=
 
  
))%	

 
 &a(--l;"1o**;7 $&6|&DD!$4[$AA  ,,{NN4D4G4GHZHZ4[\),<,<,@,@,B,E,EkFXFX,YY*,,..?D-+#%* .
 	
r6   )r  r  r  r  r  r  r  r  )NNNrw  )NNNNNNF)r8   r9   r:   r;   r   r?   rQ   r   r   r=   r   r   r   r2   r   r  r>   r  r  r   r   r}   r@   r   r   s   @r/   r  r    s   } $  /3.2,0 <<  t+  t+	 
 llT)  +,  
+	+    D  */!''! #'! +,	!
 
+	+!  !F  .215.2.204#').L
##d*L
 ''$.L
 t+	L

 t+L
 &&-L
 D[L
 #'L
 +,L
 
	L
  L
r6   r  )ry  r  r  r  )r   )Gcollections.abcr   dataclassesr   typingr   r=   torch.nnrR    r   r  activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_altclipr   r   r   r   ModulerB   r   floatr   r   r   r   r   r   r   r   r  r  r/  r8  rB  rH  ry  r  r  r  r  r  r  r  __all__r7   r6   r/   <module>r     s  ( % !    & ! 6 9 t t F & 6 a a 7 5 X X _K _  _@g8299 g8b %II%<<% 
% <<	%
 LL4'% % %,4)bii 4)n299 ")) ,RYY ryy 0 B
		 
Dryy 7)ryy 7)t 4 B
RYY 
DPbii Pf </_ </ </~ 
>
/ >

>
B L
, L
L
^B
- B
N`U\\ `ell `-ELL -U\\ -U\\ ell  l
) l
 l
^ _r6   