
    Z jMO                     .   S r SSKrSSKrSSKJs  Jr  SSKJr  SSKJr  SSK	J
r  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)J*r*J+r+  SSK,J-r-J.r.J/r/  \" SS9\ " S S\+5      5       5       r0\" SS9\ " S S\*5      5       5       r1\" SS9\ " S S\)5      5       5       r2 " S S\/5      r3 " S S \'5      r4 " S! S"\&5      r5 " S# S$\Rl                  5      r7 " S% S&\#5      r8 " S' S(\-5      r9 " S) S*\5      r: " S+ S,\.5      r; " S- S.\Rl                  5      r<\ " S/ S0\5      5       r=\" S1S29 " S3 S4\=5      5       r>\" S5S29 " S6 S7\=5      5       r?\ " S8 S9\"5      5       r@/ S:QrAg);z%Pytorch implementation of AIMv2 Model    N)strict)nn   )initialization)PreTrainedConfig)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputz!apple/aimv2-large-patch14-224-lit)
checkpointc                       \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\\S'   S	r
\\S
'   Sr\\\   -  \\\4   -  \S'   Sr\\S'   Sr\\-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   \" 5       rSrg)Aimv2VisionConfig(   a  
use_head (`str`, *optional*, defaults to `True`):
    Whether to use Attention Pooling Head or Not.
is_native (`str`, *optional*, defaults to `False`):
    Whether to use ckpt trained for image native resolution or not.

Example:

```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel

>>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = Aimv2VisionConfig()

>>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = Aimv2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```i   hidden_sizei   intermediate_size   num_hidden_layers   num_attention_heads   
patch_sizeh㈵>rms_norm_eps        attention_dropoutFqkv_biasmlp_biassilu
hidden_act{Gz?initializer_rangeTuse_head	is_native N)__name__
__module____qualname____firstlineno____doc__r#   int__annotations__r$   r&   r(   r*   listtupler,   floatr.   r/   boolr0   r2   strr4   r5   r6   AttributeErrorlayer_norm_eps__static_attributes__r7       x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/aimv2/modular_aimv2.pyr!   r!   (   s    * K!s!s  46Jd3i%S/16L%%(us{(HdHdJ#u#HdIt#%NrG   r!   c                       \ rS rSr% Sr\\S'   Sr\\S'   Sr\\S'   Sr	\\S	'   S
r
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   \" 5       r\" 5       r\" 5       r\" 5       rS rSrg)Aimv2TextConfigQ   i   
vocab_sizei   r#   i   r$      r&      r(   M   max_position_embeddingsr1   r2   r+   r,   Fr/   r0   r3   r4   c                 0    [         R                  " S0 UD6  g )Nr7   )r   __post_init__)selfkwargss     rH   rR   Aimv2TextConfig.__post_init__d   s    &&00rG   r7   N)r8   r9   r:   r;   rL   r=   r>   r#   r$   r&   r(   rP   r2   rC   r,   rA   r/   rB   r0   r4   rD   bos_token_idpad_token_idrE   projection_sizerR   rF   r7   rG   rH   rJ   rJ   Q   s     JK!s!s  #%S%JL%HdHd#u#!#L!#L#%N$&O1rG   rJ   c                   D    \ rS rSr% SrSr\\S'   Sr\	\S'   Sr
\	\S'   S	rg
)Aimv2Configh   a  
max_logit_scale (`float`, *optional*, defaults to `100.0`):
    The maximum logit scale to use

Example:

```python
>>> from transformers import Aimv2Config, Aimv2Model

>>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
>>> configuration = Aimv2Config()

>>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
>>> model = Aimv2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
>>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

>>> # Initializing a AIMv2Text and AIMv2Vision configuration
>>> config_text = Aimv2TextConfig()
>>> config_vision = Aimv2VisionConfig()

>>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
```i   projection_dimg/L
F@logit_scale_init_valueg      Y@max_logit_scaler7   N)r8   r9   r:   r;   r<   r\   r=   r>   r]   rA   r^   rF   r7   rG   rH   rZ   rZ   h   s(    8 NC$*E*"OU"rG   rZ   c                       \ rS rSrSrg)Aimv2Output   r7   Nr8   r9   r:   r;   rF   r7   rG   rH   r`   r`          rG   r`   c                       \ rS rSrSrg)Aimv2RMSNorm   r7   Nrb   r7   rG   rH   re   re      rc   rG   re   c                       \ rS rSrSrg)Aimv2MLP   r7   Nrb   r7   rG   rH   rh   rh      rc   rG   rh   c                      ^  \ rS rSrS\4U 4S jjr\SSS\R                  4S\R                  4S jj5       r
S	\R                  S\R                  4S
 jrSrU =r$ )Aimv2VisionEmbeddings   configc                 B  > [         TU ]  5         Xl        UR                  U l        [        R
                  " UR                  UR                  UR                  UR                  S9U l        [        UR                  UR                  5      U l        UR                  UR                  -  S-  nU R                  R                  (       d%  [        R                  " X!R                  5      U l        U R!                  S["        R$                  " U5      R'                  S5      SS9  g )N)kernel_sizestrider   position_ids   F)
persistent)super__init__rm   r*   r   Conv2dnum_channelsr#   patch_embedre   r,   rms_norm
image_sizer6   	Embeddingposition_embeddingregister_buffertorcharangeexpand)rS   rm   num_patches	__class__s      rH   rw   Aimv2VisionEmbeddings.__init__   s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-VchirG      g     @cpureturnc                    [         R                  " [        U5      XTS9n[         R                  " [        U 5      XTS9n[         R                  " XgSS9u  pvUS-  n[         R                  " XUS9U-  n	SX9-  -  n	UR	                  5       S   U	S S S 24   -  n
UR	                  5       S   U	S S S 24   -  n[         R
                  " U
R                  5       U
R                  5       UR                  5       UR                  5       /SS9S S S 2S S 24   $ )	Ndtypedevicexy)indexing   g      ?).Nrs   dim)r   r   r=   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   r   grid_wgrid_hpos_dimomegaout_hout_ws               rH   "build_2d_sincos_position_embedding8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding   s     c%jEc&kFFq.W&AGK{)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddrG   pixel_valuesc                    UR                  5       u    p#nU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R
                  R                  (       aT  U R                  X0R                  -  X@R                  -  U R
                  R                  UR                  UR                  S9nOU R                  U R                  5      nXV-   nU$ )Nr   rs   )r   r   r   )sizerz   r   	transposer{   rm   r6   r   r*   r#   r   r   r~   rq   )rS   r   _r   r   hidden_states	pos_embeds          rH   forwardAimv2VisionEmbeddings.forward   s    *//11e((6>>qAKKAqQm4;;  ??//)(++11$++#)) @ I //0A0ABI%1rG   )rm   rz   r*   r~   r{   )r8   r9   r:   r;   r!   rw   staticmethodr   float32Tensorr   r   rF   __classcell__r   s   @rH   rk   rk      sb    j0 j !$'%u}}e	e e ELL U\\  rG   rk   c                       \ rS rSrSrg)Aimv2TextEmbeddings   r7   Nrb   r7   rG   rH   r   r      rc   rG   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )Aimv2Attention   c                   > [         TU ]  U5        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l	        g )Nbias)
rv   rw   r   Linearr   r/   k_projv_projq_projout_projrS   rm   r   s     rH   rw   Aimv2Attention.__init__   s     iiV__UiiV__UiiV__U		$..$..vWrG   )r   r   r   r   )r8   r9   r:   r;   rw   rF   r   r   s   @rH   r   r      s    X XrG   r   c            	          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\R                  4S	 jjr
S
rU =r$ )Aimv2EncoderLayer   rm   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g N)rv   rw   r   	attentionrh   ffnre   r#   r,   	rms_norm1	rms_norm2r   s     rH   rw   Aimv2EncoderLayer.__init__   sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNrG   Nr   attention_maskrT   r   c                     U R                  U5      nU R                  " SXBS.UD6u  pVX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)r   r   r7   )r   r   r   r   )rS   r   r   rT   norm_hidden_statesattn_outputr   
mlp_outputs           rH   r   Aimv2EncoderLayer.forward   sa     "^^M:r6Hrkqr%3!^^M:XX01
%2rG   )r   r   r   r   r   )r8   r9   r:   r;   r!   rw   r   r   r   r   r   rF   r   r   s   @rH   r   r      s^    O0 O /3|| t+ +,	
 
 rG   r   c                       \ rS rSrSrg)Aimv2Encoder   r7   Nrb   r7   rG   rH   r   r      rc   rG   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Aimv2AttentionPoolingHead   rm   c                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " [        R                  " SSU R                  5      5      U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   rs   T)rv   rw   r#   r(   	num_headsr   r   r/   r   r   	Parameterr   zeros	cls_tokenoutput_projr   s     rH   rw   "Aimv2AttentionPoolingHead.__init__   s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSrG   r   r   c                    UR                   u  p#nU R                  R                  USS5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nUR	                  USU R
                  X@R
                  -  5      nUR                  SSSS5      nUR                  SSSS5      nUR                  SSSS5      n[        R                  " XU5      n	U	R                  SS5      R	                  USU5      n	U	R                  SS9n	U R                  U	5      n
U
$ )Nrt   rs   r   r   r   r   )shaper   r   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )rS   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              rH   r   !Aimv2AttentionPoolingHead.forward  s8   *7*=*='
ZNN))*b"=	kk-(00dnnV`drdrVrsM*22:XbftftXtu!!*a~~A]^kk!Q1%aAq)aAq)44UG!++Aq199*aT!&&1&-!!+.rG   )r   r#   r   r   r   r   )r8   r9   r:   r;   r!   rw   r   r   r   rF   r   r   s   @rH   r   r      s2    	T0 	TU\\ ell  rG   r   c                      ^  \ rS rSr% Sr\\S'   SrSrSr	/ SQr
SrSrSr\R                  " 5       U 4S j5       rS	rU =r$ )
Aimv2PreTrainedModeli  z
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. The model is only intended for inference and doesn't support finetuning.
rm   aimv2)imageT)r   r   rk   r   c                 ^  > [         TU ]  U5        [        US5      (       a`  [        UR                  [
        R                  5      (       a6  [        R                  " UR                  [        R                  " S5      5        g g [        U[        5      (       a5  [        R                  " UR                  SU R                  R                  S9  g [        U[         5      (       a\  [        R"                  " UR$                  [&        R(                  " UR$                  R*                  S   5      R-                  S5      5        g [        U[.        5      (       a\  [        R"                  " UR$                  [&        R(                  " UR$                  R*                  S   5      R-                  S5      5        g g )Nlogit_scaleg$I$I,@r-   )r   stdrt   rr   )rv   _init_weightshasattr
isinstancer   r   r   init	constant_mathlogr   normal_r   rm   r4   rk   copy_rq   r   r   r   r   r   )rS   moduler   s     rH   r   "Aimv2PreTrainedModel._init_weights-  s   f%6=))&,,bll;;v11488H3EF < 9::LL))9V9VW 566JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 344JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5rG   r7   )r8   r9   r:   r;   r<   rZ   r>   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   no_gradr   rF   r   r   s   @rH   r   r     sW    
 !&*# N
]]_
i 
irG   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                      ^  \ rS rSr% \\S'   Sr\\S.r	S\4U 4S jjr
S\R                  4S jr\\" SS	9\S
\\   S\4S j5       5       5       rSrU =r$ )Aimv2VisionModeli;  rm   r   r   
attentionsc                 >  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  (       a  [        U5      U l        U R                  5         g r   )rv   rw   rm   rk   
embeddingsr   encoderre   r#   r,   r{   r5   r   head	post_initr   s     rH   rw   Aimv2VisionModel.__init__H  so     /7#F+$V%7%79L9LM==1&9DIrG   r   c                 .    U R                   R                  $ r   )r  rz   rS   s    rH   get_input_embeddings%Aimv2VisionModel.get_input_embeddingsV  s    ***rG   Ftie_last_hidden_statesrT   c                     U R                  U5      nU R                  " SSU0UD6nUR                  nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```inputs_embedsNlast_hidden_statepooler_outputr7   )r  r  r  r{   r5   r  r   )rS   r   rT   r   encoder_outputsr  r  s          rH   r   Aimv2VisionModel.forwardY  sx    < 5+/<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
rG   )rm   r  r  r  r{   r5   )r8   r9   r:   r;   r!   r>   main_input_namer   r   _can_record_outputsrw   r   Moduler  r   r   r   r   r   r   r   rF   r   r   s   @rH   r	  r	  ;  s~     $O*$
0 +bii +  E2*
 +,*
 
$	*
  3  *
rG   r	  zJ
    The text model from AIMv2 without any head or projection on top.
    c            
          ^  \ rS rSrSr\\S.rS\4U 4S jjr	S\
R                  4S jrS r\\" S	S
9\ SS\R$                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )Aimv2TextModeli  	input_idsr
  rm   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  5         g r   )rv   rw   rm   r   r  r   r  re   r#   r,   r{   eos_token_idr  r   s     rH   rw   Aimv2TextModel.__init__  s_     -f5#F+$V%7%79L9LM"//rG   r   c                 .    U R                   R                  $ r   r  token_embeddingr  s    rH   r  #Aimv2TextModel.get_input_embeddings  s    ...rG   c                 $    XR                   l        g r   r)  )rS   r   s     rH   set_input_embeddings#Aimv2TextModel.set_input_embeddings  s    */'rG   Fr  Nr   rT   c                    U R                  U5      nUR                  u  pVn[        R                  " U[        R                  UR
                  S9nUR                  S5      R                  US5      nUb  [        U R                  UUUS S9nU R                  " S	UUS.UD6n	U	R                  n
U R                  U
5      n
U
[        R                  " U
R                  S   U
R
                  S9UR                  [        R                  U
R
                  S9U R                  :H  R                  5       R!                  SS94   n[#        U
US9$ )
Nr   r   rt   )rm   r  rq   r   past_key_values)r  r   )r   r   r  r7   )r  r   r   r   longr   	unsqueezer   r   rm   r  r  r{   tor=   r&  argmaxr   )rS   r$  r   rT   r   r   r   r   rq   r  r  pooled_outputs               rH   r   Aimv2TextModel.forward  sG    	2!.!4!4
Q||G5::mFZFZ[#--a077
BG%/{{+)- $N ,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
rG   )rm   r  r  r&  r{   r   )r8   r9   r:   r;   r  r   r   r   rJ   rw   r   r!  r  r-  r   r   r   r   r   r   r   r   r   rF   r   r   s   @rH   r#  r#    s     "O +$
	 	/bii /0  E2 /3&
 t+&
 +,	&

 
$&
  3  &
rG   r#  c                       \ rS rSrSrS\4S jr\\   SS\	R                  S-  S\	R                  S-  S\	R                  S-  S	\\   S
\4
S jj5       5       rSrg)
Aimv2Modeli  Trm   c                    [         R                  " X5        UR                  U l        UR                  R                  U l        UR                  R                  U l        [        R                  UR                  5      U l
        [        R                  UR                  5      U l        [        R                  " U R
                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R"                  " [$        R&                  " U R(                  R*                  5      5      U l        [.        R0                  " UR2                  5      U l        U R7                  5         g )NFr   )r   rw   r\   vision_configr#   vision_embed_dimtext_configtext_embed_dimr	  _from_configvision_modelr#  
text_modelr   r   visual_projectiontext_projectionr   r   tensorrm   r]   r   r   r   r^   max_log_logit_scaler  )rS   rm   s     rH   rw   Aimv2Model.__init__  s      .$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C rG   Nr$  r   r   rT   r   c           	          U R                   " SSU0UD6nU R                  " SUUS.UD6nUR                  nU R                  U5      nUR                  nU R	                  U5      nU[        U5      -  nU[        U5      -  nU R                  R                  SU R                  5      R                  5       R                  UR                  5      n	X-  UR                  5       -  n
U
R                  5       n[        UU
UUUUS9$ )aD  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   )r$  r   r-   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr7   )r?  r@  r  rA  rB  r   r   clamprD  expr3  r   tr`   )rS   r$  r   r   rT   vision_outputstext_outputsrJ  rI  r   rH  rG  s               rH   r   Aimv2Model.forward  s   B 6:5F5F 6
%6
6

 48?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
rG   )	r   rD  r\   r=  r@  rB  r;  r?  rA  )NNN)r8   r9   r:   r;   r  rZ   rw   r   r   r   
LongTensorFloatTensorr   r   r   r`   r   rF   r7   rG   rH   r8  r8    s    { $  .215.2	?
##d*?
 ''$.?
 t+	?

 +,?
 
?
  ?
rG   r8  )rZ   r!   rJ   r	  r8  r   r#  )Br<   r   r   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr    r   r   configuration_utilsr   masking_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r!   rJ   rZ   r`   re   rh   r!  rk   r   r   r   r   r   r   r	  r#  r8  __all__r7   rG   rH   <module>rg     s   ,     .  & 3 / 9 K - & I I 7 5 P P 9 \ \ Q Q >?$&* $&  @$&N >?1& 1  @1* >?#, #  @#D	, 		< 		x 	1BII 1h	, 	X_ X2 2	= 			 D i? i iD 
F
+ F

F
R 
B
) B

B
J V
 V
 V
rrG   