
    Z j+P                     j   S r SSKrSSKJr  SSKJr  SSKJr  SSK	J
r
  SSKJrJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJrJ r   SSK!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-J.r.J/r/J0r0J1r1  \" SS9\ " S S\5      5       5       r2\" SS9\ " S S\ 5      5       5       r3\" SS9\ " S S\5      5       5       r4 " S S\&5      r5 " S S\-5      r6 " S S\5      r7 " S S \15      r8 " S! S"\5      r9 " S# S$\.5      r: " S% S&\/5      r; " S' S(\5      r< " S) S*\$5      r= " S+ S,\05      r> " S- S.\#5      r? " S/ S0\"5      r@ " S1 S2\%5      rA " S3 S4\$5      rB " S5 S6\(5      rC " S7 S8\'5      rD " S9 S:\)5      rE\" S;S<9 " S= S>\D5      5       rF " S? S@\D5      rG " SA SB\\D5      rH/ SCQrIg)DzPyTorch AltCLIP model.    N)strict   )initialization)create_bidirectional_mask)BaseModelOutputWithPooling'BaseModelOutputWithPoolingAndProjection)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )ChineseCLIPModelChineseCLIPTextAttentionChineseCLIPTextLayerChineseCLIPTextSelfAttention)
CLIPConfigCLIPTextConfigCLIPVisionConfig)
CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayer
CLIPOutputCLIPPreTrainedModelCLIPVisionEmbeddingsCLIPVisionModel_get_vector_normimage_text_contrastive_loss)RobertaEmbeddingsRobertaIntermediateRobertaOutputRobertaPoolerRobertaSelfOutputzBAAI/AltCLIP)
checkpointc                   &   \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\\S'   S	r
\\S
'   Sr\\S'   Sr\\S'   Sr\\-  \S'   Sr\\-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\\S'   \" 5       r\" 5       rS rg)!AltCLIPTextConfig9   a"  
project_dim (`int`, *optional*, defaults to 768):
    The dimensions of the teacher model before the mapping layer.

Examples:

```python
>>> from transformers import AltCLIPTextModel, AltCLIPTextConfig

>>> # Initializing a AltCLIPTextConfig with BAAI/AltCLIP style configuration
>>> configuration = AltCLIPTextConfig()

>>> # Initializing a AltCLIPTextModel (with random weights) from the BAAI/AltCLIP style configuration
>>> model = AltCLIPTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```i 
vocab_sizei   hidden_size   num_hidden_layers   num_attention_headsi   intermediate_sizegelu
hidden_actg?hidden_dropout_probr   attention_probs_dropout_probi  max_position_embeddings   type_vocab_sizeg{Gz?initializer_factorNpad_token_idbos_token_idr   eos_token_id   project_dim )__name__
__module____qualname____firstlineno____doc__r*   int__annotations__r+   r-   r/   r0   r2   strr3   floatr4   r5   r7   r8   r9   r:   r;   r=   AttributeErrorprojection_dimattention_dropout__static_attributes__r>       |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/altclip/modular_altclip.pyr(   r(   9   s    & JKs!!!s!J'*u*01 #+1#&S&OS $$ L#*  L#*  L#* K#%N&(rL   r(   c                       \ rS rSrSrSrg)AltCLIPVisionConfigb   a  
Example:

```python
>>> from transformers import AltCLIPVisionConfig, AltCLIPVisionModel

>>> # Initializing a AltCLIPVisionConfig with BAAI/AltCLIP style configuration
>>> configuration = AltCLIPVisionConfig()

>>> # Initializing a AltCLIPVisionModel (with random weights) from the BAAI/AltCLIP style configuration
>>> model = AltCLIPVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```r>   N)r?   r@   rA   rB   rC   rK   r>   rL   rM   rO   rO   b   s    rL   rO   c                   (    \ rS rSr% SrSr\\S'   Srg)AltCLIPConfigv   a  
Example:

```python
>>> from transformers import AltCLIPConfig, AltCLIPModel

>>> # Initializing a AltCLIPConfig with BAAI/AltCLIP style configuration
>>> configuration = AltCLIPConfig()

>>> # Initializing a AltCLIPModel (with random weights) from the BAAI/AltCLIP style configuration
>>> model = AltCLIPModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a AltCLIPConfig from a AltCLIPTextConfig and a AltCLIPVisionConfig

>>> # Initializing a AltCLIPText and AltCLIPVision configuration
>>> config_text = AltCLIPTextConfig()
>>> config_vision = AltCLIPVisionConfig()

>>> config = AltCLIPConfig(text_config=config_text, vision_config=config_vision)
```r<   rI   r>   N)	r?   r@   rA   rB   rC   rI   rD   rE   rK   r>   rL   rM   rR   rR   v   s    0 NCrL   rR   c                       \ rS rSrSrg)AltCLIPOutput   r>   Nr?   r@   rA   rB   rK   r>   rL   rM   rU   rU          rL   rU   c                       \ rS rSrSrg)AltRobertaEmbeddings   r>   NrW   r>   rL   rM   rZ   rZ      rX   rL   rZ   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )AltRobertaSelfAttention   c                 2   > [         TU ]  U5        SU l        g )NF)super__init__	is_causalselfconfig	__class__s     rM   ra    AltRobertaSelfAttention.__init__   s     rL   )rb   r?   r@   rA   rB   ra   rK   __classcell__rf   s   @rM   r]   r]      s     rL   r]   c                       \ rS rSrSrg)AltRobertaSelfOutput   r>   NrW   r>   rL   rM   rl   rl      rX   rL   rl   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )AltRobertaAttention   c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g N)r`   ra   r]   rd   rl   outputrc   s     rM   ra   AltRobertaAttention.__init__   s&    +F3	*62rL   )rs   rd   rh   rj   s   @rM   ro   ro      s    3 3rL   ro   c                       \ rS rSrSrg)AltRobertaIntermediate   r>   NrW   r>   rL   rM   rv   rv      rX   rL   rv   c                       \ rS rSrSrg)AltRobertaOutput   r>   NrW   r>   rL   rM   ry   ry      rX   rL   ry   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )AltRobertaLayer   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        U5      U l        g rr   )r`   ra   ro   	attentionrv   intermediatery   rs   rc   s     rM   ra   AltRobertaLayer.__init__   s3    ,V426:&v.rL   )r   r   rs   rh   rj   s   @rM   r|   r|      s    / /rL   r|   c                   4   ^  \ rS rSrSrS\4U 4S jjrSrU =r$ )AltRobertaEncoder   z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`AltRobertaEncoderLayer`].

Args:
    config: AltCLIPTextConfig
re   c                    > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf rr   )r`   ra   nn
ModuleListranger-   r|   layers)rd   re   _rf   s      rM   ra   AltRobertaEncoder.__init__   sD     mmeFLdLdFe$fFe_V%<Fe$fg$fs   A)r   )	r?   r@   rA   rB   rC   r(   ra   rK   ri   rj   s   @rM   r   r      s    h0 h hrL   r   c                       \ rS rSrSrg)AltRobertaPooler   r>   NrW   r>   rL   rM   r   r      rX   rL   r   c                       \ rS rSrSrg)AltCLIPAttention   r>   NrW   r>   rL   rM   r   r      rX   rL   r   c                       \ rS rSrSrg)
AltCLIPMLP   r>   NrW   r>   rL   rM   r   r      rX   rL   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )AltCLIPEncoderLayer   re   c                 $   > [         TU ]  U5        g rr   )r`   ra   rc   s     rM   ra   AltCLIPEncoderLayer.__init__   s     rL   r>   )r?   r@   rA   rB   rO   ra   rK   ri   rj   s   @rM   r   r      s    !2 ! !rL   r   c                       \ rS rSrSrg)AltCLIPEncoder   r>   NrW   r>   rL   rM   r   r      rX   rL   r   c                       \ rS rSrSrg)AltCLIPVisionEmbeddings   r>   NrW   r>   rL   rM   r   r      rX   rL   r   c                   L    \ rS rSr\\S.r\R                  " 5       S 5       r	Sr
g)AltCLIPPreTrainedModel   hidden_states
attentionsc                 
   U R                   R                  n[        U[        5      (       a  [        R
                  " UR                  SUR                  S-  U-  S9  [        R
                  " UR                  R                  UR                   R                  U-  S9  [        R
                  " UR                  R                  UR                   R                  U-  S9  [        R                  " UR                  [        R                  " UR                   5      R#                  S5      5        g[        U[$        5      (       a  UR                  S-  SUR                   R&                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  " UR(                  R                  US9  [        R
                  " UR*                  R                  US9  [        R
                  " UR,                  R                  US9  [        R
                  " UR.                  R                  US9  g[        U[0        5      (       a  UR                   R2                  S-  SUR                   R&                  -  S-  -  U-  nSUR                   R2                  -  S-  U-  n[        R
                  " UR4                  R                  US9  [        R
                  " UR6                  R                  US9  g[        U[8        5      (       as  [        R
                  " UR:                  R                  UR<                  S-  U-  S9  [        R
                  " UR>                  R                  UR@                  S-  U-  S9  g[        U[B        RD                  5      (       aA  [        RF                  " URH                  5        [        RJ                  " UR                  5        g[        U[B        RL                  5      (       aO  [        R
                  " UR                  SUS9  URH                  b!  [        RF                  " URH                  5        gg[        U[B        RN                  5      (       ay  [        R
                  " UR                  SUS9  URP                  bK  [S        UR                  SS	5      (       d.  [        RF                  " UR                  URP                     5        ggg[        U[T        5      (       a|  [        R                  " UR                  [        R                  " UR                  RV                  S
   5      R#                  S5      5        [        RF                  " URX                  5        gg)zInitialize the weightsg        g      )meanstd)r   )r6   r   N_is_hf_initializedFr   )-re   r8   
isinstancer   initnormal_class_embedding	embed_dimpatch_embeddingweightinitializer_rangeposition_embeddingcopy_position_idstorcharangenum_positionsexpandr   r-   q_projk_projv_projout_projr   r+   fc1fc2AltCLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   	LayerNormzeros_biasones_Linear	Embeddingpadding_idxgetattrrZ   shapetoken_type_ids)rd   modulefactorin_proj_stdout_proj_stdfc_stds         rM   _init_weights$AltCLIPPreTrainedModel._init_weights   s    //f566LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 011!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
++!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<--LL&&--))4/&8 LL((//++T1F: --KK$JJv}}%		**LLSf={{&FKK( '--LLSf=!!-gfmmMach6i6iFMM&*<*<=> 7j- 455JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 6rL   r>   N)r?   r@   rA   rB   r   r   _can_record_outputsr   no_gradr   rK   r>   rL   rM   r   r      s*    ,&
 ]]_+/ +/rL   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )AltCLIPVisionModeli  c                 $   > [         TU ]  " S0 UD6$ )a  
Examples:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, AltCLIPVisionModel

>>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r>   )r`   forwardrd   super_kwargsrf   s     rM   r   AltCLIPVisionModel.forward  s    . w...rL   r>   )r?   r@   rA   rB   r   rK   ri   rj   s   @rM   r   r     s    / /rL   r   aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                   *  ^  \ rS rSr% \\S'   SrSr\\	S.r
SU 4S jjr\\\     SS\R                   S-  S	\R                   S-  S
\R                   S-  S\R                   S-  S\R                   S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )AltRobertaModeli4  re   textword_embeddingsr   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)	r`   ra   rZ   
embeddingsr   encoderr   pooler	post_init)rd   re   add_pooling_layerrf   s      rM   ra   AltRobertaModel.__init__G  sE    
 	 .v6(02C&v.rL   N	input_idsattention_maskr   r   inputs_embedskwargsreturnc                 
   USL USL-  (       a  [        S5      eU R                  UUUUS9n[        U R                  UUS9nU R                  " U4SU0UD6nUS   nU R
                  b  U R                  U5      OSn	[        UU	S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, AltRobertaModel

>>> model = AltRobertaModel.from_pretrained("openai/alt_roberta-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/alt_roberta-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r   r   )re   r   r   r   r   )last_hidden_statepooler_output)
ValueErrorr   r   re   r   r   r   )
rd   r   r   r   r   r   r   encoder_outputssequence_outputpooled_outputs
             rM   r   AltRobertaModel.forwardS  s    6 -t";<YZZ%)'	 ( 
 3;;')
 ,,
)
 

 *!,8<8OO4UY)-'
 	
rL   )r   r   r   )TNNNNN)r?   r@   rA   rB   r(   rE   input_modalities_input_embed_layerr|   r]   r   ra   r   r   r   r   Tensorr	   r
   tupler   r   rK   ri   rj   s   @rM   r   r   4  s      *(-

   *..2.2,0-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 +,3
 
+	+3
    3
rL   r   c                     ^  \ rS rSr% \\S'   SrSrSrU 4S jr	\
\     SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )AltCLIPTextModeli  re   r   word_embeddingrobertac                   > [         TU ]  U5        [        USS9U l        [        R
                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        U R                  5         g )NF)r   )eps)r`   ra   r   r  r   r   r+   r=   transformationr   layer_norm_epspre_LNr   rc   s     rM   ra   AltCLIPTextModel.__init__  se     &vG ii(:(:F<N<NOll6#5#56;P;PQrL   Nr   r   r   r   r   r   r   c           	          U R                   " SUUUUUS.UD6nUS   nU R                  U5      nU R                  U5      n	U	SS2S4   n
[        U	U
UR                  UR
                  S9$ )a  
Examples:

```python
>>> from transformers import AutoProcessor, AltCLIPTextModel

>>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> texts = ["it's a cat", "it's a dog"]

>>> inputs = processor(text=texts, padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```)r   r   r   r   r   r   N)r   r   r   r   r>   )r  r  r  r   r   r   )rd   r   r   r   r   r   r   outputsr   projection_stater   s              rM   r   AltCLIPTextModel.forward  s    : ,, 
))%'
 
 "!* ++o6  ..?(A.6.'!//))	
 	
rL   )r  r  r  r   )r?   r@   rA   rB   r(   rE   r   r   base_model_prefixra   r   r   r   r   r	   r
   r   r   r   rK   ri   rj   s   @rM   r   r     s     )!  *..2.2,0-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 +,3
 
8	83
  3
rL   r   c                   (  ^  \ rS rSr% \\S'   S\4U 4S jjrU 4S jrU 4S jr       SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\S-  S\S\\   S\\-  4S jjrSrU =r$ )r   i  re   c                   > [         TU ]  U5        UR                  nUR                  U l        [
        R                  U R                  R                  5      U l        [        R                  U R                  R                  5      U l        g rr   )r`   ra   text_configr=   r   r   _from_configre   
text_modelr   vision_configvision_model)rd   re   r  rf   s      rM   ra   AltCLIPModel.__init__  sb     (()55*778O8OP.;;DKK<U<UVrL   c                 $   > [         TU ]  " S0 UD6$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```r>   )r`   get_text_featuresr   s     rM   r  AltCLIPModel.get_text_features  s     w(8<88rL   c                 $   > [         TU ]  " S0 UD6$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AltCLIPModel
>>> from transformers.image_utils import load_image

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```r>   )r`   get_image_featuresr   s     rM   r  AltCLIPModel.get_image_features  s    & w)9L99rL   Nr   pixel_valuesr   r   r   return_lossinterpolate_pos_encodingr   r   c           
      4   U R                   " SUUS.UD6n	U R                  " SUUUUS.UD6n
U	S   nU R                  U5      nU
S   nU R                  U5      nU[	        U5      -  nU[	        U5      -  n[
        R                  " XR                  5       R                  UR                  5      5      nXR                  R                  5       R                  UR                  5      -  nUR                  5       nSnU(       a  [        U5      n[        UUUUUU
U	S9$ )u  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AltCLIPModel
>>> from transformers.image_utils import load_image

>>> model = AltCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
>>> processor = AutoProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")

>>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
>>> image = load_image(url)

>>> inputs = processor(text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, return_tensors="pt", padding=True)

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```)r  r  )r   r   r   r   r6   N)losslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr>   )r  r  r   r   r   r   matmulttodevicelogit_scaleexpr    rU   )rd   r   r  r   r   r   r  r  r   vision_outputstext_outputsr#  r"  r!  r   r  s                   rM   r   AltCLIPModel.forward  s?   F ** 
%%=
 
  
))%	

 
 &a(--l;"1o**;7 $&6|&DD!$4[$AA  ,,{NN4D4G4GHZHZ4[\),<,<,@,@,B,E,EkFXFX,YY*,,..?D-+#%* .
 	
rL   )r   r  r  )NNNNNNF)r?   r@   rA   rB   rR   rE   ra   r  r  r   
LongTensorFloatTensorr   boolr	   r
   r   rU   r   rK   ri   rj   s   @rM   r   r     s    W} W9":. .215.2.204#').L
##d*L
 ''$.L
 t+	L

 t+L
 &&-L
 D[L
 #'L
 +,L
 
	L
 L
rL   r   )r   r   r   r   r(   rO   rR   )JrC   r   torch.nnr   huggingface_hub.dataclassesr    r   r   masking_utilsr   modeling_outputsr   r   processing_utilsr	   utilsr
   r   r   utils.genericr   utils.output_capturingr   "chinese_clip.modeling_chinese_clipr   r   r   r   clip.configuration_clipr   r   r   clip.modeling_clipr   r   r   r   r   r   r   r   r   r    roberta.modeling_robertar!   r"   r#   r$   r%   r(   rO   rR   rU   rZ   r]   rl   ro   rv   ry   r|   r   r   r   r   r   r   r   r   r   r   r   r   __all__r>   rL   rM   <module>r@     s      . & 6 ' I I 7 5  S R    >*$) $)  +$)N >**   +$ >*J   +8	J 		, 	: 	, 	32 3	0 		} 	/* /h h	} 		} 		 	!* !
	[ 		2 	2/0 2/j/ /6 L
, L
L
^B
- B
J|
#%; |
~rL   