
    Z jB                        S SK Jr  S SKrS SKJr  S SKJr  SSKJr	  SSK
Jr  SSKJrJr  SSKJrJr  SS	KJr  SS
KJrJrJr  SSKJrJrJrJrJrJr  SSKJ r   SSK!J"r"J#r#  \RH                  " \%5      r&\" SS9\ " S S\5      5       5       r' " S S\5      r( " S S\"5      r) " S S\5      r* " S S\5      r+ " S S\5      r, " S S\5      r-\ " S S \5      5       r. " S! S"\5      r// S#Qr0g)$    )CallableN)strict   )initialization)PreTrainedConfig)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModel)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionz&DeepGlint-AI/mlcd-vit-bigG-patch14-336)
checkpointc                      \ rS rSr% SrSrSrSr\\	S'   Sr
\\	S'   S	r\\	S
'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\\   -  \\\4   -  \	S'   Sr\\\   -  \\\4   -  \	S'   Sr\\	S'   Sr\\	S'   Sr\\-  \	S'   Sr\\	S'   Sr\\	S'   Srg )!MLCDVisionConfig)   aF  
num_key_value_groups (`int`, *optional*, defaults to 1):
    Number of key-value groups used in Attention.

Example:

```python
>>> from transformers import MLCDVisionConfig, MLCDVisionModel

>>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
>>> configuration = MLCDVisionConfig()

>>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
>>> model = MLCDVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```mlcd_vision_modelvision_configi  hidden_sizei    intermediate_size0   num_hidden_layers   num_attention_heads   num_key_value_groupsr   num_channelsiP  
image_size   
patch_sizegelu
hidden_actgh㈵>layer_norm_eps        attention_dropoutg{Gz?initializer_range      ?initializer_factor N)__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr    int__annotations__r!   r#   r%   r'   r(   r)   listtupler+   r-   strr.   floatr0   r1   r3   __static_attributes__r4       v/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mlcd/modular_mlcd.pyr   r   )   s    & %J%OK!s!s!! !#!L#47Jd3i%S/1746Jd3i%S/16J NE %(us{(#u# ##rC   r   c                       \ rS rSrSrg)MLCDMLPQ   r4   N)r5   r6   r7   r8   rB   r4   rC   rD   rF   rF   Q   s    rC   rF   c                   >    \ rS rSrS\S\S\R                  4S jrSrg)MLCDRotaryEmbeddingU   num_patches_heightnum_patches_widthreturnc                 ~   [         R                  " XR                  R                  S9R	                  S5      R                  SU5      n[         R                  " X R                  R                  S9R	                  S5      R                  US5      n[         R                  " UR                  5       UR                  5       /SS9n[        X5      n[         R                  " X`R                  R                  U R                  R                  S9n[         R                  " XpR                  5      nX   R                  S5      n	U	$ )aE  
Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

Args:
    num_patches_height (int): Number of patches in the height dimension.
    num_patches_width (int): Number of patches in the width dimension.

Returns:
    torch.Tensor: Rotary positional embeddings for the given grid size.
)devicer&   r   dim)rO   dtype)torcharangeinv_freqrO   	unsqueezeexpandstackflattenmaxrS   outer)
selfrK   rL   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             rD   forwardMLCDRotaryEmbedding.forwardV   s     LL+MM4H4HISSTUV]]^`bst 	 LL*==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .Bll=1E1ET]]M`M`a#kk#}}= -5==a@rC   r4   N)	r5   r6   r7   r8   r<   rT   Tensorre   rB   r4   rC   rD   rI   rI   U   s     # # %,, rC   rI   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )MLCDVisionEmbeddingsw   configc                 (   > [         TU ]  U5        U ?g N)super__init__position_embeddingr]   rk   	__class__s     rD   ro   MLCDVisionEmbeddings.__init__x   s     #rC   pixel_valuesrM   c                 H   UR                   S   nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n[        R                  " XT/SS9nU$ )Nr   rS   r   r&   rP   rQ   )shapepatch_embeddingweightrS   torZ   	transposeclass_embeddingrX   rT   cat)r]   rt   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingss          rD   re   MLCDVisionEmbeddings.forward|   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
rC   r4   )r5   r6   r7   r8   r   ro   rT   FloatTensorrg   re   rB   __classcell__rr   s   @rD   ri   ri   w   s2    $/ $
E$5$5 
%,, 
 
rC   ri   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
\   S
\	\R                  \R                  S-  4   4
S jjrSrU =r$ )MLCDAttention   zMulti-headed attention with RoPE. Refer to papers:
- Attention is all you need:
    https://huggingface.co/papers/1706.03762
- RoFormer: Enhanced Transformer with Rotary Position Embedding:
    https://huggingface.co/papers/2104.09864
rk   c                 T   > [         TU ]  U5        UR                  U l        SU l        g )NF)rn   ro   r'   	is_causalrq   s     rD   ro   MLCDAttention.__init__   s%     $*$?$?!rC   Nhidden_statesposition_embeddingsattention_maskkwargsrM   c                    UR                   S S u  pVU R                  U5      R                  XVU R                  U R                  45      nU R                  U5      R                  XVU R                  U R                  45      nU R                  U5      R                  XVU R                  U R                  45      n	US   R                  S5      R                  5       n
US   R                  S5      R                  5       n[        XxX5      u  pxUR                  SSSS5      R                  5       nUR                  SSSS5      R                  5       nU	R                  SSSS5      R                  5       n	[        R                  " U R                  R                  [         5      nU" U UUU	U4U R"                  (       d  SOU R$                  U R&                  U R(                  S.UD6u  pUR                  SSSS5      R                  5       nUR+                  XeS5      nU R-                  U5      nUR                  SSS5      R                  5       nX4$ )NrP   r   r&   r   r   r/   )dropoutscalingr   )rw   q_projreshape	num_headshead_dimk_projv_projrW   rA   r   permute
contiguousr
   get_interfacerk   _attn_implementationr   trainingr   scaler   viewout_proj)r]   r   r   r   r   r~   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  rD   re   MLCDAttention.forward   s&    "/!4!4Sb!9
 {{=199:SWSaSacgcpcp:qr[[/77QUQ_Q_aeanan8op
{{=199:SWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|Y\#b  $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?(M(MKK,,.E)
 %8
%
  $}}C$,,JJnn
%
 
%
! "))!Q15@@B!&&zrBmmK0!))!Q2==?((rC   )r   r'   rm   )r5   r6   r7   r8   r9   r   ro   rT   rg   r?   r   r   re   rB   r   r   s   @rD   r   r      s    /  /3	,)||,) #5<<#=>,) t+	,)
 +,,) 
u||U\\D00	1,) ,)rC   r   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\\R                  \R                  4   S\R                  S-  S\	\
   S	\\R                     4
S
 jjrSrU =r$ )MLCDEncoderLayer   rk   c                 D   > [         TU ]  U5        [        U5      U l        g rm   )rn   ro   r   	self_attnrq   s     rD   ro   MLCDEncoderLayer.__init__   s     &v.rC   Nr   r   r   r   rM   c                     UnU R                  U5      nU R                  " SUUUS.UD6u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU$ )ah  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
        Represents the hidden states from the previous layer or the input embeddings.
    position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
        A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
        Represents absolute positional embeddings for the query and key in the attention mechanism.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
)r   r   r   r4   )layer_norm1r   layer_norm2mlp)r]   r   r   r   r   residual_s          rD   re   MLCDEncoderLayer.forward   s}    $ !((7>> 
' 3)
 	
 !0 ((7/ 0rC   )r   rm   )r5   r6   r7   r8   r   ro   rT   rg   r?   r   r   r   re   rB   r   r   s   @rD   r   r      s    // / /3	"||" #5<<#=>" t+	"
 +," 
u  	!" "rC   r   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\\   S
\	\-  4
S jjrSrU =r$ )MLCDEncoder   z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`MLCDEncoderLayer`].

Args:
    config: MLCDVisionConfig
rk   c                 $   > [         TU ]  U5        g)z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)rn   ro   rq   s     rD   ro   MLCDEncoder.__init__   s     rC   Ninputs_embedsr   r   r   rM   c                 T    UnU R                    H  nU" UUU40 UD6nM     [        US9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
        A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
        Represents absolute positional embeddings for the query and key in the attention mechanism.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
)last_hidden_state)layersr   )r]   r   r   r   r   r   encoder_layers          rD   re   MLCDEncoder.forward   sF    , &![[M)# 	M ) +
 	
rC   r4   rm   )r5   r6   r7   r8   r9   r   ro   rT   r   r?   rg   r   r   r   re   rB   r   r   s   @rD   r   r      s    !/ ! /3	!
((!
 #5<<#=>!
 t+	!

 +,!
 
	 !
 !
rC   r   c                   z    \ rS rSr% \\S'   SrS/rSrSr	Sr
SrSrSr\\S.r\R$                  " 5       S 5       rS	rg
)MLCDPreTrainedModeli   rk   vision_modelr   TF)r   
attentionsc           	      P	   U R                   R                  n[        U[        5      (       a  U R                   R                  n[        R
                  " UR                  SUR                  S-  U-  S9  [        R
                  " UR                  R                  UR                   R                  U-  S9  [        R                  " UR                  [        R                  " UR                  R                  S   5      R!                  S5      5        g[        U["        5      (       a  U R                   R                  nUR                  S-  SUR                   R$                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  " UR&                  R                  US9  [        R
                  " UR(                  R                  US9  [        R
                  " UR*                  R                  US9  [        R
                  " UR,                  R                  US9  g[        U[.        5      (       a  U R                   R                  nUR                   R0                  S-  SUR                   R$                  -  S-  -  U-  nSUR                   R0                  -  S-  U-  n[        R
                  " UR2                  R                  US9  [        R
                  " UR4                  R                  US9  g[        U[6        5      (       am  U R                   R                  nUR                   R0                  UR                   R8                  -  S-  S-  U-  n[        R
                  " UR:                  SUS9  g[        U[<        R>                  5      (       aA  [        R@                  " URB                  5        [        RD                  " UR                  5        g[        U[<        RF                  5      (       a.  URB                  b!  [        R@                  " URB                  5        g[        U[H        5      (       an  S	URJ                  [        R                  " S
URL                  S[        RN                  S9URL                  -  -  -  n[        R                  " URP                  U5        gg)zInitialize the weightsr/   g      )meanstd)r   rP   )r&   rP   r   Nr2   r   rv   ))rk   r3   
isinstanceri   initnormal_r|   	embed_dimrx   ry   r1   copy_position_idsrT   rU   rw   rX   r   r#   r   r   r   r   rF   r    fc1fc2MLCDVisionModelr%   class_pos_embnn	LayerNormzeros_biasones_LinearrI   thetarR   rA   rV   )r]   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stdrV   s           rD   _init_weights!MLCDPreTrainedModel._init_weights0  s7    //f233[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh..[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B(([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<00[[33F!==448Y8YY]^^cggjppKLL--C[I--KK$JJv}}%		**v{{/FKK$ 344fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 5rC   r4   N)r5   r6   r7   r8   r   r=   base_model_prefix_no_split_modulessupports_gradient_checkpointingaccepts_loss_kwargs_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrT   no_gradr   rB   r4   rC   rD   r   r      sb    &+,&*#N"&)#
 ]]_!2 !2rC   r   c                   r   ^  \ rS rSrS\4U 4S jjr S
S\R                  S-  S\\	   S\
\-  4S jjrS	rU =r$ )r   iU  rk   c                   > [         TU ]  U5        [        UR                  UR                  -  S-  5      U l        [        R                  " [        R                  " SUR                  UR                  -  S-  5      5      U l
        g )Nr   r&   )rn   ro   rI   r    r%   vision_rotary_embeddingr   	ParameterrT   randnr   rq   s     rD   ro   MLCDVisionModel.__init__V  sh     ':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*oprC   Nrt   r   rM   c                    Uc  [        S5      eUR                  S   U R                  R                  -  nUR                  S   U R                  R                  -  nU R	                  X45      nUR                  U R                  R                  5      n[        R                  " U R                  U/SS9n[        R                  " XU4SS9nUR                  5       UR                  5       4nU R                  U5      nU R                  U5      nU R                  " S	UUS.UD6n	U	S   n
U
SS2SSS24   nU R                  U5      n[!        U
US9$ )
ac  
Example:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, MLCDVisionModel
>>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
>>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs, output_attentions=True)

>>> features = outputs.last_hidden_state
>>> print(f"Extracted features shape: {features.shape}")
>>> print(f"Number of attention layers: {len(outputs.attentions)}")
>>> print(f"Attention shape: {outputs.attentions[0].shape}")
```Nz You have to specify pixel_valuesrP   r   rQ   )r   r   )r   pooler_outputr4   )
ValueErrorrw   rk   r+   r   rz   r   rO   rT   r}   r   r   r   pre_layrnormencoderpost_layernormr	   )r]   rt   r   rK   rL   rd   embr   r   encoder_outputsr   pooled_outputs               rD   re   MLCDVisionModel.forward[  sI   : ?@@)//3t{{7M7MM(..r2dkk6L6LL556H\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8,, 
' 3
 
 ,A.)!Q'2++M:)/'
 	
rC   )r   r   rm   )r5   r6   r7   r8   r   ro   rT   r   r   r   r?   r	   re   rB   r   r   s   @rD   r   r   U  sT    q/ q 268
''$.8
 +,8
 
+	+	8
 8
rC   r   )r   r   r   )1collections.abcr   rT   torch.nnr   huggingface_hub.dataclassesr    r   r   configuration_utilsr   modeling_outputsr   r	   modeling_utilsr
   r   processing_utilsr   utilsr   r   r   clip.modeling_clipr   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr5   loggerr   rF   rI   ri   r   r   r   r   r   __all__r4   rC   rD   <module>r     s    %   . & 3 K F & @ @  ; [ 
		H	% CD#$' #$  E#$L	g 	/ D/ $9)M 9)x'' 'T.
+ .
b 12/ 12 12h>
o >
BrC   