
    Z j!                     ,   S r SSKrSSKJr  SSKJr  SSKJr  SSKrSSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*  \!RV                  " \,5      r-\ " SS9\ " S S\5      5       5       r.\ " SS9\ " S S\5      5       5       r/\\  " S S\5      5       5       r0S\Rb                  S\Rb                  4S jr2S \Rb                  S\Rb                  4S! jr3S"\*S#\44S$ jr5S]S%\4\6-  S&\74S' jjr8 " S( S)\	Rr                  5      r: " S* S+\	Rv                  5      r< " S, S-\	Rr                  5      r= " S. S/\	Rr                  5      r> " S0 S1\	Rr                  5      r? " S2 S3\	Rr                  5      r@ " S4 S5\	Rr                  5      rA " S6 S7\	Rr                  5      rB " S8 S9\	Rr                  5      rC S^S:\	Rr                  S;\Rb                  S<\Rb                  S=\Rb                  S>\Rb                  S-  S?\DS@\D4SA jjrE " SB SC\	Rr                  5      rF " SD SE\	Rr                  5      rG " SF SG\	Rr                  5      rH " SH SI\	Rr                  5      rI " SJ SK\	Rr                  5      rJ " SL SM\5      rK " SN SO\	Rr                  5      rL " SP SQ\	Rr                  5      rM\  " SR SS\5      5       rN\ " STS9 " SU SV\N5      5       rO\ " SWS9 " SX SY\N5      5       rP\  " SZ S[\N5      5       rQ/ S\QrRg)_zPyTorch ALIGN model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Srg)AlignVisionModelOutput-   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_statehidden_states )__name__
__module____qualname____firstlineno____doc__r"   torchFloatTensor__annotations__r#   r$   tuple__static_attributes__r%       y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/align/modeling_align.pyr    r    -   sN    
 .2L%##d*126u((4/659M5**+d29r0   r    ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	AlignTextModelOutput>   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsr#   r$   
attentionsr%   )r&   r'   r(   r)   r*   r5   r+   r,   r-   r#   r$   r.   r6   r/   r%   r0   r1   r3   r3   >   sh    
 -1K""T)026u((4/659M5**+d2926Je''(4/6r0   r3   c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)AlignOutputP   a.  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The output of [`AlignVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AlignTextModel`].
vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
    The output of the [`AlignVisionModel`].
Nlosslogits_per_imagelogits_per_textr5   r"   text_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r=   r>   N)getattrto_tuple).0kselfs     r1   	<genexpr>'AlignOutput.to_tuple.<locals>.<genexpr>o   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)r.   keysrF   s   `r1   rC   AlignOutput.to_tuplen   s#     
YY[
 
 	
r0   r%   )r&   r'   r(   r)   r*   r:   r+   r,   r-   r;   r<   r5   r"   r=   r   r>   r   r.   r   rC   r/   r%   r0   r1   r8   r8   P   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*14818DHAH
%* 
r0   r8   logitsr?   c                     [         R                  R                  U [        R                  " [        U 5      U R                  S9SS9$ )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr+   arangelenrO   )rL   s    r1   contrastive_lossrU   w   s5    ==&&vu||CKPVP]P]/^ps&ttr0   
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)rU   t)rV   caption_loss
image_losss      r1   
align_lossr[   {   s*    #J/L!*,,.1J%,,r0   confignum_channelsc                     U R                   nXR                  -  n[        U[        XS-  -   5      U-  U-  5      nUSU-  :  a  X2-  n[        U5      $ )z4
Round number of filters based on depth multiplier.
   g?)depth_divisorwidth_coefficientmaxint)r\   r]   divisornew_dims       r1   round_filtersrf      s`     ""G,,,L'3|k9:gEOPG |##w<r0   kernel_sizeadjustc                     [        U [        5      (       a  X 4n U S   S-  U S   S-  4nU(       a  US   S-
  US   US   S-
  US   4$ US   US   US   US   4$ )a.  
Utility function to get the tuple padding value for the depthwise convolution.

Args:
    kernel_size (`int` or `tuple`):
        Kernel size of the convolution layers.
    adjust (`bool`, *optional*, defaults to `True`):
        Adjusts padding value to apply to right and bottom sides of the input.
r   r_   r   )
isinstancerc   )rg   rh   corrects      r1   correct_padrl      s~     +s##"01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r0   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	AlignVisionEmbeddings   zD
A module that corresponds to the stem module of the original work.
r\   c           	      |  > [         TU ]  5         [        US5      U l        [        R
                  " SS9U l        [        R                  " UR                  U R                  SSSSS9U l	        [        R                  " U R                  UR                  UR                  S	9U l        [        UR                     U l        g )
N    )r   r   r   r   paddingr   r_   validFrg   striders   bias)epsmomentum)super__init__rf   out_dimr   	ZeroPad2drs   Conv2dr]   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr	   
hidden_act
activationrF   r\   	__class__s     r1   r{   AlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r0   pixel_valuesr?   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)rs   r   r   r   )rF   r   featuress      r1   forwardAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r0   )r   r   r   r|   rs   )r&   r'   r(   r)   r*   r   r{   r+   Tensorr   r/   __classcell__r   s   @r1   rn   rn      s5    	40 	4ELL U\\  r0   rn   c                   :   ^  \ rS rSr       SU 4S jjrSrU =r$ )AlignVisionDepthwiseConv2d   c	                 8   > X-  n	[         T
U ]  UU	UUUUUUUS9	  g )N)	in_channelsout_channelsrg   rv   rs   dilationgroupsrw   padding_mode)rz   r{   )rF   r   depth_multiplierrg   rv   rs   r   rw   r   r   r   s             r1   r{   #AlignVisionDepthwiseConv2d.__init__   s:     #5#%#% 	 
	
r0   r%   )r   r   r   r   r   Tzeros)r&   r'   r(   r)   r{   r/   r   r   s   @r1   r   r      s$     
 
r0   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jrSrU =r$ )AlignVisionExpansionLayer   zW
This corresponds to the expansion phase of each block in the original implementation.
r\   in_dimr|   rv   c                    > [         TU ]  5         [        R                  " UUSSSS9U l        [        R
                  " X1R                  S9U l        [        UR                     U l
        g )Nr   sameFr   r   rg   rs   rw   )num_featuresrx   )rz   r{   r   r~   expand_convr   r   	expand_bnr	   r   
expand_act)rF   r\   r   r|   rv   r   s        r1   r{   "AlignVisionExpansionLayer.__init__   sX    99 
 WBWBWX !2!23r0   r$   r?   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   rF   r$   s     r1   r   !AlignVisionExpansionLayer.forward   s4    ((7}56r0   )r   r   r   )r&   r'   r(   r)   r*   r   rc   r{   r+   r,   r   r   r/   r   r   s   @r1   r   r      sM    
40 
4# 
4 
4UX 
4U%6%6 5<<  r0   r   c            
       ~   ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjrS	\	R                  S
\	R                  4S jrSrU =r$ )AlignVisionDepthwiseLayer   zc
This corresponds to the depthwise convolution phase of each block in the original implementation.
r\   r   rv   rg   adjust_paddingc                 F  > [         TU ]  5         X0l        U R                  S:X  a  SOSn[        XES9n[        R
                  " US9U l        [        X$X6SS9U l        [        R                  " X!R                  UR                  S9U l        [        UR                     U l        g )	Nr_   rt   r   )rh   rr   Fru   r   rx   ry   )rz   r{   rv   rl   r   r}   depthwise_conv_padr   depthwise_convr   r   r   depthwise_normr	   r   depthwise_act)	rF   r\   r   rv   rg   r   conv_padrs   r   s	           r1   r{   "AlignVisionDepthwiseLayer.__init__   s     	"kkQ.7FkA"$,,w"?8FSX
 !nn%:%:VE_E_
 $F$5$56r0   r$   r?   c                     U R                   S:X  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ )Nr_   )rv   r   r   r   r   r   s     r1   r   !AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r0   )r   r   r   r   rv   r&   r'   r(   r)   r*   r   rc   boolr{   r+   r,   r   r   r/   r   r   s   @r1   r   r      s_    7!7 7 	7
 7 7,	U%6%6 	5<< 	 	r0   r   c            	       ~   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )AlignVisionSqueezeExciteLayeri  zd
This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
r\   r   
expand_dimexpandc                   > [         TU ]  5         U(       a  UOUU l        [        S[	        X!R
                  -  5      5      U l        [        R                  " SS9U l	        [        R                  " U R                  U R                  SSS9U l        [        R                  " U R                  U R                  SSS9U l        [        UR                     U l        [        R                   " 5       U l        g )Nr   )output_sizer   )r   r   rg   rs   )rz   r{   dimrb   rc   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezer~   reducer   r	   r   
act_reduceSigmoid
act_expand)rF   r\   r   r   r   r   s        r1   r{   &AlignVisionSqueezeExciteLayer.__init__$  s    !':V!S*H*H!HIJ++:ii	
 ii	
 !!2!23**,r0   r$   r?   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      n[
        R                  " X!5      nU$ r   )r   r   r   r   r   r+   mul)rF   r$   inputss      r1   r   %AlignVisionSqueezeExciteLayer.forward9  sa    ]3M26M26		&8r0   )r   r   r   r   r   r   r   )Fr   r   s   @r1   r   r     sR    '0 '# '3 'X\ ' '*
U%6%6 
5<< 
 
r0   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\4U 4S	 jjr	S
\
R                  S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionFinalBlockLayeriF  zS
This corresponds to the final phase of each block in the original implementation.
r\   r   r|   rv   	drop_rateid_skipc                   > [         TU ]  5         US:H  =(       a    U(       + U l        [        R                  " UUSSSS9U l        [        R                  " X1R                  UR                  S9U l	        [        R                  " US9U l        g )Nr   r   Fr   r   )p)rz   r{   apply_dropoutr   r~   project_convr   r   r   
project_bnDropoutdropout)rF   r\   r   r|   rv   r   r   r   s          r1   r{   #AlignVisionFinalBlockLayer.__init__K  sx     	#q[8[II 
 .. &;&;fF`F`
 zzI.r0   
embeddingsr$   r?   c                     U R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX!-   nU$ r   )r   r   r   r   )rF   r   r$   s      r1   r   "AlignVisionFinalBlockLayer.forward\  sE    ))-86 LL7M)6Mr0   )r   r   r   r   r&   r'   r(   r)   r*   r   rc   floatr   r{   r+   r,   r   r   r/   r   r   s   @r1   r   r   F  so    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf  r0   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\S	\S
\S\4U 4S jjr	S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionBlockig  a1  
This corresponds to the block module of original the EfficientNet vision encoder implementation.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
    in_dim (`int`):
        Number of input channels.
    out_dim (`int`):
        Number of output channels.
    stride (`int`):
        Stride size to be used in convolution layers.
    expand_ratio (`int`):
        Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
    kernel_size (`int`):
        Kernel size for the depthwise convolution layer.
    drop_rate (`float`):
        Dropout rate to be used in the final phase of each block.
    id_skip (`bool`):
        Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
        of each block. Set to `True` for the first block of each stage.
    adjust_padding (`bool`):
        Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
        operation, set to `True` for inputs with odd input sizes.
r\   r   r|   rv   expand_ratiorg   r   r   r   c
           	      f  > [         TU ]  5         XPl        U R                  S:g  U l        X%-  n
U R                  (       a  [	        XXS9U l        [        UU R                  (       a  U
OUUUU	S9U l        [        XXR                  S9U l	        [        UU R                  (       a  U
OUUUUUS9U l        g )Nr   )r\   r   r|   rv   )r\   r   rv   rg   r   )r\   r   r   r   )r\   r   r|   rv   r   r   )rz   r{   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)rF   r\   r   r|   rv   r   rg   r   r   r   expand_in_dimr   s              r1   r{   AlignVisionBlock.__init__  s     	(''1,-;;6mDN 8$(KK=V#)
 <];;
 5$(KK=V
r0   r$   r?   c                     UnU R                   S:w  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  X!5      nU$ Nr   )r   r   r   r   r   )rF   r$   r   s      r1   r   AlignVisionBlock.forward  sY    "
! NN=9M++M: ++M:
Br0   )r   r   r   r   r   r   r   r   s   @r1   r   r   g  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
 
r0   r   c                   d   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\
   S\4S jrS	rU =r$ )
AlignVisionEncoderi  z
Forward propagates the embeddings through each vision encoder (EfficientNet) block.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
r\   c                   >^ ^ [         TT ]  5         UR                  T l        U 4S jm[        UR                  5      n[        U4S jUR                   5       5      nSn/ n[        U5       H  n[        XR                  U   5      n[        XR                  U   5      nUR                  U   n	UR                  U   n
UR                  U   n[        T" UR                  U   5      5       Hc  nUS:H  nUS:  a  SOU	n	US:  a  UOUnXAR                  ;  nUR                  U-  U-  n[        UUUU	U
UUUUS9	nUR!                  U5        US-  nMe     M     ["        R$                  " U5      T l        g )Nc                 \   > [        [        R                  " TR                  U -  5      5      $ r   )rc   mathceildepth_coefficient)repeatsrF   s    r1   round_repeats2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr0   c              3   4   >#    U  H  nT" U5      v   M     g 7fr   r%   )rD   nr   s     r1   rG   .AlignVisionEncoder.__init__.<locals>.<genexpr>  s     L3Kaq))3Ks   r   r   )	r\   r   r|   rv   rg   r   r   r   r   )rz   r{   r   rT   r   sumnum_block_repeatsrangerf   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)rF   r\   num_base_blocks
num_blockscurr_block_numr  ir   r|   rv   rg   r   jr   r   r   blockr   r   s   `                @r1   r{   AlignVisionEncoder.__init__  sp   !'!9!9	D f001L63K3KLL
'A"6+=+=a+@AF#F,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEFq&!e$%Ev!/7O7O!O"44~E
R	(!!#! +!-'##1
 e$!#' G (8 mmF+r0   r$   kwargsr?   c                 J    U R                    H  nU" U5      nM     [        US9$ N)r#   )r  r   )rF   r$   r  r  s       r1   r   AlignVisionEncoder.forward  s.    
 [[E!-0M ! .+
 	
r0   )r  r   )r&   r'   r(   r)   r*   r   r{   r+   r,   r   r   r   r   r/   r   r   s   @r1   r   r     sH    ),0 ),V

((

 +,

 
(	

 

r0   r   c                      ^  \ rS rSrSrU 4S jr    SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  4
S
 jjr
SrU =r$ )AlignTextEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                 
  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  S9SS9  g )	N)padding_idxrx   position_idsr   F)
persistenttoken_type_ids)dtype)rz   r{   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   register_bufferr+   rS   r   r   r  sizelongr   s     r1   r{   AlignTextEmbeddings.__init__  s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r0   N	input_idsr  r  inputs_embedsr?   c                 @   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr  r   r  r   )r  rO   )r)  r  hasattrr  r   r+   r   r*  rO   r   r$  r"  r%  r   )rF   r,  r  r  r-  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr$  r   r"  s               r1   r   AlignTextEmbeddings.forward  s.     #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D)
^^J/
\\*-
r0   )r%  r   r"  r$  r   )NNNN)r&   r'   r(   r)   r*   r{   r+   
LongTensorr,   r   r   r/   r   r   s   @r1   r  r    s    Q
$ .2260426&##d*& ((4/& &&-	&
 ((4/& 
& &r0   r  modulequerykeyvalueattention_maskscalingr   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr_   r   r  )r   r  )r   trainingr   )r+   matmul	transposer   rQ   softmaxfloat32tor  r   r=  
contiguous)
r6  r7  r8  r9  r:  r;  r   r  attn_weightsattn_outputs
             r1   eager_attention_forwardrF  5  s     <<}}Q':;gEL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r0   c                      ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\
\R                  \R                  S-  4   4S jjrS	rU =r$ )AlignTextSelfAttentioniK  c                 6  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        UR                   U l        U R                  S-  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )rz   r{   r  num_attention_headsr/  
ValueErrorr\   rc   attention_head_sizeall_head_sizer   Linearr7  r8  r9  r   attention_probs_dropout_probr   attention_dropoutr;  r   s     r1   r{   AlignTextSelfAttention.__init__L  sD    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r0   Nr$   r:  r  r?   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  (       d  SOU R                  U R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
X4$ )Nr  r   r_           )r   r;  )shaperN  r7  viewr?  r8  r9  r   get_interfacer\   _attn_implementationrF  r=  rR  r;  reshaperC  )rF   r$   r:  r  r0  hidden_shapequery_states
key_statesvalue_statesattention_interfacerE  rD  s               r1   r   AlignTextSelfAttention.forwarda  s8    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFH((r0   )
rO  rR  rN  r\   r   r8  rL  r7  r;  r9  r   )r&   r'   r(   r)   r{   r+   r   r,   r   r   r.   r   r/   r   r   s   @r1   rH  rH  K  si    60 48)||) ))D0) +,	)
 
u||U\\D00	1) )r0   rH  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr  )rz   r{   r   rP  r  denser%  r&  r   r'  r   r   s     r1   r{   AlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r0   r$   input_tensorr?   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   re  r   r%  rF   r$   rg  s      r1   r   AlignTextSelfOutput.forward  5    

=1]3}'CDr0   r%  re  r   
r&   r'   r(   r)   r{   r+   r   r   r/   r   r   s   @r1   rb  rb    6    >U\\  RWR^R^  r0   rb  c            	          ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\R                  4S jjr
S	rU =r$ )AlignTextAttentioni  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )rz   r{   rH  rF   rb  outputr   s     r1   r{   AlignTextAttention.__init__  s&    *62	)&1r0   Nr$   r:  r  r?   c                 Z    UnU R                   " U4SU0UD6u  pU R                  X5      nU$ Nr:  )rF   rs  )rF   r$   r:  r  residual_s         r1   r   AlignTextAttention.forward  sE     !99
)
 

 M<r0   )rs  rF   r   )r&   r'   r(   r)   r{   r+   r   r,   r   r   r   r/   r   r   s   @r1   rq  rq    sV    2 48|| ))D0 +,	
 
 r0   rq  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rz   r{   r   rP  r  intermediate_sizere  rj   r   strr	   intermediate_act_fnr   s     r1   r{   AlignTextIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r0   r$   r?   c                 J    U R                  U5      nU R                  U5      nU$ r   re  r  r   s     r1   r   AlignTextIntermediate.forward  s&    

=100?r0   r  rn  r   s   @r1   r{  r{    s(    9U\\ ell  r0   r{  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g rd  )rz   r{   r   rP  r}  r  re  r%  r&  r   r'  r   r   s     r1   r{   AlignTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r0   r$   rg  r?   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   ri  rj  s      r1   r   AlignTextOutput.forward  rl  r0   rm  rn  r   s   @r1   r  r    ro  r0   r  c            	          ^  \ rS rSrU 4S jr SS\R                  S\R                  S-  S\\	   S\R                  4S jjr
S	 rS
rU =r$ )AlignTextLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g r   )
rz   r{   chunk_size_feed_forwardseq_len_dimrq  	attentionr{  intermediater  rs  r   s     r1   r{   AlignTextLayer.__init__  sI    '-'E'E$+F31&9%f-r0   Nr$   r:  r  r?   c                     U R                   " U4SU0UD6n[        U R                  U R                  U R                  U5      nU$ rv  )r  r   feed_forward_chunkr  r  )rF   r$   r:  r  s       r1   r   AlignTextLayer.forward  sW     
)
 
 2##T%A%A4CSCSUb
 r0   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  rs  )rF   attention_outputintermediate_outputlayer_outputs       r1   r  !AlignTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir0   )r  r  r  rs  r  r   )r&   r'   r(   r)   r{   r+   r   r,   r   r   r   r  r/   r   r   s   @r1   r  r    s[    . 48|| ))D0 +,	
 
$ r0   r  c            	       |   ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\
4S jjrS	rU =r$ )AlignTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rz   r{   r\   r   r  r   num_hidden_layersr  layergradient_checkpointing)rF   r\   r	  r   s      r1   r{   AlignTextEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A&Nr$   r:  r  r?   c                 N    U R                    H  nU" UU40 UD6nM     [        US9$ r  )r  r   )rF   r$   r:  r  layer_modules        r1   r   AlignTextEncoder.forward  s>     !JJL( M ' +
 	
r0   )r\   r  r  r   )r&   r'   r(   r)   r{   r+   r   r,   r   r   r   r   r/   r   r   s   @r1   r  r    sR    , 48
||
 ))D0
 +,	

 

 
r0   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextPooleri   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rz   r{   r   rP  r  re  Tanhr   r   s     r1   r{   AlignTextPooler.__init__  s9    YYv1163E3EF
'')r0   r$   r?   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )re  r   )rF   r$   first_token_tensorpooled_outputs       r1   r   AlignTextPooler.forward  s6     +1a40

#566r0   )r   re  rn  r   s   @r1   r  r     s(    $
U\\ ell  r0   r  c                   v    \ rS rSr% \\S'   SrSrSr\	R                  " 5       S\R                  4S j5       rSrg	)
AlignPreTrainedModeli  r\   align)imagetextTr6  c                 t   U R                   R                  n[        U[        R                  [        R
                  45      (       aO  [        R                  " UR                  SUS9  UR                  b   [        R                  " UR                  5        GO4[        U[        5      (       a  [        R                  " UR                  R                  5        [        R                  " UR                  R                  5        [        R                  " UR                  U R                   R                   5        O[        U[        R"                  5      (       av  [        R                  " UR                  SUS9  UR$                  bI  ['        UR                  SS5      (       d-  [        R                  " UR                  UR$                     5        [        U[        R(                  [        R*                  45      (       a  [        R                  " UR                  5        [        R,                  " UR                  5        ['        USS5      ba  [        R                  " UR.                  5        [        R,                  " UR0                  5        [        R                  " UR2                  5        gg[        U[4        5      (       a|  [        R6                  " UR8                  [:        R<                  " UR8                  R>                  S   5      RA                  S5      5        [        R                  " URB                  5        gg)	zInitialize the weightsrU  )meanstdN_is_hf_initializedFrunning_meanr  r  )"r\   initializer_rangerj   r   rP  r~   initnormal_weightrw   zeros_
AlignModelxavier_uniform_text_projection	constant_temperaturetemperature_init_valuer  r  rB   r%  r   ones_r  running_varnum_batches_trackedr  copy_r  r+   rS   rV  r   r  )rF   r6  r  s      r1   _init_weights"AlignPreTrainedModel._init_weights  s     kk++fryy"))455LLSc:{{&FKK(
++  !7!7!>!>?KK..334NN6--t{{/Q/QR--LLSc:!!-gfmmMach6i6iFMM&*<*<=>fr||R^^<==KK$JJv}}%v~t4@F//0

6--.F667 A  344JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 5r0   r%   N)r&   r'   r(   r)   r   r-   base_model_prefixinput_modalitiessupports_gradient_checkpointingr+   no_gradr   Moduler  r/   r%   r0   r1   r  r    s=    (&*#
]]_/BII / /r0   r  zJ
    The text model from ALIGN without any head or projection on top.
    c                   D  ^  \ rS rSr% \\S'   SrS/r\\	S.r
SS\S\4U 4S jjjrS rS	 r\\\     SS\R&                  S
-  S\R&                  S
-  S\R&                  S
-  S\R&                  S
-  S\R&                  S
-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )AlignTextModeli3  r\   )r  r  )r$   r6   add_pooling_layerc                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rz   r{   r\   r  r   r  encoderr  pooler	post_init)rF   r\   r  r   s      r1   r{   AlignTextModel.__init__A  sK    
 	 -f5'/1Bof- 	r0   c                 .    U R                   R                  $ r   r   r   rJ   s    r1   get_input_embeddings#AlignTextModel.get_input_embeddingsQ  s    ...r0   c                 $    XR                   l        g r   r  )rF   r9  s     r1   set_input_embeddings#AlignTextModel.set_input_embeddingsT  s    */'r0   Nr,  r:  r  r  r-  r  r?   c                    Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[        S5      eUu  pUb  UR                  OUR                  n
Uc  [        R
                  " X4U
S9nU R                  X'5      nU R                  UUUUS9nU R                  " U4SU0UD6nUS   nU R                  b  U R                  U5      OSn[        UUS	9$ )
a  
Examples:

```python
>>> from transformers import AutoTokenizer, AlignTextModel

>>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer  z5You have to specify either input_ids or inputs_embedsrN   )r,  r  r  r-  r:  r   r#   pooler_output)rM  %warn_if_padding_and_no_attention_maskr)  rO   r+   onesget_extended_attention_maskr   r  r  r   )rF   r,  r:  r  r  r-  r  r0  
batch_sizer1  rO   extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                   r1   r   AlignTextModel.forwardW  s-   6  ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN 150P0PQ_0m??%)'	 + 
 ,,
2
 

 *!,8<8OO4UY)-'
 	
r0   r\   r   r  r  TNNNNN)r&   r'   r(   r)   r   r-   r  _no_split_modulesr  rH  _can_record_outputsr   r{   r  r  r   r   r   r+   r   r   r   r.   r   r   r/   r   r   s   @r1   r  r  3  s      ./',
 4   /0   *..2.2,0-1=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 +,=
 
+	+=
    =
r0   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
S\0rS\4U 4S	 jjr\\\ SS\R$                  S
-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )AlignVisionModeli  r\   r   )r  Fr   r   r$   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:X  a%  [        R                  " UR                  SS9U l        OMUR                  S:X  a%  [        R                  " UR                  SS9U l        O[        SUR                   35      eU R                  5         g )Nr  T)	ceil_moderb   z2config.pooling must be one of ['mean', 'max'] got )rz   r{   r\   rn   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2drM  poolingr  r   s     r1   r{   AlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r0   Nr  r?   c                     Uc  [        S5      eU R                  U5      nU R                  " U40 UD6nUS   nU R                  U5      nUR	                  UR
                  SS 5      n[        UUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, AlignVisionModel

>>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```Nz You have to specify pixel_valuesr   r_   r  )rM  r   r  r  rZ  rV  r   )rF   r   r  r  r  r#   r  s          r1   r   AlignVisionModel.forward  s    < ?@@??<8,,

 ,A.$56%--m.A.A"1.EF7/'
 	
r0   r  r   )r&   r'   r(   r)   r   r-   main_input_namer  r  _input_embed_layerr  r   r  r{   r   r   r   r+   r,   r   r   r.   r   r   r/   r   r   s   @r1   r  r    s     $O!&+#&+,)0 "   26*
''$.*
 +,*
 
9	9	*
    *
r0   r  c                   ^  ^  \ rS rSr% \\S'   S\4U 4S jjr\\     SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\\   S\\-  4S jj5       5       r\\S\	R                   S
\\   S\\-  4S j5       5       r\\       SS\	R$                  S-  S\	R                   S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S	\	R                  S-  S\S-  S
\\   S\\-  4S jj5       5       rSrU =r$ )r  i  r\   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        [        U5      U l        [        U5      U l        [         R"                  " U R                  U R                  5      U l        [         R&                  " [(        R*                  " U R,                  R.                  5      5      U l        U R3                  5         g )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rz   r{   rj   text_configr   	TypeErrortypevision_configr   projection_dimr  text_embed_dimr  
text_modelr  vision_modelr   rP  r  	Parameterr+   tensorr\   r  r  r  )rF   r\   r  r  r   s       r1   r{   AlignModel.__init__  s)    &,,o>>++,-Q0 
 &..0ABB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r0   Nr,  r:  r  r  r-  r  r?   c           	          U R                   " SUUUUUS.UD6nUS   SS2SSS24   nU R                  U5      Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```r,  r:  r  r  r-  r   Nr%   )r  r  r  )	rF   r,  r:  r  r  r-  r  text_outputsr#   s	            r1   get_text_featuresAlignModel.get_text_features  sa    2 48?? 4
))%'4
 4
 )OAq!G4%)%9%9:K%L"r0   r   c                 *    U R                   " SSU0UD6$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AlignModel
>>> from transformers.image_utils import load_image

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```r   r%   )r  )rF   r   r  s      r1   get_image_featuresAlignModel.get_image_features3  s    .   ElEfEEr0   return_lossc           
         U R                   " SSU0UD6n	U R                  " SUUUUUS.UD6n
U	S   nU
S   SS2SSS24   nU R                  U5      nXR                  SSSS	9-  nXR                  SSSS	9-  n[        R
                  " XR                  5       5      U R                  -  nUR                  5       nSnU(       a  [        U5      n[        UUUUUU
U	S
9$ )a^  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AlignModel
>>> from transformers.image_utils import load_image

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(
...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
... )

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   r
  r   r   Nr_   r  T)r   r   keepdim)r:   r;   r<   r5   r"   r=   r>   r%   )
r  r  r  normr+   r>  rX   r  r[   r8   )rF   r,  r   r:  r  r  r-  r  r  vision_outputsr  r"   r5   r<   r;   r:   s                   r1   r   AlignModel.forwardL  s   N ** 
%


  
))%'
 
 &a("1oaAg.**;7 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4DEHXHXX*,,.o.D-+#%* .
 	
r0   )r  r  r  r  r  r  r  )NNNNNNN)r&   r'   r(   r)   r   r-   r{   r   r   r+   r   r   r   r.   r   r  r,   r  r5  r   r8   r   r/   r   r   s   @r1   r  r    s   { <  *..2.2,0-1"<<$&" t+" t+	"
 llT)" ||d*" +," 
+	+"  "H F!--F9?@R9SF	+	+F  F.  .215.2.2,0-1#'K
##d*K
 ''$.K
 t+	K

 t+K
 llT)K
 ||d*K
 D[K
 +,K
 
	K
  K
r0   r  )r  r  r  r  r  )rU  )Sr*   r   collections.abcr   dataclassesr   typingr   r+   r    r   r  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_alignr   r   r   
get_loggerr&   loggerr    r3   r8   r   rU   r[   rc   rf   r.   r   rl   r  rn   r~   r   r   r   r   r   r   r   r  r   rF  rH  rb  rq  r{  r  r  r  r  r  r  r  r  __all__r%   r0   r1   <module>r(     s\     $ !    & ! 9  G & 6 M M I 5 P P 
		H	% 
 :[ : : 
 	7; 	7 	7  
+  
   
JuU\\ uell u-5<< -ELL -+ 3  @S5[ @$ @*BII 4
 
6		 6$		 $P$BII $N BNryy Nb>
 >
B9")) 9F %II%<<% 
% <<	%
 LL4'% % %,3)RYY 3)n"))  .BII  bii / B
ryy 
4bii   /?  /  /F 
_
) _

_
D 
I
+ I

I
X m
% m
 m
` Wr0   