
    Z j(                       S r SSKJr  SSKJr  SSKJr  SSKrSSKJr  SSK	J
r
  SS	KJr  SS
KJrJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJrJrJrJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+  SSK,J-r-J.r.  SSK/J0r0  SSK1J2r2J3r3J4r4  \*Rj                  " \65      r7Sr8\)" SS9\ " S S\5      5       5       r9\)" SS9\ " S S\5      5       5       r: " S S \Rv                  5      r< " S! S"\Rv                  5      r= " S# S$\Rv                  5      r> " S% S&\Rv                  5      r? " S' S(\Rv                  5      r@ " S) S*\Rv                  5      rA " S+ S,\Rv                  5      rB " S- S.\Rv                  5      rC " S/ S0\Rv                  5      rD  SdS1\Rv                  S2\R                  S3\R                  S4\R                  S5\R                  S-  S6\FS-  S7\FS8\$\(   4S9 jjrG " S: S;\Rv                  5      rH " S< S=\Rv                  5      rI " S> S?\Rv                  5      rJ " S@ SA\Rv                  5      rK " SB SC\5      rL " SD SE\Rv                  5      rM " SF SG\Rv                  5      rN\) " SH SI\"5      5       rO " SJ SK\O5      rP\)" SLS9 " SM SN\O5      5       rQ\)" SOS9 " SP SQ\O5      5       rR " SR SS\Rv                  5      rS " ST SU\Rv                  5      rT " SV SW\Rv                  5      rU\)" SXS9 " SY SZ\O5      5       rV\)" S[S9 " S\ S]\O5      5       rW " S^ S_\Rv                  5      rX\)" S`S9 " Sa Sb\O5      5       rY/ ScQrZg)ezPyTorch BridgeTower Model    )OrderedDict)Callable)	dataclassN)nn)CrossEntropyLoss   )initialization)ACT2FNQuickGELUActivation)CacheDynamicCacheEncoderDecoderCache)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputModelOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )BridgeTowerConfigBridgeTowerTextConfigBridgeTowerVisionConfigRobertaTokenizerz.
    Output type of [`BridgeTowerModel`].
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   S	rg)
BridgeTowerModelOutput2   a  
text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
    Sequence of hidden-states at the text output of the last layer of the model.
image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
    Sequence of hidden-states at the image output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
    Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
    token), respectively, after further processing through layers used for auxiliary pretraining tasks.
Ntext_featuresimage_featurespooler_outputhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r+   torchFloatTensor__annotations__r,   r-   r.   tupler/   __static_attributes__r0       څ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/bridgetower/modeling_bridgetower.pyr)   r)   2   s|     /3M5$$t+2/3NE%%,3.2M5$$t+259M5**+d2926Je''(4/6r;   r)   z>
    Output type of ['BridgeTowerForContrastiveLearning']
    c                   P   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S	'   Sr\\R                     S-  \S
'   Srg)BridgeTowerContrastiveOutputJ   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Image-text contrastive loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
Nlosslogitstext_embedsimage_embedscross_embedsr.   r/   r0   )r1   r2   r3   r4   r5   r@   r6   r7   r8   rA   rB   r9   rC   rD   r.   r/   r:   r0   r;   r<   r>   r>   J   s      &*D%

d
")'+FE$+37Ku(()D0748L%))*T1848L%))*T1859M5**+d2926Je''(4/6r;   r>   c                      ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrS	S\R                  S\R                  S-  4S jjrSr	U =r
$ )
BridgeTowerResidualAttentionj   c                 h  > [         TU ]  5         [        R                  " UR                  UR                  S-  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " [        S[        R                  " UR                  UR                  S-  5      4S[        5       4S[        R                  " UR                  S-  UR                  5      4/5      5      U l        [        R                  " UR                  UR                  S9U l        S U l        g )N@   epsc_fc   geluc_proj)super__init__r   MultiheadAttentionhidden_sizeattn	LayerNormlayer_norm_epsln_1
ModuleDictr   Linearr   mlpln_2	attn_maskselfconfig	__class__s     r<   rQ   %BridgeTowerResidualAttention.__init__k   s    ))&*<*<f>P>PTV>VW	LL!3!39N9NO	==RYYv'9'96;M;MPQ;QRS023ryy););a)?ASASTU
 LL!3!39N9NO	r;   hidden_stateattention_maskc           	         Ub(  UR                  [        R                  UR                  S9nU R                  b.  U R                  R                  UR
                  UR                  S9OS U l        U R                  UUUSU R                  US9S   $ )NdtypedeviceF)need_weightsr\   key_padding_maskr   )tor6   boolrg   r\   rf   rT   )r^   rb   rc   s      r<   	attention&BridgeTowerResidualAttention.attention|   s    %+..UZZH[H[.\N ~~) NNL$6$6|?R?RS 	
 yynn+  
  	r;   Nc                     XR                  U R                  U5      U5      -   nU R                  U5      nU R                  R	                  5        H  nU" U5      nM     X1-   nU$ N)rl   rW   r[   rZ   values)r^   rb   rc   residual_statelayers        r<   forward$BridgeTowerResidualAttention.forward   sZ    %tyy7NP^(__yy0XX__&E .L '%4r;   )rT   r\   rW   r[   rZ   ro   )r1   r2   r3   r4   rQ   r6   Tensorrl   rs   r:   __classcell__r`   s   @r<   rF   rF   j   sI    "ell ELL "ELL %,,QUBU  r;   rF   c                   l   ^  \ rS rSrU 4S jrSS\R                  S\R                  S-  4S jjrSrU =r	$ )	BridgeTowerTransformer   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  (       aL  [
        R                  " [        U R                  S-
  5       Vs/ s H  n[        U5      PM     sn5      U l	        OH[
        R                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l	        UR                  U l
        g s  snf s  snf )Nr"   )rP   rQ   rS   num_hidden_layersremove_last_layerr   
ModuleListrangerF   	resblocksstop_gradientr^   r_   _r`   s      r<   rQ   BridgeTowerTransformer.__init__   s    !--!'!9!9##]]?DTE[E[^_E_?`a?`!-f5?`aDN  ]]?DTE[E[?\]?\!-f5?\]DN $11 b ^s   -C)6C.Nrb   rc   c                     / nU R                    HN  nU" X5      nU R                  (       a!  UR                  UR                  5       5        M=  UR                  U5        MP     U$ ro   )r   r   appenddetach)r^   rb   rc   r.   blocks        r<   rs   BridgeTowerTransformer.forward   sU    ^^E >L!!$$\%8%8%:;$$\2 $ r;   )rS   r|   r   r   ro   
r1   r2   r3   r4   rQ   r6   ru   rs   r:   rv   rw   s   @r<   ry   ry      s.    2ELL %,,QUBU  r;   ry   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )BridgeTowerVisionEmbeddings   r_   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r"   position_idsr"   
persistent)rP   rQ   r_   rS   	embed_dim
image_size
patch_sizer   	Parameterr6   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandr]   s     r<   rQ   $BridgeTowerVisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr;   
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r"   r   Nr         ?r   r   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer6   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolateviewcat)r^   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r<   interpolate_pos_encoding4BridgeTowerVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr;   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z).rf   r   r"   r   r   )r   r   
ValueErrorr   r   rf   rj   flatten	transposer   r   r6   r   r   r   r   )r^   r   r   
batch_sizer   r   r   target_dtypepatch_embedsclass_embedsr   s              r<   rs   #BridgeTowerVisionEmbeddings.forward   s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr;   )	r   r_   r   r   r   r   r   r   r   F)r1   r2   r3   r4   r%   rQ   r6   ru   intr   r7   rs   r:   rv   rw   s   @r<   r   r      sj    q6 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r;   r   c                      ^  \ rS rSrU 4S jr S
S\R                  S\4S jjr S
S\R                  S\4S jjr	S\R                  4S jr
S	rU =r$ )BridgeTowerVisionTransformeri  c           
      6  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        U5      U l
        [        R
                  " UR                  UR                  S9U l        UR                  U l        UR                  (       dg  [        R                  " [        UR                  5       Vs/ s H,  n[        R
                  " UR                  UR                  S9PM.     sn5      U l        g g s  snf NrJ   )rP   rQ   r   r   r   rU   rS   rV   ln_prery   transformerln_postshare_layernormr~   r   r|   ln_separater   s      r<   rQ   %BridgeTowerVisionTransformer.__init__  s    5f=ll6#5#56;P;PQ1&9||F$6$6F<Q<QR%55%%!}}V[\b\t\tVuvVuQRf00f6K6KLVuv D &vs   3Dr   r   c                    U R                  X5      nU R                  U5      nUR                  SSS5      nU R                  XB5      n[        R
                  " USS9nUR                  SSSS5      nU R                  (       a  U R                  U5      nU$ / n[        X@R                  5       H  u  pFU" U5      nUR                  U5        M      [        R
                  " USS9nU$ )Nr"   r   r   r   r   )r   r   r   r   r6   stackr   r   zipr   r   )r^   r   rc   r   r.   hidden_states_stacklns          r<   rs   $BridgeTowerVisionTransformer.forward  s     OM2%--aA6((GMq9%--aAq9 LL7M  #%%(8H8H%I! "= 1#**=9 &J "KK(;CMr;   c                 l    U R                  XS9nU R                  U5      nUR                  SSS5      nU$ )Nr   r"   r   r   )r   r   r   )r^   r   r   r.   s       r<   forward_pre(BridgeTowerVisionTransformer.forward_pre-  s<    
 hM2%--aA6r;   rb   c                 N    UR                  SSS5      nU R                  U5      nU$ )Nr"   r   r   )r   r   )r^   rb   visual_output_posts      r<   forward_post)BridgeTowerVisionTransformer.forward_post8  s-    )11!Q:!\\*<=!!r;   )r   r   r   r   r   r   r   )r1   r2   r3   r4   rQ   r6   ru   rk   rs   r   r   r:   rv   rw   s   @r<   r   r     s]    " */	ll #'	< */	ll	 #'	" " "r;   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerLinkToweri>  c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  S;   a  UR                  S:X  a0  [        R
                  " [        R                  " S5      5      U l        O?UR                  S:X  a/  [        R
                  " [        R                  " S5      5      U l	        [        R                  " U R                  UR                  S9U l
        g [        SUR                   S35      e)	N)add
scaled_addr   r   g      ?r   r   rJ   link_tower_type  is not implemented)rP   rQ   link_tower_typerS   r   r   r6   tensorscaled_factorbetarU   rV   NotImplementedErrorr]   s     r<   rQ   BridgeTowerLinkTower.__init__?  s    %55!--!!%II%%5%'\\%,,s2C%D"''=8LLc):;	\\$*:*:@U@UVDN%(89O9O8PPc&deer;   c                 Z   U R                   S:X  a  U R                  X-   5      $ U R                   S:X  a   U R                  XR                  -  U-   5      $ U R                   S:X  a0  U R                  USU R                  -
  -  X R                  -  -   5      $ [	        SU R                    S35      e)Nr   r   r   r"   r   r   )r   rU   r   r   r   )r^   r.   cross_modal_hidden_statesrc   s       r<   rs   BridgeTowerLinkTower.forwardL  s    5(>>-"KLL!!\1>>-2D2D"DG`"`aa!!]2>>-1tyy="AD]`i`iDi"ijj%(89M9M8NNa&bccr;   )rU   r   rS   r   r   r1   r2   r3   r4   rQ   rs   r:   rv   rw   s   @r<   r   r   >  s    fd dr;   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BridgeTowerSelfOutputiX  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r   )rP   rQ   r   rY   rS   denserU   rV   Dropouthidden_dropout_probdropoutr]   s     r<   rQ   BridgeTowerSelfOutput.__init__Y  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r;   r.   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ ro   r  r  rU   r^   r.   r  s      r<   rs   BridgeTowerSelfOutput.forward_  5    

=1]3}'CDr;   rU   r  r  r   rw   s   @r<   r  r  X  6    >U\\  RWR^R^  r;   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BridgeTowerIntermediateig  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g ro   )rP   rQ   r   rY   rS   intermediate_sizer  
isinstance
hidden_actstrr
   intermediate_act_fnr]   s     r<   rQ    BridgeTowerIntermediate.__init__h  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r;   r.   r   c                 J    U R                  U5      nU R                  U5      nU$ ro   r  r  r^   r.   s     r<   rs   BridgeTowerIntermediate.forwardp  s&    

=100?r;   r  r   rw   s   @r<   r  r  g  s(    9U\\ ell  r;   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BridgeTowerOutputiw  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rP   rQ   r   rY   r  rS   r  rU   rV   r  r  r  r]   s     r<   rQ   BridgeTowerOutput.__init__x  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r;   r.   r  r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ ro   r
  r  s      r<   rs   BridgeTowerOutput.forward~  r  r;   r  r   rw   s   @r<   r  r  w  r  r;   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BridgeTowerPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g ro   )rP   rQ   r   rY   rS   r  Tanh
activationr]   s     r<   rQ   BridgeTowerPooler.__init__  s9    YYv1163E3EF
'')r;   r.   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r  r'  )r^   r.   first_token_tensorpooled_outputs       r<   rs   BridgeTowerPooler.forward  s6     +1a40

#566r;   )r'  r  r   rw   s   @r<   r$  r$    s(    $
U\\ ell  r;   r$  modulequerykeyvaluerc   scalingr  kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr         r   r   r   )ptrainingr"   )
r   r6   matmulr   r   r   softmaxr  r6  
contiguous)
r-  r.  r/  r0  rc   r1  r  r2  attn_weightsattn_outputs
             r<   eager_attention_forwardr<    s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r;   c                      ^  \ rS rSrSU 4S jjr  SS\R                  S\R                  S-  S\S-  S\	\
   S\\R                     4
S	 jjrS
rU =r$ )BridgeTowerSelfAttentioni  Nc                 N  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        UR&                  U l        X l        X0l        g Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r4  )rP   rQ   rS   num_attention_headshasattrr   r_   r   attention_head_sizeall_head_sizer1  r   rY   r.  r/  r0  r  attention_probs_dropout_probr  
is_decoder	is_causal	layer_idxr^   r_   rI  rJ  r`   s       r<   rQ   !BridgeTowerSelfAttention.__init__  sM    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""r;   r.   rc   past_key_valuesr2  r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n	UbA  Un
[        U[        5      (       a  UR                  n
U
R                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UUU	U4U R"                  (       d  SOU R$                  R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nX4$ )Nr   r"   r           r  r1  )r   rE  r.  r   r   r/  r0  r  r   self_attention_cacheupdaterJ  r   get_interfacer_   _attn_implementationr<  r6  r  r5  r1  r   r9  )r^   r.   rc   rM  r2  input_shapehidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer;  r:  s                 r<   rs    BridgeTowerSelfAttention.forward  s    $))#2.CCbC$*B*BC jj/44lCMMaQRSHH]+00,?II!QO	jj/44lCMMaQRS&&5#/+>??*9*N*N' &=%C%CI\`\j\j%k"I(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((r;   )rF  rE  r_   r  rI  rH  r/  rJ  rC  r.  r1  r0  FN)NNr1   r2   r3   r4   rQ   r6   ru   r7   r   r   r   r9   rs   r:   rv   rw   s   @r<   r>  r>    sl    #6 48(,	')||') ))D0') 	')
 +,') 
u||	') ')r;   r>  c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	\
   S	\\R                     4S
 jjrSrU =r$ )BridgeTowerCrossAttentioni  Nc                 ,  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        X l        X0l        g r@  )rP   rQ   rS   rC  rD  r   r_   r   rE  rF  r1  r   rY   r.  r/  r0  r  rG  r  rI  rJ  rK  s       r<   rQ   "BridgeTowerCrossAttention.__init__  s@    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""r;   r.   encoder_hidden_statesrc   rM  r2  r   c                 z   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nUb%  UR
                  R                  U R                  5      OSn	Ubb  U	(       a[  UR                  R                  U R                     R                  n
UR                  R                  U R                     R                  nO/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUbA  UR                  R                  XU R                  5      u  pSUR
                  U R                  '   [        R                   " U R"                  R$                  [&        5      nU" U UU
UU4U R(                  (       d  SOU R*                  R,                  U R.                  S.UD6u  pUR0                  " / UQSP76 R3                  5       nX4$ )Nr   r"   r   FTrO  rP  )r   rE  r.  r   r   
is_updatedgetrJ  cross_attention_cachelayerskeysrp   r/  r0  rR  r   rS  r_   rT  r<  r6  r  r5  r1  r   r9  )r^   r.   rc  rc   rM  r2  rU  rV  rW  re  rX  rY  kv_shaper[  r;  r:  s                   r<   rs   !BridgeTowerCrossAttention.forward  s    $))#2.CCbC$*B*BC jj/44\BLLQPQRGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]KX.44Sb9X2Xt?W?WXH!67<<XFPPQRTUVI**%:;@@JTTUVXYZK*)8)N)N)U)UDNN*&	 >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((r;   )rF  rE  r_   r  rI  r/  rJ  rC  r.  r1  r0  r]  NNN)r1   r2   r3   r4   rQ   r6   ru   r7   r   r   r   r9   rs   r:   rv   rw   s   @r<   r`  r`    s    #4 ;?376:1)||1)  %00471) ))D0	1)
 -t31) +,1) 
u||	1) 1)r;   r`  c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\\R                     4S jjrSrU =r$ )BridgeTowerAttentioniD  Nc                    > [         TU ]  5         X@l        U(       a  [        O[        nU" XUS9U l        [        U5      U l        g )NrI  rJ  )rP   rQ   is_cross_attentionr`  r>  r^   r  output)r^   r_   rI  rJ  rq  attention_classr`   s         r<   rQ   BridgeTowerAttention.__init__E  s9    "47I3Og#F9U	+F3r;   r.   rc   rc  encoder_attention_maskrM  r2  r   c                     U R                   (       d  UOUnU R                  " U4UUUS.UD6u  pxU R                  Xq5      nXx4$ )N)rc  rc   rM  )rq  r^   rr  )	r^   r.   rc   rc  ru  rM  r2  attention_outputr:  s	            r<   rs   BridgeTowerAttention.forwardL  s\     04/F/FLb)-*
"7)+	*

 *
&  ;;'7G--r;   )rq  rr  r^   )FNFNNNNr^  rw   s   @r<   rn  rn  D  s    4 48:>;?(,.||. ))D0.  %0047	.
 !& 1 1D 8. . +,. 
u||	. .r;   rn  c                   P   ^  \ rS rSrSU 4S jjr   SS\\   4S jjrS rSr	U =r
$ )	BridgeTowerBertCrossLayeria  c                   > [         TU ]  5         UR                  U l        SU l        [	        USUS9U l        UR                  U l        UR                  U l        [	        USUSS9U l        [        U5      U l
        [        U5      U l        g )Nr"   Trp  FrI  rJ  rq  )rP   rQ   chunk_size_feed_forwardseq_len_dimrn  rl   rH  add_cross_attentioncrossattentionr  intermediater  rr  r^   r_   rJ  r`   s      r<   rQ   "BridgeTowerBertCrossLayer.__init__b  s    '-'E'E$-fPYZ ++#)#=#= 2#	
 4F;'/r;   r2  c                     U R                   " U4US S.UD6u  pxUn	U R                  " U	4UUUUS.UD6u  pU
n	[        U R                  U R                  U R
                  U	5      nUUU4$ )N)rc   rM  )rc   rc  ru  rM  )rl   r  r   feed_forward_chunkr~  r  )r^   r.   rc  rc   ru  rM  r2  self_attention_outputself_attn_weightsrw  cross_attention_outputcross_attn_weightslayer_outputs                r<   rs   !BridgeTowerBertCrossLayer.forwardr  s     48>>4
) 4
 	4
0 1595H5H6
)"7#9+6
 6
2 20##T%A%A4CSCSUe
 
 	
r;   c                 J    U R                  U5      nU R                  X!5      nU$ ro   r  rr  r^   rw  intermediate_outputr  s       r<   r  ,BridgeTowerBertCrossLayer.feed_forward_chunk  )    "//0@A{{#6Ir;   r  rl   r~  r  r  rH  rr  r  ro   rl  )r1   r2   r3   r4   rQ   r   r   rs   r  r:   rv   rw   s   @r<   r{  r{  a  s3    0( #"
 +,"
H r;   r{  c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\R                  4S jjrS rSrU =r$ )BridgeTowerTextLayeri  Nc                   > [         TU ]  5         UR                  U l        SU l        [	        XR
                  US9U l        UR
                  U l        UR                  U l        U R                  (       a0  U R
                  (       d  [        U  S35      e[	        USUSS9U l	        [        U5      U l        [        U5      U l        g )Nr"   rp  z> should be used as a decoder model if cross attention is addedFTr}  )rP   rQ   r~  r  rn  rH  rl   r  r   r  r  r  r  rr  r  s      r<   rQ   BridgeTowerTextLayer.__init__  s    '-'E'E$-f@Q@Q]fg ++#)#=#= ##?? D6)g!hii"6##'	#D 4F;'/r;   r.   rc   rc  ru  rM  r2  r   c                 2   U R                   " UU4SU0UD6u  pxUn	U R                  (       a?  Ub<  [        U S5      (       d  [        SU  S35      eU R                  " US UU4SU0UD6u  pU
n	[        U R                  U R                  U R                  U	5      nU$ )NrM  r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	rl   rH  rD  r   r  r   r  r~  r  )r^   r.   rc   rc  ru  rM  r2  r  r   rw  r  r  s               r<   rs   BridgeTowerTextLayer.forward  s     $(>>$
 ,$
 	$
  1??4@4!122 =dV DD D 
 )-(;(;%%&	)
 !0) )%"  60##T%A%A4CSCSUe
 r;   c                 J    U R                  U5      nU R                  X!5      nU$ ro   r  r  s       r<   r  'BridgeTowerTextLayer.feed_forward_chunk  r  r;   r  ro   ry  )r1   r2   r3   r4   rQ   r6   ru   r7   r   r   r   rs   r  r:   rv   rw   s   @r<   r  r    s    0. 48:>;?(,%||% ))D0%  %0047	%
 !& 1 1D 8% % +,% 
%N r;   r  c                      ^  \ rS rSrU 4S jr     SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	S-  S
\
\   S\4S jjrSrU =r$ )BridgeTowerTextEncoderi  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf )NrJ  )	rP   rQ   r_   r   r~   r   r|   r  rr   )r^   r_   ir`   s      r<   rQ   BridgeTowerTextEncoder.__init__  sK    ]]@EfF^F^@_`@_1!&6@_`

`s   ANr.   rc   rc  ru  rM  	use_cacher2  r   c                 n    U R                    H  nU" UUU4UUS.UD6nM     [        UU(       a  US9$ S S9$ )N)ru  rM  )last_hidden_staterM  )rr   r   )	r^   r.   rc   rc  ru  rM  r  r2  layer_modules	            r<   rs   BridgeTowerTextEncoder.forward  s`     !JJL(% (> / M ' 9+/8O
 	
>B
 	
r;   )r_   rr   )NNNNN)r1   r2   r3   r4   rQ   r6   ru   r7   r   rk   r   r   r   rs   r:   rv   rw   s   @r<   r  r    s    
 48:>;?(,!%
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 +,
 
3
 
r;   r  c                      ^  \ rS rSrSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S
\R                  4S jjr\S 5       r\SS j5       rSrU =r$ )BridgeTowerTextEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                 >  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR$                  5      R'                  S5      SS9  U R                  S[         R(                  " U R*                  R-                  5       [         R.                  S9SS9  UR                  U l        [        R                  " UR$                  UR
                  U R0                  S9U l        g )	N)padding_idxrJ   r   r   Fr   token_type_idsr   )rP   rQ   r   r   
vocab_sizerS   pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsrU   rV   r  r  r  r   r6   r   max_position_embeddingsr   zerosr   r   longr  position_embeddingsr]   s     r<   rQ   "BridgeTowerTextEmbeddings.__init__  s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
 r;   N	input_idsr  r   inputs_embedspast_key_values_lengthr   c                    Uc;  Ub  U R                  XR                  U5      nOU R                  X@R                  5      nUb  UR                  5       nOUR                  5       S S nUu  pxUc  [	        U S5      (       aQ  U R
                  R                  UR                  S   S5      n	[        R                  " U	SUS9n	U	R                  Xx5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R!                  U5      nX-   nU R#                  U5      nU R%                  U5      nU$ )Nr   r  r   r"   )r   indexre   )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   rD  r  r   r   r6   gatherr  r  r   rg   r  r  r  rU   r  )r^   r  r  r   r  r  rU  r   
seq_lengthbuffered_token_type_idsr  r   r  s                r<   rs   !BridgeTowerTextEmbeddings.forward  sb    $#FF//1G   $JJ=ZjZjk #..*K',,.s3K!,

 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
r;   c                     U R                  5       SS nUS   n[        R                  " US-   X1-   S-   [        R                  U R                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr   r"   re   r   )r   r6   r   r  rg   r   r   )r  r  rU  sequence_lengthr   s        r<   r  @BridgeTowerTextEmbeddings.create_position_ids_from_inputs_embedsI  sn     $((*3B/%a.||!O_:Q>ejjYfYmYm
 %%a(//<<r;   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r"   r   )ner   r6   cumsumtype_asr  )r  r  r  maskincremental_indicess        r<   r  <BridgeTowerTextEmbeddings.create_position_ids_from_input_ids[  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r;   )rU   r  r  r  r  r  )NNNNr   )r   )r1   r2   r3   r4   r5   rQ   r6   
LongTensorr7   r   ru   rs   staticmethodr  r  r:   rv   rw   s   @r<   r  r    s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r;   r  c                       \ rS rSr% \\S'   SrSrSrSS/r	Sr
\\\S	.r\R                   " 5       S
\R$                  4S j5       rSrg)BridgeTowerPreTrainedModelil  r_   bridgetower)imagetextFr>  rF   rM  )r.   r/   cross_attentionsr-  c                    U R                   R                  n[        U[        5      (       Ga  U R                   R                  S-  SU R                   R
                  -  S-  -  nU R                   R                  S-  nSU R                   R                  -  S-  nUR                  R                   H  n[        R                  " UR                  R                  XB-  S9  [        R                  " UR                  R                  5        [        R                  " UR                  R                  R                  X2-  S9  [        R                  " UR                   R"                  R                  XR-  S9  [        R                  " UR                   R$                  R                  X2-  S9  M     [        R                  " UR&                  R(                  XB-  S9  [        R                  " UR&                  R*                  R                  XB-  S9  GO[        U[,        R.                  [,        R0                  [,        R2                  45      (       a%  [        R                  " UR                  SSU-  S9  GO[        U[,        R4                  5      (       aB  [        R                  " UR6                  5        [        R8                  " UR                  5        GO?[        U[:        5      (       a6  [        R<                  " UR>                  U R                   R@                  5        O[        U[B        5      (       aO  [        RD                  " URF                  [H        RJ                  " URL                  5      RO                  S5      5        O[        U[P        5      (       a{  [        RD                  " URF                  [H        RJ                  " URF                  RR                  S   5      RO                  S5      5        [        R                  " URT                  5        [        U[,        R.                  [V        45      (       a/  UR6                  b!  [        R                  " UR6                  5        g g g )	Nr4  r   )stdrO  g?)meanr  r   r   ),r_   initializer_factorr  r   rS   r|   r   r   initnormal_rT   in_proj_weightzeros_in_proj_biasout_projr   rZ   rL   rO   r   r   r   r   rY   r   r   rU   r   ones_!BridgeTowerForContrastiveLearning	constant_logit_scalelogit_scale_init_valuer   copy_r   r6   r   r   r   r  r   r  BridgeTowerMLMHead)r^   r-  r  proj_stdattn_stdfc_stdr   s          r<   _init_weights(BridgeTowerPreTrainedModel._init_weightsz  s   kk,,f:;;//51t{{?\?\;\ae:efH{{..4H$++111d:F++55UZZ66HNKEJJ334UZZ0077X^LUYY^^22EUYY--44(.I 6 LL**::OLL**==DD(.YBIIr|| DEELLSdSjA--KK$JJv}}% ABBNN6--t{{/Q/QR ;<<JJv**ELL9M9M,N,U,UV],^_ 9::JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--.fryy*<=>>6;;CZKK$ D[>r;   r0   N)r1   r2   r3   r4   r#   r8   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementr  r>  r`  _can_record_outputsr6   no_gradr   Moduler  r:   r0   r;   r<   r  r  l  sb    %(&+#35ST"3-.5 ]]_%BII % %r;   r  c                   R   ^  \ rS rSr% \\S'   SrU 4S jr\S 5       r	SS jr
SrU =r$ )	BridgeTowerVisionModeli  r_   )r  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g ro   )rP   rQ   r   visual	post_initr]   s     r<   rQ   BridgeTowerVisionModel.__init__  s&     26:r;   c                 j    U R                   R                  R                  R                  R                  $ ro   )r  r   r   r   rf   r^   s    r<   rf   BridgeTowerVisionModel.dtype  s$    {{%%55<<BBBr;   c                 X    U R                  UR                  U R                  5      X#5      $ ro   )r  typerf   )r^   r  
image_maskr   r2  s        r<   rs   BridgeTowerVisionModel.forward  s     {{5::djj1:XXr;   )r  )NF)r1   r2   r3   r4   r%   r8   r  rQ   propertyrf   rs   r:   rv   rw   s   @r<   r  r    s6    ##!
 C CY Yr;   r  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                     ^  \ rS rSr% \\S'   SrSU 4S jjrS rS r	\
\\         SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\\   S\4S jj5       5       5       rS rSrU =r$ )BridgeTowerTextModeli  r_   )r  c                    > [         TU ]  U5        Xl        SU l        [	        U5      U l        [        U5      U l        U(       a  [        U5      OSU l	        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
FN)rP   rQ   r_   gradient_checkpointingr  r   r  encoderr$  poolerr  )r^   r_   add_pooling_layerr`   s      r<   rQ   BridgeTowerTextModel.__init__  sT    
 	 &+#3F;-f53D'/$ 	r;   c                 .    U R                   R                  $ ro   r   r  r  s    r<   get_input_embeddings)BridgeTowerTextModel.get_input_embeddings  s    ...r;   c                 $    XR                   l        g ro   r  r^   r0  s     r<   set_input_embeddings)BridgeTowerTextModel.set_input_embeddings  s    */'r;   Nr  rc   r  r   r  rc  ru  rM  r  r2  r   c
           
          US L US L-  (       a  [        S5      eU R                  R                  (       d  Sn	U	(       a1  Uc.  [        [	        U R                  S9[	        U R                  S95      nUb  UR                  5       OSnU R                  UUUUUS9nU R                  UUUUUS9u  p'U R                  " U4UUUUU	US.U
D6nUS   nU R                  b  U R                  U5      OS n[        UUUR                  S9$ )	Nz:You must specify exactly one of input_ids or inputs_embedsF)r_   r   )r  r   r  r  r  )rc   ru  embedding_outputrc  rM  )rc   rc  ru  rM  r  r   )r  r-   rM  )r   r_   rH  r   r   get_seq_lengthr   _create_attention_masksr  r  r   rM  )r^   r  rc   r  r   r  rc  ru  rM  r  r2  r  r  encoder_outputssequence_outputr+  s                   r<   rs   BridgeTowerTextModel.forward  s8   & -t";<YZZ{{%%I01,dkk2RT`hlhshsTtuOETE`!?!?!Afg??%)'#9 + 
 261M1M)#9-"7+ 2N 2
. ,,	
)"7#9+%	
 	
 *!,8<8OO4UY;-'+;;
 	
r;   c                     U R                   R                  (       a  [        U R                   UUUS9nO[        U R                   UUS9nUb  [        U R                   UUUS9nX4$ )N)r_   r  rc   rM  )r_   r  rc   )r_   r  rc   rc  )r_   rH  r   r   )r^   rc   ru  r  rc  rM  s         r<   r  ,BridgeTowerTextModel._create_attention_masks  sr     ;;!!/{{.- /	N 7{{.-N "-%>{{.5&;	&" 55r;   )r_   r   r  r  r  )T)	NNNNNNNNN)r1   r2   r3   r4   r$   r8   r  rQ   r	  r  r    r!   r   r6   ru   r   rk   r   r   r   rs   r  r:   rv   rw   s   @r<   r   r     s%    "! "/0   *..2.2,0-1596:(,!%9
<<$&9
 t+9
 t+	9

 llT)9
 ||d*9
  %||d29
 !&t 39
 9
 $;9
 +,9
 
69
	    9
x6 6r;   r   zv
    The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on
    c                     ^  \ rS rSrU 4S jrS rS rS\R                  S\	S\R                  4S jr
S\R                  S\	S\R                  4S	 jr\\          SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\	S
-  S\R                  S
-  S\S\\   S\\R                     \-  4S jj5       5       rS rSrU =r$ )BridgeTowerModeli8  c           	      	  > [         TU ]  U5        Xl        UR                  nUR                  nUR
                  (       aa  [        R                  " UR                  UR                  5      U l	        [        R                  " UR                  UR                  5      U l
        O[        R                  " [        UR                  5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l	        [        R                  " [        UR                  5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l
        [        R                  " SUR                  5      U l        [!        U5      U l        [%        U5      U l        UR(                  (       d  UR*                  (       a  U R"                  R,                  R.                   H  nU R"                  R,                  R0                  R2                  R4                  UR2                  l        U R"                  R,                  R0                  R6                  R4                  UR6                  l        M     [        R                  " [        UR                  5       Vs/ s H  n[9        U5      PM     sn5      U l        [        R                  " [        UR                  5       Vs/ s H  n[9        U5      PM     sn5      U l        [?        U5      U l         [?        U5      U l!        [        RD                  " UR                  URF                  S9U l$        [        RD                  " UR                  URF                  S9U l%        URL                  (       a!  [O        U5      U l(        [O        U5      U l)        O[        R                  " [        UR                  S-
  5       Vs/ s H  n[O        U5      PM     sn5      U l(        [        R                  " [        UR                  S-
  5       Vs/ s H  n[O        U5      PM     sn5      U l)        U RU                  5         g s  snf s  snf s  snf s  snf s  snf s  snf )Nr   rJ   r"   )+rP   rQ   r_   vision_configtext_config$share_cross_modal_transformer_layersr   rY   rS   cross_modal_text_transformcross_modal_image_transformr~   r   r|   r   r  r  vision_modelr   
text_modelr   "init_layernorm_from_vision_encoderr  cross_modal_ln_separater   r   datar   r{  cross_modal_image_layerscross_modal_text_layersr$  cross_modal_image_poolercross_modal_text_poolerrU   rV   cross_modal_text_layernormcross_modal_image_layernormshare_link_tower_layersr   cross_modal_text_link_towercross_modal_image_link_towerr  )r^   r_   r  r  r   r   r`   s         r<   rQ   BridgeTowerModel.__init__>  sU    ,,((66.0ii8O8OQWQcQc.dD+/1yy9R9RTZTfTf/gD,.0mmQVW]WoWoQpqQpA;22F4F4FGQpq/D+ 02}}SXY_YqYqSrsSra=44f6H6HISrs0D, &(\\!V5G5G%H"2=A.{;,,1Z1Z''..FF!%!2!2!9!9!A!A!H!H!M!M		#0077??DDII G )+=B6C[C[=\]=\&{3=\])
% (*}}=B6C[C[=\]=\&{3=\](
$
 ):&(A%'8'@$ +-,,v7I7IvOdOd*e'+-<<8J8JPVPePe+f())/CF/KD,0DV0LD-/1}}7<V=U=UXY=Y7Z[7Z!%f-7Z[0D, 137<V=U=UXY=Y7Z[7Z!%f-7Z[1D- 	W r t ^ ^  \ \s$   5Q+05Q0-Q55Q:.Q?9Rc                 6    U R                   R                  5       $ ro   )r!  r	  r  s    r<   r	  %BridgeTowerModel.get_input_embeddingsv  s    3355r;   c                 :    U R                   R                  U5        g ro   )r!  r  r  s     r<   r  %BridgeTowerModel.set_input_embeddingsy  s    ,,U3r;   r.   rJ  r   c                     U R                   R                  (       a  U R                  U5      $ U R                  U   " U5      $ ro   )r_   r  r  r^   r.   rJ  s      r<   _apply_text_transform&BridgeTowerModel._apply_text_transform|  s6    ;;;;22=AA..y9-HHr;   c                     U R                   R                  (       a  U R                  U5      $ U R                  U   " U5      $ ro   )r_   r  r  r4  s      r<   _apply_image_transform'BridgeTowerModel._apply_image_transform  s6    ;;;;33MBB//	:=IIr;   Nr  rc   r  r   
pixel_maskr  rC   image_token_type_idxlabelsr   r2  c           
         / n/ n/ n/ nUb  Uc  [        S5      eU=(       d    SnUR                  5       nU R                  R                  US9nUR	                  U5        Uc.  [
        R                  " U[
        R                  UR                  S9nU R                  R                  UU5      R                  UR                  5      n[        U R                  R                  R                  5      U R                  R                  -
  S-   nU R                  R                  R                  SU  H  nU" UU5      nUR	                  U5        M     UcH  U R                   R"                  R%                  UR'                  U R                   R(                  5      U
S9nOUR+                  SSS5      nUR	                  U5        U R                   R"                  R,                  R.                  SU  H  nU" U5      nUR	                  U5        M     U R                   R"                  R1                  UR'                  U R                   R(                  5      5      nU R3                  USS	9nU R5                  [
        R6                  " S[
        R                  UR                  S95      R9                  U5      nU R;                  UU-   5      nU R=                  USS	9nU R5                  [
        R>                  " S
U[
        R                  UR                  S95      R9                  U5      nUU-   nU RA                  U5      n[
        R                  " UR                  S5      UR                  S5      4[
        R                  UR                  S9nU R                  R                  XUR                  5       5      R                  UR                  5      nU RB                  S   " UUUUS9nUS   nU RD                  S   " UUUUS9nUS   nUR	                  UU45        UR	                  US   US   45        Sn [G        U[        U R                  R                  R                  5      5       GH  n!U R                  R                  R                  U!   " UU5      nU R                   R"                  R,                  R.                  U!   " U5      R'                  U R                   R(                  5      nU R=                  U R                   R"                  R1                  U5      U S-   5      U-   nU RH                  U    n"U RJ                  U    n#U R3                  UU S-   5      n$U"" U$U-   UU5      n%U#" UUU5      n&U RB                  U S-      " U%U&UUS9nUS   nU RD                  U S-      " U&U%UUS9nUS   nU S-  n UR	                  U5        UR	                  U5        UR	                  UU45        UR	                  US   US   45        GM     UUn(n'U RM                  U'U(5      n)[O        U'U(U)[Q        U5      [Q        U5      [Q        U5      4[Q        U5      S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
image_token_type_idx (`int`, *optional*):
    - The token type ids for images.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels are currently not supported.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerModel
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> # prepare image and text
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> text = "hello world"
>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base")
>>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")

>>> inputs = processor(image, text, return_tensors="pt")
>>> outputs = model(**inputs)
>>> outputs.keys()
odict_keys(['text_features', 'image_features', 'pooler_output'])
```NzYBridgeTowerModel does not use `inputs_embeds`.  Make sure to pass in `input_ids` instead.r"   )r  re   r   r   r   r  r"   )rc   ru  )r+   r,   r-   r.   r/   ))r   r   r!  r   r   r6   onesr  rg   get_extended_attention_maskrj   lenr  rr   r_   r|   r   r  r   r  rf   r   r   r   r   r5  r  r  	expand_asr)  r8  fullr*  r&  r%  r   r,  r-  get_cls_featuresr)   r9   )*r^   r  rc   r  r   r:  r  rC   r;  r<  r   r2  all_hidden_states_textall_hidden_states_imageall_hidden_states_crossall_self_attentionsrU  rB   extend_text_maskssplit_indexrr   r   image_embeds_with_lncross_modal_texttext_token_type_embeddingsimage_token_type_embeddingscross_modal_imageextend_image_maskslayer_outputs_textcross_text_featureslayer_outputs_imagecross_image_featureslink_layer_indexr  text_link_towerimage_link_towertransformed_text_embedscross_text_features_cross_image_features_r+   r,   cls_featuress*                                             r<   rs   BridgeTowerModel.forward  sf   \ "$"$"$ $):%k   48qnn&oo0090E%%k2!"ZZ5::iN^N^_N OOGGXcdgg

 $//117784;;;X;XX[\\ __,,22<K@E->?K"))+6 A ,,33??!!$"3"3"9"9:Um @ L
 (//1a8L&&|4 &&--99CCL[QE .L#**<8 R  $0077DD\EVEVW[WhWhWnWnEop  55kQ5O%)%?%?KKI4D4DE&

)$
% 	#  ::;KNh;hi#::;O[\:]&*&@&@JJt1IL\L\]'

)(
) 	$  46QQ <<=QRZZ##A&(9(>(>q(AB**##


 "__HHUdUdUfgjj
 "99!<,#5	
 13";;A>-#4	
  315&&(;=Q'RS""$6q$9;Nq;Q#RS {C(?(?(E(E$FGA//1177:;HYZK,,33??II!L\Z__!!''L ++D,=,=,D,D,Q,QR^,_aqtuauv-. !
 #>>?OPO#@@AQR '+&@&@N^abNb&c##2'*DD#!$ 
 %55IK_as$t! "&!=!=>NQR>R!S$%0'9	" #5Q"7"&"?"?@PST@T"U%$1'8	# $7q#9 !"))+6#**<8#**,?AU+VW&&(:1(=?RST?U'VW[ H` )<=Q~,,]NK%')&,--.-.
 01

 
	
r;   c                 r    U R                  U5      nU R                  U5      n[        R                  " X4/SS9$ )Nr   r   )r(  r'  r6   r   )r^   r+   r,   cls_features_textcls_features_images        r<   rD  !BridgeTowerModel.get_cls_featuresS  s9     88G!::>Jyy+@bIIr;   )r_   r*  r%  r-  r'  r  r)  r&  r,  r(  r  r!  r  r   )
NNNNNNNNNF)r1   r2   r3   r4   rQ   r	  r  r6   ru   r   r5  r8  r   r   r  r7   rk   r   r   r9   r)   rs   rD  r:   rv   rw   s   @r<   r  r  8  s   6p64I5<< IC ITYT`T` I
JELL JS JUZUaUa J
  .2372615.22615+/*.).I
##d*I
 ))D0I
 ((4/	I

 ''$.I
 $$t+I
 ((4/I
 ''$.I
 "DjI
   4'I
 #'I
 +,I
 
u||	5	5I
  I
VJ Jr;   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )"BridgeTowerPredictionHeadTransformiZ  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rP   rQ   r   rY   rS   r  r  r  r  r
   transform_act_fnrU   rV   r]   s     r<   rQ   +BridgeTowerPredictionHeadTransform.__init__[  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr;   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ ro   )r  rd  rU   r  s     r<   rs   *BridgeTowerPredictionHeadTransform.forwardd  s4    

=1--m<}5r;   )rU   r  rd  r   rw   s   @r<   rb  rb  Z  s    U r;   rb  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )r  ik  c                 n  > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  R                  SS9U l
        [
        R                  " [        R                  " UR                  R                  5      5      U l        Ub  X R                  l        g g )NF)r   )rP   rQ   r_   rb  	transformr   rY   rS   r  r  decoderr   r6   r  r   r   )r^   r_   r   r`   s      r<   rQ   BridgeTowerMLMHead.__init__l  s    ;FCyy!3!3V5G5G5R5RY^_LLV-?-?-J-J!KL	"(LL r;   c                 d    U R                  U5      nU R                  U5      U R                  -   nU$ ro   )rj  rk  r   )r^   x	mlm_scores      r<   rs   BridgeTowerMLMHead.forwardu  s-    NN1%	LL+dii7	r;   )r   r_   rk  rj  ro   r   rw   s   @r<   r  r  k  s    ) r;   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerITMHeadi{  c                 Z   > [         TU ]  5         [        R                  " US5      U l        g Nr   rP   rQ   r   rY   fc)r^   rS   r`   s     r<   rQ   BridgeTowerITMHead.__init__|  s     ))K+r;   c                 (    U R                  U5      nU$ ro   rv  )r^   rn  	itm_scores      r<   rs   BridgeTowerITMHead.forward  s    GGAJ	r;   ry  r   rw   s   @r<   rr  rr  {  s    , r;   rr  z\
    BridgeTower Model with a language modeling head on top as done during pretraining.
    c                   l  ^  \ rS rSrSS0rU 4S jrS rS r\\	        SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\   S\4S jj5       5       rSrU =r$ )BridgeTowerForMaskedLMi  zmlm_score.decoder.weightz8bridgetower.text_model.embeddings.word_embeddings.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g ro   )rP   rQ   r  r  r  ro  r  r]   s     r<   rQ   BridgeTowerForMaskedLM.__init__  s5     +F3+F3 	r;   c                 .    U R                   R                  $ ro   ro  rk  r  s    r<   get_output_embeddings,BridgeTowerForMaskedLM.get_output_embeddings  s    ~~%%%r;   c                 $    XR                   l        g ro   r  )r^   new_embeddingss     r<   set_output_embeddings,BridgeTowerForMaskedLM.set_output_embeddings  s    !/r;   Nr  rc   r  r   r:  r  rC   r<  r2  r   c	                    U R                   " SUUUUUUUS.U	D6n
U R                  U
R                  5      nSnUbk  [        5       nUR	                  UR
                  5      nU" UR                  SU R                  R                  R                  5      UR                  S5      5      n[        UUU
R                  U
R                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read())).convert("RGB")
>>> text = "a <mask> looking out of the window"

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
>>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

>>> # prepare inputs
>>> encoding = processor(image, text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**encoding)

>>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())

>>> print(results)
.a cat looking out of the window.
```r  rc   r  r   r:  r  rC   Nr   r@   rA   r.   r/   r0   )r  ro  r+   r   rj   rg   r   r_   r  r  r   r.   r/   )r^   r  rc   r  r   r:  r  rC   r<  r2  outputs
mlm_logitsmasked_lm_lossloss_fcts                 r<   rs   BridgeTowerForMaskedLM.forward  s    d "" 	
))%!'%	
 	
 ^^G$9$9:
')HYYz001F%joob$++:Q:Q:\:\&]_e_j_jkm_noN!//))	
 	
r;   )r  ro  NNNNNNNN)r1   r2   r3   r4   _tied_weights_keysrQ   r  r  r   r   r6   r  r7   r   r   r   rs   r:   rv   rw   s   @r<   r}  r}    s"    56pq&0  .2372615.22615*.H
##d*H
 ))D0H
 ((4/	H

 ''$.H
 $$t+H
 ((4/H
 ''$.H
   4'H
 +,H
 
H
  H
r;   r}  z
    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
    [CLS] token) for image-to-text matching.
    c                   X  ^  \ rS rSrU 4S jr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\
\   S\4S jj5       5       rSrU =r$ )#BridgeTowerForImageAndTextRetrievali  c                    > [         TU ]  U5        [        U5      U l        [	        UR
                  S-  5      U l        U R                  5         g rt  )rP   rQ   r  r  rr  rS   rz  r  r]   s     r<   rQ   ,BridgeTowerForImageAndTextRetrieval.__init__  s@     +F3+F,>,>,BC 	r;   Nr  rc   r  r   r:  r  rC   r<  r2  r   c	                    U R                   " SUUUUUUUS.U	D6n
U
R                  nU R                  U5      nSnUb-  [        5       nUR	                  UR
                  5      nU" X5      n[        UUU
R                  U
R                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
    Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
    The pairs with 0 will be skipped for calculation.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
>>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

>>> # forward pass
>>> scores = dict()
>>> for text in texts:
...     # prepare inputs
...     encoding = processor(image, text, return_tensors="pt")
...     outputs = model(**encoding)
...     scores[text] = outputs.logits[0, 1].item()
```r  Nr  r0   )	r  r-   rz  r   rj   rg   r   r.   r/   )r^   r  rc   r  r   r:  r  rC   r<  r2  r  r-   rA   itm_lossr  s                  r<   rs   +BridgeTowerForImageAndTextRetrieval.forward  s    \ "" 	
))%!'%	
 	
  --.')HYYv}}-F/H'!//))	
 	
r;   )r  rz  r  )r1   r2   r3   r4   rQ   r   r   r6   r  r7   r   r   r   rs   r:   rv   rw   s   @r<   r  r    s
     .2372615.22615*.G
##d*G
 ))D0G
 ((4/	G

 ''$.G
 $$t+G
 ((4/G
 ''$.G
   4'G
 +,G
 
"G
  G
r;   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerContrastiveHeadiF  c                 X   > [         TU ]  5         [        R                  " X5      U l        g ro   ru  )r^   rS   
embed_sizer`   s      r<   rQ   #BridgeTowerContrastiveHead.__init__G  s    ))K4r;   c                 (    U R                  U5      nU$ ro   ry  )r^   rn  s     r<   rs   "BridgeTowerContrastiveHead.forwardK  s    GGAJr;   ry  r   rw   s   @r<   r  r  F  s    5 r;   r  zl
    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
    c                   D  ^  \ rS rSrU 4S jr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
S-  S\\   S\4S jj5       5       rSrU =r$ )r  iP  c                   > [         TU ]  U5        [        U5      U l        [	        UR
                  UR                  5      U l        [	        UR
                  UR                  5      U l        [	        UR
                  S-  UR                  5      U l	        [        R                  " [        R                  " U R                  R                  5      5      U l        U R#                  5         g rt  )rP   rQ   r  r  r  rS   contrastive_hidden_sizeitc_text_headitc_image_headitc_cross_modal_headr   r   r6   r   r_   r  r  r  r]   s     r<   rQ   *BridgeTowerForContrastiveLearning.__init__V  s     +F378J8JFLjLjk89K9KVMkMkl$>v?Q?QTU?UW]WuWu$v!<<T[[5W5W(XYr;   Nr  rc   r  r   r:  r  rC   return_lossr2  r   c	                    U	R                  SS5        U R                  " SUUUUUUUS.U	D6n
U
R                  nU
R                  u  pnUS   nUS   nU R                  R                  R
                  R                  U5      nU R                  R                  [        R                  " SS[        R                  U R                  R                  R                  R                  S95      R                  U5      nU R                  R                  U5      U-   n[        R                   R#                  U R%                  USS2S	SS24   5      SS
S9n[        R                   R#                  U R'                  USS2S	SS24   5      SS
S9R)                  UR                  S9n[        R                   R#                  U R+                  U5      SS
S9R)                  UR                  S9n[        R,                  " XU/SS9nU R.                  R1                  5       R)                  UR                  S9n[        R2                  " XR5                  5       5      U-  n[        R2                  " UUR5                  5       5      U-  n[        R2                  " UUR5                  5       5      U-  nSnU(       a  [        R6                  " [9        U5      UR                  S9n[        R                   R;                  UU5      n[        R                   R;                  UU5      n[        R                   R;                  UU5      nUU-   U-   S-  n[=        UUUUUU
R                  U
R>                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> import torch

>>> image_urls = [
...     "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg",
...     "http://images.cocodataset.org/val2017/000000039769.jpg",
... ]
>>> texts = ["two dogs in a car", "two cats sleeping on a couch"]

>>> with httpx.stream("GET", urls[0]) as response:
...     image1 = Image.open(BytesIO(response.read()))

>>> with httpx.stream("GET", urls[1]) as response:
...     image2 = Image.open(BytesIO(response.read()))

>>> images = [image1, image2]

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
>>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")

>>> inputs = processor(images, texts, padding=True, return_tensors="pt")
>>> loss = model(**inputs, return_loss=True).loss

>>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt")
>>> loss_swapped = model(**inputs, return_loss=True).loss

>>> print("Loss", round(loss.item(), 4))
Loss 0.0019

>>> print("Loss with swapped images", round(loss_swapped.item(), 4))
Loss with swapped images 2.126
```output_hidden_statesTr  r   r>  r"   re   Nr   r   )r   r5  )rg   r   g      @)r@   rA   rB   rC   rD   r.   r/   r0   ) 
setdefaultr  r-   r.   r   r  r   r  r6   rC  r  r   rg   rB  r  r   r   	normalizer  r  rj   r  r   r  expr7  tr   rA  cross_entropyr>   r/   )r^   r  rc   r  r   r:  r  rC   r  r2  r  r-   hidden_states_txthidden_states_imghidden_states_cross_modalrB   rK  rN  rD   rA   r  logits_text_to_imagelogits_text_to_crosslogits_image_to_crossitc_lossr<  text_to_image_losstext_to_cross_lossimage_to_cross_losss                                r<   rs   )BridgeTowerForContrastiveLearning.forwardc  s"   v 	0$7"" 	
))%!'%	
 	
  --JQJ_J_G.G'+(,#//<<CCPPQ]^&*&6&6&L&LJJtQejj9I9I9_9_9f9f9m9mn'

)(
) 	$ ''CCDXY\ww mm--d.@.@QPQSTWAU.V\^bc-d}}..t/B/B<PQSTVWPWCX/Y_aef.gjj%% k 
 }}..t/H/H/W]_cd.ehh%% i 
 kFBO&&**,//{7I7I/J$||K9IJ[X$||K9IJ[X %\<>>;K L{ Z\\#f+fmmDF!#!<!<=QSY!Z!#!<!<=QSY!Z"$--"="=>SU["\*-??BUUY\\H+#%%!//))
 	
r;   )r  r  r  r  r  r  )r1   r2   r3   r4   rQ   r   r   r6   r  r7   rk   r   r   r>   rs   r:   rv   rw   s   @r<   r  r  P  s     .2372615.22615#'s
##d*s
 ))D0s
 ((4/	s

 ''$.s
 $$t+s
 ((4/s
 ''$.s
 D[s
 +,s
 
&s
  s
r;   r  )r  r  r}  r  r  )NrO  )[r5   collectionsr   collections.abcr   dataclassesr   r6   r   torch.nnr    r	   r  activationsr
   r   cache_utilsr   r   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r    utils.output_capturingr!   configuration_bridgetowerr#   r$   r%   
get_loggerr1   logger_TOKENIZER_FOR_DOCr)   r>   r  rF   ry   r   r   r   r  r  r  r$  ru   floatr<  r>  r`  rn  r{  r  r  r  r  r  r   r  rb  r  rr  r}  r  r  r  __all__r0   r;   r<   <module>r     s     # $ !   % & 6 C C J 9  G & 6 K K I 5 h h 
		H	%'  
 7[ 7 7$ 
 7; 7 74)299 )XRYY 6P")) Pf7"299 7"td299 d4BII bii  		 		 , !%II%<<% 
% <<	%
 LL4'% T\% % '(%:@)ryy @)HI)		 I)Z.299 .:8		 8v?5 ?F
RYY 
Fg8		 g8T +% +% +%\Y7 Y" {65 {6{6| 
YJ1 YJ
YJz "    
\
7 \

\
~ T
*D T
T
n  
C
(B C

C
Lr;   