
    Z j#                       S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSK	r	SSK
Js  Jr  SSK	Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-  \$R\                  " \/5      r0S r1S r2S r3S\	Rh                  S\	Rh                  4S jr5\\"" SS9 " S S\ 5      5       5       r6\"" SS9\ " S S \ 5      5       5       r7\"\ " S! S"\ 5      5       5       r8 " S# S$\Rr                  5      r: " S% S&\Rr                  5      r; " S' S(\Rr                  5      r< " S) S*\Rr                  5      r= " S+ S,\Rr                  5      r> " S- S.\Rr                  5      r? " S/ S0\Rr                  5      r@ " S1 S2\Rr                  5      rA " S3 S4\Rr                  5      rB " S5 S6\5      rC " S7 S8\Rr                  5      rD " S9 S:\Rr                  5      rE " S; S<\Rr                  5      rF " S= S>\Rr                  5      rG SeS?\Rr                  S@\	Rh                  SA\	Rh                  SB\	Rh                  SC\	Rh                  S-  SD\HSE\H4SF jjrI " SG SH\Rr                  5      rJ " SI SJ\Rr                  5      rK " SK SL\Rr                  5      rL " SM SN\Rr                  5      rM " SO SP\Rr                  5      rN " SQ SR\5      rO " SS ST\Rr                  5      rP " SU SV\Rr                  5      rQ\" " SW SX\5      5       rR " SY SZ\R5      rS\"" S[S9 " S\ S]\R5      5       rT\" " S^ S_\R5      5       rU\" " S` Sa\R5      5       rV\" " Sb Sc\R5      5       rW/ SdQrXg)fzPyTorch CLAP model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)capture_outputs   )ClapAudioConfig
ClapConfigClapTextConfigc                     U R                   u  p#nU SS2SS2SSS24   R                  SSUS5      nUR                  X#U-  U5      nU$ )aI  
Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

Args:
    hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
        Input hidden states
    ratio (`int`):
        The ratio of the length of the output to the length of the input.
Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/clap/modeling_clap.pyinterpolater(   .   sT     .;-@-@*ZkaD!m,33Aq%CI!!*E.A;OI    c                     U R                   u  p#pEU R                  X#U-  XU-  X5      n U R                  SSSSSS5      R                  5       R                  SXU5      nU$ )a2  
Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
num_channels)`

Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
        Input hidden states
    window_size (`int`):
        Window size
r   r   r            r   viewpermute
contiguous)r!   window_sizer#   heightwidthnum_channelswindowss          r'   window_partitionr8   ?   so     /<.A.A+J!&&k);8LkM ##Aq!Q15@@BGGKfrsGNr)   c                     U R                   S   nU R                  SX!-  X1-  XU5      n U R                  SSSSSS5      R                  5       R                  SX#U5      n U $ )a_  
Merges windows to produce higher resolution features.
Args:
    windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
        Input windows
    window_size (`int`):
        Window size
    height (`int`):
        Height of the resized audio
    width (`int`):
        Width of the resized audio
r.   r   r   r   r+   r,   r-   r/   )r7   r3   r4   r5   r6   s        r'   window_reverser:   T   se     ==$Lll2v4e6JKfrsGooaAq!Q/::<AA"fUabGNr)   logitsreturnc                     [         R                  " [        U 5      U R                  S9n[        R
                  R                  X5      $ )Ndevice)torcharangelenr?   r   
functionalcross_entropy)r;   labelss     r'   contrastive_lossrF   i   s/    \\#f+fmm<F==&&v66r)   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
ClapTextModelOutputn   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedslast_hidden_state.r!   
attentions )__name__
__module____qualname____firstlineno____doc__rK   r@   FloatTensor__annotations__rL   r!   tuplerM   __static_attributes__rN   r)   r'   rI   rI   n   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r)   rI   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
ClapAudioModelOutput   z
audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    The Audio embeddings obtained by applying the projection layer to the pooler_output.
Naudio_embedsrL   .r!   rM   rN   )rO   rP   rQ   rR   rS   r[   r@   rT   rU   rL   r!   rV   rM   rW   rN   r)   r'   rY   rY      sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r)   rY   c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)
ClapOutput   ab  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for audio-text similarity.
logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
    The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
    The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`ClapTextModel`].
audio_model_output (`BaseModelOutputWithPooling`):
    The output of the [`ClapAudioModel`].
Nlosslogits_per_audiologits_per_textrK   r[   text_model_outputaudio_model_outputr<   c                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fN)
isinstancer   to_tuple).0vs     r'   	<genexpr>&ClapOutput.to_tuple.<locals>.<genexpr>   s)     ^P]1Z;%?%?QZZ\QFP]s   46)rV   valuesselfs    r'   rh   ClapOutput.to_tuple   s    ^PTP[P[P]^^^r)   rN   )rO   rP   rQ   rR   rS   r_   r@   rT   rU   r`   ra   rK   r[   rb   r   rc   rV   r   rh   rW   rN   r)   r'   r]   r]      s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148185929_%* _r)   r]   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )ClapDropPath   z
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
refactored version of the `SwinDropPath` implementation.
c                 .   > [         TU ]  5         Xl        g rf   )super__init__	drop_prob)ro   rw   	__class__s     r'   rv   ClapDropPath.__init__   s    "r)   c                 P   U R                   S:X  d  U R                  (       d  U$ SU R                   -
  nUR                  S   4SUR                  S-
  -  -   nU[        R
                  " X1R                  UR                  S9-   nUR                  5         UR                  U5      U-  nU$ )N        r   r   )r   dtyper?   )
rw   trainingr   ndimr@   randr}   r?   floor_div)ro   r!   	keep_probr   random_tensoroutputs         r'   forwardClapDropPath.forward   s    >>S   &	$$Q')DM4F4F4J,KK!EJJu<O<OXeXlXl$mm""9-=r)   )rw   rf   )	rO   rP   rQ   rR   rS   rv   r   rW   __classcell__rx   s   @r'   rr   rr      s    
# r)   rr   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )ClapAudioAFFBlock   z
ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
the 1D version.
configc                   > [         TU ]  5         UR                  nUR                  n[	        X#-  5      n[
        R                  " [
        R                  " X$SSSS9[
        R                  " U5      [
        R                  " SS9[
        R                  " XBSSSS9[
        R                  " U5      5      U l
        [
        R                  " [
        R                  " S5      [
        R                  " X$SSSS9[
        R                  " U5      [
        R                  " SS9[
        R                  " XBSSSS9[
        R                  " U5      5      U l        [
        R                  " 5       U l        g )Nr   r   kernel_sizestridepaddingT)inplace)ru   rv   patch_embeds_hidden_sizeaff_block_rintr   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)ro   r   channelsdownsize_ratiointer_channelsrx   s        r'   rv   ClapAudioAFFBlock.__init__   s   22++X78IIhAaQRSNN>*GGD!IInAaQRSNN8$
 --  #IIhAaQRSNN>*GGD!IInAaQRSNN8$
 zz|r)   c                     X-   nU R                  U5      U R                  U5      -   nU R                  U5      nSU-  U-  SU-  SU-
  -  -   nU$ )Nr+   r   )r   r   r   )ro   r!   residualattention_inputfused_layer_outputr   s         r'   r   ClapAudioAFFBlock.forward   s`    '2!^^O<t?__!\\*<=]"%77!h,!N`J`:aar)   )r   r   r   
rO   rP   rQ   rR   rS   r   rv   r   rW   r   r   s   @r'   r   r      s    
$ $0 r)   r   c                   >   ^  \ rS rSrSrS\4U 4S jjrSS jrSrU =r	$ )ClapAudioPatchEmbed   z~
This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
Transformer block.
r   c                   > [         TU ]  5         [        UR                  [        5      (       a  UR                  UR                  4OUR                  n[        UR
                  [        5      (       a  UR
                  UR
                  4OUR
                  n[        UR                  [        5      (       a  UR                  UR                  4OUR                  nX l        X@l        US   US   -  US   US   -  4U l        U R                  S   U R                  S   -  U l	        UR                  U l        UR                  U l        US   US   -
  S-  US   US   -
  S-  4nU R                  (       a  UR                  S:X  a  SOSn[        R                  " UR                   U-  UR"                  UUUS9U l        UR&                  (       a   [        R(                  " UR"                  5      O[        R*                  " 5       U l        U R                  (       aX  [/        U5      U l        [        R                  " UR                   UR"                  US   US   S-  4US   US   S-  4US9U l        g g )Nr   r   r+   channel_mapr,   r   r   )ru   rv   rg   	spec_sizer   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)ro   r   r   r   r   r   scale_factorrx   s          r'   rv   ClapAudioPatchEmbed.__init__   s0   ;EfFVFVX[;\;\F$$f&6&67bhbrbr6@ARARTW6X6XV 1 12^d^o^o 	 ;EVEXEXZ]:^:^V  &"5"56djdwdw 	 !("1+a8(1+VW:XY>>!,t~~a/@@22#11qMLO39JqMLYZO<[`a;ab ..63E3E3Vq\]II--<++"
	 FLEcEcBLL!@!@Aikititiv	 1& 9D ii11//']JqMA,=>$Qa1)<=DO r)   c                    U R                   (       Ga  US S 2SS2S S 2S S 24   nUR                  u  pEpgX`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      nUR                  S5      n[        U5      S:  a  XSS 2S S 2S S 24   R                  5       n	U	R                  u  pEpgU	R                  XE-  SXg5      n	U R                  U	5      n	U	R                  u  ppgU	R                  XEXU5      n	U	R                  S5      R                  5       R                  S	5      n	U	R                  S5      n[        R                  R                  R                  U	SX-
  4S
S5      n	U R!                  X2   U	5      X2'   UnOwUR                  u    pnX`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      nU R                  (       a!  UR                  S5      R#                  SS5      nU R%                  U5      nU$ )Nr   r   zInput audio size (*z) doesn't match model (z).r.   )r   r+   r   r   r,   r   constantr+   )r   r   r   
ValueErrorr   sizerB   r2   r0   r   r1   r   r@   r   rC   padr   	transposer   )ro   r!   is_longer_idxglobal_hidden_statesr#   r6   r4   r5   output_widthlocal_hidden_states_featureslocal_widths                r'   r   ClapAudioPatchEmbed.forward(  s   #0AaCA#>  7K6P6P3Jfq))UmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  $(99-A#B /44R8L=!A%&312q!4K&L&W&W&Y#:M:S:S7
&&9&>&>z?XZ[]c&k#&*oo6I&J#-@-F-F*V&9&>&>zYakp&q#&9&A&A/&R&]&]&_&g&ghi&j#166r:&+hh&9&9&=&='!\-G)H*VW'# 7;6G6G(79L7$3 1M"/"5"5Aq%q))UmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  !IIm4M<<)11!4>>q!DM		-0r)   )
r   r   r   r   r   r   r   r   r   r   rf   r   r   s   @r'   r   r      s    
( (T/ /r)   r   c            
          ^  \ rS rSrU 4S jr  SS\R                  S\R                  S-  S\S-  S\	\R                     4S jjr
S	 rS
rU =r$ )ClapAudioSelfAttentioni[  c                   > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        [        U[        R                  R                  5      (       a  UOXD4U l        [        R                  " [        R                  " SU R                  S   -  S-
  SU R                  S   -  S-
  -  U5      5      U l        U R#                  SU R%                  5       5        [        R&                  " U R                  U R                  UR(                  S9U l        [        R&                  " U R                  U R                  UR(                  S9U l        [        R&                  " U R                  U R                  UR(                  S9U l        [        R0                  " UR2                  5      U l        g )	Nr   The hidden size (6) is not a multiple of the number of attention heads ()r+   r   relative_position_indexbias)ru   rv   r   num_attention_headsr   attention_head_sizeall_head_sizerg   collectionsabcIterabler3   r   	Parameterr@   zerosrelative_position_bias_tableregister_buffercreate_relative_position_indexLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropoutro   r   dim	num_headsr3   rx   s        r'   rv   ClapAudioSelfAttention.__init__\  s   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP%k;??3K3KLLKS^Rl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
) 	68[8[8]^YYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr)   Nr!   attention_maskoutput_attentionsr<   c                 v   UR                   u  pEnXESU R                  4nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
[        R                  " XR	                  SS5      5      nU[        R                  " U R                  5      -  nU R                  U R                  R                  S5         nUR                  U R                  S   U R                  S   -  U R                  S   U R                  S   -  S5      nUR                  SSS5      R                  5       nXR!                  S5      -   nUbm  UR                   S   nUR                  XM-  XR"                  XU5      nXR!                  S5      R!                  S5      -   nUR                  SU R"                  XU5      n[$        R&                  R)                  USS9nU R+                  U5      n[        R                  " X5      nUR                  SSSS5      R                  5       nUR-                  5       S S U R.                  4-   nUR                  U5      nU(       a  X4nU$ U4nU$ )Nr.   r   r+   r   r   r   )r   r   r   r0   r   r   r   r@   matmulmathsqrtr   r   r3   r1   r2   	unsqueezer   r   rC   softmaxr   r   r   )ro   r!   r   r   r#   r   r6   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                     r'   r   ClapAudioSelfAttention.forwardv  s    )6(;(;%
"T-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.N.Nq.QQ%'--a0J/44(*6N6NPS   02J2J12M2W2WXY2ZZ/44R9Q9QSV\ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=2 O\M]r)   c                    [         R                  " U R                  S   5      n[         R                  " U R                  S   5      n[         R                  " [         R                  " X/SS95      n[         R
                  " US5      nUS S 2S S 2S 4   US S 2S S S 24   -
  nUR                  SSS5      R                  5       nUS S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   SU R                  S   -  S-
  -  ss'   UR                  S5      nU$ )Nr   r   ij)indexingr+   r.   )	r@   rA   r3   stackmeshgridr   r1   r2   sum)ro   coords_hcoords_wcoordscoords_flattenrelative_coordsr   s          r'   r   5ClapAudioSelfAttention.create_relative_position_index  s+   << 0 0 34<< 0 0 34U^^X,@4PQvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"9&&r)   )	r   r   r   r   r   r   r   r   r3   NF)rO   rP   rQ   rR   rv   r@   TensorrT   boolrV   r   r   rW   r   r   s   @r'   r   r   [  sc    G: 48).	1||1 ))D01  $;	1
 
u||	1f' 'r)   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapAudioSelfOutputi  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g rf   )ru   rv   r   r   denser   r   r   ro   r   r   rx   s      r'   rv   ClapAudioSelfOutput.__init__  s4    YYs(
zz&"E"EFr)   r!   input_tensorr<   c                 J    U R                  U5      nU R                  U5      nU$ rf   r  r   ro   r!   r  s      r'   r   ClapAudioSelfOutput.forward  s$    

=1]3r)   r  
rO   rP   rQ   rR   rv   r@   r  r   rW   r   r   s   @r'   r  r    s7    G
U\\  RWR^R^  r)   r  c            
          ^  \ rS rSrU 4S jr  S
S\R                  S\R                  S-  S\S-  S\	\R                     4S jjr
S	rU =r$ )ClapAudioAttentioni  c                 d   > [         TU ]  5         [        XX45      U l        [	        X5      U l        g rf   )ru   rv   r   ro   r  r   r   s        r'   rv   ClapAudioAttention.__init__  s(    *6	O	)&6r)   Nr!   r   r   r<   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   ro   r   )ro   r!   r   r   self_outputsattention_outputr  s          r'   r   ClapAudioAttention.forward  sC     yy@QR;;|AF#%QR(88r)   r   ro   r  )rO   rP   rQ   rR   rv   r@   r  rT   r  rV   r   rW   r   r   s   @r'   r$  r$    s\    7 48).		||	 ))D0	  $;		
 
u||		 	r)   r$  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapAudioIntermediatei  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rf   )ru   rv   r   r   r   	mlp_ratior  rg   
hidden_actstrr	   intermediate_act_fnr  s      r'   rv   ClapAudioIntermediate.__init__  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r)   r!   r<   c                 J    U R                  U5      nU R                  U5      nU$ rf   r  r3  ro   r!   s     r'   r   ClapAudioIntermediate.forward  &    

=100?r)   r6  r"  r   s   @r'   r.  r.    (    9U\\ ell  r)   r.  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapAudioOutputi  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g rf   )
ru   rv   r   r   r   r0  r  r   hidden_dropout_probr   r  s      r'   rv   ClapAudioOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r)   r!   r<   c                 J    U R                  U5      nU R                  U5      nU$ rf   r  r7  s     r'   r   ClapAudioOutput.forward  s$    

=1]3r)   r  r"  r   s   @r'   r<  r<    s(    >
U\\ ell  r)   r<  c                      ^  \ rS rSrSU 4S jjrS rS rS r  SS\R                  S\
\\4   S\S	-  S
\S	-  S\
\R                  \R                  4   4
S jjrSrU =r$ )ClapAudioLayeri  c                   > [         TU ]  5         UR                  U l        X`l        UR                  U l        X0l        [        R                  " X!R                  S9U l	        [        XX@R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X!R                  S9U l        [!        X5      U l        [%        X5      U l        g )Neps)r3   r{   )ru   rv   chunk_size_feed_forward
shift_sizer3   input_resolutionr   r   layer_norm_epslayernorm_beforer$  	attentionrr   r   	drop_pathlayernorm_afterr.  intermediater<  r   )ro   r   r   rI  r   drop_path_raterH  rx   s          r'   rv   ClapAudioLayer.__init__  s    '-'E'E$$!-- 0 "S6K6K L+FP`P`a9G#9Mn5SUS^S^S`!||C5J5JK1&>%f2r)   c                    [        U5      U R                  ::  an  [        S5      U l        [        R
                  R                  5       (       a*  [        R                   " [        R                  " U5      5      O
[        U5      U l        g g Nr   )minr3   r   rH  r@   jit
is_tracingtensor)ro   rI  s     r'   set_shift_and_window_size(ClapAudioLayer.set_shift_and_window_size  s_     D$4$44'lDO=BYY=Q=Q=S=S		%,,'789Y\]mYn  5r)   c           	         U R                   S:  Gae  [        R                  " SXS4X4S9n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4nSnU H  n	U H  n
XS S 2XS S 24'   US-  nM     M     [        XPR                  5      nUR                  SU R                  U R                  -  5      nUR                  S5      UR                  S5      -
  nUR                  US:g  S5      R                  US:H  S5      nU$ S nU$ )Nr   r   r|   r.   r+   g      Yr{   )	rH  r@   r   slicer3   r8   r0   r   masked_fill)ro   r4   r5   r}   r?   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r'   get_attn_maskClapAudioLayer.get_attn_mask  sy   ??Q{{Ava#8UHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E -#/K@EQ1<=QJE $0 !.
 ,H6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir)   c                     U R                   X0R                   -  -
  U R                   -  nU R                   X R                   -  -
  U R                   -  nSSSUSU4n[        R                  R                  X5      nX4$ rS  )r3   r   rC   r   )ro   r!   r4   r5   	pad_right
pad_bottom
pad_valuess          r'   	maybe_padClapAudioLayer.maybe_pad)  sy    %%0@0@(@@DDTDTT	&&2B2B)BBdFVFVV
Ay!Z8
))-D((r)   r!   input_dimensionsr   Nalways_partitionr<   c                    U(       d  U R                  U5        O Uu  pVUR                  5       u  pxn	Un
U R                  U5      nUR                  XuXi5      nU R	                  XU5      u  pUR
                  u  ppU R                  S:  a.  [        R                  " XR                  * U R                  * 4SS9nOUn[        XR                  5      nUR                  SU R                  U R                  -  U	5      nU R                  XUR                  UR                  S9nU R                  UUUS9nUS   nUR                  SU R                  U R                  U	5      n[        UU R                  X5      nU R                  S:  a-  [        R                  " UU R                  U R                  4SS9nOUnUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R!                  5       nUR                  XuU-  U	5      nXR#                  U5      -   nU R%                  U5      nU R'                  U5      nXR)                  U5      -   nU(       a	  UUS	   4nU$ U4nU$ )
Nr   )r   r+   )shiftsdimsr.   r|   )r   r   r-   r   )rX  r   rK  r0   rk  r   rH  r@   rollr8   r3   re  r}   r?   rL  r:   r2   rM  rN  rO  r   )ro   r!   rm  r   rn  r4   r5   r#   r   r   shortcutrj  
height_pad	width_padshifted_hidden_stateshidden_states_windowsrd  attention_outputsr*  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                           r'   r   ClapAudioLayer.forward0  s{     **+;<("/"4"4"6
x --m<%**:uO %)NN=%$P!&3&9&9#y??Q$)JJ}FVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&)<)<EZEaEa ' 
	 !NN+@)_pNq,Q/,11"d6F6FHXHXZbc():D<L<Ljd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:~xX >>2C#DD++M:((6${{<'@@@Q'8';< YeWfr)   )
rL  rG  rM  rI  rO  rN  rK  r   rH  r3   )r{   r   FF)rO   rP   rQ   rR   rv   rX  re  rk  r@   r  rV   r   r  r   rW   r   r   s   @r'   rC  rC    s    38) */(->||>  S/>  $;	>
 +> 
u||U\\)	*> >r)   rC  c                      ^  \ rS rSrU 4S jr  SS\R                  S\\\4   S\	S-  S\	S-  S\\R                     4
S	 jjr
S
rU =r$ )ClapAudioStageir  c                 R  > [         T	U ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H+  n[        UUUUXh   US-  S:X  a  SOUR                  S-  S9PM-     sn5      U l	        Ub  U" X2[        R                  S9U l        OS U l        SU l        g s  snf )Nr+   r   )r   r   rI  r   rP  rH  )r   
norm_layerF)ru   rv   r   r   r   
ModuleListrangerC  r3   blocksr   
downsamplepointing)
ro   r   r   rI  depthr   rM  r  irx   s
            r'   rv   ClapAudioStage.__init__s  s    mm u
 &A !%5'#,<%&UaZqf6H6HA6M &

 !()9r||\DO"DO'
s   2B$r!   rm  r   Nrn  r<   c                     Uu  pV[        U R                  5       H  u  pxU" XX45      n	U	S   nM     Un
U R                  b%  US-   S-  US-   S-  pXVX4nU R                  X5      nOXVXV4nXU4nU(       a  UW	SS  -  nU$ )Nr   r   r+   )	enumerater  r  )ro   r!   rm  r   rn  r4   r5   r  layer_moduler}  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                  r'   r   ClapAudioStage.forward  s     )(5OA(J[nM)!,M  6
 -:)??&5;aZA4EPQ	VWGW 1!'0B V OO,M`M!' >&K\]]12..Mr)   )r  r   r   r  r  r  )rO   rP   rQ   rR   rv   r@   r  rV   r   r  r   rW   r   r   s   @r'   r  r  r  sg    < */(-||  S/  $;	
 + 
u||	 r)   r  c            	          ^  \ rS rSrSr\R                  4S\\   S\S\R                  SS4U 4S jjjr
S	 rS
\R                  S\\\4   S\R                  4S jrSrU =r$ )ClapAudioPatchMergingi  a  
Patch Merging Layer.

Args:
    input_resolution (`tuple[int]`):
        Resolution of input feature.
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
rI  r   r  r<   Nc                    > [         TU ]  5         Xl        X l        [        R
                  " SU-  SU-  SS9U l        U" SU-  5      U l        g )Nr,   r+   Fr   )ru   rv   rI  r   r   r   	reductionr   )ro   rI  r   r  rx   s       r'   rv   ClapAudioPatchMerging.__init__  sE     01s7AG%@q3w'	r)   c                     US-  S:H  =(       d    US-  S:H  nU(       a-  SSSUS-  SUS-  4n[         R                  R                  X5      nU$ )Nr+   r   r   )r   rC   r   )ro   input_featurer4   r5   
should_padrj  s         r'   rk  ClapAudioPatchMerging.maybe_pad  sS    qjAo:519>
Q519a!<JMM--mHMr)   r  rm  c                    Uu  p4UR                   u  pVnUR                  XSXG5      nU R                  XU5      nUS S 2SS S2SS S2S S 24   nUS S 2SS S2SS S2S S 24   n	US S 2SS S2SS S2S S 24   n
US S 2SS S2SS S2S S 24   n[        R                  " XX/S5      nUR                  USSU-  5      nU R                  U5      nU R                  U5      nU$ )Nr   r+   r   r.   r,   )r   r0   rk  r@   catr   r  )ro   r  rm  r4   r5   r#   r   r6   input_feature_0input_feature_1input_feature_2input_feature_3s               r'   r   ClapAudioPatchMerging.forward  s   ((5(;(;%
%**:uS}eD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?_"fhjk%**:r1|;KL		-0}5r)   )r   rI  r   r  )rO   rP   rQ   rR   rS   r   r   rV   r   Modulerv   rk  r@   r  r   rW   r   r   s   @r'   r  r    s|    
 XZWcWc (s (# (299 (hl ( (U\\ U3PS8_ Y^YeYe  r)   r  c                      ^  \ rS rSrU 4S jrS r\      SS\R                  S-  S\	S-  S\	S-  S\	S-  S	\	S-  S
\	S-  S\
\-  4S jj5       rSrU =r$ )ClapAudioEncoderi  c                   > [         TU ]  5         [        UR                  5      U l        Xl        [        U5      U l        UR                  U l        U R                  R                  U l	        UR                  U l
        UR                  UR                  -  U l        [        UR                  SU R                  S-
  -  -  5      U l        [         R"                  " SUR$                  ['        UR                  5      SS9 Vs/ s H  o"R)                  5       PM     nnU R                  R*                  n[-        U R                  5       Vs/ s H  oTS   SU-  -  US   SU-  -  4PM     snU l        [0        R2                  " [-        U R                  5       Vs/ s H  n[5        U[        UR                  SU-  -  5      U R.                  U   UR                  U   UR6                  U   U['        UR                  S U 5      ['        UR                  S US-    5       X`R                  S-
  :  a  [8        OS S9PM     sn5      U l        SU l        [0        R>                  " UR                  5      U l         [0        RB                  " U R                  5      U l"        UR                  U l        [0        RF                  " S5      U l$        g s  snf s  snf s  snf )Nr+   r   r   cpur>   )r   r   rI  r  r   rM  r  F)%ru   rv   rB   depths
num_layersr   r   patch_embedr   r   r   num_mel_bins
freq_ratior   r   num_featuresr@   linspacerP  r  itemr   r  input_resolutionsr   r  r  r   r  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)ro   r   xrP  r   r  i_layerrx   s          r'   rv   ClapAudioEncoder.__init__  s_   fmm,.v6#11 ,,99)) **f.A.AA ? ?!Z[H[B\ \],1NN1f>S>SUXY_YfYfUgpu,vw,vq&&(,vw$$..	\abfbqbq\r!s\rWXQ<AqD#99Q<AqD;Q"R\r!smm  %T__5  6G !F;;ajHI%)%;%;G%D --0$88A,Sx1H-ICPVP]P]^k`gjk`kPlLmn9@??UVCV9V4]a  6
 ',#..)<)<=LL!2!23	mm++A.3 x "ts   >KKB#Kc                 n   UR                   u    p#n[        U R                  U R                  -  5      nU R                  U R                  -  nX5:  d  XF:  a  [	        S5      eX5:  a!  [
        R                  R                  XU4SSS9nXF:  a!  [
        R                  R                  XU4SSS9nUR                   u  pxpUR                  XxU R                  -  XR                  -  U
5      nUR                  SSSS5      R                  5       nUR                  XxXR                  -  XR                  -  5      nU$ )	z
The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r   r+   )r   r   r   r  r   r   rC   r(   r    r1   r2   )ro   normalized_input_featuresr   r$   freq_length
spec_widthspec_heightbatchr   timefreqs              r'   reshape_mel2img ClapAudioEncoder.reshape_mel2img	  sD   
 *C)H)H&1;$//9:
nn7#{'@_`` #(*(A(A)+D9dh )B )% $(*(A(A)+EIei )B )% '@&E&E# %>$E$Edoo-t/F%
! %>$E$EaAq$Q$\$\$^!$=$E$ETOO3T__5L%
! )(r)   N	is_longerr   output_hidden_states(output_hidden_states_before_downsamplingrn  return_dictr<   c                    U=(       d    U R                   R                  nU=(       d    U R                   R                  nUR                  SS5      nU R	                  U5      nUR                  SS5      nS n	U R
                  (       a7  UR                  UR                  5      n
[        R                  " U
S:H  5      S   n	U R                  U5      nUR                  S   nU R                  X5      nU(       a  SOS nU(       a  SOS nU(       a  SOS nU R                  S   nU(       aD  UR                  u  nnnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nUU4-  n[!        U R"                  5       H  u  nnU R                  U   nU" UUX65      nUS   nUS   nUS   nUS   US   4nU(       aU  U(       aN  UR                  u  nnnUR                  " U/US   US   4QUP76 nUR                  SSSS5      nUU4-  nUU4-  nORU(       aK  U(       dD  UR                  u  nnnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nUU4-  nU(       d  M  UUSS  -  nM     U R%                  U5      nUR                  u  nnnUS['        U R(                  5      S-
  -  -  U R*                  S   -  nUS['        U R(                  5      S-
  -  -  U R*                  S   -  nUR                  SSS5      R-                  5       R/                  UUUU5      nUR                  u  nnnnUU R0                  -  n UR/                  UUUU -  U U5      nUR                  SSSSS5      R-                  5       R/                  UUU S5      nU R3                  [        R4                  " US5      5      n![        R4                  " U!S5      n![7        UU!UUS	9$ )
Nr   r   r   r+   rN   r   r.   r,   )rL   pooler_outputr!   rM   )r   r  r   r   r  r   tor?   r@   wherer  r   r  r  r0   r1   r  r  r   rB   r  r   r2   r    r  r  r   r   )"ro   input_featuresr  r   r  r  rn  r  r  is_longer_list_idxis_longer_listr!   
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsrm  r#   r   hidden_sizereshaped_hidden_stater  r  r}  r  r  rL   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs"                                     r'   r   ClapAudioEncoder.forward-  sP     4Wt{{7W7W-N1N1N'11!Q7$(OON$C!$=$G$G1$M!!&\\.*?*?@N!&^q-@!A!!D,,-FG"((+
((K"6BD+?RT"$5b411!4)6)<)<&J;$1$6$6z$bDT$bVa$b!$9$A$A!Q1$M!!11&+@*BB&(5OA|#55a8(8HJ[nM)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )J(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(5(:(::(fHX(fZe(f%(=(E(EaAq(Q%!%55!*/D.FF*  #}QR'88#?  6B !IIm4$5$;$;!
AzA#dkk*:Q*>$?@DDUDUVWDXX
#c$++.>.B(CDHYHYZ[H\\ %%aA.99;CCJPZ\fhvw 	 9J8O8O5
Jv"doo5
-55
MZ$?V
 %%aAq!4??AII*V`blnpq 	 U]]3Da%HImQ7)/'4*	
 	
r)   )r  r  r   r  r   r  r  r  r  r   r  r  r  r   r   )NFFFFT)rO   rP   rQ   rR   rv   r  r   r@   rT   r  rV   rY   r   rW   r   r   s   @r'   r  r    s    &/P")H  /3).,1@E(-#'h
 $$t+h
  $;	h

 #Tkh
 37+h
 +h
 D[h
 
%	%h
 h
r)   r  c                   <   ^  \ rS rSrS\\-  4U 4S jjrS rSrU =r	$ )ClapProjectionLayeri  r   c                    > [         TU ]  5         Xl        UR                  nUR                  n[
        R                  " X#5      U l        [        UR                     U l
        [
        R                  " X35      U l        g rf   )ru   rv   r   r  projection_dimr   r   linear1r	   projection_hidden_act
activationlinear2)ro   r   r  r  rx   s       r'   rv   ClapProjectionLayer.__init__  s[    ((..yy= !=!=>yy@r)   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rf   )r  r  r  r7  s     r'   r   ClapProjectionLayer.forward  s2    ]36]3r)   )r  r   r  r  )
rO   rP   rQ   rR   r   r   rv   r   rW   r   r   s   @r'   r  r    s     A? A r)   r  c                      ^  \ rS rSrSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S
\R                  4S jjr\S 5       r\SS j5       rSrU =r$ )ClapTextEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                 >  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR$                  5      R'                  S5      SS9  U R                  S[         R(                  " U R*                  R-                  5       [         R.                  S9SS9  UR                  U l        [        R                  " UR$                  UR
                  U R0                  S9U l        g )	N)padding_idxrE  position_idsr   r.   T)
persistenttoken_type_ids)r}   )ru   rv   r   	Embedding
vocab_sizer  pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsr   rJ  r   r>  r   r   r@   rA   max_position_embeddingsexpandr   r  r   longr  position_embeddingsro   r   rx   s     r'   rv   ClapTextEmbeddings.__init__  s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXei 	 	
 	ekk$*;*;*@*@*B%**Ubf 	 	
 "..#%<<**F,>,>DL\L\$
 r)   N	input_idsr  r  inputs_embedspast_key_values_lengthr<   c                    Uc;  Ub  U R                  XR                  U5      nOU R                  X@R                  5      nUb  UR                  5       nOUR                  5       S S nUu  pxUc  [	        U S5      (       aQ  U R
                  R                  UR                  S   S5      n	[        R                  " U	SUS9n	U	R                  Xx5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R!                  U5      nX-   nU R#                  U5      nU R%                  U5      nU$ )Nr.   r  r   r   )r   indexr|   )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   hasattrr  r  r   r@   gatherr   r  r  r?   r  r  r  r   r   )ro   r   r  r  r  r  input_shaper#   
seq_lengthbuffered_token_type_idsr  
embeddingsr  s                r'   r   ClapTextEmbeddings.forward  sb    $#FF//1G   $JJ=ZjZjk #..*K',,.s3K!,

 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
r)   c                     U R                  5       SS nUS   n[        R                  " US-   X1-   S-   [        R                  U R                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr.   r   r|   r   )r   r@   rA   r  r?   r   r  )r  r  r	  sequence_lengthr  s        r'   r  9ClapTextEmbeddings.create_position_ids_from_inputs_embeds  sn     $((*3B/%a.||!O_:Q>ejjYfYmYm
 %%a(//<<r)   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r@   cumsumtype_asr  )r   r  r  maskincremental_indicess        r'   r  5ClapTextEmbeddings.create_position_ids_from_input_ids  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r)   )r   r   r  r  r  r  )NNNNr   )r   )rO   rP   rQ   rR   rS   rv   r@   
LongTensorrT   r   r  r   staticmethodr  r  rW   r   r   s   @r'   r  r    s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r)   r  moduler   r   r   r   scalingr   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr+   r   r.   )r   r}   )pr~   r   )r@   r   r   r   rC   r   float32r  r}   r   r~   r2   )
r  r   r   r   r   r  r   kwargsattn_weightsattn_outputs
             r'   eager_attention_forwardr"    s     <<}}Q':;gEL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r)   c                      ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\
\R                  \R                  S-  4   4S jjrS	rU =r$ )ClapTextSelfAttentioni.  c                 6  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        UR                   U l        U R                  S-  U l        g )Nr   embedding_sizer   r   r         )ru   rv   r  r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   attention_dropoutr  r  s     r'   rv   ClapTextSelfAttention.__init__/  sD    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r)   Nr!   r   r  r<   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  (       d  SOU R                  U R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
X4$ )Nr.   r   r+   r{   )r   r  )r   r   r   r0   r   r   r   r   get_interfacer   _attn_implementationr"  r~   r(  r  r    r2   )ro   r!   r   r  r	  r   query_states
key_statesvalue_statesattention_interfacer!  r   s               r'   r   ClapTextSelfAttention.forwardD  s8    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFH((r)   )
r   r(  r   r   r   r   r   r   r  r   rf   )rO   rP   rQ   rR   rv   r@   r  rT   r   r   rV   r   rW   r   r   s   @r'   r$  r$  .  si    60 48)||) ))D0) +,	)
 
u||U\\D00	1) )r)   r$  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapTextSelfOutputie  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrE  )ru   rv   r   r   r  r  r   rJ  r   r>  r   r  s     r'   rv   ClapTextSelfOutput.__init__f  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r)   r!   r  r<   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rf   r  r   r   r   s      r'   r   ClapTextSelfOutput.forwardl  5    

=1]3}'CDr)   r   r  r   r"  r   s   @r'   r3  r3  e  6    >U\\  RWR^R^  r)   r3  c            	          ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\R                  4S jjr
S	rU =r$ )ClapTextAttentionit  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g rf   )ru   rv   r$  ro   r3  r   r  s     r'   rv   ClapTextAttention.__init__u  s&    )&1	(0r)   Nr!   r   r  r<   c                 Z    UnU R                   " U4SU0UD6u  pU R                  X5      nU$ Nr   r(  )ro   r!   r   r  r   r   s         r'   r   ClapTextAttention.forwardz  sE     !99
)
 

 M<r)   r,  rf   )rO   rP   rQ   rR   rv   r@   r  rT   r   r   r   rW   r   r   s   @r'   r>  r>  t  sV    1 48|| ))D0 +,	
 
 r)   r>  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapTextIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rf   )ru   rv   r   r   r  intermediate_sizer  rg   r1  r2  r	   r3  r  s     r'   rv   ClapTextIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r)   r!   r<   c                 J    U R                  U5      nU R                  U5      nU$ rf   r6  r7  s     r'   r   ClapTextIntermediate.forward  r9  r)   r6  r"  r   s   @r'   rE  rE    r:  r)   rE  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapTextOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r5  )ru   rv   r   r   rG  r  r  r   rJ  r   r>  r   r  s     r'   rv   ClapTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r)   r!   r  r<   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rf   r8  r   s      r'   r   ClapTextOutput.forward  r:  r)   r;  r"  r   s   @r'   rL  rL    r<  r)   rL  c            	          ^  \ rS rSrU 4S jr SS\R                  S\R                  S-  S\\	   S\R                  4S jjr
S	 rS
rU =r$ )ClapTextLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g )Nr   )
ru   rv   rG  seq_len_dimr>  rL  rE  rO  rL  r   r  s     r'   rv   ClapTextLayer.__init__  sI    '-'E'E$*6208$V,r)   Nr!   r   r  r<   c                     U R                   " U4SU0UD6n[        U R                  U R                  U R                  U5      nU$ rB  )rL  r   feed_forward_chunkrG  rT  )ro   r!   r   r  s       r'   r   ClapTextLayer.forward  sW     
)
 
 2##T%A%A4CSCSUb
 r)   c                 J    U R                  U5      nU R                  X!5      nU$ rf   )rO  r   )ro   r*  intermediate_outputr|  s       r'   rW   ClapTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir)   )rL  rG  rO  r   rT  rf   )rO   rP   rQ   rR   rv   r@   r  rT   r   r   r   rW  rW   r   r   s   @r'   rR  rR    s[    - 48|| ))D0 +,	
 
$ r)   rR  c            	       |   ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\
4S jjrS	rU =r$ )ClapTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
ru   rv   r   r   r  r  num_hidden_layersrR  layerr  )ro   r   r  rx   s      r'   rv   ClapTextEncoder.__init__  sR    ]]5IaIaCb#cCbaM&$9Cb#cd
&+# $ds   A&Nr!   r   r  r<   c                 N    U R                    H  nU" UU40 UD6nM     [        US9$ )N)rL   )r`  r   )ro   r!   r   r  r  s        r'   r   ClapTextEncoder.forward  s>     !JJL( M ' +
 	
r)   )r   r  r`  rf   )rO   rP   rQ   rR   rv   r@   r  rT   r   r   r   r   rW   r   r   s   @r'   r]  r]    sR    , 48
||
 ))D0
 +,	

 

 
r)   r]  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapTextPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rf   )ru   rv   r   r   r  r  Tanhr  r  s     r'   rv   ClapTextPooler.__init__  s9    YYv1163E3EF
'')r)   r!   r<   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ rS  )r  r  )ro   r!   first_token_tensorpooled_outputs       r'   r   ClapTextPooler.forward  s6     +1a40

#566r)   )r  r  r"  r   s   @r'   re  re    s(    $
U\\ ell  r)   re  c                   v    \ rS rSr% \\S'   SrSrSr\	R                  " 5       S\R                  4S j5       rSrg	)
ClapPreTrainedModeli  r   clap)audiotextFr  c                    U R                   R                  n[        U[        5      (       a  [        R
                  " UR                  R                  SUS-  S9  [        R
                  " UR                  R                  SUS-  S9  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        R                  " UR                   5        g[        U["        5      (       a  [        R$                  " UR&                  [(        R*                  " U R                   R,                  5      5        [        R$                  " UR.                  [(        R*                  " U R                   R,                  5      5        g[        U[0        R2                  5      (       a$  [        R
                  " UR                  SUS-  S9  g[        U[0        R4                  [0        R6                  45      (       a  [        R                  " UR8                  5        [        R:                  " UR                  5        [=        USS5      ba  [        R                  " UR>                  5        [        R:                  " UR@                  5        [        R                  " URB                  5        gg[        U[0        RD                  [0        RF                  45      (       a  U R                   RH                  S-  S	U R                   RJ                  -  S-  -  U-  n[        R
                  " UR                  US
9  UR8                  b!  [        R                  " UR8                  5        gg[        U[L        5      (       aP  [        R                  " URN                  5        [        R                  " URP                  URS                  5       5        gg)zInitialize the weightsr{   g{Gz?)meanstdr.   r  running_meanNr'  r+   )rt  )*r   initializer_factorrg   r  initnormal_r  weightr  copy_r  r@   rA   r   r  zeros_r  	ClapModel	constant_logit_scale_ar   loglogit_scale_init_valuelogit_scale_tr   r  r   r   r   ones_getattrru  running_varnum_batches_trackedr   r   r  r_  r   r   r   r   )ro   r  factorin_proj_stds       r'   _init_weights!ClapPreTrainedModel._init_weights  s|    //f011LL33::&SW-XLL55<<3FUYMZJJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--.	**NN6//$++:\:\1]^NN6//$++:\:\1]^--LLSftmDr~~ >??KK$JJv}}%v~t4@F//0

6--.F667 A BII 677;;22D8a$++B_B_>_dh=hilrrKLLK8{{&FKK( ' 677KK;;<JJv55v7\7\7^_ 8r)   rN   N)rO   rP   rQ   rR   r   rU   base_model_prefixinput_modalitiessupports_gradient_checkpointingr@   no_gradr   r  r  rW   rN   r)   r'   rn  rn    s@    (&+#
]]_`BII ` `r)   rn  c                      ^  \ rS rSr% \\S'   SrSrS\4U 4S jjrS\	R                  4S jr\  SS\R                  S-  S	\R                  S-  S
\\   S\\-  4S jj5       rSrU =r$ )ClapAudioModeli  r   r  rp  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rf   )ru   rv   r  audio_encoder	post_initr  s     r'   rv   ClapAudioModel.__init__!  s'     -f5r)   r<   c                 B    U R                   R                  R                  $ rf   )r  r  r   rn   s    r'   get_input_embeddings#ClapAudioModel.get_input_embeddings'  s    !!--222r)   Nr  r  c                 ,    U R                   " SUUS.UD6$ )a  
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, ClapAudioModel

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
>>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

>>> inputs = processor(audio=audio_sample, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```r  r  rN   r  )ro   r  r  r  s       r'   r   ClapAudioModel.forward*  s,    : !! 
)
 
 	
r)   r  NN)rO   rP   rQ   rR   r   rU   main_input_namer  rv   r   r  r  r   r@   rT   
BoolTensorr   r   rV   r   r   rW   r   r   s   @r'   r  r    s    &O 3bii 3  48-1 
))D0 
 ##d* 
 +,	 

 
+	+ 
  
r)   r  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                   ,  ^  \ rS rSr% \\S'   Sr\\S.r	SU 4S jjr
S rS r\\\     SS	\R"                  S-  S
\R"                  S-  S\R"                  S-  S\R"                  S-  S\R"                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )ClapTextModeliN  r   rq  r!   rM   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
ru   rv   r   r  r  r]  encoderre  poolerr  )ro   r   add_pooling_layerrx   s      r'   rv   ClapTextModel.__init__d  sK    
 	 ,V4&v.0AnV,t 	r)   c                 .    U R                   R                  $ rf   r  r  rn   s    r'   r  "ClapTextModel.get_input_embeddingst  s    ...r)   c                 $    XR                   l        g rf   r  ro   r   s     r'   set_input_embeddings"ClapTextModel.set_input_embeddingsw  s    */'r)   Nr   r   r  r  r  r  r<   c                    Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  pUb  UR                  OUR                  n
Uc  [        R
                  " X4U
S9nU R                  X'5      nU R                  UUUUS9nU R                  " U4SU0UD6nUS   nU R                  b  U R                  U5      OS n[        UUS9$ )	NzDYou cannot specify both input_ids and inputs_embeds at the same timer.   z5You have to specify either input_ids or inputs_embedsr>   )r   r  r  r  r   r   )rL   r  )r   %warn_if_padding_and_no_attention_maskr   r?   r@   onesget_extended_attention_maskr  r  r  r   )ro   r   r   r  r  r  r  r	  r#   r
  r?   extended_attention_maskembedding_outputencoder_outputssequence_outputrk  s                   r'   r   ClapTextModel.forwardz  s-     ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN 150P0PQ_0m??%)'	 + 
 ,,
2
 

 *!,8<8OO4UY)-'
 	
r)   )r   r  r  r  )T)NNNNN)rO   rP   rQ   rR   r   rU   r  rR  r$  _can_record_outputsrv   r  r  r   r   r   r@   r  r   r   r   r   rW   r   r   s   @r'   r  r  N  s      &+
 /0   *..2.2,0-1.
<<$&.
 t+.
 t+	.

 llT).
 ||d*.
 +,.
 
$.
    .
r)   r  c                   :  ^  \ rS rSr% \\S'   S\4U 4S jjr\\  SS\	R                  S\	R                  S-  S\	R                  S-  S\\   S	\\-  4
S
 jj5       5       r\\  SS\	R                  S\	R                  S-  S\	R                  S-  S\\   S	\\-  4
S jj5       5       r\\      SS\	R"                  S-  S\	R$                  S-  S\	R&                  S-  S\	R                  S-  S\	R"                  S-  S\S-  S\\   S	\\-  4S jj5       5       rSrU =r$ )r|  i  r   c                 N  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  n[        R                  " [        R                  " [        R                  " UR                  5      5      5      U l        [        R                  " [        R                  " [        R                  " UR                  5      5      5      U l        UR$                  U l        ['        U5      U l        [+        U5      U l        [/        U5      U l        [+        U5      U l        U R5                  5         g )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )ru   rv   rg   text_configr   	TypeErrortypeaudio_configr   r   r   r@   rW  r   r  r  r~  r  r  r  
text_modelr  text_projectionr  audio_modelaudio_projectionr  )ro   r   r  r  rx   s       r'   rv   ClapModel.__init__  sC    &,,n==++,-Q0 
 &--??,,-.a1 
 ((**\\%,,txx@]@]7^*_`\\%,,txx@]@]7^*_`$33'42;?),7 3L A 	r)   Nr   r   r  r  r<   c                     U R                   " SUUUS.UD6nU R                  UR                  5      n[        R                  " USS9Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, ClapModel

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

>>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```r   r   r  r.   r   rN   )r  r  r  F	normalize)ro   r   r   r  r  text_outputstext_featuress          r'   get_text_featuresClapModel.get_text_features  s[    . 48?? 4
)%4
 	4
 ,,\-G-GH%&[[B%G"r)   r  r  c                     U R                   " SXS.UD6nU R                  UR                  5      n[        R                  " USS9Ul        U$ )a  
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Examples:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, ClapModel

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
>>> random_audio = torch.rand((16_000))

>>> inputs = feature_extractor(random_audio, return_tensors="pt")
>>> with torch.inference_mode():
...     audio_features = model.get_audio_features(**inputs)
```r  r.   r   rN   )r  r  r  r  r  )ro   r  r  r   r  audio_outputsaudio_featuress          r'   get_audio_featuresClapModel.get_audio_features  sU    8 594D4D 5
)5
BH5
 ..}/J/JK&'kk.b&I#r)   return_lossc           
         U R                   " S
UUS.UD6nU R                  " S
UUUS.UD6n	UR                  n
U R                  U
5      n
U	R                  nU R	                  U5      nXR                  SSSS9-  n
XR                  SSSS9-  nU R                  R                  5       nU R                  R                  5       n[        R                  " XR                  5       5      U-  n[        R                  " XR                  5       5      U-  nSnU(       a,  [        U5      n[        UR                  5       5      nUU-   S-  n[        UUUUU
U	US	9$ )a*  
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, ClapModel

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

>>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]

>>> inputs = processor(text=input_text, audio=audio_sample, return_tensors="pt", padding=True)

>>> outputs = model(**inputs)
>>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
>>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
```r  r  r+   r.   T)r  r   keepdimNg       @)r_   r`   ra   rK   r[   rb   rc   rN   )r  r  r  r  r  r   r  expr~  r@   r   trF   r]   )ro   r   r  r  r   r  r  r  r  r  r[   rK   logit_scale_textlogit_scale_audiora   r`   r_   caption_loss
audio_losss                      r'   r   ClapModel.forward  sy   N (( 
)
 
  
)%
 	
 %22,,\:"00**;7 $&7&7!T&7&RR!$4$4qb$$4$OO  --113 ..224,,{NN4DEHXX <<mmoFIZZ+O<L)*:*<*<*>?J :-4D-+#%*,
 	
r)   )r  r  r~  r  r  r  r  r  )NNNNNN)rO   rP   rQ   rR   r   rU   rv   r   r   r@   r  r   r   rV   r   r  r  r  rT   r  r  r]   r   rW   r   r   s   @r'   r|  r|    s   z @  /3,0	<< t+ llT)	
 +, 
+	+  @  *..2	   <<$&  t+	 
 +,  
+	+    D  .237-1.204#'P
##d*P
 ))D0P
 ##d*	P

 t+P
 &&-P
 D[P
 +,P
 
	P
  P
r)   r|  c                     ^  \ rS rSr% \\S'   Sr\\S.r	S\4U 4S jjr
S\R                  4S jrS r\\   SS
\R$                  S	-  S\R$                  S	-  S\R$                  S	-  S\\   S\\-  4
S jj5       5       rSrU =r$ )ClapTextModelWithProjectionim  r   r  r  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g rf   )ru   rv   r  r  r  r  r  r  s     r'   rv   $ClapTextModelWithProjection.__init__v  s3     '/26:r)   r<   c                 B    U R                   R                  R                  $ rf   r  r  r  rn   s    r'   r  0ClapTextModelWithProjection.get_input_embeddings}  s    ))999r)   c                 8    XR                   R                  l        g rf   r  r  s     r'   r  0ClapTextModelWithProjection.set_input_embeddings  s    5:""2r)   Nr   r   r  r  c                     U R                   " SUUUS.UD6nUR                  nU R                  U5      n[        UUR                  UR
                  UR                  S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, ClapTextModelWithProjection

>>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
>>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

>>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> text_embeds = outputs.text_embeds
```r  )rK   rL   r!   rM   rN   )r  r  r  rI   rL   r!   rM   )ro   r   r   r  r  r  rk  rK   s           r'   r   #ClapTextModelWithProjection.forward  sr    . 48?? 4
)%4
 	4
 %22**=9"#*<<&44#..	
 	
r)   )r  r  )NNN)rO   rP   rQ   rR   r   rU   r  rR  r$  r  rv   r   r  r  r  r   r   r@   r  r   r   rV   rI   r   rW   r   r   s   @r'   r  r  m  s     &+
~ :bii :;  *..2,0	#
<<$&#
 t+#
 llT)	#

 +,#
 
$	$#
  #
r)   r  c                      ^  \ rS rSr% \\S'   SrSrS\4U 4S jjrS\	R                  4S jr\\  SS\R                  S-  S	\R                   S-  S
\\   S\\-  4S jj5       5       rSrU =r$ )ClapAudioModelWithProjectioni  r   r  rp  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g rf   )ru   rv   r  r  r  r  r  r  s     r'   rv   %ClapAudioModelWithProjection.__init__  s4     )&1 3F ;r)   r<   c                 V    U R                   R                  R                  R                  $ rf   )r  r  r  r   rn   s    r'   r  1ClapAudioModelWithProjection.get_input_embeddings  s     --99>>>r)   Nr  r  c                     U R                   " SUUS.UD6nU R                  UR                  5      n[        UUR                  UR
                  UR                  S9$ )a  
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import ClapAudioModelWithProjection, ClapProcessor

>>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
>>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> inputs = processor(audio=audio_sample, return_tensors="pt")
>>> outputs = model(**inputs)
>>> audio_embeds = outputs.audio_embeds
```r  )r[   rL   rM   r!   rN   )r  r  r  rY   rL   rM   r!   )ro   r  r  r  r  r[   s         r'   r   $ClapAudioModelWithProjection.forward  sl    : 594D4D 5
)5
 5
 ,,]-H-HI#%+==$//'55	
 	
r)   )r  r  r  )rO   rP   rQ   rR   r   rU   r  r  rv   r   r  r  r   r   r@   rT   r  r   r   rV   rY   r   rW   r   r   s   @r'   r  r    s    &O ?bii ?  48-1(
))D0(
 ##d*(
 +,	(

 
%	%(
  (
r)   r  )r|  rn  r  r  r  r  )r{   )YrS   r   r   collections.abcr   dataclassesr   typingr   r@   torch.nn.functionalr   rC   r   r   rw  activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_clapr   r   r   
get_loggerrO   loggerr(   r8   r:   r  rF   rI   rY   r]   r  rr   r   r   r   r  r$  r.  r<  rC  r  r  r  r  r  floatr"  r$  r3  r>  rE  rL  rR  r]  re  rn  r  r  r|  r  r  __all__rN   r)   r'   <module>r     s      $ !      & ! 9 G & 6 j j 7 5 K K 
		H	%"**7U\\ 7ell 7
 	<+ 	< 	< 
 	<; 	< 	< 
_ _  _B299 2%		 %P_")) _FZ'RYY Z'|
")) 
 &BII  	bii 	wRYY wv4/ 4p3BII 3lv
ryy v
r")) &g8 g8d %II%<<% 
% <<	%
 LL4'% % %.3)BII 3)n 		 .299  RYY . D
bii 
4RYY  #`/ #` #`L/
( /
d O
' O
O
d {
# {
 {
| :
"5 :
 :
z 9
#6 9
 9
xr)   