
    Z j8                     6   S SK r S SKJr  S SKJr  S SKrS SKrS SKJ	r	  S SKJ
r
  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJrJr  SSK J!r!J"r"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)  SSK*J+r+  \)RX                  " \-5      r.\(" SS9\ " S S\5      5       5       r/ " S S\	R`                  5      r1 " S S\	R`                  5      r2 " S S\5      r3 " S S\5      r4 " S S \5      r5 " S! S"\	R`                  5      r6 " S# S$\	R`                  5      r7  SSS%\	R`                  S&\Rp                  S'\Rp                  S(\Rp                  S)\Rp                  S-  S*\9S-  S+\9S,\%\'   4S- jjr: " S. S/\	R`                  5      r; " S0 S1\	R`                  5      r< " S2 S3\5      r= " S4 S5\	R`                  5      r> " S6 S7\	R`                  5      r? " S8 S9\5      r@ " S: S;\	R`                  5      rA " S< S=\	R`                  5      rB\( " S> S?\"5      5       rC  STS@\D\E\E4   SA\9SB\ES)\R                  S-  SC\ESD\R                  4SE jjrH\rI\( " SF SG\C5      5       rJ\(" SHS9 " SI SJ\C5      5       rKSKrL\(" SLS9 " SM SN\C5      5       rM\(" SOS9 " SP SQ\C5      5       rN/ SRQrOg)U    N)Callable)	dataclass)CrossEntropyLoss   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_bidirectional_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputModelOutputSequenceClassifierOutputWav2Vec2BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel*get_torch_context_manager_or_global_device)Unpack)TransformersKwargsauto_docstringlogging   )UniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S	'   S
rg)UniSpeechForPreTrainingOutput5   a  
loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
    paper](https://huggingface.co/papers/2006.11477).
projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
    Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
    projected quantized states.
projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
    Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
    target vectors for contrastive loss.
codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
    The perplexity of the codevector distribution, used to measure the diversity of the codebook.
Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r    torchFloatTensor__annotations__r!   r"   r#   r$   tupler%   __static_attributes__r&       ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/unispeech/modeling_unispeech.pyr   r   5   s     &*D%

d
")15e''$.5;? 1 1D 8?6:5,,t3:59M5**+d2926Je''(4/6r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )UniSpeechSamePadLayerR   c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__s     r2   r9   UniSpeechSamePadLayer.__init__S   s)    #:Q#>!#Car1   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r:   r;   r$   s     r2   forwardUniSpeechSamePadLayer.forwardW   s6    ")!Q0F43F3F2F0F*FGMr1   rA   r'   r(   r)   r*   r9   rC   r0   __classcell__r=   s   @r2   r4   r4   R   s    K r1   r4   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ ) UniSpeechPositionalConvEmbedding]   c                   > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R                  U R                  R                   SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R                   R"                  nU R                  R                  R                   R$                  nO,U R                  R&                  nU R                  R(                  nUR                  R+                  X5        UR                  R+                  X5        OU" U R                  SSS9U l        [-        UR
                  5      U l        [0        UR2                     U l        g ! , (       d  f       GN,= f)	Nr7   )kernel_sizepaddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r8   r9   nnConv1dhidden_sizer<   num_conv_pos_embedding_groupsconvutilsrO   hasattrrT   r	   	deepspeedzeroGatheredParametersrQ   	original0	original1weight_gweight_vregister_external_parameterr4   rM   r   feat_extract_activation
activation)r;   configrO   r\   ra   rb   r=   s         r2   r9   )UniSpeechPositionalConvEmbedding.__init__^   s   II6622a777
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI,V-K-KL !?!?@ VUs   I
Ic                     UR                  SS5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      nU$ )Nr   r7   )	transposerY   rM   re   rB   s     r2   rC   (UniSpeechPositionalConvEmbedding.forward   sV    %//15		-0]36%//15r1   )re   rY   rM   rE   rG   s   @r2   rI   rI   ]   s    AB r1   rI   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )UniSpeechNoLayerNormConvLayer   c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r   rL   stridebias)r8   r9   conv_dimin_conv_dimout_conv_dimrU   rV   conv_kernelconv_stride	conv_biasrY   r   rd   re   r;   rf   layer_idr=   s      r2   r9   &UniSpeechNoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r1   c                 J    U R                  U5      nU R                  U5      nU$ N)rY   re   rB   s     r2   rC   %UniSpeechNoLayerNormConvLayer.forward   s$    		-06r1   )re   rY   rs   rt   r   rE   rG   s   @r2   rl   rl      s    A r1   rl   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )UniSpeechLayerNormConvLayer   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   ro   T)elementwise_affine)r8   r9   rr   rs   rt   rU   rV   ru   rv   rw   rY   	LayerNorm
layer_normr   rd   re   rx   s      r2   r9   $UniSpeechLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r1   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )N)rY   ri   r   re   rB   s     r2   rC   #UniSpeechLayerNormConvLayer.forward   sV    		-0%//B76%//B76r1   re   rY   rs   r   rt   r~   rE   rG   s   @r2   r   r      s    A r1   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )UniSpeechGroupNormConvLayer   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r   ro   T)
num_groupsnum_channelsaffine)r8   r9   rr   rs   rt   rU   rV   ru   rv   rw   rY   r   rd   re   	GroupNormr   rx   s      r2   r9   $UniSpeechGroupNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr1   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r|   )rY   r   re   rB   s     r2   rC   #UniSpeechGroupNormConvLayer.forward   s2    		-066r1   r   r~   rE   rG   s   @r2   r   r      s    r  r1   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )UniSpeechFeatureEncoder   z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a@  [        USS9/[	        UR
                  S-
  5       Vs/ s H  n[        XS-   S9PM     sn-   nOVUR                  S:X  a-  [	        UR
                  5       Vs/ s H  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )ry   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r8   r9   feat_extract_normr   rangenum_feat_extract_layersrl   r   
ValueErrorrU   
ModuleListconv_layersgradient_checkpointing_requires_grad)r;   rf   ir   r=   s       r2   r9    UniSpeechFeatureEncoder.__init__   s    ##w.6vJKv==ABOBA .f1uEBO K %%0INvOmOmInInA+F?In  K 01I1I0JJst  ==5&+#"O
s   C C%c                 N    U R                  5        H
  nSUl        M     SU l        g NF)
parametersrequires_gradr   r;   params     r2   _freeze_parameters*UniSpeechFeatureEncoder._freeze_parameters   s#    __&E"'E '#r1   c                     US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H  nU" U5      nM     U$ )NT)r   trainingr   r   )r;   input_valuesr$   
conv_layers       r2   rC   UniSpeechFeatureEncoder.forward   sK    $QW- 4==*.M'**J&}5M + r1   )r   r   r   )
r'   r(   r)   r*   r+   r9   r   rC   r0   rF   rG   s   @r2   r   r      s    8#($

 
r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )UniSpeechFeatureProjection   c                 4  > [         TU ]  5         [        R                  " UR                  S   UR
                  S9U l        [        R                  " UR                  S   UR                  5      U l	        [        R                  " UR                  5      U l        g )Nr   eps)r8   r9   rU   r   rr   layer_norm_epsr   LinearrW   
projectionDropoutfeat_proj_dropoutdropoutr;   rf   r=   s     r2   r9   #UniSpeechFeatureProjection.__init__   sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r1   c                 n    U R                  U5      nU R                  U5      nU R                  U5      nX4$ r|   )r   r   r   )r;   r$   norm_hidden_statess      r2   rC   "UniSpeechFeatureProjection.forward  s7    !__];(:;]300r1   )r   r   r   rE   rG   s   @r2   r   r      s    <1 1r1   r   modulequerykeyvalueattention_maskscalingr   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr         r7   r   rS   )pr   r   )
sizer,   matmulri   rU   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r2   eager_attention_forwardr   
  s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r1   c                   :  ^  \ rS rSrSr     SS\S\S\S\S\S	\S
\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\
R                  S-  S\S-  S\\   S\\
R                  \
R                  S-  \\
R                     S-  4   4S jjrSrU =r$ )UniSpeechAttentioni&  z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr   
is_decoderrq   	is_causalrf   c                   > [         TU ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )rq   )r8   r9   r   r   r   head_dimrf   r   r   r   r   rU   r   k_projv_projq_projout_proj)	r;   r   r   r   r   rq   r   rf   r=   s	           r2   r9   UniSpeechAttention.__init__)  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr1   r$   key_value_statesr   output_attentionsr   returnc                    USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U(       a  UOUn
/ U
R                   SS QSPU R                  P7nU R                  U
5      R                  U5      R	                  SS5      nU R                  U
5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      nU" U U	UUU4U R                  (       d  SOU R                  U R                  US.UD6u  nnUR                  " / UQSP76 R!                  5       nU R#                  U5      nUUS4$ )z#Input shape: Batch x Time x ChannelNr   r   r7           )r   r   r   )shaper   r   viewri   r   r   r   get_interfacerf   _attn_implementationr   r   r   r   reshaper   r   )r;   r$   r   r   r   r   is_cross_attentioninput_shapehidden_shapequery_statescurrent_stateskv_shape
key_statesvalue_statesattention_interfacer   r   s                    r2   rC   UniSpeechAttention.forwardH  s    .T9 $))#2.88b8$--8 {{=166|DNNqRST-?)]B^))#2.BBDMMB[[055h?II!QO
{{>277AKKAqQ(?(M(MKK,,.E)
 %8
%
  $}}C$,,LL/
%
 
%
!\ "));;;;FFHmmK0L$..r1   )rf   r   r   r   r   r   r   r   r   r   r   r   )r   FTFN)NNF)r'   r(   r)   r*   r+   intfloatboolr   r9   r,   Tensorr   r   r/   rC   r0   rF   rG   s   @r2   r   r   &  s
   G  )-CC C 	C
 C C C  $&C CD 15.2).0/||0/  ,,-0/ t+	0/
  $;0/ -.0/ 
u||U\\D0%2E2LL	M0/ 0/r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )UniSpeechFeedForwardi{  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                   5      U l        g r|   )r8   r9   rU   r   activation_dropoutintermediate_dropoutr   rW   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r2   r9   UniSpeechFeedForward.__init__|  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''--'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r1   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r|   )r  r  r  r  r
  rB   s     r2   rC   UniSpeechFeedForward.forward  sX    //>00?11-@))-8++M:r1   )r  r  r  r  r
  rE   rG   s   @r2   r   r   {  s    @ r1   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )UniSpeechEncoderLayeri  c                   > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        g )NFr   r   r   r   rf   r   )r8   r9   r   rW   num_attention_headsattention_dropout	attentionrU   r   r	  r   r   r   r   r   feed_forwardfinal_layer_normr   s     r2   r9   UniSpeechEncoderLayer.__init__  s    +((00,,
 zz&"7"78,,v'9'9v?T?TU08 "V-?-?VEZEZ [r1   c                     UnU R                  XUS9u  pnU R                  U5      nXA-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4nU(       a  Xu4-  nU$ Nr   r   )r  r   r   r  r  r;   r$   r   r   attn_residualr   _outputss           r2   rC   UniSpeechEncoderLayer.forward  s    %)-L] *8 *
&Q ]3%56%(9(9-(HH--m< "&Gr1   )r  r   r  r  r   r   rE   rG   s   @r2   r  r    s    \ r1   r  c                      ^  \ rS rSrU 4S jr    SS\R                  S\R                  S-  S\S\S\4
S	 jjr	S
r
U =r$ )UniSpeechEncoderi  c                   > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[!        U5      PM     sn5      U l        SU l        g s  snf Nr   F)r8   r9   rf   rI   pos_conv_embedrU   r   rW   r   r   r   r	  r   r   r   num_hidden_layersr  layersr   r;   rf   r  r=   s      r2   r9   UniSpeechEncoder.__init__  s    >vF,,v'9'9v?T?TUzz&"7"78mmERXRjRjLk$lLkq%:6%BLk$lm&+# %m    C	Nr$   r   r   output_hidden_statesreturn_dictc                 ,   U(       a  SOS nU(       a  SOS nUb4  UR                  S5      R                  SSUR                  S   5      nSX) '   [        U R                  UUS9nU R                  U5      n	XR                  UR                  5      -   nU R                  U5      nU R                  U5      n[        5       =(       d    [        U 5      n
U R                   H  nU(       a  Xa4-   n[        R                  " / 5      nU R                  =(       a    XR                  R                   :  nU(       a  U
(       a  U" XUS9nUS   nU(       a  SnU(       d  M|  UWS   4-   nM     U(       a  Xa4-   nU(       d  [#        S	 XU4 5       5      $ [%        UUUS
9$ )Nr&   r   r   r7   r   rf   inputs_embedsr   r  NNc              3   .   #    U  H  oc  M  Uv   M     g 7fr|   r&   .0vs     r2   	<genexpr>+UniSpeechEncoder.forward.<locals>.<genexpr>       m$[q$[   	last_hidden_stater$   r%   )	unsqueezerepeatr   r   rf   r$  todevicer   r   r	   r
   r&  r,   randr   	layerdropr/   r   r;   r$   r   r   r*  r+  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                  r2   rC   UniSpeechEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M012;;')
 #11-@%(>(>}?S?S(TT6]302R6LT6R[[E#$58H$H! #(**R.!]]Z/B[[EZEZ/ZN![ %!Te! !.a 0 ,  &9]1=M<O&O#' !*   14D Dm]GZ$[mmm++*
 	
r1   rf   r   r   r   r&  r$  NFFT)r'   r(   r)   r*   r9   r,   tensorr   r   rC   r0   rF   rG   s   @r2   r!  r!    s]    , /3"'%* ;
||;
 t+;
  	;

 #;
 ;
 ;
r1   r!  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )UniSpeechAttnAdapterLayeri  c                   > [         TU ]  5         UR                  U l        UR                  U l        [        R                  " U R
                  5      U l        [        R                  " U R
                  U R                  5      U l
        [        R                  " 5       U l        [        R                  " U R                  U R
                  5      U l        g)z
Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
up training throughput.
N)r8   r9   adapter_attn_dim	input_dimrW   
hidden_dimrU   r   normr   linear_1ReLUact_fnlinear_2r   s     r2   r9   "UniSpeechAttnAdapterLayer.__init__   s    
 	00 ,,LL1			$//4>>Bggi		$..$//Br1   r$   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r|   )rS  rT  rV  rW  rB   s     r2   rC   !UniSpeechAttnAdapterLayer.forward  s@    		-0m4M2m4r1   )rV  rR  rQ  rT  rW  rS  )
r'   r(   r)   r*   r9   r,   r-   rC   r0   rF   rG   s   @r2   rN  rN    s     CU%6%6  r1   rN  c                   t   ^  \ rS rSrU 4S jr  S	S\R                  S\R                  S-  S\4S jjrSr	U =r
$ )
$UniSpeechEncoderLayerStableLayerNormi  c                   > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        [#        USS 5      b  [%        U5      U l        g S U l        g )NFr  r   rP  )r8   r9   r   rW   r  r  r  rU   r   r	  r   r   r   r   r   r  r  getattrrN  adapter_layerr   s     r2   r9   -UniSpeechEncoderLayerStableLayerNorm.__init__  s    +((00,,
 zz&"7"78,,v'9'9v?T?TU08 "V-?-?VEZEZ [6-t4@!:6!BD!%Dr1   Nr$   r   r   c                    UnU R                  U5      nU R                  XUS9u  pnU R                  U5      nXA-   nXR                  U R	                  U5      5      -   nU R
                  b  XR                  U5      -   nU4nU(       a  Xu4-  nU$ r  )r   r  r   r  r  r_  r  s           r2   rC   ,UniSpeechEncoderLayerStableLayerNorm.forward,  s     &6)-L] *8 *
&Q ]3%5%(9(9$:O:OP]:^(__)),>,>},MMM "&Gr1   )r_  r  r   r  r  r   r   )r'   r(   r)   r*   r9   r,   r   r   rC   r0   rF   rG   s   @r2   r\  r\    sC    &, /3"'	|| t+  	 r1   r\  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )UniSpeechEncoderStableLayerNormiF  c                   > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[!        U5      PM     sn5      U l        SU l        g s  snf r#  )r8   r9   rf   rI   r$  rU   r   rW   r   r   r   r	  r   r   r   r%  r\  r&  r   r'  s      r2   r9   (UniSpeechEncoderStableLayerNorm.__init__G  s    >vF,,v'9'9v?T?TUzz&"7"78mmCHIaIaCbcCba1&9Cbc
 ',# dr)  c                    U(       a  SOS nU(       a  SOS nUb4  UR                  S5      R                  SSUR                  S   5      nSX) '   [        U R                  UUS9nU R                  U5      n	X-   nU R                  U5      n[        5       =(       d    [        U 5      n
U R                   H  nU(       a  Xa4-   n[        R                  " / 5      nU R                  =(       a    XR                  R                  :  nU(       a  U
(       a  U" XUS9nUS   nU(       a  SnU(       d  M|  UWS   4-   nM     U R                  U5      nU(       a  Xa4-   nU(       d  [        S	 XU4 5       5      $ [!        UUUS
9$ )Nr&   r   r   r7   r   r-  r  r/  c              3   .   #    U  H  oc  M  Uv   M     g 7fr|   r&   r1  s     r2   r4  :UniSpeechEncoderStableLayerNorm.forward.<locals>.<genexpr>  r6  r7  r8  )r:  r;  r   r   rf   r$  r   r	   r
   r&  r,   r>  r   r?  r   r/   r   r@  s                  r2   rC   'UniSpeechEncoderStableLayerNorm.forwardR  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M012;;')
 #11-@%;]302R6LT6R[[E#$58H$H! #(**R.!]]Z/B[[EZEZ/ZN![ !&!Te! !.a 0 ,  &9]1=M<O&O#) !, 6 14D Dm]GZ$[mmm++*
 	
r1   rJ  rK  rE   rG   s   @r2   rd  rd  F  s     	, "=
 =
r1   rd  c                   B   ^  \ rS rSrSrU 4S jr\S 5       rS rSr	U =r
$ )UniSpeechGumbelVectorQuantizeri  z
Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
c                 8  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U R                  -  S:w  a&  [        SUR                   SU R                   S35      e[        R                  " [        R                  " SU R                  U R
                  -  UR                  U R                  -  5      5      U l        [        R                  " UR                  S   U R                  U R
                  -  5      U l        SU l        g )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationr   r   r7   )r8   r9   num_codevector_groupsr   num_codevectors_per_groupnum_varscodevector_dimr   rU   	Parameterr,   r-   codevectorsr   rr   weight_projtemperaturer   s     r2   r9   'UniSpeechGumbelVectorQuantizer.__init__  s     6688  4??2a7)&*?*?)@ A559__4EEWY  <<a4==!@&BWBW[_[j[jBjk
 99V__R%8$//DMM:YZ r1   c                     U R                  SS9n[        R                  " [        R                  " [        R                  " X5      SS9* 5      R                  5       nU$ )Nr   r   r   )meanr,   expsumxlogy)probsmarginal_probs
perplexitys      r2   _compute_perplexity2UniSpeechGumbelVectorQuantizer._compute_perplexity  sG    *YY		%++n*U[] ^^_cce
r1   c                    UR                   u  p#nU R                  U5      nUR                  X#-  U R                  -  S5      nU R                  (       a  [
        R                  R                  UR                  5       U R                  SS9R                  U5      n[        R                  " UR                  X#-  U R                  S5      R                  5       SS9nU R                  U5      nOyUR                  SS9nUR                  " UR                   6 R!                  SUR                  SS5      S5      nUR                  X#-  U R                  S5      nU R                  U5      nUR                  X#-  S5      nUR#                  S5      U R$                  -  n	U	R                  X#-  U R                  U R&                  S5      n
U
R)                  S5      R                  X#S5      n
X4$ )Nr   T)tauhardr   r   g      ?r   )r   rt  r   r   r   rU   r   gumbel_softmaxr   ru  type_asr,   r   r  argmax	new_zerosscatter_r:  rs  rp  rz  )r;   r$   
batch_sizesequence_lengthrW   codevector_probscodevector_soft_distr~  codevector_idxcodevectors_per_grouprs  s              r2   rC   &UniSpeechGumbelVectorQuantizer.forward  s   3@3F3F0
[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FGJ +11b19N,668K8KLUUN''A.   044Z5QSWSbSbdfg112BCJ+001MrR 0 : :2 >AQAQ Q+001Mt`d`m`moqr!oob)..zBO&&r1   )rs  r   rp  ru  rt  )r'   r(   r)   r*   r+   r9   staticmethodr  rC   r0   rF   rG   s   @r2   rl  rl    s+    
(  
#' #'r1   rl  c                       \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr\R                  " 5       S 5       rS\R                   \-  4S	 jrS
\S\R                   4S jrSrg)UniSpeechPreTrainedModeli  rf   	unispeechr   audioTc           
         [        U[        5      (       au  [        R                  " UR                  R
                  SSS9  [        R                  " UR                  R                  5        [        R                  " UR                  5        g[        U[        5      (       a  [        R                  " UR                  R
                  SS[        R                  " SUR                  R                  S   UR                  R                  -  -  5      -  S9  [        R                   " UR                  R                  S5        g[        U["        5      (       a  [        R                  " SUR$                  R&                  -  5      n[        R                  " UR$                  R
                  U* US9  [        R                  " UR$                  R                  U* US9  g[        U[(        R*                  5      (       ac  [        R                  " UR
                  SU R,                  R.                  S9  UR                  b!  [        R                  " UR                  5        gg[        U[(        R0                  [(        R2                  45      (       aA  [        R                  " UR                  5        [        R4                  " UR
                  5        g[        U[(        R6                  5      (       a  [        R8                  " UR
                  5        UR                  b_  [        R                  " UR:                  UR                  UR                  S   -  -  5      n[        R                  " UR                  U* US9  ggg)zInitialize the weightsr   r   )rx  stdr   r7   )abN)r  rl  initnormal_rt  rQ   zeros_rq   uniform_rs  rI   rY   mathsqrtrL   in_channels	constant_r   r   in_featuresrU   r   rf   initializer_ranger   r   ones_rV   kaiming_normal_rN   )r;   r   ks      r2   _init_weights&UniSpeechPreTrainedModel._init_weights  s<    f<==LL++22!DKK**//0MM&,,- @AALL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/ :;;		!f//;;;<AMM&++22qbA>MM&++00QB!<		**LLSdkk6S6ST{{&FKK( 'r|| <==KK$JJv}}%		**  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' +r1   input_lengthsc                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r,   div)input_lengthrL   rp   s      r2   _conv_out_lengthSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length
  s      99\7wWZ[[[r1   )ziprf   ru   rv   )r;   r  r  rL   rp   s        r2    _get_feat_extract_output_lengths9UniSpeechPreTrainedModel._get_feat_extract_output_lengths  sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r1   feature_vector_lengthr   c                    UR                  SS9S S 2S4   nU R                  U5      R                  [        R                  5      nUR
                  S   n[        R                  " XQ4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )Nr   r   r   )dtyper=  r   )r=  )cumsumr  r<  r,   longr   zerosr  r=  arangeflipr   )r;   r  r   non_padded_lengthsoutput_lengthsr  s         r2   "_get_feature_vector_attention_mask;UniSpeechPreTrainedModel._get_feature_vector_attention_mask  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr1   r&   N)r'   r(   r)   r*   r   r.   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr,   no_gradr  
LongTensorr   r  r  r0   r&   r1   r2   r  r    s{    #$O&*#N
]]_6 6Be>N>NQT>T  ]b]m]m r1   r  r   	mask_probmask_length	min_masksr   c           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )r   max)r  num_masked_spanepsilonr  r  r  r  s     r2   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_spanJ  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr1   Nr   r  r   F)replace)r   nprandomr>  itemdetachrz  tolistr   r  r   choicer  lenconcatenateonesint32appendarraybroadcast_tor   r  put_along_axis)r   r  r  r   r  r  r  r  r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  r  s    `` `            @@r2   _compute_mask_indicesr  $  s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                   :  ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S-  S\R                  S-  4S jjr	\
     SS	\R                  S-  S\R                  S-  S\R                  S-  S
\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )UniSpeechModeli  rf   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        UR                   (       a  [#        U5      U l        O['        U5      U l        U R)                  5         g Nr   )r8   r9   rf   r   feature_extractorr   feature_projectionmask_time_probmask_feature_probrU   rr  r,   r   rW   r  masked_spec_embeddo_stable_layer_normrd  encoderr!  	post_initr   s     r2   r9   UniSpeechModel.__init__  s     !8!@"<V"D  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&:6BDL+F3DL 	r1   Nr$   mask_time_indicesr   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTNr   )r  r  r   r  )r=  r  )r  r  r  r   )r^  rf   r   r  r<  r  r  r   r  mask_time_lengthmask_time_min_masksr,   rL  r=  r   r  mask_feature_lengthmask_feature_min_masksexpand)r;   r$   r  r   r  r  rW   mask_feature_indicess           r2   _mask_hidden_states"UniSpeechModel._mask_hidden_states  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r1   r   r   r*  r+  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nUR                  SS5      nUb  U R                  UR                  S   U5      nU R                  U5      u  pU R                  XUS9n	U R                  U	UUUUS9n
U
S   n	U(       d	  X4U
SS -   $ [        U	UU
R                  U
R                  S9$ )a  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
Nr   r7   )r  r   r   r   r*  r+  r   )r9  extract_featuresr$   r%   )rf   r   r*  r+  r  ri   r  r   r  r  r  UniSpeechBaseModelOutputr$   r%   )r;   r   r   r  r   r*  r+  r   r  r$   encoder_outputss              r2   rC   UniSpeechModel.forward  s7     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY11,?+55a;%!DDEUE[E[\]E^`noN*.*A*ABR*S'00~ 1 
 ,,)/!5# ' 
 (*!4qr7JJJ'+-)77&11	
 	
r1   )rf   r  r  r  r  r/  NNNNN)r'   r(   r)   r*   r   r9   r,   r-   r  r  r   r   r   r/   r  rC   r0   rF   rG   s   @r2   r  r    s     ( 7;26	,((, !,,t3, ((4/	,\  /36:)-,0#'3
llT)3
 t+3
 !,,t3	3

  $;3
 #Tk3
 D[3
 
)	)3
 3
r1   r  zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                   .  ^  \ rS rSrS\4U 4S jjrS\4S jrS r\	 SS\
R                  S\
R                  S	\
R                  S\4S
 jj5       r\    SS\
R                  S-  S\
R                  S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )UniSpeechForPreTrainingi  rf   c                 8  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        U5      U l	        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                   5      U l        [        R
                  " UR$                  5      U l        U R)                  5         g r|   )r8   r9   r  r  rU   r   feat_quantizer_dropoutdropout_featuresrl  	quantizerr   rq  proj_codevector_dim	project_qrW   project_hidnum_ctc_classesctc_projfinal_dropoutr   r  r   s     r2   r9    UniSpeechForPreTraining.__init__  s     '/ "

6+H+H I7?6#8#8&:T:TU99V%?%?ASAST		&"4"4f6L6LMzz&"6"67 	r1   ru  c                 $    XR                   l        g)zR
Set the Gumbel softmax temperature to a given value. Only necessary for training
N)r  ru  )r;   ru  s     r2   set_gumbel_temperature.UniSpeechForPreTraining.set_gumbel_temperature+  s     &1"r1   c                 L    U R                   R                  R                  5         gz
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
Nr  r  r   r;   s    r2   freeze_feature_encoder.UniSpeechForPreTraining.freeze_feature_encoder1      
 	((;;=r1   target_featuresnegative_featurespredicted_featuresc                     [         R                  " X/SS9n [         R                  " UR                  5       U R                  5       SS9nUR	                  U 5      nXC-  nU$ )z
Compute logits for contrastive loss based using cosine similarity as the distance measure between
`[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
r   r   r   )r,   catcosine_similarityr   r  )r!  r"  r#  ru  logitss        r2   compute_contrastive_logits2UniSpeechForPreTraining.compute_contrastive_logits8  s\      ))_$HaP(();)A)A)C_EZEZE\bde0 %r1   Nr   r   r   r*  r+  r   c           	         Ub  UOU R                   R                  nU R                  UUUUUS9nUS   nU R                  US   5      n	U R	                  U	5      u  pU R                  U
R                  U R
                  R                  R                  5      5      n
U R                  U
5      n
[        R                  " UR                  S5      UR                  S5      5      R                  U R                   R                  5      nUR                  SS5      n[        R                   " U5      R#                  5       R                  UR$                  5      nUR                  SS5      nUR'                  S5      nUR)                  US5      U
R)                  U) S5      -   nU R+                  U5      nU R-                  U5      nSnU(       d  Ub
  XX4USS -   $ XU4USS -   $ [/        UUU
UUR0                  UR2                  S9$ )	a[  
Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
>>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
>>> # TODO: Add full pretraining example
```Nr  r   r   r   r   r7   )r    r!   r"   r#   r$   r%   )rf   r+  r  r  r  r  r<  rQ   r  r  r,   emptyr   fill_replace_probri   	bernoullir   r=  r:  masked_fillr   r  r   r$   r%   )r;   r   r   r   r*  r+  r   r  transformer_featuresr  quantized_featuresr#   prob_replace_matrixsampled_replace_matrixr'  r    s                   r2   rC   UniSpeechForPreTraining.forwardL  s   , &1%<k$++BYBY..)/!5# ! 
  'qz  00<48NNCS4T1 "^^,>,A,A$..BWBWB]B],^_!--.@A#kk*>*C*CA*FH\HaHabcHdekkKK$$
 2;;AqA!&1D!E!J!J!L!O!OPdPkPk!l!7!A!A!Q!G!7!A!A"!E%112H#N**,B+BCH

 f%v& 4F^ahijikalll(>STW^_`_aWbbb,1'9"7!//))
 	
r1   )r  r   r  r  r  r  r  )r   )NNNN)r'   r(   r)   r*   r   r9   r   r  r  r  r,   r-   r(  r   r   r   r/   r   rC   r0   rF   rG   s   @r2   r  r    s     1# 1> 
 	** ,, "-- 	 &  /3)-,0#'E
llT)E
 t+E
  $;	E

 #TkE
 D[E
 
.	.E
 E
r1   r  r7   zq
    UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    c                      ^  \ rS rSrSS\S-  4U 4S jjjrS rS rS r\	     SS\
R                  S-  S	\
R                  S-  S
\S-  S\S-  S\S-  S\
R                  S-  S\\-  4S jj5       rSrU =r$ )UniSpeechForCTCi  Ntarget_langc                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        X l        UR                  c  [        SU R                   S35      e[        US5      (       a  UR                  (       a  UR                  OUR                  n[        R                   " X1R                  5      U l        U R%                  5         g)a  
target_lang (`str`, *optional*):
    Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
    adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' by
    default.
NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r8   r9   r  r  rU   r   r  r   r7  
vocab_sizer   r=   r[   r9  output_hidden_sizerW   r   lm_headr  )r;   rf   r7  r;  r=   s       r2   r9   UniSpeechForCTC.__init__  s     	 '/zz&"6"67&$00@ AH H  *1)G)GFL^L^F%%djdvdv 	 yy!35F5FG 	r1   c                 @   [        5       [        R                  " S5      :X  a  gU R                  nUb'  [	        U R
                  SS5      c  [        SU S35      eUc.  [	        U R
                  SS5      b  [        R                  S5        gUb  U R                  USS9  gg)	a  
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.

This method is **not** supposed to be called by the user and is prone to be changed in the future.
metaNrP  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)
r   r,   r=  r7  r^  rf   r   loggerinfoload_adapter)r;   r   r7  s      r2   tie_weightsUniSpeechForCTC.tie_weights  s     675<<;OO &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r1   c                 L    U R                   R                  R                  5         gr  r  r  s    r2   r  &UniSpeechForCTC.freeze_feature_encoder  r   r1   c                 T    U R                   R                  5        H
  nSUl        M     gz
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
FNr  r   r   r   s     r2   freeze_base_model!UniSpeechForCTC.freeze_base_model  #    
 ^^..0E"'E 1r1   r   r   r   r*  r+  labelsr   c                    Ub  UOU R                   R                  nUbJ  UR                  5       U R                   R                  :  a"  [	        SU R                   R                   35      eU R                  UUUUUS9nUS   n	U R                  U	5      n	U R                  U	5      n
SnUGbX  Ub  UO"[        R                  " U[        R                  S9nU R                  UR                  S5      5      R                  [        R                  5      nUS:  nUR                  S5      nUR                  U5      n[        R                   R#                  U
S[        R$                  S9R'                  SS5      n[        R(                  R*                  R-                  S	S
9   [        R                   R/                  UUUUU R                   R0                  U R                   R2                  U R                   R4                  S9nSSS5        U(       d  U
4U[6        S -   nUb  U4U-   $ U$ [9        XUR:                  UR<                  S9$ ! , (       d  f       NL= f)a  
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
    Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
    the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
    All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
    config.vocab_size - 1]`.
Nz$Label values must be <= vocab_size: r  r   r  r   )rS   r  r   F)enabled)blank	reductionzero_infinityr    r'  r$   r%   )rf   r+  r  r:  r   r  r   r<  r,   	ones_liker  r  rz  r<  masked_selectrU   r   log_softmaxfloat32ri   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r$   r%   )r;   r   r   r   r*  r+  rN  r   r  r$   r'  r    r  labels_masktarget_lengthsflattened_targets	log_probsoutputs                     r2   rC   UniSpeechForCTC.forward  s   $ &1%<k$++BYBY&**,$++2H2H"HCDKKDZDZC[\]]..)/!5# ! 
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; Y)F)G!HHF)-)9TGf$EvEG4I4IV]VhVh
 	
 ;:s   A H??
I)r   r<  r7  r  r|   r	  )r'   r(   r)   r*   r  r9   rD  r  rK  r   r,   r   r   r/   r   rC   r0   rF   rG   s   @r2   r6  r6    s    C$J  :<0>(  /3)-,0#'&*E
llT)E
 t+E
  $;	E

 #TkE
 D[E
 t#E
 
	E
 E
r1   r6  z
    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                      ^  \ rS rSrU 4S jrS rS r\     SS\R                  S-  S\R                  S-  S\
S-  S	\
S-  S
\
S-  S\R                  S-  S\\-  4S jj5       rSrU =r$ )"UniSpeechForSequenceClassificationi+  c                 "  > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        [        R                  " UR                   UR$                  5      U l        U R)                  5         g )Nr9  z`Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)r   )r8   r9   r[   r9  r   r  r  r%  use_weighted_layer_sumrU   rr  r,   r  layer_weightsr   rW   classifier_proj_size	projector
num_labels
classifierr  )r;   rf   
num_layersr=   s      r2   r9   +UniSpeechForSequenceClassification.__init__2  s     6=))f.@.@r  (/--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r1   c                 L    U R                   R                  R                  5         gr  r  r  s    r2   r  9UniSpeechForSequenceClassification.freeze_feature_encoderC  r   r1   c                 T    U R                   R                  5        H
  nSUl        M     grI  rJ  r   s     r2   rK  4UniSpeechForSequenceClassification.freeze_base_modelJ  rM  r1   Nr   r   r   r*  r+  rN  r   c                 0   Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n	[
        R                  " U	SS9n	[        R                  R                  U R                  SS9n
XR                  SSS5      -  R                  SS9n	OUS   n	U R                  U	5      n	Uc  U	R                  SS9nOU R                  U	R                   S   U5      nUR#                  S5      R%                  SSU	R                   S   5      nS	X) '   U	R                  SS9UR                  SS9R                  SS5      -  nU R'                  U5      nSnUbF  [)        5       nU" UR                  SU R                   R*                  5      UR                  S5      5      nU(       d  U4U[        S -   nUb  U4U-   $ U$ [-        UUUR.                  UR0                  S
9$ )a  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
    into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr  r   r   r   r   r7   r   rT  )rf   r+  rj  r  r`  r,   stackrU   r   r   rk  r   rz  rm  rx  r  r   r:  r;  ro  r   rn  r   r$   r%   )r;   r   r   r   r*  r+  rN  r   r  r$   norm_weightspooled_outputpadding_maskexpand_padding_maskr'  r    loss_fctre  s                     r2   rC   *UniSpeechForSequenceClassification.forwardR  s   0 &1%<k$++BYBY'+{{'I'ItOc..)/!5# ! 
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M./)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r1   )ro  rk  rm  r  r	  )r'   r(   r)   r*   r9   r  rK  r   r,   r   r   r/   r   rC   r0   rF   rG   s   @r2   rh  rh  +  s    ">(  /3)-,0#'&*C
llT)C
 t+C
  $;	C

 #TkC
 D[C
 t#C
 
)	)C
 C
r1   rh  )r6  r  rh  r  r  r  r@   )Pr  collections.abcr   dataclassesr   numpyr  r,   torch.nnrU   r    r   r  activationsr   integrations.deepspeedr	   integrations.fsdpr
   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   processing_utilsr   rZ   r   r   r   configuration_unispeechr   
get_loggerr'   rA  r   Moduler4   rI   rl   r   r   r   r   r   r   r   r   r   r  r!  rN  r\  rd  rl  r  r/   r   r  ndarrayr  r  r  r  r`  r6  rh  __all__r&   r1   r2   <module>r     s6  *  $ !    % & ! @ 7 6 B 9  s r & @ @ 4 
		H	% 
 7K 7 7.BII *ryy *Z$> *"< 6"< 0&bii &R1 1* !%II%<<% 
% <<	%
 LL4'% T\% % '(%8R/ R/j299 0!6 !HE
ryy E
P		 2++E +\I
bii I
XC'RYY C'L H H H^ /3tc?tt t $$t+	t
 t ZZtn 3  t
- t
 t
n 
w
6 w

w
t !"  
K
. K

K
\ e
)A e
e
Pr1   