
    Z jD                        S r SSKrSSKJr  SSKrSSKJr  SSKJr	  SSK
JrJr  SSKJr  SSKJrJr  S	S
KJrJrJrJrJrJrJrJrJr  SSKJr  \R<                  " \5      r \" SS9\ " S S\5      5       5       r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r'\ " S S\5      5       r(\r) " S S \(\5      r*\" S!S9 " S" S#\(5      5       r+ " S$ S%\5      r, " S& S'\5      r-/ S(Qr.g))zPyTorch UniSpeech model.    N)	dataclass   )initialization)ModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S	'   S
rg)UniSpeechForPreTrainingOutput+   a  
loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
    paper](https://huggingface.co/papers/2006.11477).
projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
    Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
    projected quantized states.
projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
    Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
    target vectors for contrastive loss.
codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
    The perplexity of the codevector distribution, used to measure the diversity of the codebook.
Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   r   r   tupler    __static_attributes__r!       ڀ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/unispeech/modular_unispeech.pyr   r   +   s     &*D%

d
")15e''$.5;? 1 1D 8?6:5,,t3:59M5**+d2926Je''(4/6r,   r   c                       \ rS rSrSrg) UniSpeechPositionalConvEmbeddingH   r!   Nr"   r#   r$   r%   r+   r!   r,   r-   r/   r/   H       r,   r/   c                       \ rS rSrSrg)UniSpeechFeatureEncoderL   r!   Nr1   r!   r,   r-   r4   r4   L   r2   r,   r4   c                       \ rS rSrSrg)UniSpeechFeatureProjectionP   r!   Nr1   r!   r,   r-   r7   r7   P   r2   r,   r7   c                       \ rS rSrSrg)UniSpeechEncoderT   r!   Nr1   r!   r,   r-   r:   r:   T   r2   r,   r:   c                       \ rS rSrSrg)UniSpeechEncoderStableLayerNormX   r!   Nr1   r!   r,   r-   r=   r=   X   r2   r,   r=   c                   *    \ rS rSr\S 5       rS rSrg)UniSpeechGumbelVectorQuantizer\   c                     U R                  SS9n[        R                  " [        R                  " [        R                  " X5      SS9* 5      R                  5       nU$ )Nr   dim)meanr'   expsumxlogy)probsmarginal_probs
perplexitys      r-   _compute_perplexity2UniSpeechGumbelVectorQuantizer._compute_perplexity]   sG    *YY		%++n*U[] ^^_cce
r,   c                    UR                   u  p#nU R                  U5      nUR                  X#-  U R                  -  S5      nU R                  (       a  [
        R                  R                  UR                  5       U R                  SS9R                  U5      n[        R                  " UR                  X#-  U R                  S5      R                  5       SS9nU R                  U5      nOyUR                  SS9nUR                  " UR                   6 R!                  SUR                  SS5      S5      nUR                  X#-  U R                  S5      nU R                  U5      nUR                  X#-  S5      nUR#                  S5      U R$                  -  n	U	R                  X#-  U R                  U R&                  S5      n
U
R)                  S5      R                  X#S5      n
X4$ )NrE   T)tauhardrC   r   g      ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr'   softmaxrM   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsrH   )selfr   
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distrL   codevector_idxcodevectors_per_grouprc   s              r-   forward&UniSpeechGumbelVectorQuantizer.forwardc   s   3@3F3F0
[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FGJ +11b19N,668K8KLUUN''A.   044Z5QSWSbSbdfg112BCJ+001MrR 0 : :2 >AQAQ Q+001Mt`d`m`moqr!oob)..zBO&&r,   r!   N)r"   r#   r$   r%   staticmethodrM   rm   r+   r!   r,   r-   r@   r@   \   s     
#'r,   r@   c                       \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr\R                  " 5       S 5       rS\R                   \-  4S	 jrS
\S\R                   4S jrSrg)UniSpeechPreTrainedModel   config	unispeechinput_valuesaudioTc           
         [        U[        5      (       au  [        R                  " UR                  R
                  SSS9  [        R                  " UR                  R                  5        [        R                  " UR                  5        g[        U[        5      (       a  [        R                  " UR                  R
                  SS[        R                  " SUR                  R                  S   UR                  R                  -  -  5      -  S9  [        R                   " UR                  R                  S5        g[        U["        5      (       a  [        R                  " SUR$                  R&                  -  5      n[        R                  " UR$                  R
                  U* US9  [        R                  " UR$                  R                  U* US9  g[        U[(        R*                  5      (       ac  [        R                  " UR
                  SU R,                  R.                  S9  UR                  b!  [        R                  " UR                  5        gg[        U[(        R0                  [(        R2                  45      (       aA  [        R                  " UR                  5        [        R4                  " UR
                  5        g[        U[(        R6                  5      (       a  [        R8                  " UR
                  5        UR                  b_  [        R                  " UR:                  UR                  UR                  S   -  -  5      n[        R                  " UR                  U* US9  ggg)zInitialize the weights        r   )rF   stdr   r   )abN)
isinstancer@   initnormal_rT   weightzeros_biasuniform_rc   r/   convmathsqrtkernel_sizein_channels	constant_r7   
projectionin_featuresrX   Linearrs   initializer_range	LayerNorm	GroupNormones_Conv1dkaiming_normal_groups)re   moduleks      r-   _init_weights&UniSpeechPreTrainedModel._init_weights   s<    f<==LL++22!DKK**//0MM&,,- @AALL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/ :;;		!f//;;;<AMM&++22qbA>MM&++00QB!<		**LLSdkk6S6ST{{&FKK( 'r|| <==KK$JJv}}%		**  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' +r,   input_lengthsc                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r'   div)input_lengthr   strides      r-   _conv_out_lengthSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s      99\7wWZ[[[r,   )ziprs   conv_kernelconv_stride)re   r   r   r   r   s        r-    _get_feat_extract_output_lengths9UniSpeechPreTrainedModel._get_feat_extract_output_lengths   sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r,   feature_vector_lengthattention_maskc                    UR                  SS9S S 2S4   nU R                  U5      R                  [        R                  5      nUR
                  S   n[        R                  " XQ4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )NrE   rC   r   )dtypedevicer   )r   )cumsumr   tor'   longrS   zerosr   r   arangeflipbool)re   r   r   non_padded_lengthsoutput_lengthsrf   s         r-   "_get_feature_vector_attention_mask;UniSpeechPreTrainedModel._get_feature_vector_attention_mask   s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr,   r!   N)r"   r#   r$   r%   r   r)   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr'   no_gradr   
LongTensorintr   r   r+   r!   r,   r-   rq   rq      s{    #$O&*#N
]]_6 6Be>N>NQT>T  ]b]m]m r,   rq   c                       \ rS rSrS\4S jrS r     SS\R                  S-  S\R                  S-  S\R                  S-  S	\
S-  S
\
S-  S\
S-  S\\-  4S jjrSrg)UniSpeechModel   rs   c                    [         R                  X5        Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        UR                   (       a  [#        U5      U l        O['        U5      U l        U R)                  5         g )Nrx   )rq   __init__rs   r4   feature_extractorr7   feature_projectionmask_time_probmask_feature_probrX   	Parameterr'   Tensorrh   r   masked_spec_embeddo_stable_layer_normr=   encoderr:   	post_init)re   rs   s     r-   r   UniSpeechModel.__init__   s     ))$7!8!@"<V"D  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&:6BDL+F3DL 	r,   c                     [        S5      e)NzNot needed for UniSpeech)AttributeErrorre   s    r-   freeze_feature_encoder%UniSpeechModel.freeze_feature_encoder   s    788r,   Nru   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nUR                  SS5      nUb  U R                  UR                  S   U5      nU R                  U5      u  pU R                  XUS9n	U R                  U	UUUUS9n
U
S   n	U(       d	  X4U
SS -   $ [        U	UU
R                  U
R                  S9$ )a  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
Nr   r   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr   r    )rs   r   r   r   r   	transposer   rS   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr   r    )re   ru   r   r   r   r   r   kwargsr   r   encoder_outputss              r-   rm   UniSpeechModel.forward   s7    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY11,?+55a;%!DDEUE[E[\]E^`noN*.*A*ABR*S'00~ 1 
 ,,)/!5# ' 
 (*!4qr7JJJ'+-)77&11	
 	
r,   )rs   r   r   r   r   )NNNNN)r"   r#   r$   r%   r   r   r   r'   r   r(   r   r*   r   rm   r+   r!   r,   r-   r   r      s     "9 /36:)-,0#'3
llT)3
 t+3
 !,,t3	3

  $;3
 #Tk3
 D[3
 
)	)3
 3
r,   r   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                   .  ^  \ rS rSrS\4U 4S jjrS\4S jrS r\	 SS\
R                  S\
R                  S	\
R                  S\4S
 jj5       r\    SS\
R                  S-  S\
R                  S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )UniSpeechForPreTrainingi#  rs   c                 8  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        U5      U l	        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                   5      U l        [        R
                  " UR$                  5      U l        U R)                  5         g )N)superr   r   rt   rX   Dropoutfeat_quantizer_dropoutdropout_featuresr@   	quantizerr   codevector_dimproj_codevector_dim	project_qrh   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   )re   rs   	__class__s     r-   r    UniSpeechForPreTraining.__init__)  s     '/ "

6+H+H I7?6#8#8&:T:TU99V%?%?ASAST		&"4"4f6L6LMzz&"6"67 	r,   r\   c                 $    XR                   l        g)zR
Set the Gumbel softmax temperature to a given value. Only necessary for training
N)r   r\   )re   r\   s     r-   set_gumbel_temperature.UniSpeechForPreTraining.set_gumbel_temperature8  s     &1"r,   c                 L    U R                   R                  R                  5         g)z
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)rt   r   _freeze_parametersr   s    r-   r   .UniSpeechForPreTraining.freeze_feature_encoder>  s    
 	((;;=r,   target_featuresnegative_featurespredicted_featuresc                     [         R                  " X/SS9n [         R                  " UR                  5       U R                  5       SS9nUR	                  U 5      nXC-  nU$ )z
Compute logits for contrastive loss based using cosine similarity as the distance measure between
`[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
r   rC   rE   )r'   catcosine_similarityr[   r]   )r   r   r   r\   logitss        r-   compute_contrastive_logits2UniSpeechForPreTraining.compute_contrastive_logitsE  s\      ))_$HaP(();)A)A)C_EZEZE\bde0 %r,   Nru   r   r   r   r   r   c           	         Ub  UOU R                   R                  nU R                  UUUUUS9nUS   nU R                  US   5      n	U R	                  U	5      u  pU R                  U
R                  U R
                  R                  R                  5      5      n
U R                  U
5      n
[        R                  " UR                  S5      UR                  S5      5      R                  U R                   R                  5      nUR                  SS5      n[        R                   " U5      R#                  5       R                  UR$                  5      nUR                  SS5      nUR'                  S5      nUR)                  US5      U
R)                  U) S5      -   nU R+                  U5      nU R-                  U5      nSnU(       d  Ub
  XX4USS -   $ XU4USS -   $ [/        UUU
UUR0                  UR2                  S9$ )	a[  
Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
>>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
>>> # TODO: Add full pretraining example
```Nr   r   r   rE   rx   r   )r   r   r   r   r   r    )rs   r   rt   r   r   r   r   r   r   r   r'   emptysizefill_replace_probr   	bernoullir   r   rb   masked_fillr   r   r   r   r    )re   ru   r   r   r   r   r   outputstransformer_featuresr   quantized_featuresr   prob_replace_matrixsampled_replace_matrixr  r   s                   r-   rm   UniSpeechForPreTraining.forwardY  s   , &1%<k$++BYBY..)/!5# ! 
  'qz  00<48NNCS4T1 "^^,>,A,A$..BWBWB]B],^_!--.@A#kk*>*C*CA*FH\HaHabcHdekkKK$$
 2;;AqA!&1D!E!J!J!L!O!OPdPkPk!l!7!A!A!Q!G!7!A!A"!E%112H#N**,B+BCH

 f%v& 4F^ahijikalll(>STW^_`_aWbbb,1'9"7!//))
 	
r,   )r   r   r   r   r   r   rt   )r   )NNNN)r"   r#   r$   r%   r   r   r   r   r   ro   r'   r(   r  r	   r   r   r*   r   rm   r+   __classcell__)r   s   @r-   r   r   #  s     1# 1> 
 	** ,, "-- 	 &  /3)-,0#'E
llT)E
 t+E
  $;	E

 #TkE
 D[E
 
.	.E
 E
r,   r   c                       \ rS rSrSrg)UniSpeechForCTCi  r!   Nr1   r!   r,   r-   r  r    r2   r,   r  c                       \ rS rSrSrg)"UniSpeechForSequenceClassificationi  r!   Nr1   r!   r,   r-   r  r    r2   r,   r  )r  r   r  r   rq   )/r&   r   dataclassesr   r'   torch.nnrX    r   r}   modeling_outputsr   r   modeling_utilsr   utilsr	   r
   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr"   loggerr   r/   r4   r7   r:   r=   r@   rq   r   r   r   r  r  __all__r!   r,   r-   <module>r"     sH     !   & D - ,
 
 
 5 
		H	% 
 7K 7 7.	'F 		4 		!: 		 		&D 	*'%B *'Z H H HV 3 H
-} H
V 
w
6 w

w
t	n 		)J 	r,   