
    Z jXd                        S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSK	Js  J
r  SSKJr  SSKJr  SSKJr  SS	KJrJr  S
SKJr  SSKJr  \ " S S\5      5       r\ " S S\5      5       r\ " S S\5      5       r " S S\R:                  5      r " S S\R:                  5      r " S S\R:                  5      r  " S S\R:                  5      r! " S S\R:                  5      r" " S S\R:                  5      r# " S  S!\R:                  5      r$ " S" S#\R:                  5      r%\ " S$ S%\5      5       r&\" S&S'9 " S( S)\&5      5       r'S)S%/r(g)*zTransformers Xcodec model.    N)	dataclass)	lru_cache   )initialization)conv1d_output_length)PreTrainedAudioTokenizerBase)ModelOutputauto_docstring   )	AutoModel   )XcodecConfigc                   j    \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Srg)XcodecOutput    aW  
Args:
    audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
        Discrete code indices computed using `model.encode`.
    audio_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`, *optional*)
        Decoded audio values obtained using the decoder part of Xcodec.
Naudio_codesaudio_values )__name__
__module____qualname____firstlineno____doc__r   torch
LongTensor__annotations__r   FloatTensor__static_attributes__r       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/xcodec/modeling_xcodec.pyr   r       s3     ,0K!!D(/-1L%##d*1r   r   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)XcodecEncoderOutput.   z
Args:
    audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
        Discrete code indices computed using `model.encode`.
Nr   r   )
r   r   r   r   r   r   r   r   r   r   r   r   r    r"   r"   .   s     ,0K!!D(/r   r"   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)XcodecDecoderOutput9   z
Args:
    audio_values (`torch.FloatTensor`  of shape `(batch_size, channels, num_samples)`, *optional*):
        Decoded audio values obtained using the decoder part of Xcodec.
Nr   r   )
r   r   r   r   r   r   r   r   r   r   r   r   r    r%   r%   9   s     .2L%##d*1r   r%   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jr
SrU =r$ )XcodecResidualUnitD   zFResidual block for SemanticEncoder and SemanticDecoder used in Xcodec.configin_channelsout_channelsdilationc                 
  > [         TU ]  5         [        R                  " 5       U l        UR
                  S-
  S-  U-  n[        R                  " UUUR
                  SUUSSS9U l        [        R                  " X3SSS9U l        g )Nr   r   F)stridepaddingr-   groupsbias)r+   r,   kernel_sizer2   )	super__init__nnELU
activationunit_kernel_sizeConv1dconv1conv2)selfr*   r+   r,   r-   r0   	__class__s         r    r5   XcodecResidualUnit.__init__G   s{    &&(++a/A5AYY##	

 YY<`ahmn
r   hidden_statereturnc                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nX-   $ Nr8   r;   r<   )r=   r@   output_tensors      r    forwardXcodecResidualUnit.forwardW   sC    5

=16

=1++r   rD   )r   r   r   r   r   r   intr5   r   TensorrF   r   __classcell__r>   s   @r    r(   r(   D   sM    Po| o# oS o\_ o ,ELL ,U\\ , ,r   r(   c                   v   ^  \ rS rSrS\S\S\S\4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )XcodecSemanticEncoderBlock_   r*   r+   r,   r/   c                   > [         TU ]  5         [        R                  " UR                   Vs/ s H  n[        XX%5      PM     sn5      U l        US:X  a  SOSU-  nUS-
  S-  n[        R                  " X#XdUSS9U l        g s  snf )Nr   r   r   Tr3   r/   r0   r2   )	r4   r5   r6   
ModuleListblock_dilationsr(   	res_unitsr:   conv)	r=   r*   r+   r,   r/   r-   kernelr0   r>   s	           r    r5   #XcodecSemanticEncoderBlock.__init__`   s    \b\r\rs\rPX[K\rs

 kF
A:!#IIkVdkrvw	 ts   Br@   rA   c                 ^    U R                    H  nU" U5      nM     U R                  U5      nU$ rC   )rS   rT   r=   r@   units      r    rF   "XcodecSemanticEncoderBlock.forwardk   s.    NND-L #yy.r   rT   rS   r   r   r   r   r   rH   r5   r   rI   rF   r   rJ   rK   s   @r    rM   rM   _   sJ    	x| 	x# 	xS 	xZ] 	xELL U\\  r   rM   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )SemanticEncoderr   c           	        > [         TU ]  5         [        UR                  5      [        UR                  5      :w  a  [        S5      e[        R                  " UR                  UR                  UR                  SUR                  S-  SS9U l
        UR                  n/ n[        UR                  5       H<  u  pE[        UR                  UR                  U   -  5      nU[        XXe5      /-  nUnM>     [        R                  " U5      U l        g )Nz:Number of strides must match the number of channel_ratios.r   r   Fr2   )r4   r5   lenstrideschannel_ratios
ValueErrorr6   r:   semantic_hidden_sizer3   rT   	enumeraterH   rM   rQ   conv_blocks)r=   r*   r+   rh   ir/   r,   r>   s          r    r5   SemanticEncoder.__init__s   s    v~~#f&;&;"<<YZZII''''!#
	 11"6>>2IAv::V=R=RST=UUVL6vLabbK&K 3
 ==5r   r@   rA   c                 ^    U R                  U5      nU R                   H  nU" U5      nM     U$ rC   rT   rh   r=   r@   blocks      r    rF   SemanticEncoder.forward   s0    yy.%%E .L &r   rl   
r   r   r   r   r5   r   rI   rF   r   rJ   rK   s   @r    r^   r^   r   s(    6,ELL U\\  r   r^   c                   v   ^  \ rS rSrS\S\S\S\4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )SemanticDecoderBlock   r*   r+   r,   r/   c                 b  > [         T	U ]  5         US:X  a  [        R                  " UUSSSSS9U l        O6SU-  nUS-   S-  nUS-  S:X  a  SOSn[        R
                  " X#XTXgSS9U l        [        R                  " UR                   Vs/ s H  n[        XX85      PM     sn5      U l	        g s  snf )	Nr   r   TrP   r   r   Fra   )
r4   r5   r6   r:   rT   ConvTranspose1drQ   rR   r(   rS   )
r=   r*   r+   r,   r/   r3   r0   output_paddingr-   r>   s
            r    r5   SemanticDecoderBlock.__init__   s    Q;		DI f*Kza'G"(1*/QqN**;^cDI ^d^t^tu^tRZlM^tu
us   	B,r@   rA   c                 ^    U R                  U5      nU R                   H  nU" U5      nM     U$ rC   r[   rX   s      r    rF   SemanticDecoderBlock.forward   s.    yy.NND-L #r   r[   r\   rK   s   @r    rr   rr      sE    
| 
# 
S 
Z] 
.ELL U\\  r   rr   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )SemanticDecoder   c           	        > [         TU ]  5         [        R                  " UR                  [        UR                  UR                  S   -  5      UR                  SUR                  S-  SS9U l        / n[        UR                  5       H  u  p4[        UR                  UR                  U   -  5      nU[        UR                  5      S-
  :  a)  [        UR                  UR                  US-      -  5      nOUR                  nU[        XXd5      /-  nM     [        R                  " U5      U l        [        R                  " UR                  UR                  UR                  SUR                  S-  SS9U l        g )Nr   r   r   F)r+   r,   r3   r/   r0   r2   )r/   r0   r2   )r4   r5   r6   r:   rf   rH   rd   r3   r;   rg   rc   rb   rr   rQ   rh   r<   )r=   r*   rh   ri   r/   r+   r,   r>   s          r    r5   SemanticDecoder.__init__   sQ   YY33V886;P;PQR;SST**&&!+

 "6>>2IAf99F<Q<QRS<TTUKC--.23"6#>#>AVAVWX[\W\A]#]^%::0l[\\K 3 ==5YY''''&&!+

r   r@   rA   c                     U R                  U5      nU R                   H  nU" U5      nM     U R                  U5      nU$ rC   )r;   rh   r<   rm   s      r    rF   SemanticDecoder.forward   s>    zz,/%%E .L &zz,/r   )r;   r<   rh   rp   rK   s   @r    r{   r{      s(    
>ELL U\\  r   r{   c                   >   ^  \ rS rSrSrU 4S jrS rS rS rSr	U =r
$ )XcodecEuclideanCodebook   z!Codebook with Euclidean distance.c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      nUR                  U l        U R                  S[        R                  " S/5      5        U R                  S[        R                  " UR                  5      5        U R                  SU5        U R                  SUR                  5       5        g )NinitedTcluster_sizeembed	embed_avg)	r4   r5   r   zeroscodebook_sizecodebook_dimregister_bufferrI   clone)r=   r*   r   r>   s      r    r5    XcodecEuclideanCodebook.__init__   s    F00&2E2EF#11Xu||TF';<^U[[9M9M-NOWe,[%++-8r   c                    U R                   R                  5       nUR                  S5      R                  SSS9nUSU-  U-  -
  UR                  S5      R                  SSS9-   * nUR	                  SS9R
                  nU$ )Nr   r   T)keepdimr   dim)r   tpowsummaxindices)r=   hidden_statesr   scaled_statesdist	embed_inds         r    quantize XcodecEuclideanCodebook.quantize   s    

%))!,00D0A]!2U!::UYYq\=M=MaY]=M=^^_HHH$,,	r   c                     UR                   nUR                  SUS   45      nU R                  U5      nUR                  " US S 6 nU$ )Nr   )shapereshaper   view)r=   r   r   r   s       r    encodeXcodecEuclideanCodebook.encode   sM    ##%--r59o>MM-0	NNE#2J/	r   c                     [         R                  " UR                  U R                  R                  5      U R                  5      nU$ rC   )F	embeddingtor   device)r=   r   	quantizeds      r    decodeXcodecEuclideanCodebook.decode   s/    KK	TZZ->-> ?L	r   )r   )r   r   r   r   r   r5   r   r   r   r   rJ   rK   s   @r    r   r      s    +9 r   r   c                   @   ^  \ rS rSrSrS\4U 4S jjrS rS rSr	U =r
$ )XcodecVectorQuantization   zQ
Vector quantization implementation. Currently supports only euclidean distance.
r*   c                 B   > [         TU ]  5         [        U5      U l        g rC   )r4   r5   r   codebook)r=   r*   r>   s     r    r5   !XcodecVectorQuantization.__init__   s    /7r   c                 b    UR                  SSS5      nU R                  R                  U5      nU$ Nr   r   r   )permuter   r   )r=   r   embed_ins      r    r   XcodecVectorQuantization.encode  s/    %--aA6==''6r   c                 b    U R                   R                  U5      nUR                  SSS5      nU$ r   )r   r   r   )r=   r   r   s      r    r   XcodecVectorQuantization.decode  s/    ==''	2##Aq!,r   )r   )r   r   r   r   r   r   r5   r   r   r   rJ   rK   s   @r    r   r      s#    8| 8
 r   r   c                      ^  \ rS rSrSrS\4U 4S jjrS rSS\4S jjr	SS\
R                  S\
R                  4S	 jjrS
\
R                  S\
R                  4S jrSrU =r$ ) XcodecResidualVectorQuantizationi  zn
Residual vector quantization implementation. Follows Algorithm 1 in https://huggingface.co/papers/2107.03312
r*   c                 "  > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        UR                  U l        UR                  U l	        UR
                  U l        g s  snf rC   )
r4   r5   r6   rQ   rangenum_quantizersr   
quantizers
frame_rater   )r=   r*   _r>   s      r    r5   )XcodecResidualVectorQuantization.__init__  sq    --SXY_YnYnSo(pSoa)A&)ISo(pq ++#11$33 )qs   Bc                 b    [         R                  " U R                  5      U R                  -  S-  $ )zReturn bandwidth per quantizer.i  )mathlog2r   r   )r=   s    r    get_bandwidth_per_quantizer<XcodecResidualVectorQuantization.get_bandwidth_per_quantizer  s%    yy++,t>EEr   rA   c           	          U R                  5       nU R                  nUb1  US:  a+  [        [        S[        R
                  " X-  5      5      5      nU$ )z:Return num_quantizers based on specified target bandwidth.        r   )r   r   rH   r   r   floor)r=   	bandwidthbw_per_qr   s       r     get_num_quantizers_for_bandwidthAXcodecResidualVectorQuantization.get_num_quantizers_for_bandwidth  sJ    335,, Y_ Q

93G(H!IJNr   
embeddingsc                     U R                  U5      nUn/ nU R                  SU  H:  nUR                  U5      nUR                  U5      nXH-
  nUR	                  U5        M<     [
        R                  " U5      n	U	$ )z
Encode the input tensor into discrete indices using RVQ, with the number of quantizers selected based on the given bandwidth.
Each quantizer /codebook residually quantizes the input and returns the nearest indices in terms of Euclidian distance.
N)r   r   r   r   appendr   stack)
r=   r   r   r   residualall_indices	quantizerr   r   out_indicess
             r    r   'XcodecResidualVectorQuantization.encode%  s    
 >>yI.9I&&x0G!((1I+Hw'	 :
 kk+.r   codesc                     [         R                  " SUR                  S9n[        U5       HB  u  p4U R                  U   nUR                  U5      nX&R                  UR                  5      -   nMD     U$ )z9Decode the given codes to their quantized representation.r   )r   )r   tensorr   rg   r   r   r   )r=   r   quantized_outri   r   r   r   s          r    r   'XcodecResidualVectorQuantization.decode5  sb    S>#E*JA*I!((1I)LL,FFM + r   )r   r   r   r   rC   )r   r   r   r   r   r   r5   r   rH   r   r   rI   r   r   r   rJ   rK   s   @r    r   r     sa    4| 4F#  %,,  ELL U\\  r   r   c                       \ rS rSrSr\rSrSrSr	S/r
\R                  " 5       S 5       rS rS	 r\S
 5       rSS jrSrg)XcodecPreTrainedModeli?  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
xcodecinput_valuesaudior   c                 $   [        U[        R                  5      (       ac  [        R                  " UR
                  SU R                  R                  S9  UR                  b!  [        R                  " UR                  5        gg[        U[        R                  [        R                  45      (       aA  [        R                  " UR                  5        [        R                  " UR
                  5        g[        U[        R                  5      (       a  [        R                  " UR
                  5        UR                  b_  [        R                   " UR"                  UR$                  UR&                  S   -  -  5      n[        R(                  " UR                  U* US9  ggUR*                  R,                  S:X  a!  [        R                  " UR.                  5        g[        U[        R0                  5      (       a  UR3                  5         g[        U[        R4                  5      (       a!  [        R                  " UR
                  SSS9  g[        U[6        5      (       Ga  UR8                  R;                  5        Hd  n[        U[        R                  5      (       d  M$  [        R<                  " UR
                  SS9  [        R>                  " UR                  S5        Mf     UR@                  R;                  5        Hd  n[        U[        R                  5      (       d  M$  [        R<                  " UR
                  SS9  [        R>                  " UR                  S5        Mf     g[        U[B        5      (       a  [        RD                  " URF                  [H        RJ                  " S	/5      5        [        R                  " URL                  5        [        R                  " URN                  5        [        R                  " URP                  5        gg)
zInitialize the weightsr   )meanstdNr   )abSnake1dg{Gz?)r   T))
isinstancer6   Linearinitnormal_weightr*   initializer_ranger2   zeros_	LayerNorm	GroupNormones_r:   kaiming_normal_r   sqrtr1   r+   r3   uniform_r>   r   alpharu   reset_parameters	EmbeddingXcodecModelacoustic_encodermodulestrunc_normal_	constant_acoustic_decoderr   copy_r   r   rI   r   r   r   )r=   modulek	submodules       r    _init_weights#XcodecPreTrainedModel._init_weightsL  s    fbii((LLSdkk6S6ST{{&FKK( 'r|| <==KK$JJv}}%		**  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' &&)3JJv||$ 2 233##%--LLSd;,, $44<<>	i33&&y'7'7TBNN9>>15 ? $44<<>	i33&&y'7'7TBNN9>>15 ?  788JJv}}ellD6&:;KK++,KK%KK(()	 9r   c                 *   [         R                  R                  R                  R                  nU" U R
                  R                  5        U" U R
                  R                  5        U R
                  R                   Hc  nU" UR                  5        UR                  UR                  UR                  4 H'  nU" UR                  5        U" UR                  5        M)     Me     U" U R                  R                  SS9  U" U R                  R                  SS9  U R                  R                   H`  nU" UR                  SS9  UR                  UR                  UR                  4 H%  nU" UR                  SS9  U" UR                  SS9  M'     Mb     g)znApply weight norm in the acoustic encoder and decoder because the original checkpoint has weight norm applied.r   nameN)r   r6   utilsparametrizationsweight_normr   r;   r<   rn   	res_unit1	res_unit2	res_unit3r   conv_t1)r=   r	  rn   res_units       r    apply_weight_norm'XcodecPreTrainedModel.apply_weight_normr  s'   hhnn55AAD))//0D))//0**00E$"__eoouOHNN+HNN+ P 1 	D))//h?D))//h?**00EH5"__eoouOHNN:HNN: P 1r   c                    U R                   U R                  4 H  nUR                  5        H  n [        R                  R
                  R                  USS9  [        US5      (       d  M?  SUR                  ;   d  MQ  [        R                  R
                  R                  R                  USSS9  M     M     g! [        [        4 a     Nsf = f)z=Remove the weight norm from the acoustic encoder and decoder.r   r  r  T)leave_parametrizedN)r   r   r   r   r6   r  remove_weight_normre   AttributeErrorhasattrr  parametrizeremove_parametrizations)r=   r   ms      r    r  (XcodecPreTrainedModel.remove_weight_norm  s    ,,d.C.CDF^^%HHNN55ah5G 1011h!BTBT6THHNN..FFq(gkFl & E #N3 s   (B99CCc                 V   ^ S[         R                  4U4S jjm[        T" U5      5      $ )z1
Recursively iterate to fetch all Conv1d layers.
r   c                    > / n[        U [        R                  5      (       a  UR                  U 5        U R	                  5        H  nUR                  T" U5      5        M     U$ rC   )r   r6   r:   r   childrenextend)r   params_listchildget_conv1d_layers_recursives      r    r   MXcodecPreTrainedModel._get_conv1d_layers.<locals>.get_conv1d_layers_recursive  sV    K&")),,""6*  *""#>u#EF + r   )r6   Moduletuple)r=   r   r   s     @r    _get_conv1d_layers(XcodecPreTrainedModel._get_conv1d_layers  s&    
			 
	 0899r   Nc                 Z    Uc  U nU R                  U5      nU H  n[        XA5      nM     U$ )z_
For a given module, compute the output length that would be obtained after all Conv1d layers.
)r$  r   )r=   input_lengthr   conv1d_layerslayers        r    _get_conv1d_output_lengths0XcodecPreTrainedModel._get_conv1d_output_lengths  s:     >F//7"E/DL # r   r   rC   )r   r   r   r   r   r   config_classbase_model_prefixmain_input_nameinput_modalities_no_split_modulesr   no_gradr  r  r  r   r$  r*  r   r   r   r    r   r   ?  se    
  L $O;<
]]_#* #*J;,	m : :&r   r   z$The Xcodec neural audio codec model.)custom_introc                     ^  \ rS rSrU 4S jr\S\R                  4S j5       rS\	R                  S\	R                  4S jr\  SS\	R                  S	\S-  S
\S-  S\	R                  \-  4S jj5       r\ SS\	R                  S
\S-  S\	R                  \-  4S jj5       r\   SS\	R                  S\	R                  S-  S	\S-  S
\S-  S\\	R                  \	R                  4   \-  4
S jj5       rSrU =r$ )r   i  c                 @  > [         TU ]  U5        Xl        UR                  S-  U l        [
        R                  " UR                  5      nUR                  U l	        UR                  U l        U R                  U R                  5        [        U5      U l        [        U5      U l        [
        R                  " UR"                  5      R%                  5       U l        [(        R*                  " UR,                  UR,                  5      U l        [(        R*                  " UR,                  UR"                  R,                  5      U l        [(        R*                  " UR,                  UR                  R,                  5      U l        [5        U5      U l        U R9                  5         g )Nr   )r4   r5   r*   
hop_lengthpadr   from_configacoustic_model_configencoderr   decoderr   _adjust_dac_decoderr^   encoder_semanticr{   decoder_semanticsemantic_model_configevalsemantic_modelr6   r   hidden_sizefcfc1fc2r   r   	post_init)r=   r*   acoustic_modelr>   s      r    r5   XcodecModel.__init__  s'    $$)"..v/K/KL . 6 6 . 6 6  !6!67 / 7 / 7'33F4P4PQVVX))F..0B0BC99V//1M1M1Y1YZ99V//1M1M1Y1YZ9&A 	r   r:  c                    U R                  5        Hi  n[        U[        R                  5      (       d  M$  [        UR                  [
        5      (       a  UR                  S   OUR                  nUS-  4Ul        Mk     [        U S5      (       aE  [        U R                  [        R                  5      (       a  [        R                  " 5       U l        ggg)z
DAC implemented in Xcodec is slightly different from the HF version.
DAC in Xcodec adjusts the output padding in every ConvTranspose1d in the decoder and removes
the final `nn.Tanh` activation function.
r   r   tanhN)r   r   r6   ru   r/   r#  rv   r  rI  TanhIdentity)r:  r   r/   s      r    r;  XcodecModel._adjust_dac_decoder  s     oo'F&""4"455-7u-M-Mq)SYS`S`)/!% ( 7F##
7<<(I(I;;=GL )J#r   r   rA   c                 P   US S 2SS S 24   n[         R                  " XR                  U R                  45      n[        R                  " 5          U R	                  USS9nUR
                  nS S S 5        [        R                  " WSS9nUR                  SS9$ ! , (       d  f       N2= f)Nr   T)output_hidden_statesr   r   )r   r6  r   r1  r@  r   r   r   )r=   r   outputsr   stackeds        r    _extract_semantic_features&XcodecModel._extract_semantic_features  s    #Aq!G,uu\HHdhh+?@]]_)),T)RG#11M  ++m3|||"" _s   B
B%Nr   return_dictc                    Ub  UOU R                   R                  nUR                  S   nUS:w  a  [        SU 35      eUc  U R                   R                  S   nO?X R                   R                  ;  a&  [        SU SU R                   R                   S35      eU R                  U5      R                  5       nU R                  UR                  SS5      5      nU R                  UR                  S   U R                  5      UR                  S   :w  a<  U R                  [        R                  " XR                  U R                  45      5      nOU R                  U5      n[        R                  " UR                  UR                   5      U/SS9nU R#                  UR                  SS5      5      R                  SS5      nU R$                  R'                  X5      n	U	R                  S	S5      n	U(       d  U	$ [)        U	5      $ )
a  
input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
    Float values of the input audio waveform.
bandwidth (`float`, *optional*):
    The target bandwidth in (kbps) supports only values in `config.target_bandwidths`.
    Defaults to the highest available bandwidth `4.0` kbps.
return_dict (`bool`, *optional*):
    Whether or not to return a [`~utils.ModelOutput`].

Returns:
    `torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)` containing the discrete encoded audio codes.
r   zAudio must be mono, but got r   z)This model doesn't support the bandwidth z. Select one of .r   r   r   )r*   rS  r   re   target_bandwidthsrQ  detachr<  	transposer*  r   r   r6  r   catr   r   rB  r   r   r"   )
r=   r   r   rS  channelse_semantic_input
e_semantic
e_acousticr   r   s
             r    r   XcodecModel.encode  s   & &1%<k$++BYBY%%a(q=;H:FGG55b9Ikk;;;;I;FVW[WbWbWtWtVuuvw   ::<HOOQ**+;+E+Ea+KL
 **<+=+=a+@$BWBWX\f\l\lmn\oo..quu\HHdhhCW/XYJ..|<JYY
j.?.? @*MSTU
WWZ11!Q78BB1aH
nn++JB!++Aq1";//r   r   c                 >   Ub  UOU R                   R                  nUR                  SS5      nU R                  R	                  U5      nU R                  UR                  SS5      5      R                  SS5      nU R                  U5      nU(       d  U$ [        U5      $ )al  
audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`):
    Discrete code indices computed using `model.encode`.
return_dict (`bool`, *optional*):
    Whether or not to return a [`~utils.ModelOutput`]

Returns:
    Decoded audio values of shape `(batch_size, channels, num_samples)` obtained using the decoder part of
    Xcodec.
r   r   r   )r*   rS  rX  r   r   rD  r   r%   )r=   r   rS  r   quantized_acousticr   s         r    r   XcodecModel.decode  s      &1%<k$++BYBY!++Aq1NN))+6	!XXi&9&9!Q&?@JJ1aP,,-?@"<00r   c                     Ub  UOU R                   R                  nUR                  S   nUc  U R                  XSS9nU R	                  X$S9S   SSU24   nU(       d  X&4$ [        X&S9$ )a  
input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
    The raw float values of the input audio waveform.
audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`:
    Discrete code indices computed using `model.encode`.
bandwidth (`float`, *optional*):
    Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
bandwidth (`float`, *optional*):
    Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
return_dict (`bool`, *optional*):
    Whether to return a [`XcodecOutput`] instead of a plain tuple.

Returns:
    `XcodecOutput` or tuple `(audio_codes, audio_values)`:
    - `audio_codes` of shape `(batch_size, num_quantizers, codes_length)`: the quantized discrete codes.
    - `audio_values` of shape `(batch_size, channels, num_samples)`: the reconstructed audio waveform given the codes.

Example:

```python
>>> from datasets import load_dataset
>>> from transformers import AutoFeatureExtractor, XcodecModel

>>> model_id = "hf-audio/xcodec-hubert-librispeech"
>>> model = XcodecModel.from_pretrained(model_id)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)

>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
>>> audio_sample = dataset[0]['audio']['array']

>>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")

>>> outputs = model(**inputs)
>>> audio_codes = outputs.audio_codes
>>> audio_values = outputs.audio_values
```
Nr   F)rS  r   .)r   r   )r*   rS  r   r   r   r   )r=   r   r   r   rS  lengthr   s          r    rF   XcodecModel.forward3  s    \ &1%<k$++BYBY##B'++l5+QK{{;{HKCQXRXQXLY..OOr   )r   r   r*   r=  r<  rB  rC  rD  r6  r   r@  )NNrC   )NNN)r   r   r   r   r5   staticmethodr6   r"  r;  r   r   rQ  r
   rI   floatboolr"   r   r%   r   r#  r   rF   r   rJ   rK   s   @r    r   r     sh   & )RYY ) )#u7H7H #UM^M^ #  #'#'	10ll10 4<10 D[	10
 
+	+10 10f  $(1\\1 D[1 
+	+	1 16  ,0"&#'8Pll8P \\D(8P 4<	8P
 D[8P 
u||U\\)	*\	98P 8Pr   r   ))r   r   dataclassesr   	functoolsr   r   torch.nnr6   torch.nn.functional
functionalr    r   r   audio_utilsr   modeling_utilsr   r  r	   r
   autor   configuration_xcodecr   r   r"   r%   r"  r(   rM   r^   rr   r{   r   r   r   r   r   __all__r   r   r    <module>rs     s\   !  !      & / : 0  . 
2; 
2 
2 0+ 0 0 2+ 2 2, ,6 &bii <299 >%bii %Pbii @ryy ,/ryy /d s8 s sl GHuP' uP IuPp 1
2r   