
    Z j5                        S SK r S SKJr  S SKJr  S SKrSSKJrJ	r	J
r
  \
" 5       (       a  S SKr\	" 5       (       a  S SKrSSKJrJr  SSKJr  SSKJrJrJrJr  SS	KJrJr   " S
 S\SS9r " S S\SS9r\ " S S\5      5       rS/rg)    N)Path)Any   )auto_docstringis_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   4    \ rS rSr% Sr\\\4   S-  \S'   Sr	g)CsmAudioKwargs$   a  
encoded_length_kwargs (`dict[str, Any]`, *optional*):
    Dictionary of keyword arguments used to compute the encoded audio sequence length. This includes parameters
    such as `kernel_sizes`, `strides`, `dilations`, and `use_causal_conv` that define the convolutional layers
    used in audio encoding. The encoded length is used to determine how many audio tokens to generate for each
    audio input in the text sequence.
Nencoded_length_kwargs )
__name__
__module____qualname____firstlineno____doc__dictstrr   __annotations____static_attributes__r       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/csm/processing_csm.pyr   r   $   s      S>D00r    r   F)totalc                   T    \ rS rSr% \\S'   SSSS./ SQ/ SQ/ S	QSS
.SS.SS0S.rSrg)CsmProcessorKwargs0   audio_kwargsTleftF)paddingpadding_sideadd_special_tokens)   r         r   r,   
   r   r,      r   r,      r      )r,   r,   r,   r1   r,   r,      r,   r,      r,   r,   r-   r,      )r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr&   common_kwargsr   N)r   r   r   r   r   r   	_defaultsr   r   r    r!   r$   r$   0   sG       ""'
 !QHJ#'	& #
 +D1Ir    r$   c                      ^  \ rS rSr SU 4S jjr\SS j5       rS\S\\	-  \
\\	-     -  S\\   4S jr\   SS	\\-  \
\   -  \
\   -  S-  S\S-  S
\S-  S\S-  S\\   4
S jj5       r\S 5       rSrU =r$ )CsmProcessorE   Nc                   > [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR
                  U l        [        TU ]  XUS9  g )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)chat_template)hasattrrC   convert_tokens_to_idsaudio_token_idrD   audio_eos_token_idsuper__init__)selffeature_extractor	tokenizerrE   	__class__s       r!   rK   CsmProcessor.__init__G   s     y-00*D"+"A"A$BRBR"SD(44D"+":":Dy"344#2D &/&E&EdFZFZ&[D##,#<#<D &/&B&BD#*]Sr    c                 >   U nUb	  Ub  Ub  Uc  U$ [        XU5       H|  u  pgnUS-
  U-  S-   n	Xg-
  n
U
S-  nX-
  nXY-
  U
-   U-  S-   n[        R                  " U5      S-
  nX-  U-   U
-
  nX-
  nU(       a  U
nUnOX-   nX\-   U-   nXXUS-
  -  -
  S-
  U-  S-   nM~     U$ )aD  
Compute the length of the encoded audio sequence.

Args:
    audio_length (int): The length of the audio sequence.
    kernel_sizes (list[int]): The kernel sizes for the convolutional layers.
    strides (list[int]): The strides for the convolutional layers.
    use_causal_conv (bool): Whether to use causal convolutions.
r,   r4   )zipmathceil)audio_lengthr5   r6   r7   r8   
cur_lengthkernel_sizestridedilationeffective_kernel_sizepadding_totalpadding_rightpadding_leftn_framesideal_lengthextra_paddings                   r!   _get_encoded_length CsmProcessor._get_encoded_length]   s     "
7?i6G?Kb-0	-R)K%01_$@1$D!'0M)Q.M(8L":]JfTWXXHyy*Q.H#,{:]JL(5M, - - =#2]BJ$;?'CCaGFRUVVJ% .S( r    audiosaving_pathkwargsc                 z   [        5       (       d  [        S5      e[        U5      n[        U[        [
        45      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S5      e[        U5      [        U5      :w  a  [        S5      eU R                  " [        40 UD6nUS   nUS   n[        X5       Hg  u  px[        U[        R                  5      (       a,  UR!                  5       R#                  5       R%                  5       n[&        R(                  " XU5        Mi     g )Nz/Please install `soundfile` to save audio files.c              3   N   #    U  H  n[        U[        [        45      v   M     g 7fN)
isinstancer   r   ).0ps     r!   	<genexpr>*CsmProcessor.save_audio.<locals>.<genexpr>   s#     @qep`aAPSUY{A[A[eps   #%zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer&   r9   )r   ImportErrorr
   ri   r   r   listtupleall
ValueErrorlen_merge_kwargsr$   rR   torchTensorcpufloatnumpysfwrite)	rL   rc   rd   re   output_kwargsr&   r9   audio_valuerk   s	            r!   
save_audioCsmProcessor.save_audio   s	    &''OPP #5) kC;//&-K[4-88S@qep@q=q=q`aau:[))TUU**

 %^4$_5!%5NK+u||44)oo/557==?HHQ]3 6r    textoutput_labelsdepth_decoder_labels_ratioc                 
   U R                   " [        4SU R                  R                  0UD6nUS   nUS   nUR	                  SS5      n	U	S:w  a"  [        U R                  R                   S35      e[        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S	5      eU V
s/ s H  oR                  U R                  5      PM     nn
S
nUb  [        U5      n[!        U5      n[#        U5      S
:  a/  U[#        U5      :w  a   Uc  [        S5      e[        SU SU S35      eUGb  UR%                  S0 5      nU Vs/ s H"  oR&                  " UR(                  S   40 UD6PM$     nnUR+                  5       n/ nU H  n/ nU R                  U;   a`  UR%                  S
5      nU R                  U-  nUR-                  U5        UR/                  U R                  SS5      nU R                  U;   a  M`  SU;   a*  UR/                  SUR%                  S
5      S5      nSU;   a  M*  UR-                  U5        M     UnU R                  " U40 UD6n0 nUR1                  U5        UGb  UR%                  SS5        / / nnS
nU GH*  nUS
:X  aM  UR-                  [2        R4                  " S
5      5        UR-                  [6        R8                  " S/5      5        MW  UR-                  [2        R:                  " UUUU-     Vs/ s HB  n[        U[6        R<                  5      (       a  UR?                  5       RA                  5       OUPMD     snSS95        UR-                  [6        R8                  " UUUU-     Vs/ s H  nUR(                  S   PM     sn5      RC                  SS95        UU-  nGM-     U RD                  " U40 UD6nUR%                  SS5        UR1                  U5        [G        S U 5       5      nU Vs/ s H>  n[6        RH                  RJ                  RM                  US
UUR(                  S   -
  4SS9PM@     nn[6        RN                  " US
S9US'   U(       a  US   U RP                  :H  RS                  5       nUR(                  S
   n US::  a.  [6        RT                  " U 5      S[W        U SU-
  -  5       n!UU!   n"OUn"[6        RX                  " US   U RP                  :H  US   U RZ                  :H  -  US   S5      n#SU#U"SS2S
4   U"SS2S4   4'   U#US'   []        UU	S9$ s  sn
f s  snf s  snf s  snf s  snf ) a  
output_labels (bool, *optional*, default=False):
    Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
    - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
    - `-100` will be ignored in the loss computation
    - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
depth_decoder_labels_ratio (float, *optional*, default=1.0):
    The ratio of audio frames to keep for the depth decoder labels.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
tokenizer_init_kwargsr<   r&   r:   Nr;   z% only supports `return_tensors='pt'`.c              3   B   #    U  H  n[        U[        5      v   M     g 7frh   )ri   r   )rj   ts     r!   rl   (CsmProcessor.__call__.<locals>.<genexpr>   s     9[VZQR*Q:L:LVZs   zAInvalid input text. Please provide a string, or a list of stringsr   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   z<placeholder>r,   return_attention_mask)axis)dimpadding_maskc              3   >   #    U  H  oR                   S    v   M     g7f)r   N)shape)rj   cut_idxss     r!   rl   r     s     R=Q..,=Qs   )valueinput_values_cutoffs	input_ids      ?iilabels)datatensor_type)/rt   r$   rN   init_kwargsgetrr   rO   r   ri   r   ro   rp   rq   countrC   r
   rs   sumpopra   r   copyappendreplaceupdatenpzerosru   tensorconcatenaterv   rw   ry   cumsumrM   maxnn
functionalpadstackrH   nonzerorandpermintwhererI   r   )$rL   r   rc   r   r   re   r|   r<   r&   r:   r   n_audio_in_textn_audior   audio_arraynum_audio_tokens_listnum_audio_tokens_list_copyexpanded_textsamplereplace_strnum_audio_tokensexpanded_audio_tokenencodingr   concatenated_audior   offsetelaudio_inputsmax_lenr   audio_frame_idxsn_audio_frames	rand_idxsskip_frames_idxsr   s$                                       r!   __call__CsmProcessor.__call__   s   : **
"&.."<"<
 
 $M2$^4$)94@T! 7 788]^__dC  6DTD%=11c9[VZ9[6[6[`aa>BCd774#3#34dC&u-E%jG!#33G(G} !cdd ??P Q229"> 
 $0$4$45Lb$Q!lq%lq]h(():):2)>XBWXlq " % *?)C)C)E& M &&&0'A'E'Ea'H$+/+;+;>N+N(&&';<#^^D,<,<oqQF &&&0 &/#^^O[__Q=OQRSF &/$$V,  !D>>$6+6H4d;792 4F*a<&--bhhqk:(//bT0BC&-- +0'9I*J*JB 5?r5<<4P4P 0VX X*J "$ )//U6FU\L\=]%^=]rbhhrl=]%^_ffkmfn g%F# +&  112DUUL^T2KK% R=QRRG !5$ 4H ##''1gr@R6R2S[]'^ 4 ! $ ,1;;7KQR+SD'( $[ 1T5H5H HQQS-33A6N)S0!NN>:;sSSTWqSqAr=st	#3I#> #3 [[k"d&9&99d;>OSWSjSj>jk[!F
 FJF#AqD)+;AqD+AAB#DN>BBG D$%L &_$s    $U%)U A	UUAU c                     U R                   R                  nU R                  R                  nU Vs/ s H  o3S:w  d  M
  UPM     nn[        X-   S/-   5      $ s  snf )Nr   r   )rN   model_input_namesrM   ro   )rL   tokenizer_input_namesfeature_extractor_input_namesnames       r!   r   CsmProcessor.model_input_names7  sb     $ @ @(,(>(>(P(P% ;X(r:W$cq[q:W%(r)IMcLddee )ss
   	AA)rD   rI   rC   rH   rh   )NNNN)NFr   )r   r   r   r   rK   staticmethodra   r	   r   r   ro   r   r$   r~   r   r   r   boolrx   r   propertyr   r   __classcell__)rO   s   @r!   r@   r@   E   s    	T, # #J 4 4 4Z$sTz"22 4 +,	 4D  $(%*36OC++d9o=EV@WWZ^^OC D OC d{	OC
 %*DLOC +,OC OCb f fr    r@   )rS   pathlibr   typingr   ry   r   utilsr   r   r   ru   	soundfilerz   audio_utilsr	   r
   feature_extraction_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r$   r@   __all__r   r    r!   <module>r      s        O O  9 4 U U C	1[ 	1) * yf> yf yfx 
r    