
    Z j9                         S SK r S SKrSSKJrJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJr  \" 5       (       a  S SKr\R$                  " \5      r " S S	\	S
S9r " S S\
5      rS/rg)    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   2    \ rS rSrSS0SSSS.SSS	.S
.rSrg)GlmAsrProcessorKwargs(   paddingTi>  
max_length)sampling_ratereturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/glmasr/processing_glmasr.pyr   r   (   s2     t
 #%)#
 #"
Ir"   r   F)totalc                      ^  \ rS rSrSr    SU 4S jjrSS jrS rS r  SS
\	\
\	   -  S\S-  S\S-  S\\   S\4
S jjr\S\
\   4S j5       r SS\\
\   -  \-  S\\
\   -  S-  S\\   S\4S jjrS	S.S jrS rS
\S\4S jrSrU =r$ )GlmAsrProcessor9   aA  
Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
tokenizer into a single processor.

[`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
[`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.

Args:
        feature_extractor ([`WhisperFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`Qwen2TokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`Optional[str]`, *optional*):
            The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
            template will be used.
        audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
            Special token used to represent audio inputs in the chat template.
        default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
            Default prompt to use for transcription tasks when applying transcription requests.
        max_audio_len (`int`, *optional*, defaults to 655):
            Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
            655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
Nc                 r   > X@l         UR                  U5      U l        XPl        X`l        [
        TU ]  XUS9  g )N)chat_template)audio_tokenconvert_tokens_to_idsaudio_token_iddefault_transcription_promptmax_audio_lensuper__init__)selffeature_extractor	tokenizerr)   r*   r-   r.   	__class__s          r#   r0   GlmAsrProcessor.__init__R   s=     ''==kJ,H)**]Sr"   returnc                 d    SnS H  u  p4nUSU-  -   US-
  -
  S-
  U-  S-   nM     X-
  U-  S-   nU$ )N   ))   r   r9   )r9   r      r:   r9   r   )r1   audio_lengthsmerge_factorr   kernel_sizestride
num_tokenss          r#   _get_audio_token_length'GlmAsrProcessor._get_audio_token_lengtha   sY    ,B(G&*Q[8K!OLqPU[[^__M -C $2|CaG
r"   c                    [         R                  " [         R                  " UR                  S5      U5       Vs/ s H  oDR                  5       PM     sn5      nU R	                  U5      n[
        R                  " [
        R                  " U R                  5      5      n[        U5       H(  u  pUR                  U R                  U	-  X   5      X'   M*     U$ s  snf )N)torchstacksplitsumr@   recompileescaper*   	enumeratesub)
r1   textpadding_maskper_sample_windowssr;   audio_tokens_lengthsaudio_token_patterniaudio_lengths
             r#   _expand_audio_tokens$GlmAsrProcessor._expand_audio_tokensi   s    ekk,BRBRSUBVXj6k$l6kUUW6k$lm#;;MJ jj43C3C)DE()=>OA)--d.>.>.MtwWDG  ? %ms   Cc                     XR                   :H  $ N)r,   )r1   	input_idss     r#   _get_audio_tokens_mask&GlmAsrProcessor._get_audio_tokens_maskq   s    ////r"   FrM   audiooutput_labelskwargsc           
         U R                   " [        4SU R                  R                  0UD6nUS   nUS   nUR	                  S5      nUS:w  a"  [        U R                  R                   S35      e[        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S5      e0 n	UGb  [        U5      n[        U5      [        U5      :w  a$  [        S	[        U5       S
[        U5       S35      e[        US   U R                   R"                  -  5      n
[        U R$                  U R                   R"                  -  5      n/ n/ nU H  n[        UR&                  S   5      n[)        SX-   S-
  U
-  5      nUU:  a;  [*        R-                  SXS   -  S SU R$                   SU R$                   S35        UnUR/                  U5        [1        UUU
-  5      n[3        U5       H.  nUU
-  n[1        US-   U
-  U5      nUR/                  UUU 5        M0     M     U R                   " U40 UD6n	U	R5                  S5      nUU	S'   U R7                  UUU5      nU R                  " U40 UD6n0 UEU	EnU(       aH  US   R9                  5       nSUU R;                  U5      '   SUUU R                  R<                  :H  '   UUS'   [?        UUS9$ )a  
Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
the text is tokenized as-is (LM-only behavior).

Args:
    text (`str` or `list[str]`):
        Input sequence or batch of sequences.
    audio (`np.ndarray` or `list[np.ndarray]`):
        Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
        `audio` inputs.
    output_labels (bool, *optional*, default=False):
        Whether to return labels for training.

Returns:
    [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
    audio features (`input_features`, `input_features_mask`).
tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c              3   B   #    U  H  n[        U[        5      v   M     g 7frX   
isinstancestr).0ts     r#   	<genexpr>+GlmAsrProcessor.__call__.<locals>.<genexpr>   s     9[VZQR*Q:L:LVZ   zAInvalid input text. Please provide a string, or a list of stringszGot z
 text but z audios; they must match 1:1.r   r   r9   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_maskrY   ilabels)datatensor_type) _merge_kwargsr   r3   init_kwargsget
ValueErrorr4   r   rc   rd   listtupleallr   lenintr2   chunk_lengthr.   shapemaxloggerwarningappendminrangepoprU   clonerZ   pad_token_idr   )r1   rM   r\   r]   r^   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsrO   flat_chunksaudio_el	n_samplesn_wintime_caprS   startendrN   text_inputsrm   rl   s                            r#   __call__GlmAsrProcessor.__call__t   s!   : ((!
"&.."<"<
 
 "-0">2$)9:T! 7 788]^__dC  6DTD%=11c9[VZ9[6[6[`aa&u-E4yCJ& 4D	{*SZLHe!fgg l?;d>T>T>a>aabKd00D4J4J4W4WWXK,.,.K!q 12	A	 7! ;KL;&NN*9O7T+TUX*YYdeiewewdx  yP  QU  Qc  Qc  Pd  df  g (E"))%0y%+*=>uAOEq1u3X>C&&xc':; & ""  11+NNL'++,<=L2>L./ ,,T<ASTD nnT9[9.+..+&,,.F:>F4..v67<@F6T^^8889#DN>BBr"   c                     U R                   R                  nU R                  R                  n[        [        R                  X-   S/-   5      5      $ )Nrk   )r3   model_input_namesr2   rs   dictfromkeys)r1   	tok_names	fea_namess      r#   r   !GlmAsrProcessor.model_input_names   sB    NN44	**<<	DMM)"7;P:Q"QRSSr"   promptc           	      ~   [        U[        5      (       a  U/nO[        U[        [        45      (       a*  U(       a#  [	        S U 5       5      (       a  [        U5      nO[        [        U5      5      n[        5       (       a]  U Vs/ s HP  n[        U[        R                  5      (       a,  UR                  5       R                  5       R                  5       OUPMR     nn[        U5      nUS:X  a  [        S5      eUc  U R                  /U-  nO[        U[        5      (       a  U/U-  nO[        U[        [        45      (       a  [        U5      U:w  a  [        S[        U5       SU S35      e/ nU HT  nUc  UR                  U R                  5        M#  [        U[        5      (       a  UR                  U5        MK  [!        S5      e   O[!        S5      e[#        Xt5       V	V
s/ s H-  u  pS	[        U
[        5      (       a  S
U
S.OS
U
S.SU	S./S./PM/     nn	n
U R$                  " U4SSSS.UD6$ s  snf s  sn
n	f )a  
Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

Args:
    audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
        Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
        the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
    prompt (`str` or `list[str]`, *optional*):
        Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
        each sample uses `"Transcribe the input speech."`.
    **kwargs:
        Additional keyword arguments forwarded to [`~GlmAsrProcessor.apply_chat_template`] (for example
        `text_kwargs`, `audio_kwargs`, ...).

Returns:
    [`BatchFeature`]: Processor outputs ready to be passed to [`GlmAsrForConditionalGeneration.generate`].

c              3   B   #    U  H  n[        U[        5      v   M     g 7frX   rb   )re   els     r#   rg   >GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>   s     ?d^cXZ
2s@S@S^cri   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userr\   )typepath)r   r\   rM   )r   rM   )rolecontentT)tokenizeadd_generation_promptreturn_dict)rc   rd   rs   rt   ru   r   r   rD   Tensordetachcpunumpyrv   rr   r-   r}   	TypeErrorzipapply_chat_template)r1   r\   r   r^   audio_itemsr   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r#   apply_transcription_request+GlmAsrProcessor.apply_transcription_request   s/   2 eS!!38'Ke}--%C?d^c?d<d<du+K1%89K!##kvwkvegJr5<<<X<Xryy{0668^``kvw%
?HII>889JFG$$h+Gu..6{j( F}OJ<Gkl  G<NN4#D#DEc**NN4(#$MNN  Z[[ ,/w+D
 ,E' # &j#66 ")*=&-
C!'=	 
 ,E 	 
 ''
"&	

 
 	
S x4
s   AH4&4H9)strip_prefixc                    U R                   R                  " U0 UD6nU(       a   U Vs/ s H  oPR                  U5      PM     nnU$ s  snf )aB  
Forward arguments to [`~PreTrainedTokenizer.decode`] and optionally remove the assistant framing the model
was trained to produce.

AF3 transcription requests respond with sentences such as `"The spoken content of the audio is "..."."`.
Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
)r3   decode"_strip_assistant_prefix_and_quotes)r1   r   argsr^   decodedrM   s         r#   r   GlmAsrProcessor.decode*  sJ     ..''88QXYQX>>tDQXGY Zs   Ac                 &    U R                   " U0 UD6$ )z)BC as previous examples used batch_decode)r   )r1   r   r^   s      r#   batch_decodeGlmAsrProcessor.batch_decode7  s    {{D+F++r"   c                 `   UR                  5       nS H7  nUR                  U5      (       d  M  U[        U5      S R                  5       n  O   UR                  S5      (       a  USS R                  5       n[        U5      S:  a(  US   US   :X  a  US   S;   a  USS R                  5       nU$ )	zY
Remove the assistant prefix and surrounding quotes from a decoded transcription string.
)z"The spoken content of the audio isz!The transcription of the audio isz!The content of the input audio isN.rC   r:   r   >   "'r9   )strip
startswithrv   endswith)r1   rM   strippedprefixs       r#   r   2GlmAsrProcessor._strip_assistant_prefix_and_quotes;  s    
 ::<
F
 ""6**#CKM288:
 S!!}**,Hx=A(1+""=(1+Q[B["~++-Hr"   )r*   r,   r-   r.   )Nz<|pad|>z&Please transcribe this audio into texti  )r;   torch.Tensorr6   r   )NFrX   )r   r   r   r   __doc__r0   r@   rU   rZ   r
   rs   r   boolr	   r   r   r   propertyrd   r   r   r   r   r   r!   __classcell__)r4   s   @r#   r&   r&   9   s,   8 %MT0 $(%*	]C$y/)]C D ]C d{	]C
 ./]C 
]C~ T49 T T *.O
T#Y+O
 d3i$&O
 ./	O

 
O
b */ ,s s  r"   r&   )rH   r   npaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   utilsr   r   rD   
get_loggerr   r{   r   r&   __all__r   r"   r#   <module>r      sg   , 
  9 4 H H 0 0  
		H	%,E "Xn Xv 
r"   