
    Z jmI                     l   S SK Jr  S SKrSSKJr  SSKJrJr  SSK	J
r
  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJr  SSK J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,  SSK-J.r.J/r/  \" 5       (       a
  S SK0r0S SK0J1r1  \Rd                  " \35      r4 " S S\&5      r5 " S S\%5      r6 " S S\(5      r7S/S jr8 " S S\*5      r9 " S  S!\1Rt                  5      r; " S" S#\5      r< " S$ S%\#5      r= " S& S'\=5      r> " S( S)\"5      r?\" S*S+9 " S, S-\!5      5       r@/ S.QrAg)0    )CallableN   )ACT2FN)
AudioInputmake_list_of_audio)Cache)BatchFeature)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringis_torch_availablelogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )&AudioFlamingo3ForConditionalGeneration!AudioFlamingo3MultiModalProjectorAudioFlamingo3PreTrainedModel)AudioFlamingo3ProcessorAudioFlamingo3ProcessorKwargs)GlmRotaryEmbedding)LlamaAttentioneager_attention_forwardrotate_half   )GlmAsrConfigGlmAsrEncoderConfig)nnc                       \ rS rSrSrg)GlmAsrProcessorKwargs1    N__name__
__module____qualname____firstlineno____static_attributes__r'       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/glmasr/modular_glmasr.pyr%   r%   1       Cr.   r%   c            	          ^  \ rS rSrSr    SU 4S jjrSS jr SS\\\   -  \	-  S\\\   -  S-  S	\
\   S\4S
 jjrSrU =r$ )GlmAsrProcessor4   aA  
Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
tokenizer into a single processor.

[`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
[`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.

Args:
        feature_extractor ([`WhisperFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`Qwen2TokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`Optional[str]`, *optional*):
            The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
            template will be used.
        audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
            Special token used to represent audio inputs in the chat template.
        default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
            Default prompt to use for transcription tasks when applying transcription requests.
        max_audio_len (`int`, *optional*, defaults to 655):
            Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
            655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
Nc           	      *   > [         TU ]  UUUUUUS9  g )N)chat_templateaudio_tokendefault_transcription_promptmax_audio_len)super__init__)selffeature_extractor	tokenizerr5   r6   r7   r8   	__class__s          r/   r:   GlmAsrProcessor.__init__M   s)     	'#)E' 	 	
r.   returnc                 d    SnS H  u  p4nUSU-  -   US-
  -
  S-
  U-  S-   nM     X-
  U-  S-   nU$ )N   )r    r   r    )r    r   r   r   r    r'   )r;   audio_lengthsmerge_factorpaddingkernel_sizestride
num_tokenss          r/   _get_audio_token_length'GlmAsrProcessor._get_audio_token_length_   sY    ,B(G&*Q[8K!OLqPU[[^__M -C $2|CaG
r.   audiopromptkwargsc           	      ~   [        U[        5      (       a  U/nO[        U[        [        45      (       a*  U(       a#  [	        S U 5       5      (       a  [        U5      nO[        [        U5      5      n[        5       (       a]  U Vs/ s HP  n[        U[        R                  5      (       a,  UR                  5       R                  5       R                  5       OUPMR     nn[        U5      nUS:X  a  [        S5      eUc  U R                  /U-  nO[        U[        5      (       a  U/U-  nO[        U[        [        45      (       a  [        U5      U:w  a  [        S[        U5       SU S35      e/ nU HT  nUc  UR                  U R                  5        M#  [        U[        5      (       a  UR                  U5        MK  [!        S5      e   O[!        S5      e[#        Xt5       V	V
s/ s H-  u  pS	[        U
[        5      (       a  S
U
S.OS
U
S.SU	S./S./PM/     nn	n
U R$                  " U4SSSS.UD6$ s  snf s  sn
n	f )a  
Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

Args:
    audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
        Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
        the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
    prompt (`str` or `list[str]`, *optional*):
        Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
        each sample uses `"Transcribe the input speech."`.
    **kwargs:
        Additional keyword arguments forwarded to [`~GlmAsrProcessor.apply_chat_template`] (for example
        `text_kwargs`, `audio_kwargs`, ...).

Returns:
    [`BatchFeature`]: Processor outputs ready to be passed to [`GlmAsrForConditionalGeneration.generate`].

c              3   B   #    U  H  n[        U[        5      v   M     g 7fN)
isinstancestr).0els     r/   	<genexpr>>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>   s     ?d^cXZ
2s@S@S^cs   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userrL   )typepath)rY   rL   text)rY   r[   )rolecontentT)tokenizeadd_generation_promptreturn_dict)rR   rS   listtupleallr   r   torchTensordetachcpunumpylen
ValueErrorr7   append	TypeErrorzipapply_chat_template)r;   rL   rM   rN   audio_itemsrU   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r/   apply_transcription_request+GlmAsrProcessor.apply_transcription_requestg   s/   2 eS!!38'Ke}--%C?d^c?d<d<du+K1%89K!##kvwkvegJr5<<<X<Xryy{0668^``kvw%
?HII>889JFG$$h+Gu..6{j( F}OJ<Gkl  G<NN4#D#DEc**NN4(#$MNN  Z[[ ,/w+D
 ,E' # &j#66 ")*=&-
C!'=	 
 ,E 	 
 ''
"&	

 
 	
S x4
s   AH4&4H9r'   )Nz<|pad|>z&Please transcribe this audio into texti  )rD   torch.Tensorr@   rx   rQ   )r)   r*   r+   r,   __doc__r:   rJ   rS   ra   r   r   r%   r	   rv   r-   __classcell__r>   s   @r/   r2   r2   4   s{    8 %M
$ *.O
T#Y+O
 d3i$&O
 ./	O

 
O
 O
r.   r2   c                       \ rS rSrSrg)GlmAsrRotaryEmbedding   r'   Nr(   r'   r.   r/   r}   r}      s    r.   r}   c                 R   UR                  U5      nUR                  U5      nUR                  S   nU SS U24   U SUS 24   pUSS U24   USUS 24   pXr-  [        U5      U-  -   nX-  [        U	5      U-  -   n[        R                  " X/SS9n[        R                  " X/SS9nX4$ )N.)dim)	unsqueezeshaper   rd   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r/   apply_rotary_pos_embr      s    
--
&C
--
&C2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {{51C78G{{51C78G ii)r2Gii)r2Gr.   c                      ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\	\R                  \R                  4   S-  S\
\   S	\	\R                  \R                  4   4S
 jjrSrU =r$ )GlmAsrAttention   config	layer_idxc                   > [         TU ]  X5        SU l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR
                  UR                  U R                  -  SS9U l
        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR
                  SS9U l        g )NFT)bias)r9   r:   	is_causalr#   Linearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projr;   r   r   r>   s      r/   r:   GlmAsrAttention.__init__   s    +ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^eijii : :T]] JFL^L^eijr.   Nhidden_statesposition_embeddingsrN   r@   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nUu  p[        XgX5      u  pg[        R                  " U R                  R                  [        5      nU" U UUU4S U R                  (       d  SOU R                  U R                  S.UD6u  pUR                   " / UQSP76 R#                  5       nU R%                  U5      nX4$ )Nr   r    r   g        )attention_maskdropoutscaling)r   r   r   view	transposer   r   r   r   get_interfacer   _attn_implementationr   trainingattention_dropoutr   reshape
contiguousr   )r;   r   r   rN   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   attention_interfaceattn_outputattn_weightss                 r/   forwardGlmAsrAttention.forward   s\    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ (?(M(MKK,,.E)
 %8		%

  #}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r.   )r   r   r   r   r   rQ   r)   r*   r+   r,   r!   intr:   rd   re   rb   r   r   r   r-   rz   r{   s   @r/   r   r      s    k| k k IM!)||!) #5<<#=>E!) +,	!)
 
u||U\\)	*!) !)r.   r   c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )	GlmAsrMLP   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  5      U l        [        UR                     U l
        g rQ   )r9   r:   r#   r   r   intermediate_sizefc1fc2r   
hidden_actact_fnr;   r   r>   s     r/   r:   GlmAsrMLP.__init__   s\    99V//1I1IJ99V55v7I7IJV../r.   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rQ   )r   r   r   )r;   r   s     r/   r   GlmAsrMLP.forward  s2    /M2/r.   )r   r   r   )
r)   r*   r+   r,   r:   rd   re   r   r-   rz   r{   s   @r/   r   r      s    0U\\  r.   r   c            	          ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\	\R                  \R                  4   S-  S\
\   S	\R                  4S
 jjrSrU =r$ )GlmAsrEncoderLayeri	  r   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  5      U l
        g )N)r   r   )r9   r:   r   r   	self_attnr   mlpr#   	LayerNorminput_layernormpost_attention_layernormr   s      r/   r:   GlmAsrEncoderLayer.__init__
  sb    !--(LV$!||F,>,>?(*V5G5G(H%r.   Nr   r   rN   r@   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r   r   r'   )r   r   r   r   )r;   r   r   rN   residual_s         r/   r   GlmAsrEncoderLayer.forward  s|     !,,];>> 
' 3
 

 !0 !55mD/ 0r.   )r   r   r   r   r   rQ   r   r{   s   @r/   r   r   	  su    I| I I IM|| #5<<#=>E +,	
 
 r.   r   c                       \ rS rSrSrg)GlmAsrPreTrainedModeli,  r'   Nr(   r'   r.   r/   r   r   ,  r0   r.   r   c                      ^  \ rS rSr% \\S'   SrSrS/r\	\
S.rS\4U 4S jjr\\\S\\   4S	 j5       5       5       rS
rU =r$ )GlmAsrEncoderi0  r   input_featuresrL   r   )r   
attentionsc           	        > [         TU ]  U5        [        R                  " UR                  UR
                  SSS9U l        [        R                  " UR
                  UR
                  SSSS9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                  " UR
                  5      U l        [        US9U l        SU l        U R%                  5         g s  snf )Nr   r    )rG   rF   r   )rG   rH   rF   )r   F)r9   r:   r#   Conv1dnum_mel_binsr   conv1conv2
ModuleListrangenum_hidden_layersr   layersr   normr}   
rotary_embgradient_checkpointing	post_initr   s      r/   r:   GlmAsrEncoder.__init__:  s     YYv22F4F4FTU_`a
YYv1163E3EST]^hij
mmDI&JbJbDcdDcy2Dcd
 LL!3!34	/v>&+# es   DrN   c                    [         R                  R                  U R                  U5      5      n[         R                  R                  U R	                  U5      5      nUR                  SS5      nUnU R                  U[        R                  " UR                  S   UR                  S9S S S 24   S9nU R                   H  nU" U4SU0UD6nM     U R                  U5      n[        US9$ )Nr    r   device)r   r   )last_hidden_state)r#   
functionalgelur   r   r   r   rd   aranger   r   r   r   r   )r;   r   rN   inputs_embedsr   r   encoder_layers          r/   r   GlmAsrEncoder.forwardG  s     **4::n+EF**4::m+DE%//15%"oo]5H5H5KTaThTh(ijnpqjq(r . 
 "[[M)-kM`kdjkM ) 		-0)MJJr.   )r   r   r   r   r   r   )r)   r*   r+   r,   r"   __annotations__main_input_nameinput_modalities_no_split_modulesr   r   _can_record_outputsr:   r   r   r   r   r   r   r-   rz   r{   s   @r/   r   r   0  sl    &O-.+%
2   K7I0J K    Kr.   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GlmAsrMultiModalProjectori[  r   c                 >  > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  S-  5      U l        [        R                  " UR                  R                  S-  UR                  R                  5      U l	        g )Nr   )
r9   r:   r#   r   audio_configr   text_configr   linear_1linear_2r   s     r/   r:   "GlmAsrMultiModalProjector.__init__\  sm    		&"5"5"G"GI[I[IgIgjkIkl		&"4"4"@"@1"DfFXFXFdFder.   )r  r  )r)   r*   r+   r,   r!   r:   r-   rz   r{   s   @r/   r   r   [  s    f| f fr.   r   z~
    The GlmAsr model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Llama language model.
    custom_introc                     ^  \ rS rSrSr\\" SS9S\R                  S\R                  S\
\   S\\-  4S	 j5       5       r          SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\\R                  -  S\
\   S\4U 4S jjjrSrU =r$ )GlmAsrForConditionalGenerationib  TzgCompute audio embeddings from log-mel input features using the audio encoder and multi-modal projector.r  r   input_features_maskrN   r@   c                 $   U R                   " U4SS0UD6nUR                  nUR                  UR                  S   SU R                  R
                  R                  5      nU R                  U5      nUR                  S5      nS H  u  pn
USU-  -   U	S-
  -
  S-
  U
-  S-   nM     SnX{-
  U-  S-   n[        R                  " UR                  S   UR                  S	9S S S 24   US S 2S 4   :  nXmR                  UR                  5         Ul        U$ )
Nr`   Tr   r   rC   r   r    rB   r   )audio_towerr   r   r   r   r   r   multi_modal_projectorsumrd   r   r   topooler_output)r;   r   r	  rN   audio_outputsaudio_hidden_statesaudio_embedsrD   rF   rG   rH   rE   post_lengths
valid_masks                 r/   get_audio_features1GlmAsrForConditionalGeneration.get_audio_featuresj  s.    ((TTTVT+==199  #R)A)A)S)S
 112EF+//3,B(G&*Q[8K!OLqPU[[^__M -C%4EI\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
&2==ATAT3U&V#r.   N	input_idsr   r   past_key_valuesr   labels	use_cachelogits_to_keepc                 6   > [         TU ]  " SUUUUUUU	U
S.UD6$ )ap  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import GlmAsrForConditionalGeneration, AutoProcessor

>>> model_id = "zai-org/GLM-ASR-Nano-2512"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = GlmAsrForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
>>> inputs = processor.apply_transcription_request("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")

>>> inputs = inputs.to(model.device, dtype=model.dtype)

>>> outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

>>> decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
>>> print(decoded_outputs)
```)r  r   r   r  r   r  r  r  r'   )r9   r   )r;   r  r   r	  r   r   r  r   r  r  r  rN   r>   s               r/   r   &GlmAsrForConditionalGeneration.forward  s>    T w 

)%+')

 

 
	
r.   r'   )
NNNNNNNNNr   )r)   r*   r+   r,   _supports_attention_backendr   r   rd   FloatTensorre   r   r   rb   r   r  
LongTensorr   boolr   r   r   r-   rz   r{   s   @r/   r  r  b  si    #'~)) #\\ +,	
 
+	+ 4 .23737.204(,26*.!%-.4
##d*4
 ))D04
 #\\D0	4

 t+4
 &&-4
 4
 ((4/4
   4'4
 $;4
 ell*4
 +,4
 
 4
 4
r.   r  )r   r  r2   r   )Nr    )Bcollections.abcr   rh   npactivationsr   audio_utilsr   r   cache_utilsr   feature_extraction_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   &audioflamingo3.modeling_audioflamingo3r   r   r   (audioflamingo3.processing_audioflamingo3r   r   glm.modeling_glmr   llama.modeling_llamar   r   r   configuration_glmasrr!   r"   rd   r#   
get_loggerr)   loggerr%   r2   r}   r   r   Moduler   r   r   r   r   r  __all__r'   r.   r/   <module>r8     s$   %  ! 9   4 9 R 5 & T T I 5 
 n 1 W W C  
		H	% @9 ?B
- B
J 5. 4$*)n *)Z		  3  F @9 ?(K) (KVf A f 
S
%K S

S
l jr.   