
    Z j                        S SK Jr  S SKrS SKrS SKJr  S SKJs  Jr	  S SK
Jr  S SKJr  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJ r   SSK!J"r"J#r#  SSK$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-  SSK.J/r/J0r0J1r1J2r2J3r3  SSK4J5r5J6r6  SSK7J8r8  SSK9J:r:  SSK;J<r<J=r=J>r>J?r?  SSK@JArAJBrBJCrCJDrDJErEJFrFJGrGJHrHJIrIJJrJ  SSKKJLrL  SSKMJNrNJOrO  \2R                  " \Q5      rR\0" SS9\ " S S \5      5       5       rS\0" SS9\ " S! S"\5      5       5       rT\0" SS9\ " S# S$\5      5       5       rU " S% S&\=5      rV " S' S(\E5      rW " S) S*\A5      rX " S+ S,\B5      rY " S- S.\R                  5      r[ " S/ S0\R                  5      r\ " S1 S2\I5      r] " S3 S4\J5      r^ " S5 S6\>5      r_S7 r`SRS8 jra " S9 S:\R                  5      rb " S; S<\<5      rc " S= S>\ 5      rd " S? S@\F5      re " SA SB\G5      rf " SC SD\f5      rg " SE SF\H5      rh " SG SH\L5      ri " SI SJ\C5      rj " SK SL\D5      rk " SM SN\O5      rl " SO SP\N5      rm/ SQQrng)S    )CallableN)strict)	LayerNorm   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)BatchFeature)
ImageInput)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling)RopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)PreTokenizedInput	TextInput)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs)
VideoInput   )Glm4MLPGlm4RMSNormGlm4RotaryEmbeddingeager_attention_forward)
Qwen2_5_VisionPatchEmbedQwen2_5_VisionRotaryEmbedding Qwen2_5_VLCausalLMOutputWithPast"Qwen2_5_VLForConditionalGenerationQwen2_5_VLMLPQwen2_5_VLModelOutputWithPastQwen2_5_VLPreTrainedModelQwen2_5_VLTextModelQwen2_5_VLVisionAttentionQwen2_5_VLVisionBlock)Qwen2VLModel)Qwen2VLProcessorQwen2VLProcessorKwargszzai-org/GLM-4.1V-9B-Thinking)
checkpointc                   N   \ rS rSr% SrSrSrSr\\	S'   Sr
\\	S'   S	r\\	S
'   Sr\\	S'   Sr\\-  \	S'   Sr\\	S'   Sr\\	S'   Sr\\\   -  \\\4   -  \	S'   Sr\\\   -  \\\4   -  \	S'   Sr\\	S'   Sr\\	S'   Sr\\\   -  \\\4   -  \	S'   Sr\\	S'   Sr\\	S'   S r\\	S!'   S"rg#)$Glm4vVisionConfigF   a  
out_hidden_size (`int`, *optional*, defaults to 4096):
    The output hidden size of the vision model.

Example:

```python
>>> from transformers import Glm4vVisionConfig, Glm4vVisionModel

>>> # Initializing a Glm4vVisionConfig GLM-4.1V-9B style configuration
>>> configuration = Glm4vVisionConfig()

>>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration
>>> model = Glm4vVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```glm4v_visionvision_config   depthi   hidden_sizesilu
hidden_actFattention_bias        attention_dropout   	num_headsr   in_channelsiP  
image_size   
patch_sizeh㈵>rms_norm_epsr"   spatial_merge_sizetemporal_patch_size   out_hidden_size5  intermediate_size{Gz?initializer_range N)__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr;   int__annotations__r<   r>   strr?   boolrA   floatrC   rD   rE   listtuplerG   rI   rJ   rK   rM   rO   rQ   __static_attributes__rR       x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/glm4v/modular_glm4v.pyr6   r6   F   s    &  J%OE3OKJ ND %(us{(IsK47Jd3i%S/1746Jd3i%S/16L%=>tCy5c?:>OS"s"#u#rb   r6   c                   \  ^  \ rS rSr% SrSrSrS/rSSSSSS	S
.rS/S/4SS/S/4S/S/4S.r	S1r
Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\\S'   S r\\S!'   S"r\\S#'   S$r\\S%'   S&r\\S''   S(r\\-  \S)'   Sr\\-  S-  \S*'   Sr\S-  \S+'   U 4S, jr S-r!U =r"$ ).Glm4vTextConfigp   a\  
Example:

```python
>>> from transformers import Glm4vTextModel, Glm4vConfig

>>> # Initializing a GLM-4.1V style configuration
>>> configuration = Glm4vConfig()

>>> # Initializing a model from the GLM-4.1V style configuration
>>> model = Glm4vTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
glm4v_texttext_configpast_key_valuescolwiserowwisecolwise_gather_outputrowwise_split_input)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormmrope_sectioni P 
vocab_sizerL   r<   rN   rO   (   num_hidden_layers    num_attention_headsr"   Nnum_key_value_headsr=   r>   i   max_position_embeddingsrP   rQ   rH   rI   T	use_cacher@   rA   rope_parameterspad_token_idc                 b   > U R                   c  U R                  U l         [        TU ]  " S0 UD6  g )NrR   )r{   rz   super__post_init__selfkwargs	__class__s     rc   r   Glm4vTextConfig.__post_init__   s-    ##+'+'?'?D$''rb   )r{   )#rS   rT   rU   rV   rW   rX   rY   keys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planignore_keys_at_rope_validationrv   rZ   r[   r<   rO   rx   rz   r{   r>   r\   r|   rQ   r^   rI   r}   r]   rA   r~   r   dictr   r   ra   __classcell__r   s   @rc   re   re   p   s-     J#O#4"5 &/%.%.%.%<"7 &(9:#%568IJ!"_$56
 '6%6"JK"s"s!!&'t'J#(S(#u#L%It%(us{(48O^d*T18#L#*#( (rb   re   c                      ^  \ rS rSr% SrSr\\S.rS/r	Sr
\\-  S-  \S'   Sr\\-  S-  \S'   S	r\\S
'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   U 4S jrSrU =r$ )Glm4vConfig   aU  
image_start_token_id (`int`, *optional*, defaults to 151339):
    The image start token index to encode the start of image.
image_end_token_id (`int`, *optional*, defaults to 151340):
    The image end token index to encode the end of image.
video_start_token_id (`int`, *optional*, defaults to 151341):
    The video start token index to encode the start of video.
video_end_token_id (`int`, *optional*, defaults to 151342):
    The video end token index to encode the end of video.

```python
>>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig

>>> # Initializing a GLM-4.1V style configuration
>>> configuration = Glm4vConfig()

>>> # Initializing a model from the GLM-4.1V style configuration
>>> model = Glm4vForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```glm4v)r9   rh   ri   Nrh   r9   i/O image_token_idi0O video_token_idi+O image_start_token_idi,O image_end_token_idi-O video_start_token_idi.O video_end_token_idFtie_word_embeddingsc                   > [        U R                  [        5      (       a%  U R                  S   " S0 U R                  D6U l        O'U R                  c  U R                  S   " S0 UD6U l        [        U R                  [        5      (       a%  U R                  S   " S0 U R                  D6U l        O'U R                  c  U R                  S   " S0 UD6U l        [
        TU ]  " S0 UD6  g )Nr9   rh   rR   )
isinstancer9   r   sub_configsrh   r   r   r   s     rc   r   Glm4vConfig.__post_init__   s    d(($//!%!1!1/!B!XTEWEW!XD'!%!1!1/!B!LV!LDd&&--#//>RAQAQRD%#//>HHD''rb   )rh   r9   )rS   rT   rU   rV   rW   rX   r6   re   r   r   rh   r   r   r[   r9   r   rZ   r   r   r   r   r   r   r]   r   ra   r   r   s   @rc   r   r      s    . J$5oVK#4"526K((4/648M4**T18 NC  NC  &#&$$ &#&$$ %%( (rb   r   c                       \ rS rSrSrg)Glm4vRMSNorm   rR   NrS   rT   rU   rV   ra   rR   rb   rc   r   r          rb   r   c                   4   ^  \ rS rSrSS\4U 4S jjjrSrU =r$ )Glm4VisionMlp   biasc                 F   > [         TU ]  X5        UR                  U l        g N)r   __init__rM   rO   )r   configr   r   s      rc   r   Glm4VisionMlp.__init__   s    &!'!7!7rb   )rO   F)rS   rT   rU   rV   r]   r   ra   r   r   s   @rc   r   r      s    8T 8 8rb   r   c                   &    \ rS rSrS\SS4S jrSrg)Glm4vVisionPatchEmbed   r   returnNc                 n   [         R                  R                  U 5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        U R                  U R                  U R                  /n[         R                  " U R
                  U R                  X"S9U l	        g )N)kernel_sizestride)
nnModuler   rG   rK   rD   r<   	embed_dimConv3dproj)r   r   r   s      rc   r   Glm4vVisionPatchEmbed.__init__   s    
		4  ++#)#=#= !--++//$//RIId..Kl	rb   )r   rD   rG   r   rK   )rS   rT   rU   rV   r6   r   ra   rR   rb   rc   r   r      s    m0 mT mrb   r   c                       \ rS rSrSrg)Glm4vVisionRotaryEmbedding   rR   Nr   rR   rb   rc   r   r      r   rb   r   c                   ~   ^  \ rS rSrSS\S\S\S\SS4
U 4S jjjrS	\R                  S\R                  4S
 jr
SrU =r$ )Glm4vVisionPatchMerger   dimcontext_dimr>   r   r   Nc                 b  > [         TU ]  5         [        R                  " XUS9U l        [        U5      U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " X!US9U l	        [        R                  " 5       U l        [        U   U l        g )Nr   )r   r   r   Linearr   r   post_projection_norm	gate_projup_proj	down_projGELUact1r   act_fn)r   r   r   r>   r   r   s        rc   r   Glm4vVisionPatchMerger.__init__   s{    IIcT2	$-cN!3$?yy=;$?GGI	Z(rb   hidden_statec                     U R                  U5      nU R                  U R                  U5      5      nU R                  U R	                  U R                  U5      5      U R                  U5      -  5      $ r   )r   r   r   r   r   r   r   )r   r   s     rc   forwardGlm4vVisionPatchMerger.forward  sY    yy.yy!:!:<!HI~~dkk$..*FG$,,WcJddeerb   )r   r   r   r   r   r   r   r   )rS   rT   rU   rV   rZ   r\   r]   r   torchTensorr   ra   r   r   s   @rc   r   r      sU    )C )c )s )$ )[_ ) )fELL fU\\ f frb   r   c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )Glm4vVisionEmbeddingsi  r   c                 f  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        SU l        g )Nr"   bicubic)r   r   r   r<   r   rE   rG   num_patchesnum_positionsr   	Embeddingposition_embeddinginterpolated_methodr   r   r   s     rc   r   Glm4vVisionEmbeddings.__init__  s    ++ ++ ++ OOt>1D!--"$,,t/A/A4>>"R#, rb   r   c           	      "   U R                   R                  nUR                  S   nUR                  n[	        U[
        5      (       a#  [        R                  " X([        R                  S9nUR                  S   n	[        U	S-  5      n
UR                  XU5      R                  SSS5      R                  S5      R                  U[        R                  S9n[        R                  " [!        [#        U5      5       Vs/ s H  oUS4   R%                  X,   5      PM     sn5      R                  U[        R                  S9n[        R                  " [!        [#        U5      5       Vs/ s H  oUS4   R%                  X,   5      PM     sn5      R                  U[        R                  S9nUS-   U-  S-  S-
  nUS-   U-  S-  S-
  n[        R&                  " UU4SS9R                  S5      R                  S5      n[(        R*                  " UUU R,                  SS	S
9nUR/                  S5      R/                  S5      R                  SS5      nUR                  UR0                  5      R                  UR                  5      nUU-   nU$ s  snf s  snf )aQ  
Forward pass with integrated position encoding adaptation using 2D interpolation.

Args:
    embeddings: Input embeddings tensor
    lengths (torch.Tensor): Sequence lengths for each image in the batch.
    image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
    h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
    w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.

Returns:
    torch.Tensor: Embeddings with adapted position encoding added.
   )devicedtyper   g      ?r"   r   Fborder)modealign_cornerspadding_mode)r   weightshaper   r   r_   r   tensorlongrZ   viewpermute	unsqueezetofloat32catrangelenrepeatstackFgrid_sampler   squeezer   )r   
embeddingslengthsimage_shapesh_coordsw_coordspos_embed_weightr<   r   orig_size_sq	orig_sizepos_embed_2ditarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32adapted_pos_embeds                        rc   r   Glm4vVisionEmbeddings.forward  sa     2299&,,Q/!(( gt$$ll7LG (--a0c)*	!!)DWQ1Yq\RvU]]R3	 	 99USVW^S_M`aM`1a4077
CM`abee f 
 99USVW^S_M`aM`1a4077
CM`abee f 

 c>X-2Q6c>X-2Q6 {{FF+4>>qAKKAN #$--$T%=%=Uai#

 "9!@!@!C!K!KB!O!W!WXY[\!]2556F6L6LMPPQ[QbQbc  "33
3 b bs   ;!J%!J)r   r   rE   r   r   r   rG   r   )rS   rT   rU   rV   r6   r   r   r   r   ra   r   r   s   @rc   r   r     s(    
-0 
-;PUP\P\ ; ;rb   r   c                   4   ^  \ rS rSrS\SS4U 4S jjrSrU =r$ )Glm4vVisionAttentioniY  r   r   Nc                   > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR
                  S-  UR                  S9U l        [        R                  " UR
                  UR
                  SS9U l        g )Nr   r   F)	r   r   rA   r   r   r<   r?   qkvr   r   s     rc   r   Glm4vVisionAttention.__init__Z  si     !'!9!999V//1C1Ca1GfNcNcdIIf00&2D2D5Q	rb   )rA   r   r  )rS   rT   rU   rV   r6   r   ra   r   r   s   @rc   r  r  Y  s     R0 RT R Rrb   r  c                   ,   ^  \ rS rSrSU 4S jjrSrU =r$ )Glm4vVisionBlockia  c                    > [         TU ]  U5        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        USS9U l
        g )NepsFr   )r   r   r   r<   rI   norm1norm2r  attnr   mlpr   s     rc   r   Glm4vVisionBlock.__init__b  s^     !&"4"4&:M:MN
!&"4"4&:M:MN
(0	 e4rb   )r  r  r  r  r   N)rS   rT   rU   rV   r   ra   r   r   s   @rc   r  r  a  s    5 5rb   r  c                   @   ^  \ rS rSrSS\4U 4S jjjrS rS rSrU =r	$ )Glm4vTextRotaryEmbeddingij  r   c                 h   > [         TU ]  5         UR                  R                  S/ SQ5      U l        g )Nru   )   rB   rB   )r   r   r~   getru   )r   r   r   r   s      rc   r   !Glm4vTextRotaryEmbedding.__init__k  s)    #3377Urb   c                 Z   U R                   S S S S 2S 4   R                  5       R                  SUR                  S   SS5      nUS S 2S S 2S S S 24   R                  5       n[	        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      nU R                  X`R                  5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR!                  5       U R                  -  n	S S S 5        WR#                  UR$                  S
9W	R#                  UR$                  S
94$ ! , (       d  f       N@= f)Nr   r   r   mpscpuF)device_typeenabledr"   r   r   )inv_freqr^   expandr   r   r   typer\   r   	transposeapply_mroperu   r   r   cosattention_scalingsinr   r   )
r   xposition_idsinv_freq_expandedposition_ids_expandedr%  freqsembr-  r/  s
             rc   r    Glm4vTextRotaryEmbedding.forwardo  sZ    !MM$a*=>DDFMMaQ]QcQcdeQfhjlmn ,Q4] ; A A C'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E$$U,>,>?E))UN3C'')d444C'')d444C D vvAGGv$cff177f&;;; DCs   BF
F*c           	          UnUR                  USS9n[        R                  " [        U5       VVs/ s H  u  pVXeS-     PM     snnSS9nU$ s  snnf )Nr   r   r   )splitr   r   	enumerate)r   r4  ru   sectionchunksr  chunkresults           rc   r,  $Glm4vTextRotaryEmbedding.apply_mrope  sS    W"-69JK9JXQEa%L9JKQST Ls   A
)ru   r   )
rS   rT   rU   rV   re   r   r   r,  ra   r   r   s   @rc   r  r  j  s&    V V V<  rb   r  c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	z*Rotates half the hidden dims of the input..r   Nr"   r   r   r   )r   r   flatten)r0  x1x2s      rc   rotate_half_llmrD    sJ    	
319B	
319B;;Ryb)11"55rb   c                    UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nUR                  S   nU SSU24   U SUS24   pvUSSU24   USUS24   pXb-  [        U5      U-  -   n
X-  [        U5      U-  -   n[        R
                  " X/SS9n
[        R
                  " X/SS9nX4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.Nr   r"   r   )r   r   repeat_interleaverD  r   r   )qkr-  r/  unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               rc   apply_rotary_pos_embrQ    s6   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {u5;<G{u5;<G ii)r2Gii)r2Grb   c                   :  ^  \ rS rSrSrSS\S\S-  4U 4S jjjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )Glm4vTextAttentioni  zz
Multi-headed attention from 'Attention Is All You Need' paper.
and "Generating Long Sequences with Sparse Transformers".
Nr   	layer_idxc                 r  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        U R                  U R                  -  U l        UR                  U l        U R                  U R                  -  U l	        SU l
        UR                  U l        UR                  U l        U R                  S-  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )NTg      r   F)r   r   r   rT  r<   rz   rC   head_dimr{   num_key_value_groups	is_causalrA   r~   scalingr   r   q_projk_projv_projo_projr   r   rT  r   s      rc   r   Glm4vTextAttention.__init__  sE   "!--33((DNN:#)#=#= $(NNd6N6N$N!!'!9!9%55}}d*ii 0 0$..4==2PW[\ii 0 0$2J2JT]]2Zaefii 0 0$2J2JT]]2Zaefii >@P@PW\]rb   rp   position_embeddingsrq   ri   r   r   c                 >   UR                  5       u  pgnU R                  U5      n	U R                  U5      n
U R                  U5      nU	R	                  XgSU R
                  5      R                  SS5      n	U
R	                  XgSU R
                  5      R                  SS5      n
UR	                  XgSU R
                  5      R                  SS5      nUu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U U	U
UU4U R                  (       d  SOU R                   U R"                  S.UD6u  nnUR%                  XgS5      R'                  5       nU R)                  U5      nUU4$ )Nr   r   r"   r@   )dropoutrY  )sizerZ  r[  r\  r   rV  r+  rQ  updaterT  r   get_interfacer   _attn_implementationr&   trainingrA   rY  reshape
contiguousr]  )r   rp   r`  rq   ri   r   bszq_len_query_states
key_statesvalue_statesr-  r/  attention_interfaceattn_outputattn_weightss                    rc   r   Glm4vTextAttention.forward  s    &**,A{{=1[[/
{{=1#((RGQQRSUVW__ST]]CMMaQRS
#((RGQQRSUVW&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "))#b9DDFkk+.L((rb   )rA   r   rV  r<   rX  r[  rT  rC   rW  r{   r]  rZ  r~   rY  r\  r   NNN)rS   rT   rU   rV   rW   re   rZ   r   r   r   r`   r	   r   r   r   ra   r   r   s   @rc   rS  rS    s    
^ ^3: ^ ^. IM.2(,))||)) #5<<#=>E)) t+	))
 )) -.)) 
u||U\\D0%2E2LL	M)) ))rb   rS  c                       \ rS rSrSrg)Glm4vTextMLPi  rR   Nr   rR   rb   rc   rv  rv    r   rb   rv  c                   T  ^  \ rS rSrS\S\4U 4S jjr\     SS\R                  S\
\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\
\R                  \
\R                  \R                  4   S-  4   4S jj5       rSrU =r$ )Glm4vTextDecoderLayeri  r   rT  c                   > [         TU ]  5         UR                  U l        [        X5      U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr  )r   r   r<   rS  	self_attnrv  r  r   rI   input_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernormr^  s      rc   r   Glm4vTextDecoderLayer.__init__   s    !--+F>'+F,>,>FDWDWX(4V5G5GVM`M`(a%(4V5G5GVM`M`(a%".v/A/AvGZGZ"[rb   Nrp   r`  rq   r1  ri   r}   r   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU$ )N)rp   r`  rq   r1  ri   r}   rR   )r{  rz  r}  r|  r  r~  )
r   rp   r`  rq   r1  ri   r}   r   residualrl  s
             rc   r   Glm4vTextDecoderLayer.forward
  s     !,,];  >> 
' 3)%+
 
 55mD 0 !55mD///> 0rb   )r<   r{  r  r|  r~  r}  rz  )NNNNF)rS   rT   rU   rV   re   rZ   r   r   r   r   r`   
LongTensorr	   r]   FloatTensorr   ra   r   r   s   @rc   rx  rx    s    \ \3 \  IM.204(,!&#||# #5<<#=>E# t+	#
 &&-# # $;# 
u  %(9(95;L;L(L"MPT"TT	U# #rb   rx  c                       \ rS rSrSrg)Glm4vModelOutputWithPasti1  rR   Nr   rR   rb   rc   r  r  1  r   rb   r  c                   "    \ rS rSrSS/rS rSrg)Glm4vPreTrainedModeli5  rx  r  c           	      6   [         R                  " X5        [        U[        5      (       an  SUR                  [
        R                  " SUR                  S[
        R                  S9UR                  -  -  -  n[        R                  " UR                  U5        g g )Ng      ?r   r"   r'  )r   _init_weightsr   r   thetar   aranger   r^   initcopy_r(  )r   moduler(  s      rc   r  "Glm4vPreTrainedModel._init_weights8  sn    %%d3f899fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 :rb   rR   N)rS   rT   rU   rV   _no_split_modulesr  ra   rR   rb   rc   r  r  5  s    02DE2rb   r  c                      ^  \ rS rSr% \\S'   SrS/r\\	S.r
SU 4S jjrS r\\\S	\R"                  S
\R"                  S\\   S\\-  4S j5       5       5       rSrU =r$ )Glm4vVisionModeli?  r   )imagevideor  rp   
attentionsr   c                 8  > [         TU ]  U5        UR                  U l        UR                  U l        [	        U5      U l        [        U5      U l        UR                  UR                  -  n[        US-  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        U5      PM     sn5      U l        [%        UR&                  UR(                  UR*                  S9U l        [/        UR                  UR0                  S9U l        [        R4                  " UR                  UR&                  UR                  UR                  S9U l        [/        UR                  UR0                  S9U l        SU l        U R=                  5         g s  snf )Nr"   )r   r   r>   r  )rD   out_channelsr   r   F)r   r   rJ   rG   r   r   r   patch_embedr<   rC   r   rotary_pos_embr   
ModuleListr   r;   r  blocksr   rM   rO   r>   mergerr   rI   post_conv_layernormConv2d
downsamplepost_layernormgradient_checkpointing	post_init)r   r   rV  rl  r   s       rc   r   Glm4vVisionModel.__init__H  sG    "(";"; ++/708%%)9)998QGmmuV\\GZ$[GZ!%5f%=GZ$[\,&&F4L4LY_YjYj
 $00B0BH[H[#\ ))**//11,,	
 +6+=+=6CVCVW&+# %\s   &Fc                    / nU GHn  u  p4n[         R                  " U5      R                  S5      R                  SU5      nUR	                  X@R
                  -  U R
                  XPR
                  -  U R
                  5      nUR                  SSSS5      nUR                  5       n[         R                  " U5      R                  S5      R                  US5      nUR	                  X@R
                  -  U R
                  XPR
                  -  U R
                  5      nUR                  SSSS5      nUR                  5       nUR                  [         R                  " Xg/SS9R                  US5      5        GMq     [         R                  " USS9nUS S 2SS 24   R                  5       nU R                  U5      n	X   R                  S5      n
X4$ )Nr   r   r   r"   r   r   )r   r  r   r)  rh  rJ   r   rA  appendr   r   r   maxr  )r   grid_thwpos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr  s              rc   rot_pos_embGlm4vVisionModel.rot_pos_embd  s   GA!||A003::2qAH'',,,'',,,''	H  ''1a3H'')H||A003::1bAH'',,,'',,,''	H  ''1a3H'')HNN5;;';DKKAqQR)  * ))G+ AB++-"11-@,5==a@&&rb   rp   r  r   c           	      f   U R                  U5      nU R                  U5      nU R                  U5      u  pE[        R                  " XD4SS9nUR                  5       UR                  5       4n[        R                  " USS2S4   USS2S4   -  USS2S4   5      R                  S[        R                  R                  5       (       a  UR                  O[        R                  S9n[        R                  " USSS	9nUSS USS -
  R                  5       n	U R!                  UU	UUSS2S4   R#                  UR$                  5      USS2S4   R#                  UR$                  5      5      nU R&                   H  n
U
" U4UUS
.UD6nM     U R)                  U5      nUR+                  SU R,                  U R,                  UR.                  S   5      nUR1                  SSSS5      nU R3                  U5      R+                  SU R4                  R6                  5      nU R9                  U5      n[;        UUS9$ )a$  
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
    The final hidden states of the model.
grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
    The temporal, height and width of feature shape of each image in LLM.

Returns:
    `torch.Tensor`: hidden_states.
r   r   Nr   r"   r   )r   r   )r   r   )value)
cu_seqlensr`  r   )last_hidden_statepooler_output)r  r  r  r   r   r-  r/  rF  cumsumjit
is_tracingr   int32r   padtolistr   r   r   r  r  r   rJ   r   r   r  r   rM   r  r   )r   rp   r  r   r  image_type_idsr5  r`  r  seqlensblkmerged_hidden_statess               rc   r   Glm4vVisionModel.forward  s     ((700?)-)9)9()C&ii8bA"wwy#'')4,,Xad^hq!tn-LhWXZ[W[n]dd
 %*II$8$8$:$:(.. e 

 UU:vQ7
ab>JsO3;;=1a4 ##M$8$891a4 ##M$8$89
 ;;C%$7 	M  ++M:%**'')@)@-BUBUVXBY
 &--aAq96;;B@[@[\#{{=9)+.
 	
rb   )r  r  r   r  r  r  rG   r  r  r  rJ   r  )rS   rT   rU   rV   r6   r[   input_modalitiesr  r  r  _can_record_outputsr   r  r   r    r   r   r   r   r   r`   r   r   ra   r   r   s   @rc   r  r  ?  s    )+,)*
8':  9
"\\9
5:\\9
MSTfMg9
	+	+9
    9
rb   r  c                     ^  \ rS rSr\\S.rS\4U 4S jjr\	\
\      SS\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\R                   S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )Glm4vTextModeli  r  r   c           	      (  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [        US9U l        U ?U ?g s  snf )Nr  r   )r   r   r   r  r   rx   rx  rs   r   r<   rI   rt   r  
rotary_embrf  has_sliding_layersr^  s      rc   r   Glm4vTextModel.__init__  s     mmGLVMeMeGfgGf)"65Gfg
 !!3!39L9LM	2&A%# hs   BNrn   rq   r1  ri   ro   r}   r   r   c           	      l   US L US L-  (       a  [        S5      eU(       a9  Uc6  [        R                  R                  5       (       d  [	        U R
                  S9nUc  U R                  U5      nUcv  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  SSS5      R                  SUR                  S   S5      nO3UR                  S:X  a#  US	   R                  SUR                  S   S5      nUR                  S:X  a  UR                  S   S
:X  a  US   n	USS  nOS n	U R
                  UUUU	S.n
[        S0 U
D6nUnU R                  XS9nU R                    H  nU" U4UU	UUS.UD6nUnM     U R#                  U5      n[%        UUS9$ )N:You must specify exactly one of input_ids or inputs_embedsr  r   r   )r   r   r   r"   )N.   )r   ro   rq   ri   r1  )r1  )rq   r1  ri   r`  )r  ri   rR   )
ValueErrorr   r  r  r
   r   rr   get_seq_lengthr  r   r   r   r)  ndimr   r  rs   rt   r   )r   rn   rq   r1  ri   ro   r}   r   past_seen_tokenstext_position_idsmask_kwargscausal_maskrp   r`  decoder_layerlayer_outputss                   rc   r   Glm4vTextModel.forward  s    -t";<YZZ 09M9M9O9O*$++>O  --i8M CRC^==?de <<(;(;A(>}G[G[\_ooL',,Q26==aATATUVAWY[\L!#'	299!\=O=OPQ=RTVWL !l&8&8&;q&@ ,Q'+L !% kk*,.-
 )7;7%"oomoW![[M)*. /$7 M *M ) 		-0&++
 	
rb   )rs   rt   r  )NNNNNN)rS   rT   rU   rV   rx  rS  r  re   r   r   r   r    r   r  r   r	   r  r]   r   r   r`   r   r   ra   r   r   s   @rc   r  r    s    .(
$ $  .2.204(,26!%J
##d*J
 t+J
 &&-	J

 J
 ((4/J
 $;J
 -.J
 
(	(J
    J
rb   r  c                     ^  \ rS rSrSS/rU 4S jr\\ SS\R                  S\R                  S-  S\\   S	\\-  4S
 jj5       5       r  SS\R                  S\R                  S\R                  S-  S\R                  S-  4S jjr SS\R                  S-  S	\\R"                  \R"                  4   4U 4S jjjr\\           SS\R                  S-  S\R"                  S-  S\R                  S-  S\S-  S\R                  S-  S\R"                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R(                  S-  S\\   S	\\-  4S jj5       5       rSrU =r$ )
Glm4vModeli   rx  r  c                 l   > [         TU ]  U5        [        R                  UR                  5      U l        g r   )r   r   r  _from_configr9   visualr   s     rc   r   Glm4vModel.__init__#  s(     &33F4H4HIrb   Npixel_values_videosvideo_grid_thwr   r   c                 2   UR                  U R                  R                  5      n/ nUR                  5       nU HN  u  pgn[        R
                  " SXx/5      R                  S5      R                  US5      n	UR                  U	5        MP     [        R                  " USS9n
U R                  " U4U
SS.UD6nUR                  S5      U R                  R                  S-  -  R                  5       n[        R                  " UR                  U5      nXl        U$ )a3  
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input videos.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.
r   r   r   T)r  return_dictr   r"   )r*  r  r   r  r   r   r   r   r  r   prodrJ   r8  r  )r   r  r  r   temp_frames_hwvideo_grid_thw_listr  r  r  repeated_rowflattened_video_grid_thwvision_outputssplit_sizesvideo_embedss                 rc   get_video_featuresGlm4vModel.get_video_features'  s    266t{{7H7HI,335*GA! <<A	2<<Q?FFq!LL!!,/ + $)99^#C 
*BPT
X^
 &**2.$++2P2PRS2SS[[]{{>#?#?M'3$rb   rn   ro   image_featuresvideo_featuresc           	      D   Uc  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nX R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nO0XR                  R                  :H  nXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUb@  [        X%   R                  5       UR                  5       :H  SU SUR                  S    35        UR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUb@  [        X&   R                  5       UR                  5       :H  SU SUR                  S    35        XV4$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r   r   r   z6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: )get_input_embeddingsr   r   r   r   r   r   allr   sumr   	expand_asr   r   numelr   )	r   rn   ro   r  r  special_image_maskspecial_video_maskn_image_tokensn_video_tokenss	            rc   get_placeholder_maskGlm4vModel.get_placeholder_maskF  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!; "+kk.H.H!H!*kk.H.H!H+//1/99"=GGVYYZgZnZno%"1779^=Q=Q=SSHHXXdeseyeyz{e|d}~
 ,//1/99"=GGVYYZgZnZno%"1779^=Q=Q=SSHHXXdeseyeyz{e|d}~ "55rb   c                 x   > Ub%  [         R                  " XSS2S4   SS9nSUSS2S4'   [        TU ]  " SSU0UD6$ )a9  
Difference from Qwen2VL/Qwen2.5VL's get_rope_index:
- GLM4V uses timestamps to seperate each video frame, so the video_grid_thw should also be split too.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
        it.
    mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`):
        Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
    image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
        The temporal, height and width of feature shape of each image in LLM.
    video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
        The temporal, height and width of feature shape of each video in LLM.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

Returns:
    position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
    mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
Nr   r   r   r  rR   )r   rF  r   get_rope_index)r   r  super_kwargsr   s      rc   r  Glm4vModel.get_rope_indexp  sP    > %"44^TUWXTXEY_`aN#$N1a4 w%T^T|TTrb   rq   r1  ri   pixel_valuesimage_grid_thwrope_deltasmm_token_type_idsc           
         USL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUbv  U R                  XhSS9R                  n[        R
                  " USS9R                  UR                  UR                  5      nU R                  XUS9u  pUR                  X5      nUbx  U R                  XySS9R                  n[        R
                  " USS9R                  UR                  UR                  5      nU R                  XUS9u  nnUR                  UU5      nUc  U R                  UUU	UUUUS	9nU R                  " SSUUUUS
.UD6n[        S0 UDSU R                  0D6$ )a  
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
    The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
    The rope index difference between sequence length and multimodal rope.
Nr  T)r  r   r   )r  )r  )rn   r  r  ro   rq   ri   r  )rn   r1  rq   ri   ro   r  rR   )r  r  get_image_featuresr  r   r   r   r   r   r  masked_scatterr  compute_3d_position_idslanguage_modelr  r  )r   rn   rq   r1  ri   ro   r  r  r  r  r  r  r   image_embeds
image_maskrl  r  
video_maskoutputss                      rc   r   Glm4vModel.forward  s   2 -t";<YZZ  557	BM#22<]a2bppL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMJ)88RM*223Fdh2iwwL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMAz)88\RM77#--+- /"3 8 L %% 
%)+'
 
 ( 

((
 	
rb   )r  r   )NN)NNNNNNNNNNN)rS   rT   rU   rV   r  r   r   r   r   r  r  r   r   r`   r   r  r  r   r  r	   	IntTensorr  r   ra   r   r   s   @rc   r  r     sY   02DEJ  37".. ((4/ +,	
 
+	+  B 4837(6##(6 (((6 ))D0	(6
 ))D0(6X 37#U((4/#U 
u||U\\)	*	#U #UJ  .2.204(,26,08<2626/348@
##d*@
 t+@
 &&-	@

 @
 ((4/@
 llT)@
 #..5@
 ((4/@
 ((4/@
 %%,@
 !??T1@
 +,@
 
)	)@
  @
rb   r  c                       \ rS rSrSrg)Glm4vCausalLMOutputWithPasti  rR   Nr   rR   rb   rc   r  r    r   rb   r  c                   P  ^  \ rS rSr            SS\R
                  S-  S\R                  S-  S\R
                  S-  S\S-  S\R                  S-  S\R
                  S-  S	\R                  S-  S
\R                  S-  S\R
                  S-  S\R
                  S-  S\R                  S-  S\
\R                  -  S\\   S\\-  4S jjr          SU 4S jjr SS\R
                  S-  S\R                  S-  S\\R                  \R                  4   4S jjrSrU =r$ )Glm4vForConditionalGenerationi  Nrn   rq   r1  ri   ro   labelsr  r  r  r  r  logits_to_keepr   r   c                    U R                   " SUUUU	U
UUUUUS.
UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nSnUb.  U R                  UX`R                  R                  R                  S9n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
    The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Glm4vForConditionalGeneration

>>> model = Glm4vForConditionalGeneration.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")
>>> processor = AutoProcessor.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")

>>> messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
```)
rn   r  r  r  r  r  r1  rq   ri   ro   r   N)logitsr  rv   )lossr  ri   rp   r  r  rR   )modelr   rZ   slicelm_headloss_functionr   rh   rv   r  ri   rp   r  r  )r   rn   rq   r1  ri   ro   r  r  r  r  r  r  r  r   r  rp   slice_indicesr  r  s                      rc   r   %Glm4vForConditionalGeneration.forward  s    t ** 
% 3))/%)+'
 
  
 9C>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD*#33!//))++
 	
rb   c                 p   > [         TU ]  " U4UUUUUUU	U
UUS.
UD6nU(       d  U(       a
  S US'   S US'   U$ )N)
ri   rq   ro   r1  r  r  r  r  r}   is_first_iterationr  r  )r   prepare_inputs_for_generation)r   rn   ri   rq   ro   r1  r}   r  r  r  r  r   r   model_inputsr   s                 rc   r!  ;Glm4vForConditionalGeneration.prepare_inputs_for_generation:  sf    " w<
+)'%% 3))1
 
 "i+/L(26L./rb   c           	         UGb  UU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nOHXR                  R                  :H  nXR                  R                  :H  nXR                  R                  :H  n[        R                  " UR                  5       UR                  5       -
  SS9nUS:  nX7) -  nUR                  SS9n	UR                  SS9n
X4$ )a  
Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

Returns:
    image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
    video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
r  ).r   r   r   r   )r  r   r   r   r   r   r   r   r   r  rZ   r  )r   rn   ro   is_imageis_video_startis_video_endvideo_levelinside_videostandalone_imagesimage_countsvideo_countss              rc   _get_image_nums_and_video_nums<Glm4vForConditionalGeneration._get_image_nums_and_video_nums`  s   $ $,,.LL!A!A\i\p\pq H ,,.LL!A!A\i\p\pq N ,,.LL!?!?uzzZgZnZno L !KK$D$DDH&++*J*JJN$(F(FFL ll>#5#5#7,:J:J:L#LRST"Q %6 ),,,3%))a)0))rb   rR   )NNNNNNNNNNNr   )
NNNNTNNNNFr   )rS   rT   rU   rV   r   r  r   r	   r  r  rZ   r   r   r`   r  r   r!  r-  ra   r   r   s   @rc   r  r    s    .2.204(,26*.,08<262648-.Y
##d*Y
 t+Y
 &&-	Y

 Y
 ((4/Y
   4'Y
 llT)Y
 #..5Y
 ((4/Y
 ((4/Y
 !??T1Y
 ell*Y
 +,Y
 
,	,Y
|   $R .26*##d*6* ||d*6* 
u||U\\)	*	6* 6*rb   r  c                   *    \ rS rSrSSSS.SS0S.rSrg)	Glm4vProcessorKwargsi  FT)paddingreturn_token_type_idsreturn_mm_token_type_idsreturn_metadata)text_kwargsvideos_kwargsrR   N)rS   rT   rU   rV   	_defaultsra   rR   rb   rc   r0  r0    s#     %*(,

 ,T2Irb   r0  c                      ^  \ rS rSrSU 4S jjr   SS\S-  S\\-  \\   -  \\   -  S\	S-  S\
\   S\4
S	 jjrS
\S\\\      4S jrS rSrU =r$ )Glm4vProcessori  Nc                 
  > [         TU ]  XX4S9  [        US5      (       d  SOUR                  U l        [        US5      (       d  SOUR                  U l        UR                  S5      U l        UR                  S5      U l        g )N)chat_templateimage_tokenz	<|image|>video_tokenz	<|video|>z<|begin_of_video|>z<|end_of_video|>)r   r   hasattrr<  r=  convert_tokens_to_idsvideo_start_idvideo_end_id)r   image_processor	tokenizervideo_processorr;  r   r   s         rc   r   Glm4vProcessor.__init__  sw    _b.5i.O.O;U^UjUj.5i.O.O;U^UjUj'==>RS%;;<NOrb   imagestextvideosr   r   c                 :   U R                   " [        4SU R                  R                  0UD6nUb  U R                  " SSU0US   D6nUS   nO0 nSnUbJ  U R
                  " SSU0US   D6nUR                  S5      (       d  UR                  S	5      n	OUS	   n	US
   n
O0 nSn
[        U[        5      (       d  U/nUR                  5       nUb  U R                  R                  S-  nSn[        [        U5      5       H  nU R                  X-   ;   aR  X|   R                  5       U-  nX-   R!                  U R                  SU-  S5      X-'   US-  nU R                  X-   ;   a  MR  X-   R!                  SU R                  5      X-'   M     U
Gb  U R
                  R                  S-  nSn[        [        U5      5       GH  nU R"                  X-   ;   Ga  X   S   nSnW	U   nUR$                  c  [&        R)                  S5        UR$                  c  SOUR$                  Ul        UR*                  SSS2   n/ n[        S[        U5      5       H  nUR-                  UU   5        M     USU n[        U5      U:  a.  UR-                  U(       a  US   OS5        [        U5      U:  a  M.  [        U5       H  nUU   nU R/                  U5      nUU-  nM      X-   R!                  U R"                  US5      X-'   X   R                  5       U-  X   S   -  n[        U5       H;  nU R                  X-   ;   d  M  X-   R!                  U R                  SU-  S5      X-'   M=     US-  nU R"                  X-   ;   a  GM  X-   R!                  SU R                  5      X-'   GM     US   R                  SS5      nUS   R                  SS5      nU R                  " U40 US   D6nU R1                  UUSS/S9  U(       a  U R3                  US   5      US'   [5        0 UEUEUEUS9$ )a5  
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
    - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
    - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
tokenizer_init_kwargsNrF  images_kwargsr  rH  r6  r4  video_metadatar  r"   r   z<|placeholder|>r    a  SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results.r:   r   r5  return_tensorsr3  Fr  r  )
modalitiesrn   r  )datatensor_typerR   )_merge_kwargsr0  rC  init_kwargsrB  rD  r   popr   r_   copy
merge_sizer   r   r<  r  replacer=  fpsloggerwarning_once
timestampsr  replace_frame_token_id_check_special_mm_tokenscreate_mm_token_type_idsr   )r   rF  rG  rH  r   output_kwargsimage_inputsr  videos_inputsrL  r  merge_lengthindexr  num_image_tokensvideo_index
num_framesvideo_structuremetadatar[  unique_timestampsidxselected_timestamps	frame_idxtimestamp_secframe_structurerN  r3  text_inputss                                rc   __call__Glm4vProcessor.__call__  sj   ( ** 
"&.."<"<
 

 //`v`A_`L)*:;NL!N 00aa-P_B`aM::/00!.!2!23C!D!./?!@*+;<NM!N$%%6Dyy{%//::A=LE3t9%&&$'1'5'<'A'A'C|'S$"good.>.>@QTd@dfghDGQJE &&$'1 '//*;T=M=MN & %//::A=LK3t9%&&$'1!/!<Q!?J&(O-k:H||+++q
 *2)=28<<HL!)!4!4SqS!9J(*%$QJ8)00CA  9 +<KZ*H'12Z?+22Na3Fr3Jghi 12Z? &+:%6	(;I(F*.*E*Em*T'?: &7
 #good.>.>QRSDG&388:lJnNijkNll % &+:%6	++tw6&*good6F6FHY\lHlno&pDG &7  1$KG &&$'1J '//*;T=M=MNM &N '}599:JDQ#0#?#C#CD^`e#f nnTJ]=-IJ%%dKWgDV%W#/3/L/L[YdMe/fK+,!QK!Q<!Q=!Q_mnnrb   rn   c                    / nU H  n[         R                  " U5      n[         R                  " U5      n[         R                  " X@R                  :H  SS9n[         R                  " X@R
                  :H  SS9nXg:  nSXTU R                  :H  U-  '   SXTU R                  :H  U) -  '   UR                  UR                  5       5        M     U$ )Nr   )axisr"   r   )	nparray
zeros_liker  r@  rA  r   r  r  )	r   rn   r  input	array_idsmm_token_typesstartsendsis_video_modalitys	            rc   r^  'Glm4vProcessor.create_mm_token_type_ids  s     EI]]51N
 YYy,?,??aHF99Y*;*;;!DD &UVN)<)<<@QQRXYN)<)<<BSASTU$$^%:%:%<=  ! rb   c                 8    SU R                    S[        U5       3$ )Nz<|begin_of_image|>z<|end_of_image|>)r<  rZ   )r   rm  s     rc   r\  %Glm4vProcessor.replace_frame_token_id.  s#    #D$4$4#55Ec-FXEYZZrb   )r<  rA  r@  r=  )NNNNrt  )rS   rT   rU   rV   r   r   r   r   r_   r!   r   r0  r   rp  rZ   r^  r\  ra   r   r   s   @rc   r9  r9    s    P %)Z^$(	koT!ko ++d9o=EV@WWko T!	ko
 -.ko 
koZ!$ !4S	? !*[ [rb   r9  )	r   re   r6   r  r  r  r9  r  r  )r   )ocollections.abcr   numpyrt  r   torch.nnr   torch.nn.functional
functionalr   huggingface_hub.dataclassesr   r   rM  r   r  activationsr   cache_utilsr	   r
   configuration_utilsr   feature_extraction_utilsr   image_utilsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   tokenization_utils_baser   r   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr    video_utilsr!   glm4.modeling_glm4r#   r$   r%   r&   qwen2_5_vl.modeling_qwen2_5_vlr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   qwen2_vl.modeling_qwen2_vlr1   qwen2_vl.processing_qwen2_vlr2   r3   
get_loggerrS   rY  r6   re   r   r   r   r   r   r   r   r   r  r  r  rD  rQ  rS  rv  rx  r  r  r  r  r  r  r  r0  r9  __all__rR   rb   rc   <module>r     sW   %      .  & ! . 3 4 % / B 9 S 1 F & C  H 5 % c c   6 
		H	% 9:%$( %$  ;%$P 9:7(& 7(  ;7(t 9:1(" 1(  ;1(j	; 	8M 8	m4 	m	!> 	fRYY f"HBII HVR4 R5, 52 86%PC) C)L	7 	/6 /d	< 	24 2~
+ ~
B]
( ]
@w
 w
t	"B 	x*$F x*v1 K[% K[\
rb   