
    Z j)1                       S SK r S SKJr  S SKJr  S SKJrJr  S SKrS SK	J
r
  S SKJ
s  Jr  S SK	Jr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/J0r0J1r1  SSK2J3r3J4r4J5r5  SSK6J7r7  SSK8J9r9J:r:J;r;  \" S5       " S S\
Rx                  5      5       r= " S S\
Rx                  5      r> " S S\
Rx                  5      r? " S S \
Rx                  5      r@ " S! S"\
Rx                  5      rA " S# S$\
Rx                  5      rBS% rCS&\R                  S'\R                  S(\R                  S)\R                  S*\E\R                  \R                  4   4
S+ jrFS,\R                  S-\GS*\R                  4S. jrH SXS/\
Rx                  S0\R                  S1\R                  S2\R                  S3\R                  S-  S4\IS5\IS6\,\.   4S7 jjrJ " S8 S9\
Rx                  5      rK " S: S;\ 5      rL " S< S=\
Rx                  5      rMS> rNSYS? jrO " S@ SA\
Rx                  5      rP " SB SC\
Rx                  5      rQ " SD SE\ 5      rR\/" SFSG9\ " SH SI\$5      5       5       rS\/ " SJ SK\*5      5       rT " SL SM\T5      rU\/ " SN SO\T5      5       rV\/ " SP SQ\T5      5       rW\/" SRSG9\ " SS ST\$5      5       5       rX " SU SV\T\5      rY/ SWQrZg)Z    N)Callable)	dataclass)AnyOptional)	LayerNorm   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)is_flash_attention_requestedmaybe_autocastmerge_with_config_defaults)capture_outputs   )Glm4vConfigGlm4vTextConfigGlm4vVisionConfigRMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Glm4vRMSNorm0   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
Glm4vRMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizer*   	__class__s      y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/glm4v/modeling_glm4v.pyr.   Glm4vRMSNorm.__init__2   s/     	ll5::k#:; #    hidden_statesc                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor1   float32powmeanrsqrtr4   r3   )r5   r;   input_dtypevariances       r8   forwardGlm4vRMSNorm.forward:   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r:   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler3   shaper4   r5   s    r8   
extra_reprGlm4vRMSNorm.extra_reprA   s*    ))*+6$2G2G1HIIr:   )r4   r3   )gư>)__name__
__module____qualname____firstlineno__floatr.   r1   TensorrH   rN   __static_attributes____classcell__r7   s   @r8   r(   r(   0   sB    $ $$ $ $;U\\ ;ell ;J Jr:   r(   c                   :   ^  \ rS rSrSS\4U 4S jjjrS rSrU =r$ )Glm4VisionMlpE   biasc                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  US9U l        [
        R                  " U R                  U R                  US9U l        [
        R                  " U R                  U R                  US9U l	        [        UR                     U l        g Nr\   )r-   r.   r6   out_hidden_sizeintermediate_sizer/   Linear	gate_projup_proj	down_projr
   
hidden_actact_fn)r5   configr\   r7   s      r8   r.   Glm4VisionMlp.__init__F   s    !--!'!7!74#3#3T5K5KRVWyy!1!143I3IPTU4#9#94;K;KRVWV../r:   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      $ N)re   rg   rc   rd   r5   hidden_states     r8   rH   Glm4VisionMlp.forwardO   s2    ~~dkk$..*FG$,,WcJddeer:   )rg   re   rc   r6   ra   rd   F)	rP   rQ   rR   rS   boolr.   rH   rV   rW   rX   s   @r8   rZ   rZ   E   s     0T 0 0f fr:   rZ   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Glm4vVisionPatchEmbedS   rh   r+   Nc                 N  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  U R                  /n[        R                  " U R                  U R                  X"S9U l	        g )N)kernel_sizestride)
r-   r.   
patch_sizetemporal_patch_sizein_channelsr6   	embed_dimr/   Conv3dproj)r5   rh   ru   r7   s      r8   r.   Glm4vVisionPatchEmbed.__init__T   s|     ++#)#=#= !--++//$//RIId..Kl	r:   r;   c                 0   U R                   R                  R                  nUR                  SU R                  U R
                  U R                  U R                  5      nU R                  UR                  US95      R                  SU R                  5      nU$ )Nr>   r@   )	r|   r3   r@   viewry   rx   rw   rA   rz   )r5   r;   target_dtypes      r8   rH   Glm4vVisionPatchEmbed.forward^   s~    yy''--%**  $":":DOOT__
 		-"2"2"2"FGLLRQUQ_Q_`r:   )rz   ry   rw   r|   rx   rP   rQ   rR   rS   r%   r.   r1   rU   rH   rV   rW   rX   s   @r8   rr   rr   S   s:    m0 mT mU\\ ell  r:   rr   c                      ^  \ rS rSr% \R
                  \S'   SS\S\SS4U 4S jjjr	S\S\R
                  4S	 jr
S
rU =r$ )Glm4vVisionRotaryEmbeddingg   inv_freqdimthetar+   Nc           	         > [         TU ]  5         Xl        X l        SU[        R
                  " SUS[        R                  S9U-  -  -  nU R                  SUSS9  g )N      ?r   r=   r   r   F
persistent)r-   r.   r   r   r1   arangerT   register_buffer)r5   r   r   r   r7   s       r8   r.   #Glm4vVisionRotaryEmbedding.__init__j   sU    
%ELLC%++$NQT$TUVZeDr:   seqlenc                     [         R                  " XR                  R                  U R                  R                  S9n[         R
                  " X R                  5      nU$ )Ndevicer@   )r1   r   r   r   r@   outer)r5   r   seqfreqss       r8   rH   "Glm4vVisionRotaryEmbedding.forwardq   s=    ll6--*>*>dmmFYFYZC/r:   )r   r   )g     @)rP   rQ   rR   rS   r1   rU   __annotations__intrT   r.   rH   rV   rW   rX   s   @r8   r   r   g   sM    llEC E ED E Ec ell  r:   r   c                   ~   ^  \ rS rSrSS\S\S\S\SS4
U 4S jjjrS	\R                  S\R                  4S
 jr
SrU =r$ )Glm4vVisionPatchMergerw   r   context_dimrf   r\   r+   Nc                 b  > [         TU ]  5         [        R                  " XUS9U l        [        U5      U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " X!US9U l	        [        R                  " 5       U l        [        U   U l        g r^   )r-   r.   r/   rb   r|   r   post_projection_normrc   rd   re   GELUact1r
   rg   )r5   r   r   rf   r\   r7   s        r8   r.   Glm4vVisionPatchMerger.__init__x   s{    IIcT2	$-cN!3$?yy=;$?GGI	Z(r:   rm   c                     U R                  U5      nU R                  U R                  U5      5      nU R                  U R	                  U R                  U5      5      U R                  U5      -  5      $ rk   )r|   r   r   re   rg   rc   rd   rl   s     r8   rH   Glm4vVisionPatchMerger.forward   sY    yy.yy!:!:<!HI~~dkk$..*FG$,,WcJddeer:   )r   rg   re   rc   r   r|   rd   ro   )rP   rQ   rR   rS   r   strrp   r.   r1   rU   rH   rV   rW   rX   s   @r8   r   r   w   sU    )C )c )s )$ )[_ ) )fELL fU\\ f fr:   r   c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )Glm4vVisionEmbeddings   rh   c                 f  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        SU l        g )Nr=   bicubic)r-   r.   rh   r6   rz   
image_sizerw   num_patchesnum_positionsr/   	Embeddingposition_embeddinginterpolated_methodr5   rh   r7   s     r8   r.   Glm4vVisionEmbeddings.__init__   s    ++ ++ ++ OOt>1D!--"$,,t/A/A4>>"R#, r:   r+   c           	      "   U R                   R                  nUR                  S   nUR                  n[	        U[
        5      (       a#  [        R                  " X([        R                  S9nUR                  S   n	[        U	S-  5      n
UR                  XU5      R                  SSS5      R                  S5      R                  U[        R                  S9n[        R                  " [!        [#        U5      5       Vs/ s H  oUS4   R%                  X,   5      PM     sn5      R                  U[        R                  S9n[        R                  " [!        [#        U5      5       Vs/ s H  oUS4   R%                  X,   5      PM     sn5      R                  U[        R                  S9nUS-   U-  S-  S-
  nUS-   U-  S-  S-
  n[        R&                  " UU4SS9R                  S5      R                  S5      n[(        R*                  " UUU R,                  SS	S
9nUR/                  S5      R/                  S5      R                  SS5      nUR                  UR0                  5      R                  UR                  5      nUU-   nU$ s  snf s  snf )aQ  
Forward pass with integrated position encoding adaptation using 2D interpolation.

Args:
    embeddings: Input embeddings tensor
    lengths (torch.Tensor): Sequence lengths for each image in the batch.
    image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
    h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
    w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.

Returns:
    torch.Tensor: Embeddings with adapted position encoding added.
r"   r   r   g      ?r=   r>   r   Fborder)modealign_cornerspadding_mode)r   r3   rL   r   
isinstancelistr1   tensorlongr   r   permute	unsqueezerA   rB   catrangelenrepeatstackFgrid_sampler   squeezer@   )r5   
embeddingslengthsimage_shapesh_coordsw_coordspos_embed_weightr6   r   orig_size_sq	orig_sizepos_embed_2ditarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32adapted_pos_embeds                        r8   rH   Glm4vVisionEmbeddings.forward   sa     2299&,,Q/!(( gt$$ll7LG (--a0c)*	!!)DWQ1Yq\RvU]]R3	 	 99USVW^S_M`aM`1a4077
CM`abee f 
 99USVW^S_M`aM`1a4077
CM`abee f 

 c>X-2Q6c>X-2Q6 {{FF+4>>qAKKAN #$--$T%=%=Uai#

 "9!@!@!C!K!KB!O!W!WXY[\!]2556F6L6LMPPQ[QbQbc  "33
3 b bs   ;!J%!J)rh   rz   r   r   r   r   rw   r   r   rX   s   @r8   r   r      s(    
-0 
-;PUP\P\ ; ;r:   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )*Rotates half the hidden dims of the input..Nr>   r=   r   )rL   r1   r   xx1x2s      r8   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r:   qkcossinr+   c                    U R                   nUR                   nU R                  5       UR                  5       pUR                  S5      R                  5       UR                  S5      R                  5       p2X-  [        U 5      U-  -   nX-  [        U5      U-  -   nUR	                  U5      nUR	                  U5      nXg4$ )N)r@   rT   r   r   rA   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embeds           r8   apply_rotary_pos_emb_visionr      s     77L77L779aggiq}}R &&(#--*;*A*A*Cw;q>C/0Gw;q>C/0Gjj&Gjj&Gr:   r;   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)rL   expandreshape)r;   r   batchnum_key_value_headsslenhead_dims         r8   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr:   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr=   r   r>   r   r@   )ptrainingr"   )r   num_key_value_groupsr1   matmul	transposer/   
functionalsoftmaxrB   rA   r@   r   r  
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r8   eager_attention_forwardr     s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r:   c                      ^  \ rS rSrS\SS4U 4S jjr  SS\R                  S\R                  S\R                  S-  S	\\R                  \R                  4   S-  S\R                  4
S
 jjr	Sr
U =r$ )Glm4vVisionAttentioni  rh   r+   Nc                   > [         TU ]  5         UR                  U l        UR                  U l        U R                  U R                  -  U l        SU l        [        R                  " UR                  UR                  S-  UR                  S9U l
        [        R                  " UR                  UR                  SS9U l        U R
                  S-  U l        Xl        UR                  U l        SU l        g )Nr"   r   r_   F      )r-   r.   r6   r   	num_headsr   r  r/   rb   attention_biasqkvr|   r   rh   attention_dropout	is_causalr   s     r8   r.   Glm4vVisionAttention.__init__  s    %%))DNN2$%!99V//1C1Ca1GfNcNcdIIf00&2D2D5Q	}}d*!'!9!9r:   r;   
cu_seqlensrotary_pos_embposition_embeddingsc                    UR                   S   nU R                  U5      R                  USU R                  S5      R	                  SSSS5      R                  S5      u  pxn	Uu  p[        XxX5      u  pxUR                  SS5      R                  S5      nUR                  SS5      R                  S5      nU	R                  SS5      R                  S5      n	[        R                  " U R                  R                  [        5      n[        U R                  5      (       aX  USS  US S -
  R                  5       nU" U UUU	4S U R                   U R"                  (       d  SOU R$                  UUUUSS.UD6u  pOUSS  US S -
  nXxU	4 Vs/ s H'  n[&        R(                  " UUR+                  5       SS	9PM)     nn[-        U6  VVVs/ s HB  u  nnnU" U UUU4S U R                   U R"                  (       d  SOU R$                  SS
.UD6S   PMD     nnnn[&        R.                  " USS	9nUR                  US5      R1                  5       nU R3                  U5      nU$ s  snf s  snnnf )Nr   r   r>   r"   r=           F)r   r   r   cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kr  r   )r   r   r   r  )rL   r  r   r  r   unbindr   r  r   r   get_interfacerh   _attn_implementationr  r   maxr   r  r  r1   splittolistzipr   r  r|   )r5   r;   r  r  r  r   
seq_lengthquery_statesr  r	  r   r   attention_interface
max_seqlenr  _r   r   splitsr   r   vattn_outputss                          r8   rH   Glm4vVisionAttention.forward  s    #((+
HH]#++J4>>2NVVWXZ[]^`abiijkl 	/, '#>|Y\#b #--a3==a@))!Q/99!<
#--a3==a@(?(M(MKK,,.E)
 (44$QR.:cr?:??AJ0	
  $#'==d6L6L(('' NK" !nz#26GLXfrKsKsFGNN$4!<Ks     #F|  ,GAq! $	

 $( LL'+}}C$:P:P#
 
 
  ,    ))La8K!))*b9DDFii,-s   .IA	I)
r  rh   r   r   r  r  r  r|   r  r   NN)rP   rQ   rR   rS   r%   r.   r1   rU   rK   rH   rV   rW   rX   s   @r8   r  r    s    0 T " /3HLB||B LLB t+	B
 #5<<#=>EB 
B Br:   r  c                      ^  \ rS rSrSU 4S jjr\  SS\R                  S\R                  S\R                  S-  S\\R                  \R                  4   S-  S\R                  4
S	 jj5       r	S
r
U =r$ )Glm4vVisionBlocki`  r+   Nc                    > [         TU ]  5         [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        USS9U l
        g )Nr*   Fr_   )r-   r.   r(   r6   rms_norm_epsnorm1norm2r  attnrZ   mlpr   s     r8   r.   Glm4vVisionBlock.__init__a  s\    !&"4"4&:M:MN
!&"4"4&:M:MN
(0	 e4r:   r;   r  r  r  c                     XR                   " U R                  U5      4UUUS.UD6-   nXR                  U R                  U5      5      -   nU$ )a  
cu_seqlens (`torch.Tensor`):
    Cumulative sequence lengths used for packed variable-length attention in Flash Attention kernels.
rotary_pos_emb (`torch.Tensor`, *optional*):
    Precomputed rotary positional embeddings applied to the vision attention query/key states.
)r  r  r  )r8  r6  r9  r7  )r5   r;   r  r  r  r   s         r8   rH   Glm4vVisionBlock.forwardh  s]     &		JJ}%)
!) 3	)

 )
 
 &M1J(KKr:   )r8  r9  r6  r7  r+   Nr0  )rP   rQ   rR   rS   r.   r   r1   rU   rK   rH   rV   rW   rX   s   @r8   r2  r2  `  s    5 
 /3HL|| LL t+	
 #5<<#=>E 
 r:   r2  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rS rSrU =r$ )Glm4vTextRotaryEmbeddingi  r   Nrh   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  UR                  R                  S/ SQ5      U l        g )	N	rope_typedefaultr   Fr   original_inv_freqmrope_section)      rF  )r-   r.   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrh   rope_parametersrA  compute_default_rope_parametersr   attention_scalingr   clonegetrD  )r5   rh   r   rope_init_fnr   r7   s        r8   r.   !Glm4vTextRotaryEmbedding.__init__  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuU#3377Ur:   r   ztorch.deviceseq_lenr+   ztorch.Tensorc           	      j   U R                   S   nU R                   R                  SS5      n[        U SS5      =(       d    U R                  U R                  -  n[        XT-  5      nSnSU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  nX4$ )
aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetapartial_rotary_factorr   r   Nr   r=   r   r   )rJ  rN  getattrr6   num_attention_headsr   r1   r   int64rA   rT   )	rh   r   rQ  baserT  r   r   attention_factorr   s	            r8   rK  8Glm4vTextRotaryEmbedding.compute_default_rope_parameters  s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(23 U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r:   c                 Z   U R                   S S S S 2S 4   R                  5       R                  SUR                  S   SS5      nUS S 2S S 2S S S 24   R                  5       n[	        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      nU R                  X`R                  5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR!                  5       U R                  -  n	S S S 5        WR#                  UR$                  S
9W	R#                  UR$                  S
94$ ! , (       d  f       N@= f)Nr   r"   r>   mpscpuF)device_typeenabledr=   r   r   )r   rT   r   rL   r   r   typer   r   r  apply_mroperD  r1   r   r   rL  r   rA   r@   )
r5   r   position_idsinv_freq_expandedposition_ids_expandedr^  r   embr   r   s
             r8   rH    Glm4vTextRotaryEmbedding.forward  sZ   
 !MM$a*=>DDFMMaQ]QcQcdeQfhjlmn ,Q4] ; A A C'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E$$U,>,>?E))UN3C'')d444C'')d444C D vvAGGv$cff177f&;;; DCs   BF
F*c           	          UnUR                  USS9n[        R                  " [        U5       VVs/ s H  u  pVXeS-     PM     snnSS9nU$ s  snnf )Nr>   r   r   )r$  r1   r   	enumerate)r5   r   rD  sectionchunksr   chunkresults           r8   ra  $Glm4vTextRotaryEmbedding.apply_mrope  sS    W"-69JK9JXQEa%L9JKQST Ls   A
)rL  rh   rH  rD  rI  rA  rk   NNN)rP   rQ   rR   rS   r1   rU   r   r$   r.   staticmethodr   r   rK   rT   rK  no_gradr   rH   ra  rV   rW   rX   s   @r8   r?  r?    s    llV V V" )-+/"*$&*(* t* 
~u$	%	* *> ]]_<  <  r:   r?  c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	r   .r   Nr=   r"   r>   r   r   )r1   r   flattenr   s      r8   rotate_half_llmrs    sJ    	
319B	
319B;;Ryb)11"55r:   c                    UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nUR                  S   nU SSU24   U SUS24   pvUSSU24   USUS24   pXb-  [        U5      U-  -   n
X-  [        U5      U-  -   n[        R
                  " X/SS9n
[        R
                  " X/SS9nX4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.Nr>   r=   r   )r   rL   repeat_interleavers  r1   r   )r   r   r   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passr   r   s               r8   apply_rotary_pos_embr|    s6   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {u5;<G{u5;<G ii)r2Gii)r2Gr:   c                   :  ^  \ rS rSrSrSS\S\S-  4U 4S jjjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )Glm4vTextAttentioni  zz
Multi-headed attention from 'Attention Is All You Need' paper.
and "Generating Long Sequences with Sparse Transformers".
Nrh   	layer_idxc                 r  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        U R                  U R                  -  U l        UR                  U l        U R                  U R                  -  U l	        SU l
        UR                  U l        UR                  U l        U R                  S-  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )NTr  r_   F)r-   r.   rh   r  r6   rV  r  r   r   r  r  r  rJ  r   r/   rb   q_projk_projv_projo_projr5   rh   r  r7   s      r8   r.   Glm4vTextAttention.__init__  sE   "!--33((DNN:#)#=#= $(NNd6N6N$N!!'!9!9%55}}d*ii 0 0$..4==2PW[\ii 0 0$2J2JT]]2Zaefii 0 0$2J2JT]]2Zaefii >@P@PW\]r:   r;   r  r   past_key_valuesr   r+   c                 >   UR                  5       u  pgnU R                  U5      n	U R                  U5      n
U R                  U5      nU	R	                  XgSU R
                  5      R                  SS5      n	U
R	                  XgSU R
                  5      R                  SS5      n
UR	                  XgSU R
                  5      R                  SS5      nUu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U U	U
UU4U R                  (       d  SOU R                   U R"                  S.UD6u  nnUR%                  XgS5      R'                  5       nU R)                  U5      nUU4$ )Nr>   r"   r=   r  )r   r   )sizer  r  r  r   r   r  r|  updater  r   r!  rh   r"  r  r  r  r   r   r  r  )r5   r;   r  r   r  r   bszq_lenr+  r(  r  r	  r   r   r)  r  r
  s                    r8   rH   Glm4vTextAttention.forward  s    &**,A{{=1[[/
{{=1#((RGQQRSUVW__ST]]CMMaQRS
#((RGQQRSUVW&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "))#b9DDFkk+.L((r:   )r  rh   r   r6   r  r  r  r  r  r   r  r  rJ  r   r  rk   rn  )rP   rQ   rR   rS   __doc__r$   r   r.   r1   rU   rK   r   r   r   rH   rV   rW   rX   s   @r8   r~  r~    s    
^ ^3: ^ ^. IM.2(,))||)) #5<<#=>E)) t+	))
 )) -.)) 
u||U\\D0%2E2LL	M)) ))r:   r~  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Glm4vTextMLPiD  c                    > [         TU ]  5         Xl        [        R                  " UR
                  SUR                  -  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        UR                     U l        g )Nr=   Fr_   )r-   r.   rh   r/   rb   r6   ra   gate_up_projre   r
   rf   activation_fnr   s     r8   r.   Glm4vTextMLP.__init__E  sn    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56r:   r;   r+   c                     U R                  U5      nUR                  SSS9u  p2X R                  U5      -  nU R                  U5      $ )Nr=   r>   r   )r  rk  r  re   )r5   r;   	up_statesgates       r8   rH   Glm4vTextMLP.forwardM  sH    %%m4	#//!/4 2 24 88	~~i((r:   )r  rh   re   r  )
rP   rQ   rR   rS   r.   r1   FloatTensorrH   rV   rW   rX   s   @r8   r  r  D  s,    7)U%6%6 )5;L;L ) )r:   r  c                   T  ^  \ rS rSrS\S\4U 4S jjr\     SS\R                  S\
\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\
\R                  \
\R                  \R                  4   S-  4   4S jj5       rSrU =r$ )Glm4vTextDecoderLayeriV  rh   r  c                   > [         TU ]  5         UR                  U l        [        X5      U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr4  )r-   r.   r6   r~  	self_attnr  r9  r(   r5  input_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernormr  s      r8   r.   Glm4vTextDecoderLayer.__init__W  s    !--+F>'+F,>,>FDWDWX(4V5G5GVM`M`(a%(4V5G5GVM`M`(a%".v/A/AvGZGZ"[r:   Nr;   r  r   rb  r  	use_cacher+   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU$ )N)r;   r  r   rb  r  r   )r  r  r  r  r9  r  )
r5   r;   r  r   rb  r  r  r   residualr+  s
             r8   rH   Glm4vTextDecoderLayer.forwarda  s     !,,];  >> 
' 3)%+
 
 55mD 0 !55mD///> 0r:   )r6   r  r9  r  r  r  r  )NNNNF)rP   rQ   rR   rS   r$   r   r.   r   r1   rU   rK   
LongTensorr   rp   r  rH   rV   rW   rX   s   @r8   r  r  V  s    \ \3 \  IM.204(,!&#||# #5<<#=>E# t+	#
 &&-# # $;# 
u  %(9(95;L;L(L"MPT"TT	U# #r:   r  zJ
    Base class for Llava outputs, with hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   S	rg)
Glm4vModelOutputWithPasti  a?  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
    The rope index difference between sequence length and multimodal rope.
Nlast_hidden_stater  r;   
attentionsrope_deltasr  )rP   rQ   rR   rS   r  r  r1   r  r   r  r   r;   rK   r  r  r  rV   r  r:   r8   r  r    sv     37u((4/6$(OUT\(59M5**+d2926Je''(4/6+/K!!D(/r:   r  c                   \   ^  \ rS rSr% \\S'   SrSrSrSS/r	Sr
SrSrSrSrU 4S	 jrS
rU =r$ )Glm4vPreTrainedModeli  rh   model)imagevideotextTr  r2  r  c           	      *  > [         TU ]  U5        [        U[        5      (       an  SUR                  [
        R                  " SUR                  S[
        R                  S9UR                  -  -  -  n[        R                  " UR                  U5        g g )Nr   r   r=   r   )r-   _init_weightsr   r   r   r1   r   r   rT   initcopy_r   )r5   r   r   r7   s      r8   r  "Glm4vPreTrainedModel._init_weights  sn    f%f899fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 :r:   r  )rP   rQ   rR   rS   r#   r   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr  rV   rW   rX   s   @r8   r  r    sN    1&*#02DE"3N!"&2 2r:   r  c                      ^  \ rS rSr% \\S'   SrS/r\\	S.r
SU 4S jjrS r\\\S	\R"                  S
\R"                  S\\   S\\-  4S j5       5       5       rSrU =r$ )Glm4vVisionModeli  rh   )r  r  r2  r;   r  r+   c                 8  > [         TU ]  U5        UR                  U l        UR                  U l        [	        U5      U l        [        U5      U l        UR                  UR                  -  n[        US-  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        U5      PM     sn5      U l        [%        UR&                  UR(                  UR*                  S9U l        [/        UR                  UR0                  S9U l        [        R4                  " UR                  UR&                  UR                  UR                  S9U l        [/        UR                  UR0                  S9U l        SU l        U R=                  5         g s  snf )Nr=   )r   r   rf   r4  )ry   out_channelsru   rv   F)r-   r.   spatial_merge_sizerw   r   r   rr   patch_embedr6   r  r   r  r/   
ModuleListr   depthr2  blocksr   r`   ra   rf   mergerr(   r5  post_conv_layernormConv2d
downsamplepost_layernormgradient_checkpointing	post_init)r5   rh   r   r+  r7   s       r8   r.   Glm4vVisionModel.__init__  sG    "(";"; ++/708%%)9)998QGmmuV\\GZ$[GZ!%5f%=GZ$[\,&&F4L4LY_YjYj
 $00B0BH[H[#\ ))**//11,,	
 +6+=+=6CVCVW&+# %\s   &Fc                    / nU GHn  u  p4n[         R                  " U5      R                  S5      R                  SU5      nUR	                  X@R
                  -  U R
                  XPR
                  -  U R
                  5      nUR                  SSSS5      nUR                  5       n[         R                  " U5      R                  S5      R                  US5      nUR	                  X@R
                  -  U R
                  XPR
                  -  U R
                  5      nUR                  SSSS5      nUR                  5       nUR                  [         R                  " Xg/SS9R                  US5      5        GMq     [         R                  " USS9nUS S 2SS 24   R                  5       nU R                  U5      n	X   R                  S5      n
X4$ )Nr"   r>   r   r=   r   r   )r1   r   r   r   r   r  r   rr  appendr   r   r   r#  r  )r5   grid_thwpos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr  s              r8   rot_pos_embGlm4vVisionModel.rot_pos_emb  s   GA!||A003::2qAH'',,,'',,,''	H  ''1a3H'')H||A003::1bAH'',,,'',,,''	H  ''1a3H'')HNN5;;';DKKAqQR)  * ))G+ AB++-"11-@,5==a@&&r:   r;   r  r   c           	      f   U R                  U5      nU R                  U5      nU R                  U5      u  pE[        R                  " XD4SS9nUR                  5       UR                  5       4n[        R                  " USS2S4   USS2S4   -  USS2S4   5      R                  S[        R                  R                  5       (       a  UR                  O[        R                  S9n[        R                  " USSS	9nUSS USS -
  R                  5       n	U R!                  UU	UUSS2S4   R#                  UR$                  5      USS2S4   R#                  UR$                  5      5      nU R&                   H  n
U
" U4UUS
.UD6nM     U R)                  U5      nUR+                  SU R,                  U R,                  UR.                  S   5      nUR1                  SSSS5      nU R3                  U5      R+                  SU R4                  R6                  5      nU R9                  U5      n[;        UUS9$ )a$  
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
    The final hidden states of the model.
grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
    The temporal, height and width of feature shape of each image in LLM.

Returns:
    `torch.Tensor`: hidden_states.
r>   r   Nr"   r=   r   r   )r"   r   )r   )r  r  r   )r  pooler_output)r  r  r  r1   r   r   r   ru  cumsumjit
is_tracingr@   int32r   padr%  r   rA   r   r  r  r   r  rL   r   r  rh   r`   r  r   )r5   r;   r  r   r  image_type_idsre  r  r  seqlensblkmerged_hidden_statess               r8   rH   Glm4vVisionModel.forward  s     ((700?)-)9)9()C&ii8bA"wwy#'')4,,Xad^hq!tn-LhWXZ[W[n]dd
 %*II$8$8$:$:(.. e 

 UU:vQ7
ab>JsO3;;=1a4 ##M$8$891a4 ##M$8$89
 ;;C%$7 	M  ++M:%**'')@)@-BUBUVXBY
 &--aAq96;;B@[@[\#{{=9)+.
 	
r:   )r  r  r   r  r  r  rw   r  r  r  r  r=  )rP   rQ   rR   rS   r%   r   r  r  r2  r  _can_record_outputsr.   r  r    r!   r   r1   rU   r   r   rK   r   rH   rV   rW   rX   s   @r8   r  r    s    )+,)*
8':  9
"\\9
5:\\9
MSTfMg9
	+	+9
    9
r:   r  c                   "  ^  \ rS rSr% \\S'   Sr\\S.r	S\4U 4S jjr
\\\      SS\R                  S-  S\R                   S-  S	\R                  S-  S
\S-  S\R$                  S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )Glm4vTextModeli6  rh   )r  r  c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr4  rh   F)r-   r.   pad_token_idpadding_idx
vocab_sizer/   r   r6   embed_tokensr  r   num_hidden_layersr  layersr(   r5  normr?  
rotary_embr  r  r  s      r8   r.   Glm4vTextModel.__init__?  s     !.. ++LL):):F<N<NPTP`P`ammGLVMeMeGfgGf)"65Gfg
 !!3!39L9LM	2&A&+# hs   C?N	input_idsr   rb  r  inputs_embedsr  r   r+   c           	      l   US L US L-  (       a  [        S5      eU(       a9  Uc6  [        R                  R                  5       (       d  [	        U R
                  S9nUc  U R                  U5      nUcv  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  SSS5      R                  SUR                  S   S5      nO3UR                  S:X  a#  US	   R                  SUR                  S   S5      nUR                  S:X  a  UR                  S   S
:X  a  US   n	USS  nOS n	U R
                  UUUU	S.n
[        S0 U
D6nUnU R                  XS9nU R                    H  nU" U4UU	UUS.UD6nUnM     U R#                  U5      n[%        UUS9$ )N:You must specify exactly one of input_ids or inputs_embedsr  r   r"   r   r>   r   r=   N.   )rh   r  r   r  rb  )rb  )r   rb  r  r  )r  r  r  )
ValueErrorr1   r  r  r   rh   r  get_seq_lengthr   rL   r   r   r   ndimr   r  r  r  r   )r5   r  r   rb  r  r  r  r   past_seen_tokenstext_position_idsmask_kwargscausal_maskr;   r  decoder_layerlayer_outputss                   r8   rH   Glm4vTextModel.forwardO  s    -t";<YZZ 09M9M9O9O*$++>O  --i8M CRC^==?de <<(;(;A(>}G[G[\_ooL',,Q26==aATATUVAWY[\L!#'	299!\=O=OPQ=RTVWL !l&8&8&;q&@ ,Q'+L !% kk*,.-
 )7;7%"oomoW![[M)*. /$7 M *M ) 		-0&++
 	
r:   )r  r  r  r  r  r  r  )NNNNNN)rP   rQ   rR   rS   r$   r   r  r  r~  r  r.   r   r    r!   r1   r  rU   r   r  rp   r   r   rK   r   rH   rV   rW   rX   s   @r8   r  r  6  s     .(
    .2.204(,26!%J
##d*J
 t+J
 &&-	J

 J
 ((4/J
 $;J
 -.J
 
(	(J
    J
r:   r  c                   p  ^  \ rS rSrSrSrSS/rU 4S jrS rS r	    S'S
\
S\\
\
\
4   \R                  -  S\
S\
S\
S\\R                  -  S	-  4S jjr   S(S\R"                  S\R$                  S\R"                  S	-  S\R"                  S	-  S\R                  S	-  S\\R                  \R                  4   4S jjr\\ S)S\R.                  S\R"                  S	-  S\\   S\\-  4S jj5       5       r\\ S)S\R.                  S\R"                  S	-  S\\   S\\-  4S jj5       5       r  S*S\R"                  S\R.                  S\R.                  S	-  S\R.                  S	-  4S  jjr     S+S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S!\R                  S	-  S\R$                  S	-  S\R                  S	-  4S" jjr\\           S,S\R"                  S	-  S\R                  S	-  S#\R"                  S	-  S!\S	-  S\R.                  S	-  S\R                  S	-  S\R.                  S	-  S\R"                  S	-  S\R"                  S	-  S$\R"                  S	-  S\R$                  S	-  S\\   S\\ -  4S% jj5       5       r!S&r"U =r#$ )-
Glm4vModeli  r  Fr  r2  c                    > [         TU ]  U5        [        R                  UR                  5      U l        [        R                  UR                  5      U l        S U l	        U R                  5         g rk   )r-   r.   r  _from_configvision_configvisualr  text_configlanguage_modelr  r  r   s     r8   r.   Glm4vModel.__init__  sU     &33F4H4HI,99&:L:LM 	r:   c                 6    U R                   R                  5       $ rk   )r  get_input_embeddingsrM   s    r8   r  Glm4vModel.get_input_embeddings  s    ""7799r:   c                 :    U R                   R                  U5        g rk   )r  set_input_embeddingsr5   r   s     r8   r  Glm4vModel.set_input_embeddings  s    007r:   Nstart_positionr  temp_merge_sizer  time_intervalr   c                    US   R                  5       U-  US   R                  5       U-  US   R                  5       U-  pn[        R                  " XvS9U-  n
[        R                  " XS9U-   n[        R                  " XS9U-   nUR                  X-  5      nUR	                  U	5      R                  U5      nU
R	                  X-  5      U-   n
[        R
                  " XU/SS9nU$ )a  
Compute 3D positional indices for vision tokens derived from a single image or video input.

The positions are generated from the input grid defined by temporal (T), height (H), and
width (W) dimensions. Temporal and spatial dimensions can be downscaled according to the
merge sizes used in the vision backbone. The resulting positions are offset by `start_position`.

Args:
    start_position (`int`):
        Offset added to all computed positional indices.
    grid_thw (`Sequence[int]` or `torch.Tensor` of shape `(3,)`):
        The (T, H, W) grid representing the feature layout of the current image or video after patch embedding.
    temp_merge_size (`int`, *optional*):
        Factor by which the temporal dimension is reduced in the backbone. The temporal grid size is divided
        by this value. Defaults to 1.
    spatial_merge_size (`int`, *optional*):
        Factor by which the spatial dimensions (H and W) are reduced in the backbone. Both H and W are divided
        by this value. Defaults to 1.
    time_interval (`int`, *optional*):
        Spacing factor applied between consecutive temporal position indices.Defaults to 1.
    device (`str` or `torch.device`, *optional*):
        Device on which the resulting tensor is allocated. If `None`, uses the current default device.

Returns:
    torch.LongTensor of shape (3, sequence_length):
        Positional indices for temporal, height, and width dimensions,
        flattened into sequence form and offset by `start_position`.
r   r"   r=   r   r   )itemr1   r   r   ru  r   )r5   r  r  r  r  r  r   
llm_grid_t
llm_grid_h
llm_grid_wposition_temporalposition_widthposition_heightvision_position_idss                 r8   get_vision_position_ids"Glm4vModel.get_vision_position_ids  s    L QK/1QK"44QK"44 !+
 "LLCmSj@>Q,,zANR (..z/FG);;JGNNzZ-??
@WX[ii#kk+<~*^def""r:   r  mm_token_type_idsimage_grid_thwvideo_grid_thwr   r+   c           
         Ub%  [         R                  " XDSS2S4   SS9nSUSS2S4'   U R                  R                  R                  n/ n[         R
                  " SUR                  S   UR                  S   UR                  UR                  S9n	Ub  [        U5      OSUb  [        U5      OSS.n
[        U5       GH  u  pX+   nUb*  XU   R                  5          nXU   R                  5          n/ n[        R                  " [        UR                  5       5      S 5       H8  u  nn[        U5      nUS   S   nUS	   S   S-   nUR!                  UUU45        M:     Sn/ nU H  u  nnnUS:X  a]  UU-
  nUR!                  [         R"                  " UUR                  S
9R%                  SS	5      R'                  SS	5      U-   5        UU-  nMj  [)        U
U   5      nU R+                  UUSXqR                  S
9nUR!                  U5        U[-        US   US   5      U-  -  nM     [         R.                  " USS9R1                  SS	5      nUb4  UR3                  U	R                  5      U	SS2XU   R                  5       4'   O"UR3                  U	R                  5      U	SS2U4'   UR!                  UR-                  5       S-   [5        U5      -
  5        GM     [         R6                  " XR                  S
9R9                  S5      nX4$ )a9  
Difference from Qwen2VL/Qwen2.5VL's get_rope_index:
- GLM4V uses timestamps to seperate each video frame, so the video_grid_thw should also be split too.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
        it.
    mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`):
        Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
    image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
        The temporal, height and width of feature shape of each image in LLM.
    video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
        The temporal, height and width of feature shape of each video in LLM.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

Returns:
    position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
    mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
Nr   r   r"   r   r@   r   )r"   r=   c                     U S   $ )Nr"   r  )r   s    r8   <lambda>+Glm4vModel.get_rope_index.<locals>.<lambda>+  s    `abc`dr:   r>   r   r=   )r1   ru  rh   r  r  zerosrL   r@   r   iterrh  rp   	itertoolsgroupbyr%  r   r  r   r   r   nextr)  r#  r   r   rA   r   r   r   )r5   r  r+  r,  r-  r   r   r  mrope_position_deltasrb  
grid_iters	batch_idxcurrent_input_idsinput_token_typeinput_type_groupr   groupstart_index	end_indexcurrent_posllm_pos_ids_listmodality_type	start_idxend_idxtext_lenr  r(  llm_positionss                               r8   get_rope_indexGlm4vModel.get_rope_index  s)   F %"44^TUWXTXEY_`aN#$N1a4 ![[66II "{{OOAOOA//##
 (6'AtN#t'5'AtN#t


 -6i,@(I0;)$5Y6O6T6T6V$W!#394M4R4R4T#U !'//	:J:Q:Q:S0TVde
UU#Ahqk!"IaL1,	 ''k9(EF	 f K!5E1y' A%&2H$++Xi6F6FGLLQPRSZZ[\^`adoo  8+K  $J}$=>H*.*F*F#Xq2DM]M] +G +' %++,?@3x{HQK#@DV#VVK 6F  "II&6A>FFq"MM)O\O_O_`l`s`sOtQ	)+D+I+I+KKL-:-=-=l>Q>Q-RQ	\*!(():):)<q)@3GXCY)YZI -AJ !&-BK[K[ \ f fgh i22r:   pixel_values_videosr   c                 2   UR                  U R                  R                  5      n/ nUR                  5       nU HN  u  pgn[        R
                  " SXx/5      R                  S5      R                  US5      n	UR                  U	5        MP     [        R                  " USS9n
U R                  " U4U
SS.UD6nUR                  S5      U R                  R                  S-  -  R                  5       n[        R                  " UR                  U5      nXl        U$ )3  
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input videos.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.
r"   r   r   T)r  return_dictr>   r=   )r`  r  r@   r%  r1   r   r   r   r  r   prodr  r$  r  )r5   rJ  r-  r   temp_frames_hwvideo_grid_thw_listr  r  r  repeated_rowflattened_video_grid_thwvision_outputssplit_sizesvideo_embedss                 r8   get_video_featuresGlm4vModel.get_video_featuresL  s    266t{{7H7HI,335*GA! <<A	2<<Q?FFq!LL!!,/ + $)99^#C 
*BPT
X^
 &**2.$++2P2PRS2SS[[]{{>#?#?M'3$r:   pixel_valuesc                 :   UR                  U R                  R                  5      nU R                  " U4SU0UD6nUR                  S5      U R                  R                  S-  -  R                  5       n[        R                  " UR                  U5      nXdl        U$ ),  
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
    The temporal, height and width of feature shape of each image in LLM.
r  r>   r=   )	r`  r  r@   rN  r  r%  r1   r$  r  )r5   rX  r,  r   rS  rT  image_embedss          r8   get_image_featuresGlm4vModel.get_image_featuresk  s     $(():):;\UNUfU%**2.$++2P2PRS2SS[[]{{>#?#?M'3$r:   r  image_featuresvideo_featuresc           	      D   Uc  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nX R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nO0XR                  R                  :H  nXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUb@  [        X%   R                  5       UR                  5       :H  SU SUR                  S    35        UR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUb@  [        X&   R                  5       UR                  5       :H  SU SUR                  S    35        XV4$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r/  r>   z6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: )r  r1   r   rh   image_token_idr   r   allvideo_token_idsumr   	expand_asrA   r   numelrL   )	r5   r  r  r^  r_  special_image_maskspecial_video_maskn_image_tokensn_video_tokenss	            r8   get_placeholder_maskGlm4vModel.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!; "+kk.H.H!H!*kk.H.H!H+//1/99"=GGVYYZgZnZno%"1779^=Q=Q=SSHHXXdeseyeyz{e|d}~
 ,//1/99"=GGVYYZgZnZno%"1779^=Q=Q=SSHHXXdeseyeyz{e|d}~ "55r:   r  c                    Uc  SOUR                  5       nUS L=(       d    US Ln	U	(       a  Uc  Ub  [        S5      eUS L=(       a    US L=(       a    U	n
U
(       a0  U R                  b  US:X  a  U R                  UUUUUS9u  pXl        U$ U R                  Gb8  US:  d  UGc.  UR                  u  pnUbu  UR                  5       R                  S5      S-
  nUR                  US:H  S5      nUR                  SUS5      R                  SSS5      R                  UR                  5      nOV[        R                  " XU-   5      nUR                  SSS5      R                  SUS5      R                  UR                  5      nU R                  R                  XR                  R                  S   -  SS9nUUR                  UR                  S9-   nU$ S nU$ )	Nr   a  Multimodal data was passed (via `image_grid_thw` or `video_grid_thw`) but `mm_token_type_ids` is missing. Please pass `mm_token_type_ids` to the model so that multimodal RoPE (M-RoPE) can be computed correctly. `mm_token_type_ids` is returned by the processor alongside `input_ids`.)r,  r-  r   r+  r>   r"   r   r   r   )r  r  r  rH  rL   r   r  masked_fillr   r   rA   r   r1   r   r   ru  )r5   r  r  r,  r-  r   r  r+  past_key_values_lengthhas_multimodalcan_compute_mroperb  r  
batch_sizer'  r+  deltas                    r8   compute_3d_position_ids"Glm4vModel.compute_3d_position_ids  s    '6&=?CaCaCc't3Q~T7Q/7I<Qn 
 &T1f6Gt6SfXf$"2"2":>TXY>Y(,(;(;---"3 )< )%L  +&  )/E/IYM^(5(;(;%JA)-224;;B?!C+77!8KQO+00JCJJ1aQRSVVWdWkWkl$||,B]gDgh+00Ar:AA!ZQSTWWXeXlXlm$$66zEUEUE[E[\]E^7^de6fE'%((-:N:N(*OOL   Lr:   rb  r  c           
         USL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUbv  U R                  XhSS9R                  n[        R
                  " USS9R                  UR                  UR                  5      nU R                  XUS9u  pUR                  X5      nUbx  U R                  XySS9R                  n[        R
                  " USS9R                  UR                  UR                  5      nU R                  XUS9u  nnUR                  UU5      nUc  U R                  UUU	UUUUS	9nU R                  " SSUUUUS
.UD6n[        S0 UDSU R                  0D6$ )a  
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
    The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
    The rope index difference between sequence length and multimodal rope.
Nr  T)rM  r   r   )r^  )r_  )r  r,  r-  r  r   r  r+  )r  rb  r   r  r  r  r  )r  r  r\  r  r1   r   rA   r   r@   rk  masked_scatterrV  rt  r  r  r  )r5   r  r   rb  r  r  rX  rJ  r,  r-  r  r+  r   r[  
image_maskr+  rU  
video_maskoutputss                      r8   rH   Glm4vModel.forward  s   2 -t";<YZZ  557	BM#22<]a2bppL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMJ)88RM*223Fdh2iwwL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMAz)88\RM77#--+- /"3 8 L %% 
%)+'
 
 ( 

((
 	
r:   )r  r  r  )r"   r"   r"   Nrn  rk   r0  )NNNNN)NNNNNNNNNNN)$rP   rQ   rR   rS   r  accepts_loss_kwargsr  r.   r  r  r   r   r1   rU   r   r   r)  r  	IntTensorrK   rH  r   r   r  r   r   r   rV  r\  rk  rt  r   r  rH   rV   rW   rX   s   @r8   r  r    s8   02DE:8  !"#,08#8# sC}%48# 	8#
  8# 8# ell"T)8#| 3726.2[3##[3 !??[3 ((4/	[3
 ((4/[3 t+[3 
u||U\\)	*[3z  37".. ((4/ +,	
 
+	+  :  37'' ((4/ +,	
 
+	+  0 4837(6##(6 (((6 ))D0	(6
 ))D0(6\ /3.2.2/348/<<$&/ ||d*/ t+	/
 t+/ t+/ ,/ !??T1/ 
	/b  .2.204(,26,08<2626/348@
##d*@
 t+@
 &&-	@

 @
 ((4/@
 llT)@
 #..5@
 ((4/@
 ((4/@
 %%,@
 !??T1@
 +,@
 
)	)@
  @
r:   r  zQ
    Base class for Glm4v causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                   S-  \S	'   S
rg)Glm4vCausalLMOutputWithPasti!  a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
    The rope index difference between sequence length and multimodal rope.
Nlosslogitsr  r;   r  r  r  )rP   rQ   rR   rS   r  r  r1   r  r   r  r  r   r;   rK   r  r  r  rV   r  r:   r8   r  r  !  s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/6+/K!!D(/r:   r  c                      ^  \ rS rSrSS0rSrU 4S jrS rS r\	 S!S	\
R                  S
\
R                  S-  S\\   S\\-  4S jj5       r\	 S!S\
R                  S\
R                  S-  S\\   S\\-  4S jj5       r\\	            S"S\
R                  S-  S\
R(                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\
R(                  S-  S	\
R                  S-  S\
R                  S-  S
\
R                  S-  S\
R,                  S-  S\\
R(                  -  S\\   S\\-  4S jj5       5       r          S#U 4S jjrU 4S jr S!S\
R                  S-  S\
R(                  S-  S\\
R(                  \
R(                  4   4S jjr   S$S\S\S\
R                  S-  S\\
R                  \\\ 4   4   4S jjr!S r"U =r#$ )%Glm4vForConditionalGenerationi>  zlm_head.weightz(model.language_model.embed_tokens.weightFc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr_   )r-   r.   r  r  r/   rb   r  r6   r  lm_headr  r   s     r8   r.   &Glm4vForConditionalGeneration.__init__C  sS     '
yy!3!3!?!?ASASA^A^ejkr:   c                 6    U R                   R                  5       $ rk   )r  r  rM   s    r8   r  2Glm4vForConditionalGeneration.get_input_embeddingsJ  s    zz..00r:   c                 :    U R                   R                  U5        g rk   )r  r  r  s     r8   r  2Glm4vForConditionalGeneration.set_input_embeddingsM  s    

''.r:   NrJ  r-  r   r+   c                 >    U R                   R                  " SXS.UD6$ )rL  )rJ  r-  r  )r  rV  )r5   rJ  r-  r   s       r8   rV  0Glm4vForConditionalGeneration.get_video_featuresP  s+     zz,, 
 3
V\
 	
r:   rX  r,  c                 >    U R                   R                  " SXS.UD6$ )rZ  )rX  r,  r  )r  r\  )r5   rX  r,  r   s       r8   r\  0Glm4vForConditionalGeneration.get_image_featuresa  s"     zz,,p,pioppr:   r  r   rb  r  r  labelsr+  logits_to_keepc                    U R                   " SUUUU	U
UUUUUS.
UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nSnUb.  U R                  UX`R                  R                  R                  S9n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
    The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Glm4vForConditionalGeneration

>>> model = Glm4vForConditionalGeneration.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")
>>> processor = AutoProcessor.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")

>>> messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
```)
r  rX  rJ  r,  r-  r+  rb  r   r  r  r   N)r  r  r  )r  r  r  r;   r  r  r  )r  r   r   slicer  loss_functionrh   r  r  r  r  r;   r  r  )r5   r  r   rb  r  r  r  rX  rJ  r,  r-  r+  r  r   rz  r;   slice_indicesr  r  s                      r8   rH   %Glm4vForConditionalGeneration.forwardp  s    x ** 
% 3))/%)+'
 
  
 9C>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD*#33!//))++
 	
r:   c                 p   > [         TU ]  " U4UUUUUUU	U
UUS.
UD6nU(       d  U(       a
  S US'   S US'   U$ )N)
r  r   r  rb  rX  rJ  r,  r-  r  is_first_iterationrX  rJ  )r-   prepare_inputs_for_generation)r5   r  r  r   r  rb  r  rX  rJ  r,  r-  r  r   model_inputsr7   s                 r8   r  ;Glm4vForConditionalGeneration.prepare_inputs_for_generation  sf    " w<
+)'%% 3))1
 
 "i+/L(26L./r:   c                   > [         TU ]  X5      nSnUR                  S5      =nb  UR                  5       nUS:w  a5  U R                  R
                  b  US   U R                  R
                  -   nU$ SU;   a  US   R                  S   S:  a  US   n[        UR                  5      S:H  =(       a-    UR                  [        R                  [        R                  4;   nU(       a  UR                  S5      b  UR                  S5      c  UR                  S	5      b\  UR                  5        VV	s0 s H  u  pUS:w  d  M  X_M     nnn	U R                  R                  " U40 UD6u  pXR                  l        OmUR                  S5      R                  S
SS5      n
[        R                   " UR                  S   S[        R                  UR"                  S9U R                  l        US   n[        R$                  " X:/SS9nU$ s  sn	nf )Nr   r  r  r  r"   r=   r+  r,  r-  r   r>   r/  r   )r-   $_prepare_position_ids_for_generationrN  r  r  r  rL   r   r@   r1   r   r   itemsrH  r   r   r3  r   r   )r5   inputs_tensormodel_kwargstext_positionspast_lengthcacherb  is_input_idsr   r-  vision_positionsr  r7   s               r8   r  BGlm4vForConditionalGeneration._prepare_position_ids_for_generation  s    Emb !%%&788EE..0K!

 6 6 B))4tzz7M7MML ,&<+D+J+J1+MPQ+Q(5M=../14g9L9LQVQZQZ\a\f\fPg9g  !45A!!"23?<CSCSTdCeCq-9-?-?-AV-ATQQ+EUDAD-ALV,0JJ,E,Em,dWc,d)%0JJ"-77:AA!RL%*[[##A&MDXDX&DJJ"
 (	2yy.!CK Ws   /H?Hc           	         UGb  UU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nOHXR                  R                  :H  nXR                  R                  :H  nXR                  R                  :H  n[        R                  " UR                  5       UR                  5       -
  SS9nUS:  nX7) -  nUR                  SS9n	UR                  SS9n
X4$ )a  
Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

Returns:
    image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
    video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
r/  ).r   r"   r   r   )r  r1   r   rh   image_start_token_idr   r   video_start_token_idvideo_end_token_idr  r   rd  )r5   r  r  is_imageis_video_startis_video_endvideo_levelinside_videostandalone_imagesimage_countsvideo_countss              r8   _get_image_nums_and_video_nums<Glm4vForConditionalGeneration._get_image_nums_and_video_nums  s   $ $,,.LL!A!A\i\p\pq H ,,.LL!A!A\i\p\pq N ,,.LL!?!?uzzZgZnZno L !KK$D$DDH&++*J*JJN$(F(FFL ll>#5#5#7,:J:J:L#LRST"Q %6 ),,,3%))a)0))r:   expand_sizeis_encoder_decoderc                    ^ ^^^^ TS:X  a  TT4$ / SQmUUUU 4S jnUU4S jnU" T5      mTb  TR                  TSS9mU" T5      mU(       a+  TR                  S5      c  [        S5      eU" TS   5      TS'   TT4$ )	Nr"   )rX  r,  rJ  r-  second_per_grid_tsc           	        > TR                  SS 5      nTR                  SS 5      nTR                  TTR                  SS 5      S9u  p4S nU  GH/  nUS:X  aa  [        R                  " U[	        U5      5      nU Vs/ s H&  n[        R
                  " USS9R                  5       PM(     n	nU" X   U	T
S	9X'   Mk  US:X  a  [	        U5      n	U" X   U	T
S	9X'   M  US
:X  aa  [        R                  " U[	        U5      5      nU Vs/ s H&  n[        R
                  " USS9R                  5       PM(     n	nU" X   U	T
S	9X'   M  US:X  a  [	        U5      n	U" X   U	T
S	9X'   GM  US:X  d  GM  U" X   [	        U5      T
S	9X'   GM2     U $ s  snf s  snf )Nr,  r-  r  )r  c                     [         R                  " X5      nU/S/U R                  5       S-
  -  -   n[         R                  " U Vs/ s H  oUR                  " U6 PM     snSS9nU$ s  snf )Nr"   r   r   )r1   r$  r   r   r   )r   r   repeat_timessamplesrepeat_argssamplerl  s          r8   _repeat_interleave_samplesڋGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual.<locals>._repeat_interleave_samplesi  s_    ++a1+nsaeegk/BBg#VgFMM;$?g#V\]^ $Ws   A&rX  r"   r   )r   r  rJ  r  )rN  r  r1   r$  r   rN  rd  )dict_to_expandr,  r-  
image_nums
video_numsr  r   r  r  r   r  r  r  r5   s             r8   "_expand_dict_for_generation_visualgGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visualb  s   )--.>EN)--.>EN%)%H%H)9)9/4)P &I &"J &.(#kk.$z:JKGMTUW6uzz&a8<<>WGU*D&+W;+N' ,,":.G*D&+W;+N' 11#kk.$z:JKGMTUW6uzz&a8<<>WGU*D&+W;+N' ,,":.G*D&+W;+N' 00*D&+T*5ET_+N'7 &< "!3 V Vs   ;-F-Fc                   > U  Hw  nUS:X  a(  X   R                   S:X  a  X   R                  TSS9X'   M1  X   c  M8  [        X   [        R                  5      (       d  M[  UT;  d  Mc  X   R                  TSS9X'   My     U $ )Nrb  r   r"   r   r   )r  ru  r   r1   rU   )r  r   r  visual_keyss     r8   _expand_dict_for_generation`Glm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation  s    %.(^-@-E-E-J*8*=*O*OP[ab*O*cN'"'3">#6EE;.*8*=*O*OP[ab*O*cN' & "!r:   r   r   encoder_outputszMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)ru  rN  r  )r5   r  r  r  r  r  r  r  s   `` ``  @r8   _expand_inputs_for_generation;Glm4vForConditionalGeneration._expand_inputs_for_generationQ  s     !l**w+	" +	"Z
	" :,G !33KQ3GI2<@ 12: !pqq.I,WhJi.jL*+,&&r:   )r  r  rk   )NNNNNNNNNNNr   )
NNNNTNNNNF)r"   FN)$rP   rQ   rR   rS   _tied_weights_keysr|  r.   r  r  r   r1   r  r  r   r   rK   r   rV  r\  r   rU   r   r}  r   r  rH   r  r  r  rp   dictr   r   r  rV   rW   rX   s   @r8   r  r  >  s   *,VW1/  37
"..
 ((4/
 +,	

 
+	+
 
   37q''q ((4/q +,	q
 
+	+q q  .2.204(,26*.,08<262648-.Y
##d*Y
 t+Y
 &&-	Y

 Y
 ((4/Y
   4'Y
 llT)Y
 #..5Y
 ((4/Y
 ((4/Y
 !??T1Y
 ell*Y
 +,Y
 
,	,Y
  Y
|   $L$R .26*##d*6* ||d*6* 
u||U\\)	*	6*t #(-1	V'V' !V' ##d*	V' 
uc3h/	0V' V'r:   r  )r  r  r  r  r  )r  )r"   )[r5  collections.abcr   dataclassesr   typingr   r   r1   torch.nnr/   torch.nn.functionalr  r   r    r	   r  activationsr
   cache_utilsr   r   
generationr   integrationsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   r    utils.output_capturingr!   configuration_glm4vr#   r$   r%   Moduler(   rZ   rr   r   r   r   r   rU   rK   r   r   r   rT   r  r  r2  r?  rs  r|  r~  r  r  r  r  r  r  r  r  r  __all__r  r:   r8   <module>r     s  (  $ !        & ! . ) 7 / B 9 ` ` K F & a a e e 5 P P Y'J299 J (J(fBII fBII (  fRYY f"HBII HV(||+0<<>Cll
5<<%&	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2P299 Pf1 DJryy JZ6%PC) C)L)299 )$/6 /d 
 0{ 0 0$ 2? 2 2(~
+ ~
B e
) e
 e
P ~
% ~
 ~
B 
 0+ 0 0.i'$8/ i'X xr:   