
    Z jL5                         S SK rSSKJr  SSKJr  SSKJrJrJ	r	J
r
  SSKJrJr  SSKJrJr  SSKJr  \R&                  " \5      r " S	 S
\SS9r\ " S S\	5      5       rS/rg)    N   )BatchFeature)
ImageInput)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringlogging)
VideoInputc                   *    \ rS rSrSSSS.SS0S.rSrg)	Glm4vProcessorKwargs"   FT)paddingreturn_token_type_idsreturn_mm_token_type_idsreturn_metadata)text_kwargsvideos_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/glm4v/processing_glm4v.pyr   r   "   s#     %*(,

 ,T2Ir   r   F)totalc                      ^  \ rS rSrSU 4S jjr\   SS\S-  S\\-  \	\   -  \	\   -  S\
S-  S\\   S\4
S	 jj5       rSS
 jr SS jr\U 4S j5       rS\	S\	\	\      4S jrS rSrU =r$ )Glm4vProcessor-   Nc                   > [        US5      (       d  SOUR                  U l        [        US5      (       d  SOUR                  U l        [        USS 5      (       a  UR                  OUR                  U R                  5      U l        [        USS 5      (       a  UR                  OUR                  U R                  5      U l        [        TU ]!  XX4S9  UR                  S5      U l	        UR                  S	5      U l
        g )
Nimage_tokenz	<|image|>video_tokenz	<|video|>image_token_idvideo_token_id)chat_templatez<|begin_of_video|>z<|end_of_video|>)hasattrr&   r'   getattrr(   convert_tokens_to_idsr)   super__init__video_start_idvideo_end_id)selfimage_processor	tokenizervideo_processorr*   kwargs	__class__s         r    r/   Glm4vProcessor.__init__/   s    .5i.O.O;U^UjUj.5i.O.O;U^UjUj y"2D99 $$001A1AB 	 y"2D99 $$001A1AB 	
 	_b'==>RS%;;<NOr   imagestextvideosr6   returnc                 :   U R                   " [        4SU R                  R                  0UD6nUb  U R                  " SSU0US   D6nUS   nO0 nSnUbJ  U R
                  " SSU0US   D6nUR                  S5      (       d  UR                  S	5      n	OUS	   n	US
   n
O0 nSn
[        U[        5      (       d  U/nUR                  5       nUb  U R                  R                  S-  nSn[        [        U5      5       H  nU R                  X-   ;   aR  X|   R                  5       U-  nX-   R!                  U R                  SU-  S5      X-'   US-  nU R                  X-   ;   a  MR  X-   R!                  SU R                  5      X-'   M     U
Gb  U R
                  R                  S-  nSn[        [        U5      5       GH  nU R"                  X-   ;   Ga  X   S   nSnW	U   nUR$                  c  [&        R)                  S5        UR$                  c  SOUR$                  Ul        UR*                  SSS2   n/ n[        S[        U5      5       H  nUR-                  UU   5        M     USU n[        U5      U:  a.  UR-                  U(       a  US   OS5        [        U5      U:  a  M.  [        U5       H  nUU   nU R/                  U5      nUU-  nM      X-   R!                  U R"                  US5      X-'   X   R                  5       U-  X   S   -  n[        U5       H;  nU R                  X-   ;   d  M  X-   R!                  U R                  SU-  S5      X-'   M=     US-  nU R"                  X-   ;   a  GM  X-   R!                  SU R                  5      X-'   GM     US   R                  SS5      nUS   R                  SS5      nU R                  " U40 US   D6nU R1                  UUSS/S9  U(       a  U R3                  US   5      US'   [5        0 UEUEUEUS9$ )a5  
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
    - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
    - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
tokenizer_init_kwargsNr9   images_kwargsimage_grid_thwr;   r   r   video_metadatavideo_grid_thw   r   z<|placeholder|>    a  SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results.   r   return_tensorsr   Fimagevideo)
modalities	input_idsmm_token_type_ids)datatensor_typer   )_merge_kwargsr   r4   init_kwargsr3   r5   getpop
isinstancelistcopy
merge_sizerangelenr&   prodreplacer'   fpsloggerwarning_once
timestampsappendreplace_frame_token_id_check_special_mm_tokenscreate_mm_token_type_idsr   )r2   r9   r:   r;   r6   output_kwargsimage_inputsr@   videos_inputsrA   rB   merge_lengthindexinum_image_tokensvideo_index
num_framesvideo_structuremetadatar_   unique_timestampsidxselected_timestamps	frame_idxtimestamp_secframe_structurerH   r   text_inputss                                r    __call__Glm4vProcessor.__call__@   sj   * ** 
"&.."<"<
 

 //`v`A_`L)*:;NL!N 00aa-P_B`aM::/00!.!2!23C!D!./?!@*+;<NM!N$%%6Dyy{%//::A=LE3t9%&&$'1'5'<'A'A'C|'S$"good.>.>@QTd@dfghDGQJE &&$'1 '//*;T=M=MN & %//::A=LK3t9%&&$'1!/!<Q!?J&(O-k:H||+++q
 *2)=28<<HL!)!4!4SqS!9J(*%$QJ8)00CA  9 +<KZ*H'12Z?+22Na3Fr3Jghi 12Z? &+:%6	(;I(F*.*E*Em*T'?: &7
 #good.>.>QRSDG&388:lJnNijkNll % &+:%6	++tw6&*good6F6FHY\lHlno&pDG &7  1$KG &&$'1J '//*;T=M=MNM &N '}599:JDQ#0#?#C#CD^`e#f nnTJ]=-IJ%%dKWgDV%W#/3/L/L[YdMe/fK+,!QK!Q<!Q=!Q_mnnr   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ s H!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ s H
  oUS-  -  PM     n
n	UR                  XS.5        Ubz  [         R                  R                  S0 5      nUR                  U5        U Vs/ s H!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ s H
  oWS-  -  PM     nn	XS'   [        S0 UD6$ s  snf s  sn	f s  snf s  sn	f )	a  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
    video_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (num_frames, height, width) per each video.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr?   rW   rC   )rj   num_image_patchesr   num_video_tokensr   )
r   r   rR   updater3   rW   get_number_of_image_patchesr5   get_number_of_video_patchesr   )r2   image_sizesvideo_sizesr6   vision_datar?   rW   
image_sizery   num_patchesrj   r   
video_sizenum_video_patchesrz   s                  r    _get_num_multimodal_tokens)Glm4vProcessor._get_num_multimodal_tokens   s    "0::>>PRSM  (&**<>a$BVBVBaBaJ #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd4Dmn"0::>>PRSM  ( #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd.>*+,,,#!  e!  es   *(EE6(E$Ec                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spaces)r4   batch_decode)r2   generated_outputsr   r   r6   s        r    post_process_image_text_to_text.Glm4vProcessor.post_process_image_text_to_text   s3    ( ~~**
 3)E
 	
 	
r   c                 >   > [         TU ]  nUR                  S5        U$ )NrM   )r.   model_input_namesr`   )r2   r   r7   s     r    r    Glm4vProcessor.model_input_names   s#    !G5  !45  r   rL   c                    / nU H  n[         R                  " U5      n[         R                  " U5      n[         R                  " X@R                  :H  SS9n[         R                  " X@R
                  :H  SS9nXg:  nSXTU R                  :H  U-  '   SXTU R                  :H  U) -  '   UR                  UR                  5       5        M     U$ )Nr   )axisrC   rD   )	nparray
zeros_likecumsumr0   r1   r(   r`   tolist)	r2   rL   rM   input	array_idsmm_token_typesstartsendsis_video_modalitys	            r    rc   'Glm4vProcessor.create_mm_token_type_ids   s     EI]]51N
 YYy,?,??aHF99Y*;*;;!DD &UVN)<)<<@QQRXYN)<)<<BSASTU$$^%:%:%<=  ! r   c                 8    SU R                    S[        U5       3$ )Nz<|begin_of_image|>z<|end_of_image|>)r&   int)r2   rs   s     r    ra   %Glm4vProcessor.replace_frame_token_id
  s#    #D$4$4#55Ec-FXEYZZr   )r&   r(   r1   r0   r'   r)   )NNNN)NNN)NN)TF)r   r   r   r   r/   r   r   r   r
   rU   r   r	   r   r   rv   r   r   propertyr   r   rc   ra   r   __classcell__)r7   s   @r    r#   r#   -   s    P"  %)Z^$(	koT!ko ++d9o=EV@WWko T!	ko
 -.ko 
ko koZ$-N Y^
6 ! !
!$ !4S	? !*[ [r   r#   )numpyr   feature_extraction_utilsr   image_utilsr   processing_utilsr   r   r   r	   tokenization_utils_baser
   r   utilsr   r   video_utilsr   
get_loggerr   r]   r   r#   __all__r   r   r    <module>r      sl   *  4 % X X C , % 
		H	%+5  ][^ ][ ][@ 
r   