
    Z j<1                         S SK rSSKJr  SSKJrJrJr  SSKJ	r	J
r
JrJr  SSKJrJr  SSKJr  SSKJr   " S	 S
\
SS9r\ " S S\5      5       rS/rg)    N   )BatchFeature)
ImageInputconcatenate_listmake_flat_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstring)
VideoInputc                   .    \ rS rSrSSS.SS0SS0S	.rS
rg)InternVLProcessorKwargs   leftF)padding_sidereturn_mm_token_type_idscrop_to_patchesTreturn_tensorspt)text_kwargsimages_kwargsvideos_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/internvl/processing_internvl.pyr   r      s.     #(-

 t
 d
Ir#   r   F)totalc                   0  ^  \ rS rSr     SS\4U 4S jjjrS\\   S\\   S\\   S\R                  S	\R                  S
\R                  4S jr
\   SS\S-  S\\-  \\   -  \\   -  S-  S\S-  S\\   S\4
S jj5       rSS jr\S 5       rSrU =r$ )InternVLProcessor)   Nimage_seq_lengthc                 x  > [         TU ]  " XU4SU0UD6  X@l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  U l        U R                  U R
                  U R                  /U l        g)z
image_seq_length (`int`, *optional*, defaults to 256):
    The number of image token to use per image patch. it should be set so that:
    image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
chat_templateN)super__init__r)   start_image_tokenend_image_tokenstart_image_token_idend_image_token_idcontext_image_tokenimage_tokenvideo_tokencontext_image_token_idimage_token_id	image_ids)selfimage_processor	tokenizervideo_processorr)   r+   kwargs	__class__s          r$   r-   InternVLProcessor.__init__+   s     	_lTalekl 0!*!<!<(88$-$B$B!"+">">$88$00'>>--t/H/H$JaJabr#   textimage_num_patchesvideo_num_patchesimage_num_patches_indicesvideo_num_patches_indicesvideo_patch_indicesc	           	      R  ^ ^ Sn	Sn
/ n/ n/ nU GH  nUnT R                   U;   d  T R                  U;   Ga  T R                   U;   a  T R                  U;  d8  UR                  T R                   5      UR                  T R                  5      :  a  U	S:  a  XiS-
     OSnXi   nUR                  UUU 5        UR	                  T R                   SS5      nUR                  T R
                   T R                   T R                  -  XI   -   T R                   35        U	S-  n	OX   nXS-      nUU   nUU   nUR                  UUU 5        [        UUU 5      mSR                  UU 4S j[        [        T5      5       5       5      nUR                  U5        UR	                  T R                  SS5      nU
S-  n
T R                   U;   a  GM  T R                  U;   a  GM  SU;   a,  UR                  S5      nUR	                  SUS5      nSU;   a  M,  UR                  U5        GM     XX4$ )z
Processes interleaved text with <image> and <video> placeholders, replacing them with appropriate
image and video tokens while keeping track of the patches used.
r      z<placeholder>
c              3      >#    U  HE  nS US-    STR                    TR                  TR                  -  TU   -   TR                   3v   MG     g7f)FramerF   z: N)r.   r3   r)   r/   ).0inum_patchesr8   s     r$   	<genexpr>?InternVLProcessor._insert_media_placeholders.<locals>.<genexpr>w   sr      -!8A  Awb)?)?(@AQAQTXTiTiAilwxylzAz@{  }A  }Q  }Q  |R  S!8s   AA)r3   r4   indexappendreplacer.   r)   r/   listjoinrangelenpop)r8   r?   image_pixel_valuesvideo_pixel_valuesr@   rA   rB   rC   rD   image_indexvideo_indexprocessed_textimage_video_patchesreplace_stringsprompt
new_promptstart_index	end_indexcurrent_patch_indexend_patch_indexvideo_promptreplace_strrL   s   `                     @r$   _insert_media_placeholders,InternVLProcessor._insert_media_placeholdersE   sr      FJ""j0D4D4D
4R##z1$$J6!''(8(89J<L<LTM]M]<^^ Q\^_P_";!O"LefK 9 FI'../A+i/XY!+!3!3D4D4DoWX!YJ#**11243C3CdF[F[3[^o^|3|2}  C  S  S  ~T  U  1$K
 +>*J'&9/&JO";<O"PK 9/ JI'../A+i/XY"&'89L_']"^K#'99 -!&s;'7!8- $L $**<8!+!3!3D4D4DoWX!YJ1$KA ""j0D4D4D
4RB "Z/-11!4'//aP
 "Z/ !!*-M P KLLr#   imagesvideosr<   returnc           
         Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6n[        U[        [        45      (       d  U/n/ nSn[        R                  " S/5      nUbu  U R                  R                  U5      n[        U5      nU R                  " SSU0US   D6n	U	R                  S5      nU	R                  S5      n[        R                  " U5      n/ n
Sn[        R                  " S/5      n[        R                  " S/5      nUb  US	   nU R                  " SS
U0UD6nUR                  S5      nUR                   tnnn[        R"                  " UU5      n[%        U5      n[        R&                  " US-   [(        5      nSUS'   [        R                  " U5      USS& S/U-  n
[        R&                  " US-   [(        5      nSUS'   [        R                  " U
5      USS& UR+                  SS5      n0 nUc  Ubd  U R-                  UUUUU
UUU5      u  nnnnUb  U[/        U5      :w  a  [        S5      eUb  U[/        W5      :w  a  [        S5      eS[1        U5      0nUS   R                  SS5      nUS   R                  SS5      nU R                  " U40 US   D6nU R3                  UUS/S9  U(       a  U R5                  US   5      US'   [7        0 UEUEUS9$ )a  
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
NzYou have to specify text.tokenizer_init_kwargsr   rh   r   rL   pixel_valuesr   ri   pixel_values_videosrF   zONumber of image placeholders in the prompt does not match the number of images.zONumber of video placeholders in the prompt does not match the number of videos.r   r   r   image)
modalities	input_idsmm_token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr   r:   init_kwargs
isinstancerR   tuplenparrayr9   fetch_imagesr   rV   cumsumr;   shapefullsumemptyintflattenrf   rU   r   _check_special_mm_tokenscreate_mm_token_type_idsr   )r8   rh   r?   ri   r<   output_kwargsr@   rW   rB   image_inputsrA   rX   rD   rC   video_kwargsvideo_inputs
batch_size
num_frames_num_frames_per_videoimage_videos_inputsr\   rY   rZ   r   r   text_inputss                              r$   __call__InternVLProcessor.__call__   sC   $ <899**#
"&.."<"<
 
 $u..6D !$&HHaSM!))66v>F-f5F//`v`A_`L , 0 0 ?!-!1!1.!A(*		2C(D%! hhsm$&HHaSM!(9L//NvNNL!-!1!12G!H);)A)A&J
Q#%77:z#B 12J"$((:>3"?%&"&(ii0D&E#!"j 0(*a(E%+,%a(,.II6G,H%ab)!3!;!;Aq!A !3BFBaBa""!!))#	C?D%{K !kS[&@ !rss!kS9M5N&N !rss $23CDW3X"Y&}599:JDQ#0#?#C#CD^`d#e nnTJ]=-IJ%%dKWI%N#/3/L/L[YdMe/fK+,!GK!G3F!GUcddr#   c                 ^   0 nUb  [         R                  R                  S0 5      nUR                  U5        U Vs/ s H!  nU R                  R
                  " / UQUP76 PM#     nnU Vs/ s H  nSU R                  U-  -   PM     nnUR                  XS.5        [        S0 UD6$ s  snf s  snf )a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
r      )num_image_tokensnum_image_patchesr   )r   r!   getupdater9   get_number_of_image_patchesr)   r   )	r8   image_sizesr<   vision_datar   
image_sizer   rL   r   s	            r$   _get_num_multimodal_tokens,InternVLProcessor._get_num_multimodal_tokens   s     "3==AA/SUVM  ( #.!"-J $$@@\*\m\"-  !
 ^oo]nkT%:%:[%H I]no4Dmn,,,!
  ps   (B%)B*c                 b    U R                   R                  nU R                  R                  nX-   $ N)r:   model_input_namesr9   )r8   tokenizer_input_namesimage_processor_input_namess      r$   r   #InternVLProcessor.model_input_names   s/     !% @ @&*&:&:&L&L#$BBr#   )	r/   r1   r7   r)   r3   r6   r.   r0   r4   )NNN   N)NNNr   )r   r   r   r    r   r-   rR   strrz   ndarrayrf   r   r   r   r   r   r   r   r   r   r   propertyr   r"   __classcell__)r=   s   @r$   r'   r'   )   s3     #c
 c c4>M3i>M
  9>M  9>M $&::>M $&::>M  ZZ>M@  %)ae$(	YeT!Ye ++d9o=EV@WWZ^^Ye T!	Ye
 01Ye 
Ye Yev-8 C Cr#   r'   )numpyrz   image_processing_utilsr   image_utilsr   r   r   processing_utilsr   r	   r
   r   tokenization_utils_baser   r   utilsr   video_utilsr   r   r'   __all__r   r#   r$   <module>r      s]      2 Q Q X X C # %.e  XC XC XCv 
r#   