
    Z j:&                        S r SSKrSSKrSSKrSSKJr  SSKJrJ	r	J
r
JrJrJr  SSKJrJr  SSKJrJrJr  SSKJrJr  SS	KJrJrJr  S
SKJr  \" 5       (       a  SSKJr    " S S\SS9r!\" S\S5       " S S\5      5       r"S/r#g)z#video processor class for GLM-4.1V.    N   )BatchFeature)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimensionPILImageResamplingSizeDictget_image_size)UnpackVideosKwargs)
TensorTypeadd_start_docstringsis_torchvision_available)BASE_VIDEO_PROCESSOR_DOCSTRINGBaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videos   )smart_resize)
functionalc                   R    \ rS rSr% \\\4   \S'   \\S'   \\S'   \\S'   \\S'   Srg)	Glm4vVideoProcessorInitKwargs)   max_image_size
patch_sizetemporal_patch_size
merge_sizemax_duration N)	__name__
__module____qualname____firstlineno__dictstrint__annotations____static_attributes__r        ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/glm4v/video_processing_glm4v.pyr   r   )   s&    cN"OOr*   r   F)totalzfConstructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.aj  
        patch_size (`int`, *optional*, defaults to 14):
            The spacial patch size of the vision encoder.
        temporal_patch_size (`int`, *optional*, defaults to 2):
            The temporal patch size of the vision encoder.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
    c                     ^  \ rS rSr\R
                  rSSS.rSS0r\	r
\rSrSrSrSrSrSrSrS	rSr\rS
rSrSS/rS\\   4U 4S jjrS\4U 4S jjr S'S\S\ \!-  S-  4S jjr"SSS\R
                  SSSSSSSSS4S\#\$RJ                     S\&S\&S\'S-  SSS\&S\!S\&S\!\#\!   -  S-  S \!\#\!   -  S-  S!\ S-  S"\ S-  S#\ S-  S$\(\)-  S-  4S% jjr*S&r+U =r,$ )(Glm4vVideoProcessor1   i 1  i )shortest_edgelongest_edger1   T      i,     pixel_values_videosvideo_grid_thwkwargsc                 &   > [         TU ]  " S0 UD6  g )Nr    )super__init__)selfr7   	__class__s     r+   r:   Glm4vVideoProcessor.__init__R   s    "6"r*   returnc                    > [         TU ]  " S0 UD6nUR                  SU R                  5      nUR                  (       a  UR
                  (       d  [        S5      eU$ )z
Update kwargs that need further processing before being validated
Can be overridden by subclasses to customize the processing of kwargs.
sizez:size must contain 'shortest_edge' and 'longest_edge' keys.r    )r9   _standardize_kwargsgetr@   r0   r1   
ValueError)r;   r7   r@   r<   s      r+   rA   'Glm4vVideoProcessor._standardize_kwargsU   sM    
 ,6v6zz&$)),!!):):YZZr*   Nmetadatafpsc                     Ub  [        USS5      c  [        S5      eUR                  nUb  UOU R                  nUS-
  nUR                  =(       d    [        XaR                  -  5      S-   nXpR                  ::  as  [        [        R                  " Xu-  5      5      n[        U5       V	s/ s H;  n	[        U[        [        R                  " XR                  -  U-  5      5      5      PM=     n
n	O[        U R                  U-  5      nX:  a  [        [        U5      5      n
O[[        R                  " SX{SS9nU Vs/ s H8  n[        U[        [        R                  " XR                  -  5      5      5      PM:     n
n[!        5       / pU
 H-  nUU;  d  M  UR#                  U5        UR%                  U5        M/     ['        U5      S-  (       a  UR%                  US   5        [        R(                  " U5      $ s  sn	f s  snf )	a?  
Args:
    metadata (`VideoMetadata`):
        Metadata of the video containing information about total duration, fps and total number of frames.
    fps (`int` or `float`, *optional*):
        Target frames to sample per second. Defaults to `self.fps`.
Returns:
    np.ndarray:
        Indices to sample video frames.
NrF   zAsked to sample frames per second but no video metadata was provided which is required when sampling in GLM4V. Please pass in `VideoMetadata` object or set `do_sample_frames=False`r   r   T)endpoint)getattrrC   total_num_framesrF   durationroundr   r'   mathfloorrangeminceillistnplinspacesetaddappendlenarray)r;   rE   rF   r7   total_framesrequested_fpsmax_frame_idxrL   niframe_indicesnum_samplestarget_secondstseenuniqidxs                    r+   sample_frames!Glm4vVideoProcessor.sample_frames`   s     wx=EX 
  00"DHH$q($$Omll.J(Ka(O(((DJJx789AkpqrkstksfgSDIIa,,>NQ^>^4_0`aksMtMd//-?@K* $U<%8 9!#QPT!U_m n_mZ[]C		!llBR8S4T!U_m nUBd C$C  !
 t9q=KKR!xx~% u !os   )AG6:?G;gp?videosdo_convert_rgb	do_resizer@   resamplez7PILImageResampling | tvF.InterpolationMode | int | None
do_rescalerescale_factordo_normalize
image_mean	image_stdr   r   r   return_tensorsc                    [        U5      u  nn0 nUR                  5        H  u  nnUR                  u  nnnnnUUUnnnU(       am  [        UUUUX-  UR                  UR
                  S9u  nnUR                  UU-  UUU5      nU R                  U[        UUS9US9nUR                  UUUUU5      nUUU'   M     [        UU5      n[        U5      u  nn0 n 0 n!UR                  5        GH  u  nn[        US   [        R                  S9u  nnU R                  UXgXU
5      nUn"U"R                  S   U-  S:w  a8  U"S S 2SS 24   R                  SUS-
  SSS5      n#[        R                   " U"U#/SS9n"U"R                  S S	 u  n$n%n&U%U-  n%UU-  UU-  n(n'U"R                  U$U%UU&U'U-  UUU(U-  UU5
      n"U"R#                  SSS
SSSS	SSS5
      n"U"R%                  U$U%U'-  U(-  U&U-  U-  U-  5      n)U)U U'   U%U'U(//U$-  U!U'   GM     [        U U5      n*[        U!U5      n![        R                   " U*SS9n+[        R&                  " U!5      n,U+U,S.n-[)        U-US9$ )N)
num_framesheightwidthtemporal_factorfactor
min_pixels
max_pixels)ru   rv   )r@   rl   r   )channel_dimr   rI   )dimr               r3      	   )r5   r6   )datatensor_type)r   itemsshaper   r0   r1   viewresizer	   r   r
   r   FIRSTrescale_and_normalizerepeattorchcatpermutereshapetensorr   ).r;   ri   rj   rk   r@   rl   rm   rn   ro   rp   rq   r   r   r   rr   r7   grouped_videosgrouped_videos_indexresized_videos_groupedr   stacked_videosBTCHWrt   ru   rv   resized_heightresized_widthresized_videosprocessed_videos_groupedprocessed_gridspatchesrepeats
batch_sizegrid_tchannelgrid_hgrid_wflatten_patchesprocessed_videosr5   r6   r   s.                                                 r+   _preprocessGlm4vVideoProcessor._preprocess   s   $ 0EV/L,,!#%3%9%9%;!E>*00MAq!Q()1aJ0<)!$7%2#11#001- "0!4!4QUAq!!D!%"!}M% "- "
 "0!4!4Q1nm!\,:"5)) &<* ((>@TU 0E^/T,,#% %3%9%9%;!E>,:>!;LZjZpZp,q)NM "77
LV_N %G }}Q"55:!!RS&/004G!4KQPQSTU))Wg$6A>*1--*;'J22F+z9=J;VFFll#*$*$G ooaAq!Q1aCG%oo&(--
:ZGO />$U+'-vv&>%?*%LOE"K &<N **BDXY(:NO#ii(8a@o6#6,

 >BBr*   r    )N)-r!   r"   r#   r$   r   BICUBICrl   r@   r   r   rp   r   rq   rk   rm   ro   rj   do_sample_framesr   r   r   r   r   valid_kwargsrt   rF   model_input_namesr   r:   r%   rA   r   r'   floatrg   rS   r   Tensorboolr	   r&   r   r   r)   __classcell__)r<   s   @r+   r.   r.   1   s    "))H&8KLD$&9:N!JIIJLNJLJ0LJ
C.0@A#(E!F #	t 	 #'00 5[40j  $ $N`NhNh )!1504!%*.!%26aCU\\"aC aC 	aC
 oaC LaC aC aC aC DK'$.aC 4;&-aC $JaC !4ZaC $JaC j(4/aC aCr*   r.   )$__doc__rN   numpyrT   r   image_processing_utilsr   image_utilsr   r   r   r   r	   r
   processing_utilsr   r   utilsr   r   r   video_processing_utilsr   r   video_utilsr   r   r   image_processing_glm4vr   torchvision.transforms.v2r   tvFr   r.   __all__r    r*   r+   <module>r      s    *    2  5 O O X O O 0 ;L  l"vC, vCvCr !
!r*   