
    Z j7                     F   S SK rS SKrSSKJrJr  SSKJrJrJ	r	J
r
  SSKJrJr  SSKJrJrJr  SSKJr  SSKJrJrJr  \" 5       (       a  S S	KJr  \R6                  " \5      rS
rSrSr Sr!Sr"S\#S   S\#\$   4S jr%S\$S\&\$\$4   4S jr' " S S\SS9r( " S S\5      r)S/r*g)    N   )BatchFeatureget_size_dict)IMAGENET_STANDARD_MEANIMAGENET_STANDARD_STDPILImageResamplingSizeDict)UnpackVideosKwargs)
TensorTypeis_torchvision_availablelogging)BaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videos)
functionalzYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.zgYou are provided the following series of {frame_count} frames from a {video_duration} [H:MM:SS] video.
z

z
Frame from {timestamp}:i   videostorch.Tensorreturnc                     [        S5      =pU  H.  nUR                  5       SS u  pE[        XA5      n[        XR5      nM0     X4$ )z@
Get the maximum height and width across all videos in a batch.
z-infN)floatsizemax)r   
max_height	max_widthvideoheightwidths         څ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/smolvlm/video_processing_smolvlm.pyget_max_height_widthr"   /   sM     #6]*J

RS),
)	  ""    resolution_max_sidec                 4   U R                  5       SS u  p#[        [        U5      nUc  [        X#5      OUnX2-  nX2:  a  Un[	        X4-  5      nUS-  S:w  a  US-  nO"X#:  a  Un[	        X$-  5      nUS-  S:w  a  US-  n[        US5      n[        US5      nX#4$ )a  
Get the output size of the video after resizing given a dictionary specifying the max and min sizes.
Args:
    video (`np.ndarray`):
        Video to resize.
    resolution_max_side (`int`):
        The longest edge of the video will be resized to this value. The shortest edge will be resized to keep the
        input aspect ratio.
Returns:
    The output size of the video after resizing.
r   N   r      )r   minMAX_IMAGE_SIZEr   int)r   r$   r   r    aspect_ratios        r!   get_resize_output_image_sizer,   ;   s     JJL%MF n.AB0C0K#f,Qd>L#U)*A:?aKF	$F)*19>QJE^FqME=r#   c                   *    \ rS rSr% \\\4   \S'   Srg)SmolVLMVideoProcessorInitKwargsc   max_image_size N)	__name__
__module____qualname____firstlineno__dictstrr*   __annotations____static_attributes__r1   r#   r!   r.   r.   c   s    cN"r#   r.   F)totalc                     ^  \ rS rSr\R
                  rSS0rSS0r\	r
\rSrSrSrSrSrSr\rSS/rS	\\   4U 4S
 jjr  S*SSS\SSS\SS4
U 4S jjjr  S+SSS\\\4   S\S\S\4
S jjr   S,S\S\S-  S\\-  S-  S\S-  4S jjr  S-S\!S   S\S \S\SSS!\S"\S#\S$\S%\\!\   -  S-  S&\\!\   -  S-  S'\"\#-  S-  4S( jjr$S)r%U =r&$ ).SmolVLMVideoProcessorg   longest_edgei  il  TFpixel_valuespixel_attention_maskkwargsc                    > [         TU ]  " S0 UD6  SU;   a  SU;   a  US   US   S'   SU;   a:  US   S   U l        US   S   U l        [	        US   S   U R
                  S9U l        g g )Nr   video_sampling
video_size
max_framesfps)default_to_squarer1   )super__init__
num_framesrF   r   rG   r   )selfrA   	__class__s     r!   rI   SmolVLMVideoProcessor.__init__v   s    "6" V 0F :5;F^F#$\2v%$%56|DDO./6DH%f-=&>|&L`d`v`vwDI &r#   Nr   r   r   resamplez7PILImageResampling | tvF.InterpolationMode | int | None	antialiasr   c                   > UR                   (       a  [        UUR                   S9nOJUR                  (       a*  UR                  (       a  UR                  UR                  4nO[	        SU S35      e[
        TU ]  U[        US   US   S9X4S9n[        U R                  S   U R                  S   S9n[
        TU ]  XX4S9nU$ )	a  
Resize an video to `(size["height"], size["width"])`.
Args:
    video (`torch.Tensor`):
        Video to resize.
    size (`SizeDict`):
        Dictionary in the format `{"height": int, "width": int}` specifying the size of the output video.
    resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
        Resampling filter to use when resizing the video.
Returns:
    `torch.Tensor`: The resized video.
)r$   zHSize must contain 'height' and 'width' keys, or 'longest_edge' key. Got .r   r'   )r   r    )rN   rO   r>   )	r>   r,   r   r    
ValueErrorrH   resizer	   r0   )	rK   r   r   rN   rO   rA   new_sizemax_sizerL   s	           r!   rS   SmolVLMVideoProcessor.resize   s    (  4$($5$5H [[TZZTZZ0Hghlgmmnopp88A;hqkBX  
 4#6#6~#FdNaNabpNqruWr#   padded_sizemax_num_framesfillreturn_pixel_maskc                    UR                  5       SS nUS   US   -
  nUS   US   -
  nX1R                  S   -
  n	US:  d  US:  a  [        SU SU S35      eXb:w  a  SUSUSSSU	/n
[        R                  " XUS9nSnU(       a@  [
        R                  " US	SSS2SS24   [
        R                  S
9nSUS	SUS   2SUS   24'   X4$ )a  Pads the sample with empty video to the padded_size
Args:
    video (`torch.Tensor`):
        Video to pad.
    padded_size (`tuple[int, int]`):
        Height and width to pad.
    max_num_frames (`int`):
        The maximum number of frames to which video will be padded.
    fill (`int`, *optional*):
        The value to use for the padding.
    return_pixel_mask (`bool`, *optional*, defaults to `True`):
        Whether to return a pixel mask.
r   Nr   r'   zzPadding dimensions are negative. Please make sure that the padded size is larger than the original size. Got padded size: z, original size: rQ   )rY   .dtype)r   shaperR   tvFpadtorch
zeros_likeint64)rK   r   rW   rX   rY   rZ   original_sizepadding_heightpadding_widthpadding_framepadding
pixel_masks               r!   r`   SmolVLMVideoProcessor.pad   s   * 

RS)$Q-*::#Aq)99&Q71 233>-?PQ^P__`b  '-NAq!]SGGGE6E 
))%Q1*=U[[QJFGJs.mA..0B-2B0BBC  r#   metadatarJ   rF   	skip_secsc                    Ub  [        USS5      c  [        S5      eUb  UOU R                  nUb  UOU R                  nUR                  n[        [        X1S   -  5      5      n[        Xr5      nUS:  a  SnSn	US-
  n
US:  a4  US   SU-  -
  X#-  :  a#  [        XAS   -  5      n	[        XdUS   -  -
  5      n
[        SU	5      n	[        XS-
  5      n
X:  a  SUS-
  p[        R                  " XU[
        S9n[        R                  " U5      nU$ )	a  
Video sampling function which:
    - Uses `num_frames` (if provided) or calculates it from `fps` and metadata.
    - Applies a basic center-skip if fewer frames than available, otherwise
        optionally skips `skip_secs` from both the start and end.
    - Uniformly samples the desired number of frames between the start and end indices.

Args:
    metadata (`VideoMetadata`):
        Metadata of the video containing information about total duration, fps and total number of frames.
    num_frames (`int`, *optional*):
        Maximum number of frames to sample. Defaults to `self.num_frames`.
    fps (`int` or `float`, *optional*):
        Target frames to sample per second. Defaults to `self.fps`.
    skip_secs (`float`, *optional*, defaults to `1`):
        Number of seconds to skip from the start and end if the video is long enough.

Returns:
    np.ndarray:
        Indices to sample video frames.
NrF   zAsked to sample frames per second but no video metadata was provided which is required when sampling in SmolVLM. Please pass in `VideoMetadata` object or set `do_sample_frames=False`durationr'   r   r&   r\   )getattrrR   rJ   rF   total_num_framesr*   roundr(   r   nplinspaceunique)rK   rk   rJ   rF   rl   rA   rp   estimated_framesdesired_frames	start_idxend_idxindicess               r!   sample_frames#SmolVLMVideoProcessor.sample_frames   s7   : wx=EX 
 $.#9Zt
_c$((#44 uSJ+?%?@A -:AN 	"Q&q=hz2Q]BzGWXI78I*%-HHIG9%	g!34!"$4q$8w++i.L))G$r#   r   do_convert_rgb	do_resize
do_rescalerescale_factordo_normalizedo_pad
image_mean	image_stdreturn_tensorsc           	         [        U5      u  p0 nUR                  5        H:  u  nnU(       a  U R                  U5      nU(       a  U R                  UXES9nUUU'   M<     [	        UU5      n[        U5      u  p0 nUR                  5        H  u  nnU R                  UXgXU5      nUUU'   M!     [	        UU5      nU	(       a~  [        U5      n[        S U 5       5      n[        U5      u  p0 n0 nUR                  5        H$  u  nnU R                  UUUS9u  nnUUU'   UUU'   M&     [	        UU5      n[	        UU5      nSU0nU	(       a$  U	(       a  Ub  [        R                  " WSS9OWUS'   [        UUS9$ )	N)r   rN   c              3   8   #    U  H  n[        U5      v   M     g 7fN)len).0r   s     r!   	<genexpr>4SmolVLMVideoProcessor._preprocess.<locals>.<genexpr>>  s      J9IU9Is   )rW   rX   r?   r   )dimr@   )tensor_type)r   itemsconvert_to_rgbrS   r   rescale_and_normalizer"   r   r`   ra   stackr   )rK   r   r|   r}   r   rN   r~   r   r   r   r   r   r   rA   grouped_videosgrouped_videos_indexresized_videos_groupedr^   stacked_videosresized_videosprocessed_videos_groupedprocessed_videospad_sizerX   processed_padded_mask_groupedpadded_masksr@   datas                               r!   _preprocess!SmolVLMVideoProcessor._preprocess  s     0EV/L,!#%3%9%9%;!E>!%!4!4^!D!%^$!Z,:"5) &< ((>@TU/D^/T,#% %3%9%9%;!E>!77
LV_N /=$U+	 &< **BDXY+,<=H  J9I JJN3HIY3Z0N,.)')$)7)=)=)?%~/3xx" 08 0, 3A(/7C-e4 *@  ..FH\]#12OQe#f  01 n8 0a8) '(
 Dn==r#   )rF   rJ   r   )NT)r   T)NNr'   r   )'r2   r3   r4   r5   r   LANCZOSrN   r   r0   r   r   r   r   r}   r~   r   r|   r   do_sample_framesr.   valid_kwargsmodel_input_namesr
   rI   r	   boolrS   tupler*   r`   r   r   rz   listr7   r   r   r9   __classcell__)rL   s   @r!   r<   r<   g   s   !))HG$D$c*N'J%IIJLNF2L')?@
x(G!H 
x  OS(( ( L	(
 ( 
( (^ "&)!)! 38_)! 	)!
 )!  )!\ "&"& !?? $J? 5[4	?
 :?\ 37=>^$=> => 	=>
 => L=> => => => => DK'$.=> 4;&-=> j(4/=> =>r#   r<   )+numpyrr   ra   image_processing_utilsr   r   image_utilsr   r   r   r	   processing_utilsr
   r   utilsr   r   r   video_processing_utilsr   video_utilsr   r   r   torchvision.transforms.v2r   r_   
get_loggerr2   loggerDEFAULT_SYSTEM_MESSAGEDEFAULT_VIDEO_INTRODEFAULT_MEDIA_OUTTROFRAME_TIMESTAMP_MESSAGEr)   r   r*   r"   r   r,   r.   r<   __all__r1   r#   r!   <module>r      s       A  5 B B 8 O O ; 
		H	% V n   5 	#n!5 	#$s) 	#%% 38_%P#l% #n>. n>b #
#r#   