
    Z j\                        S SK r S SKrS SKrS SKJr  SSKJr  SSK	J
r
  SSKJrJr  SSKJrJrJrJrJrJr  SSKJrJr  SS	KJrJr   " S
 S\SS9rSr S"S\S\S\S-  S\S-  S\\\4   4
S jjr S#S\S\S\ \!\4   S-  S\\\4   4S jjr"SSS\S\\\4   4S jr#S\$\$S      S\\\4   4S jr%S\$\$S      S\4S jr&S\$\$S      SS4S jr'\ " S  S!\5      5       r(S!/r)g)$    N)
functional   )TorchvisionBackend)BatchFeature)group_images_by_shapereorder_images)IMAGENET_STANDARD_MEANIMAGENET_STANDARD_STD
ImageInputPILImageResamplingSizeDictmake_nested_list_of_images)ImagesKwargsUnpack)
TensorTypeauto_docstringc                   B    \ rS rSr% Sr\\S'   \\\	4   \S'   \\S'   Sr
g)SmolVLMImageProcessorKwargs,   aZ  
do_image_splitting (`bool`, *optional*, defaults to `True`):
    Whether to split the image into sub-images concatenated with the original image. They are split into patches
    such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
    Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
return_row_col_info (`bool`, *optional*, defaults to `False`):
    Whether to return the row and column information of the images.
do_image_splittingmax_image_sizereturn_row_col_info N)__name__
__module____qualname____firstlineno____doc__bool__annotations__dictstrint__static_attributes__r       څ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/smolvlm/image_processing_smolvlm.pyr   r   ,   s#     cN"r%   r   F)totali   heightwidthmin_lenmax_lenreturnc                     Uc  [        X5      OUnX-  nX:  a  Un[        X-  5      n U S-  S:w  a  U S-  n O"X:  a  Un [        X-  5      nUS-  S:w  a  US-  n[        X5      n [        X5      nX4$ )a  
Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
Args:
    height (`int`):
        Height of the input image.
    width (`int`):
        Width of the input image.
    min_len (`int`, *optional*, defaults to 1):
        Minimum size of the output image.
    max_len (`int`, *optional*, defaults to the maximum size of the image):
        Maximum size of the output image.
Returns:
    The output size of the image after resizing.
   r      maxr#   )r(   r)   r*   r+   aspect_ratios        r&   &_resize_output_size_rescale_to_max_lenr3   ?   s    " %,Oc& G>LU)*A:?aKF	F)*19>QJE !FE=r%   c                     Uc  [        X5      OUnX-  nX:  a  X:  a  Un[        X-  5      n OX:  a  X:  a  Un [        X-  5      n[        U S5      n [        US5      nX4$ )a  
Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
Args:
    height (`int`):
        Height of the input image.
    width (`int`):
        Width of the input image.
    max_len (`Dict[str, int]`, *optional*, defaults to the maximum size of the image):
        Defines the maximum dimensions of the image.
Returns:
    The output size of the image after resizing.
r/   r0   )r(   r)   r+   r2   s       r&   +_resize_output_size_scale_below_upper_boundr5   d   sr     %,Oc& G>L5?U)*	F,F)* ^FqME=r%   imagetorch.Tensorresolution_max_sidec                 b    U R                   SS u  p#[        X#US9u  p#[        X#[        S9u  p#X#4$ )a  
Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
Args:
    image (`torch.Tensor`):
        Image to resize.
    resolution_max_side (`int`):
        The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the
        input aspect ratio.
Returns:
    The output size of the image after resizing.
Nr+   )shaper3   r5   MAX_IMAGE_SIZE)r6   r8   r(   r)   s       r&   get_resize_output_image_sizer>      s>     KK$MF ;6RefMF?WefMF=r%   images_listztorch.Tensor|np.ndarrayc                     / nU  H*  nU H!  nUR                  UR                  SS 5        M#     M,     [        S U 5       5      n[        S U 5       5      nXE4$ )z@
Get the maximum height and width across all images in a batch.
r:   Nc              3   *   #    U  H	  oS    v   M     g7f)r   Nr   .0sizes     r&   	<genexpr>'get_max_height_width.<locals>.<genexpr>   s     5!W   c              3   *   #    U  H	  oS    v   M     g7fr/   Nr   rB   s     r&   rE   rF      s     4GrG   )appendr<   r1   )r?   image_sizesimagesr6   
max_height	max_widths         r&   get_max_height_widthrO      sa     KEu{{23/0   555J444I""r%   c                 `    U  H  nU(       d  M  US   R                   S   s  $    [        S5      e)zg
Get the number of channels across all images in a batch. Handle empty sublists like in [[], [image]].
r   zNo images found in the batch.)r<   
ValueErrorr?   rL   s     r&   get_num_channelsrS      s4     6!9??1%%  4
55r%   ztorch.devicec                 F    U  H  nU(       d  M  US   R                   s  $    g)zz
Get the device from the first non-empty element in a nested list of images.
Handle empty sublists like in [[], [image]].
r   N)devicerR   s     r&   get_device_from_imagesrV      s#    
 6!9### r%   c                    t  ^  \ rS rSr\R
                  r\r\	r
SS0rSS0rSrSrSrSrSrSrSr\rSS/rS	\\   4U 4S
 jjr\S\S	\\   S\4U 4S jj5       rS1S\S\S\4S jjr S2SSS\SSSS4U 4S jjjr  S2S\!RD                  S\#\$\4   SS4S jjr% S2S\!RD                  S\SS4S jjr&  S3S\!RD                  S\'\\4   S\S\(4S jjr)S\*\*S      S\(S\SSS \(S!\+S"\(S#\+\*\+   -  S-  S$\+\*\+   -  S-  S%\(S-  S&\(S-  S\#\$\4   S-  S'\(S-  S(\(S-  S)\$\,-  S-  S\4 S* jr-U 4S+ jr.S,\S-\S.\#4S/ jr/S0r0U =r1$ )4SmolVLMImageProcessor   longest_edgei  il  TFpixel_valuespixel_attention_maskkwargsc                 &   > [         TU ]  " S0 UD6  g )Nr   )super__init__)selfr]   	__class__s     r&   r`   SmolVLMImageProcessor.__init__   s    "6"r%   rL   r,   c                 &   > [         TU ]  " U40 UD6$ N)r_   
preprocess)ra   rL   r]   rb   s      r&   rf    SmolVLMImageProcessor.preprocess   s    w!&3F33r%   expected_ndimsc                 6    U R                  U5      n[        XS9$ )z3
Prepare a nested images structure for processing.
)rh   )fetch_imagesr   )ra   rL   rh   s      r&   _prepare_images_structure/SmolVLMImageProcessor._prepare_images_structure   s    
 ""6*)&PPr%   Nr6   r7   rD   resamplez7PILImageResampling | tvF.InterpolationMode | int | Nonec                   > UR                   (       a  [        XR                   S9nOFUR                  (       a*  UR                  (       a  UR                  UR                  4nO[	        S5      e[
        TU ]  " U[        US   US   S94SU0UD6$ )a  
Resize an image. The longest edge of the image is resized to size.longest_edge, with the shortest edge
resized to keep the input aspect ratio. Can also be used with size.height and size.width.
Args:
    image (`torch.Tensor`):
        Image to resize.
    size (`SizeDict`):
        Size of the output image.
    resample (`PILImageResampling | tvF.InterpolationMode | int | None`, *optional*):
        Resampling filter to use when resizing the image.
)r8   zJsize must be a dictionary with key 'longest_edge' or 'height' and 'width'.r   r/   r(   r)   rm   )rZ   r>   r(   r)   rQ   r_   resizer   )ra   r6   rD   rm   r]   new_sizerb   s         r&   rp   SmolVLMImageProcessor.resize   sv    $ 3EO`O`aH[[TZZTZZ0Hijjw~eXXa[QR%Tr_grkqrrr%   r   c                 "   UR                  5       u  pEpgSu  pUS   =p/ nXj:  d  X{:  a  [        R                  " Xj-  5      n[        R                  " X{-  5      nUR                  XU
S9R                  XUS9R	                  5       R                  XESX5      R                  SSSSS	5      nXnnU R                  U[        UUS
9US9n[        R                  " XR                  S5      4SS9nOSu  pUR                  S5      nU/U-  nU/U-  nXU4$ )a  
Split an image into squares of side max_image_size and the original image resized to max_image_size.
That means that a single image becomes a sequence of images.
This is a "trick" to spend more compute on each image with no changes in the vision encoder.
1) If one side of the original image is larger than `max_image_size`, resize it to `max_image_size` while preserving the aspect ratio.
2) Divide the resulting image into `ceil(height / max_image_size)` x `ceil(width / max_image_size)`
sub-images of the same size each (image_size, image_size). Typically, 364x364.
3) Returns the list of the crops and the original image, in addition to the number of splits for the height and the width.
Args:
    images (`torch.Tensor`):
        Images to split.
    max_image_size (`Dict[str, int]`):
        Maximum size of the output image. If the image is larger than this size, it will be split into
        patches of this size, and the original image will be concatenated with the patches, resized to max_size.
    resample (`PILImageResampling | tvF.InterpolationMode | int | None`, *optional*):
        Resampling filter to use when resizing the image.
)r.   r   rZ   )rD   stepr   r.   r/   r      ro   rm   )dim)r   r   )rD   mathceilunfold
contiguousviewpermuterp   r   torchcat	unsqueeze)ra   rL   r   rm   
batch_sizenum_channelsr(   r)   
height_dim	width_dimrM   rN   framesnum_splits_hnum_splits_wglobal_image_heightglobal_image_widths                    r&   split_images"SmolVLMImageProcessor.split_images   s2   . 39++-/
& $
!/!??
%"399V%89L99U%67L j
K		BjJJAq!Q'  7A!3[[(;CUVai ! F YY(8(8(;<!DF)-&L%%a(F$~
2$~
2\11r%   vision_encoder_max_sizec                    UR                  5       SS u  pEXT-  nXT:  aD  [        R                  " XR-  5      U-  n[        XV-  5      n[        R                  " XB-  5      U-  nOHXE:  aC  [        R                  " XB-  5      U-  n[        XF-  5      n[        R                  " XR-  5      U-  n[	        XES9nU R                  XUS9$ )a(  
Resize images to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
Args:
    image (`torch.Tensor`):
        Images to resize.
    vision_encoder_max_size (`int`):
        Maximum size of the output image. If the image is larger than this size, it will be split into
        patches of this size, and the original image will be concatenated with the patches, resized to max_size.
    resample (`PILImageResampling | tvF.InterpolationMode | int | None`, *optional*):
        Resampling filter to use when resizing the image.
r:   Nro   )rD   rm   )rD   ry   rz   r#   r   rp   )ra   r6   r   rm   r(   r)   r2   rq   s           r&   resize_for_vision_encoder/SmolVLMImageProcessor.resize_for_vision_encoder4  s    " 

RS)~?IIe=>AXXE-.FYYv?@CZZF^YYv?@CZZF-.EIIe=>AXXE67{{5({CCr%   padded_sizefillreturn_pixel_maskc                 ^   UR                   SS  nUS   US   -
  nUS   US   -
  nUS:  d  US:  a  [        SU SU S35      eXR:w  a  SSXv4n[        R                  " XUSS9nS n	U(       a?  [        R
                  " US	SS S 2S S 24   [        R                  S
9n	SU	S US   2S US   24'   X4$ )Nr:   r   r/   zzPadding dimensions are negative. Please make sure that the padded size is larger than the original size. Got padded size: z, original size: .constant)r   padding_mode.)dtype)r<   rQ   tvFpadr   
zeros_likeint64)
ra   r6   r   r   r   original_sizepadding_bottompadding_rightpadding
pixel_masks
             r&   r   SmolVLMImageProcessor.padS  s     BC($Q-*::#Aq)99A!233>-?PQ^P__`b  '!];GGGEJOE 
))%Q1*=U[[QJABJ)q))+=]1-=+==>  r%   	do_resize
do_rescalerescale_factordo_normalize
image_mean	image_stddo_padr   r   disable_groupingreturn_tensorsc           	         [        USUS9u  nn0 nUR                  5        H"  u  nnU(       a  U R                  UX4S9nUUU'   M$     [        UUSS9n[        USUS9u  nn0 nU(       a  0 n0 nUR                  5        H=  u  nnU R	                  UUS   US9nU R                  UXS9u  nnnUUU'   UUU'   UUU'   M?     [        UUSS9n[        UUSS9n[        UUSS9n[        U5       H(  u  nnU VV s/ s H  nU  H  n U PM     M     sn nUU'   M*     OUR                  5        H*  u  nnU R                  U[        US   US   S9US9nUUU'   M,     [        UUSS9nU Vs/ s H  nS	/[        U5      -  PM     nnU Vs/ s H  nS	/[        U5      -  PM     nn[        USUS9u  nn0 n!UR                  5        H  u  nnU R                  UXVXxU	5      nUU!U'   M!     [        U!USS9nU
(       a  [        S
 U 5       5      n"[        U5      u  n#n$[        U5      n%[        U5      n&[        R                  " [        U5      U"/U%U#U$4Q7SU&06n'[        R                  " [        U5      U"/U#U$4Q7SU&06n([        U5       H<  u  nn[        U5       H'  u  n)n U R!                  U U#U$45      u  U'UU)4'   U(UU)4'   M)     M>     U'nU
(       a  UW(S.n*OHUS:X  a>  S[        R"                  " U Vs/ s H  n[        R"                  " U5      PM     sn5      0n*OSU0n*[%        U*US9n+U(       a
  UU+S'   UU+S'   U+$ s  sn nf s  snf s  snf s  snf )z*
Process a batch of images for the model.
T)	is_nestedr   rw   )r   rZ   )r   rm   ro   )r6   rD   rm   r   c              3   8   #    U  H  n[        U5      v   M     g 7fre   )len)rC   images_s     r&   rE   4SmolVLMImageProcessor._preprocess.<locals>.<genexpr>  s      N=M'W=Ms   rU   )r[   r\   ptr[   )datatensor_typerowscols)r   itemsrp   r   r   r   	enumerater   r   rescale_and_normalizer1   rO   rS   rV   r   zerosr   stackr   ),ra   rL   r   rD   rm   r   r   r   r   r   r   r   r   r   r   r   r]   grouped_imagesgrouped_images_indexresized_images_groupedr<   stacked_imagesresized_imagessplit_images_groupedrows_groupedcols_groupedr   r   processed_imagesigroup_imagessublistr6   processed_images_groupedmax_num_imagesrM   rN   r   rU   processed_images_paddedpixel_attention_masksjr   encodings,                                               r&   _preprocess!SmolVLMImageProcessor._preprocessq  sC   . 0Ed5E0
,, "$%3%9%9%;!E>!%^T!U,:"5) &< ((>@T`de/Dd=M0
,,  "LL)7)=)=)?%~!%!?!?"N>$BX "@ " .2->->"> .? .*d /=$U+&*U#&*U# *@  ..BDXdhi!,0DPTUD!,0DPTUD#,-=#><<H&^LV]UuV]uL&^ # $? *8)=)=)?%~!%(!)G~^lOmn% "- "
 /=$U+ *@  ..BDXdhi4DE4D&QC#f+%4DDE4DE4D&QC#f+%4DDE 0E?O0
,, $& %3%9%9%;!E>!77
LV_N /=$U+ &< **BDXdhi  N=M NNN$89I$J!J	+,<=L+,<=F&+kk$%' 
I6' 	'# %*KK$%% i(% 	%! ''78	6 )& 1HAuQUQYQY
I6RN+AqD13HA3N !2 9
  7$4NcdDt#"EKKSc0dScV1DSc0d$efD"$45DT~F#HV#HVC '_ FEV 1es   5L5$L;M + M
c                 l   > [         TU ]  5       nUR                  SS 5        UR                  SS 5        U$ )N_valid_processor_keysr   )r_   to_dictpop)ra   encoder_dictrb   s     r&   r   SmolVLMImageProcessor.to_dict  s7    w(0$7.5r%   r(   r)   images_kwargsc                    UR                  SU R                  5      nUR                  SU R                  5      nUR                  SU R                  5      nS=n=pU(       Ga  [	        XUS   S9u  p[        X[        S9u  pX!-  n
X!:  aP  [        R                  " X%S   -  5      US   -  n[        X*-  5      n[        R                  " XS   -  5      US   -  nOTX:  aO  [        R                  " XS   -  5      US   -  n[        X-  5      n[        R                  " X%S   -  5      US   -  nUS   =pWU:  d  WU:  a8  [        R                  " X-  5      n[        R                  " WU-  5      n	X-  S-   nXxU	4$ )aG  
A utility that returns number of image patches for a given image size.

Args:
    height (`int`):
        Height of the input image.
    width (`int`):
        Width of the input image.
    images_kwargs (`dict`)
        Any kwargs to override defaults of the image processor.
Returns:
    `int`: Number of patches per image.
r   r   rD   r   rZ   r;   r/   )
getr   r   rD   r3   r5   r=   ry   rz   r#   )ra   r(   r)   r   r   r   rD   num_patchesnum_rowsnum_colsr2   resized_widthresized_heightrM   rN   s                  r&   get_number_of_image_patches1SmolVLMImageProcessor.get_number_of_image_patches  s    +../CTE\E\]&**+;T=P=PQ  3,---hB6Z^_mZnoMFG_mnMF >L $		%2P*P QTbcqTr r!$U%9!:!%6>4R+R!SVdesVt!t!%6>4R+R!SVdesVt!t #F$9 : $		%2P*P QTbcqTr r%3N%CCJ
*mi.G99^%@A99]Y%>?&1A5h..r%   r   )r   re   )r   T)2r   r   r   r   r   LANCZOSrm   r	   r   r
   r   rD   r   r   r   r   do_convert_rgbr   r   r   r   valid_kwargsmodel_input_namesr   r`   r   r   r   rf   r#   rk   r   rp   r   Tensorr!   r"   r   r   tupler   r   listfloatr   r   r   r   r$   __classcell__)rb   s   @r&   rX   rX      s   !))H'J%IG$D$c*NIJLNF.L')?@#(C!D # 4 4v>Y7Z 4_k 4 4Q
 QC QXb Q OS	ss s L	s 
s s> OS	9292 S#X92 L	92~ OS	D||D "%D L	DF "&!||! 38_! 	!
  !<xT.)*x x 	x
 Lx x x x DK'$.x 4;&-x tx !4Kx S#X-x "D[x +x  j(4/!x$ 
%xt(/# (/c (/RV (/ (/r%   rX   rI   re   )*ry   numpynpr   torchvision.transforms.v2r   r   image_processing_backendsr   image_processing_utilsr   image_transformsr   r   image_utilsr	   r
   r   r   r   r   processing_utilsr   r   utilsr   r   r   r=   r#   r   r3   r!   r"   r5   r>   r   rO   rS   rV   rX   __all__r   r%   r&   <module>r      s  .    7 ; 2 E  5 /,e    MQ"""&)Dj"?BTz"
38_"L ?C&*38nt&;
38_> 38_0#d40I+J&K #PUVY[^V^P_ #6$t,E'F"G 6C 6$T.-A(B $~ $ Z/. Z/ Z/z
 #
#r%   