
    Z jYB                        S r SSKrSSKrSSKJr  SSKrSSKJs  Js  J	r
  SSKJr  SSKJrJrJr  SSKJr  SSKJrJr  SS	KJrJr  SS
KJrJrJr  SSKJrJr  SSK J!r!J"r"  SSK#J$r$  Sr% " S S\SS9r&         S"S\'S\(S\'S\'S\(S\(S\(S\(S\)S-  S\'S-  S\R                  4S jjr*\RV                  RX                  S 5       r-\" " S  S!\5      5       r.S!/r/g)#z%Image processor class for Pix2Struct.    N)Union)hf_hub_download)Image	ImageDraw	ImageFont   )TorchvisionBackend)BatchFeatureget_size_dict)group_images_by_shapereorder_images)ChannelDimension
ImageInputSizeDict)ImagesKwargsUnpack)
TensorTypeauto_docstring)requires_backendszybelkada/fontsc                   ^    \ rS rSr% Sr\\S'   \\\4   \S'   \	\S'   \
\   \-  S-  \S'   Srg)	Pix2StructImageProcessorKwargs%   a]  
max_patches (`int`, *optional*):
    Maximum number of patches to extract.
patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
    The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
is_vqa (`bool`, *optional*, defaults to `False`):
    Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
    rendered onto the input images.
header_text (`Union[list[str], str]`, *optional*):
    Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
max_patches
patch_sizeis_vqaNheader_text )__name__
__module____qualname____firstlineno____doc__int__annotations__dictstrboollist__static_attributes__r       ڋ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/pix2struct/image_processing_pix2struct.pyr   r   %   s5    
 S#XLcS4''r*   r   F)totaltext	text_size
text_colorbackground_colorleft_paddingright_paddingtop_paddingbottom_padding
font_bytes	font_pathreturnc
                 H   [        [        S5        [        R                  " SS9n
U
R	                  U S9nSR                  U5      nUb  U	c  [        R                  " U5      nOU	b  U	nO[        [        S5      n[        R                  " USUS9n[        R                  " S	S
5      n[        R                  " U5      nUR!                  SXS9u    nnnUU-   U-   nUU-   U-   n[        R                  " S	UU4U5      n[        R                  " U5      nUR#                  XF4XUS9  U$ )aC  
Render text. This script is entirely adapted from the original script that can be found here:
https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py

Args:
    text (`str`, *optional*, defaults to ):
        Text to render.
    text_size (`int`, *optional*, defaults to 36):
        Size of the text.
    text_color (`str`, *optional*, defaults to `"black"`):
        Color of the text.
    background_color (`str`, *optional*, defaults to `"white"`):
        Color of the background.
    left_padding (`int`, *optional*, defaults to 5):
        Padding on the left.
    right_padding (`int`, *optional*, defaults to 5):
        Padding on the right.
    top_padding (`int`, *optional*, defaults to 5):
        Padding on the top.
    bottom_padding (`int`, *optional*, defaults to 5):
        Padding on the bottom.
    font_bytes (`bytes`, *optional*):
        Bytes of the font to use. If `None`, the default font will be used.
    font_path (`str`, *optional*):
        Path to the font to use. If `None`, the default font will be used.
visionP   )width)r-   
z	Arial.TTFzUTF-8)encodingsizeRGB)   r@   r   r   )font)fillrB   )r   render_texttextwrapTextWrapperwrapjoinioBytesIOr   DEFAULT_FONT_PATHr   truetyper   newr   Drawtextbboxr-   )r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   wrapperlineswrapped_textrB   temp_img	temp_draw_wh
text_widthtext_heightimgdraws                          r+   rD   rD   9   s    L k8, "",GLLdL#E99U#L)"3zz*%		0+>dW9ED yy'Hx(I##FL#DJAq!Q\!M1Jk/N2K ))EJ46F
GC>>#DII|)<tITJr*   c                     U R                   u  p4pV[        R                  R                  R	                  XU4X4S9nUR                  X4XS5      nUR                  SSSSS5      R                  X5U-  Xb-  XA-  U-  5      nU$ )a_  
Extract patches from image tensor. Returns tensor of shape (batch, rows, columns, patch_height*patch_width*channels).

Args:
    image_tensor (`torch.Tensor`):
        Image tensor of shape (batch, channels, height, width).
    patch_height (`int`):
        Height of patches to extract.
    patch_width (`int`):
        Width of patches to extract.
)strider         r   r@   )shapetorchnn
functionalunfoldreshapepermute)image_tensorpatch_heightpatch_width
batch_sizechannelsheightr;   patchess           r+   torch_extract_patchesro      s     +7*<*<'J&hh!!((k7R\h[v(wGoojLrRGooaAq!,44l*E,@(BY\gBgG Nr*   c                     ^  \ rS rSrSrSrSrSSS.rSrSr	\
rSS	/r S%S
\\\4   \-  S-  S\4U 4S jjjrS r  S&SSS\S\S-  S\S-  SS4
S jjrS'S jrSSS\S
\SS4S jr\ S%S\S\\\   -  S-  S\\
   S\4U 4S jjj5       rSS\R<                  S4S\S\\\   -  S-  S\S\S\ \S4   S-  S\\
   S\4S jjr!S\S   S \S\S
\S!\\"-  S-  S"\S\4S# jr#S$r$U =r%$ )(Pix2StructImageProcessor   NT   rm   r;   i   Fflattened_patchesattention_maskr   r7   c           	         > [         TU ]  " S0 UD6nUb-  [        U[        5      (       d  [        S0 [	        USS9D6US'   U$ XS'   U$ )zS
Process custom Pix2Struct kwargs, specifically converting patch_size to SizeDict.
r   )r>   
param_namer   )super_standardize_kwargs
isinstancer   r   )selfr   kwargs	__class__s      r+   rz   ,Pix2StructImageProcessor._standardize_kwargs   sY     ,6v6!*Z*J*J#+#fmXd.e#fF<   $.< r*   c                     g)zC
Skip standard validation as Pix2Struct uses custom preprocessing.
Nr   )r|   r}   s     r+   _validate_preprocess_kwargs4Pix2StructImageProcessor._validate_preprocess_kwargs   s     	r*   imagetorch.Tensorheaderr5   r6   c                 p   UR                   nUR                  nUR                  [        R                  :X  a  [        R
                  " U5      nOHUS-  R                  SS5      R                  [        R                  5      n[        R
                  " U5      n[        X#US9n	[        U	R                  UR                  5      n
[        UR                  XR                  -  -  5      n[        U	R                  XR                  -  -  5      n[        R                  " SXU-   4S5      nUR                  U	R!                  X45      S5        UR                  UR!                  X45      SU45        [        R"                  " U5      R                  U5      nU[        R                  :w  a  UR%                  5       S-  nU$ )a  
Render header text on image using torch tensors.

Args:
    image (`torch.Tensor`):
        Image tensor in channel-first format (C, H, W).
    header (`str`):
        Header text to render.
    font_bytes (`bytes`, *optional*):
        Font bytes to use for rendering.
    font_path (`str`, *optional*):
        Path to font file to use for rendering.

Returns:
    `torch.Tensor`: Image with header in channel-first format (C, H, W).
   r   r5   r6   r?   whiterA   g     o@)devicedtyperb   uint8tvFto_pil_imageclamptorD   maxr;   r#   rm   r   rM   pasteresizepil_to_tensorfloat)r|   r   r   r5   r6   r   r   	image_pilimage_uint8header_image	new_width
new_heightnew_header_height	new_imageresults                  r+   render_header&Pix2StructImageProcessor.render_header   sl   .  ;;%++%((/I !3;--a588EK((5I #6IV **IOO<	))Y-HIJ
 3 3yCUCU7U VW IIei>O1O%PRYZ	++Y,JKVT	(())@AAGXCYZ ""9-008 EKK\\^e+Fr*   imagesc                 $   UR                  SSS9nUR                  SSS9nUR                  S   UR                  S   -  UR                  S   -  nSUS-  -  n[        R                  " U[        R
                  " XSR                  S	95      nX-
  U-  $ )
z
Normalize batched images using per-image mean and standard deviation.

Args:
    images (`torch.Tensor`):
        Batched float image tensor of shape (B, C, H, W).

Returns:
    `torch.Tensor`: Normalized images of shape (B, C, H, W).
)r@   r`   r   T)dimkeepdimr@   r`   r   g      ?      ?r   )meanstdra   rb   maximumtensorr   )r|   r   r   r   num_elements_per_imagemin_stdadjusted_stddevs          r+   	normalize"Pix2StructImageProcessor.normalize   s     {{y${7jjYj5!'a6<<?!BV\\RS_!T.33--U\\'**-UV00r*   r   c           	         UR                   UR                  pTUR                  u  pgpX$U-  -  XY-  -  S-  n
[        [	        [        X-  U-  5      U5      S5      n[        [	        [        X-  U-  5      U5      S5      n[        X-  S5      n[        X-  S5      n[        XS9nU R                  X[        R                  R                  SS9n[        XU5      nUR                  u  nnnnUR                  UUU-  U5      n[        R                  " UUR                  S9R                  US5      R!                  SU5      R                  SUU-  S5      n[        R                  " UUR                  S9R                  SU5      R!                  US5      R                  SUU-  S5      nUR#                  USS5      nUR#                  USS5      nUS-   R%                  5       nUS-   R%                  5       n[        R&                  " UUU/SS9n[        R(                  R*                  R-                  US	S	S	UUU-  -
  /5      R%                  5       nU$ )
a  
Extract flattened patches from a batch of images.

Args:
    images (`torch.Tensor`):
        Batched images tensor of shape (batch, channels, height, width).
    max_patches (`int`):
        Maximum number of patches to extract.
    patch_size (`SizeDict`):
        Dictionary containing patch height and width.

Returns:
    `torch.Tensor`: Batched flattened patches with row/column IDs of shape (batch, max_patches, patch_dim).
r   r@   rt   T)r   r>   resample	antialiasr   r^   r   r   )rm   r;   ra   r   minr#   r   r   r   InterpolationModeBILINEARro   rf   rb   aranger   repeatexpandr   catrc   rd   pad)r|   r   r   r   ri   rj   rk   rl   image_heightimage_widthscalenum_feasible_rowsnum_feasible_colsresized_heightresized_widthresize_sizern   rowscolumnsdepthrow_idscol_idsr   s                          r+   extract_flattened_patches2Pix2StructImageProcessor.extract_flattened_patches  s=   ( %/$5$5z7G7Gk:@,,7
l | ;<@YZ_bbC(<|(K$Lk Z\]^C(;k(I$JK XZ[\.=qA-;Q? nJ6cF[F[FdFdptu (kJ+2==(
D'5 //*dWneD LLfmm4<<T1ELLQPWX``abdhkrdrtuv 	 LL7WQ VD!_WQw*	 	 ..R4..R4 Q;%%'Q;%%' GWg6B? $$((!Q;$QX.;Y1Z[aacr*   r   r}   c                 *   > [         TU ]  " U4SU0UD6$ )z
header_text (`Union[str, list[str]]`, *optional*):
    Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
r   )ry   
preprocess)r|   r   r   r}   r~   s       r+   r   #Pix2StructImageProcessor.preprocessK  s     w!&LkLVLLr*   do_convert_rgbinput_data_formatr   ztorch.devicec                    U R                  UUUUS9nUR                  SU R                  5      nU(       a  Uc  [        S5      eUR	                  SS5      nUR	                  SS5      n	[        U[        5      (       a  U/[        U5      -  n[        U5       V
Vs/ s H  u  pU R                  XU
   XS9PM     nn
nU R                  " U40 UD6$ s  snn
f )z#
Preprocess images for Pix2Struct.
)r   r   r   r   r   Nz.A header text must be provided for VQA models.r5   r6   r   )_prepare_image_like_inputsgetr   
ValueErrorpopr{   r&   len	enumerater   _preprocess)r|   r   r   r   r   r   r}   r   r5   r6   ir   s               r+   _preprocess_image_like_inputs6Pix2StructImageProcessor._preprocess_image_like_inputsX  s     00)/	 1 
 Hdkk2" !QRRL$7J

;5I+s++*mc&k9
 !*& 1 1HA ""5a.Z"e 1  
 1&11s   Cdo_normalizereturn_tensorsdisable_groupingc                    [        XS9u  p0 n
0 nUR                  5        H  u  pUR                  [        R                  :X  a  UR                  5       nU(       a  U R                  U5      nU R                  XUS9nUR                  SS9S:g  R                  5       nXU'   XU'   M     [        X5      n[        X5      nU(       a*  [        R                  " USS9n[        R                  " USS9n[        UUS.US9$ )z1
Preprocess images to extract flattened patches.
)r   )r   r   r   r^   r   r   )ru   rv   )datatensor_type)r   itemsr   rb   r   r   r   r   sumr   stackr
   )r|   r   r   r   r   r   r   r}   grouped_imagesgrouped_images_indexflattened_patches_groupedattention_masks_groupedra   stacked_imagesrn   masksru   attention_maskss                     r+   r   $Pix2StructImageProcessor._preprocess  s	    0EV/o,$&!"$%3%9%9%;!E##u{{2!/!5!5!7 !%!?44%: 5 G [[R[(A-446E/6e,-2E* &<" ++D[()@W  %,=1 E#kk/qAO'8O\&
 	
r*   r   )N)NN)r   r   r7   r   )&r   r   r    r!   rescale_factorr   r   r   r   r   r   valid_kwargsmodel_input_namesr%   r&   r#   r   rz   r   bytesr   r   r   r   r   r(   r   r
   r   r   FIRSTr'   r   r   r   r   r)   __classcell__)r~   s   @r+   rq   rq      s    NLN,JKF1L,.>? 8<cNX-4 
	   $( $66 6 DL	6
 :6 
6p1*CC C 	C
 
CJ  /3
M
M 49_t+
M 78	
M
 

M 
M /3#.>.D.D48&2&2 49_t+&2 	&2
 ,&2 c>)*T1&2 78&2 
&2P/
^$/
 /
 	/

 /
 j(4//
 /
 
/
 /
r*   rq   )	$   blackr      r   r   r   NN)0r"   rI   rE   typingr   rb   $torchvision.transforms.v2.functional
transformsv2rd   r   huggingface_hubr   PILr   r   r   image_processing_backendsr	   image_processing_utilsr
   r   image_transformsr   r   image_utilsr   r   r   processing_utilsr   r   utilsr   r   utils.import_utilsr   rK   r   r&   r#   r   rD   compilerdisablero   rq   __all__r   r*   r+   <module>r     s@   , 	    2 2 + + + ; A E A A 4 / 3 % (\ (, ## C
CC C 	C
 C C C C C TzC [[CN  * X
1 X
 X
v &
&r*   