
    Z jV                     t   S r SSKrSSKrSSKJrJr  SSKrSSKrSSK	J
r  SSKJr  SSKJr  SSKJrJr  SS	KJrJrJrJrJrJr  SS
KJrJr  SSKJrJrJ r J!r!J"r"  \ " 5       (       a  SSK#J$r$  SSK%J&r&J'r'  \(       a  SSK(J)r)  \!RT                  " \+5      r, " S S\SS9r-  S/S\.\/-  \R`                  -  S\1S\1S\2S\24
S jjr3S\2S\R`                  S\R`                  S\R`                  4S jr4S r5S \R`                  S!\.\R`                  \R`                  4   4S" jr6S0S#\R`                  S$\R`                  S%\1S!\R`                  4S& jjr7S#\R`                  S'\R`                  S(\R`                  S)\R`                  S!\R`                  4
S* jr8S+\R`                  S!\R`                  4S, jr9\ " S- S.\5      5       r:S./r;g)1z"Image processor class for VitPose.    N)TYPE_CHECKINGUnion)
functional   )TorchvisionBackend)BatchFeature)group_images_by_shapereorder_images)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDChannelDimension
ImageInputPILImageResamplingSizeDict)ImagesKwargsUnpack)
TensorTypeauto_docstringis_scipy_availableloggingrequires_backends)inv)affine_transformgaussian_filter   )VitPoseEstimatorOutputc                   :    \ rS rSr% Sr\S-  \S'   \S-  \S'   Srg)VitPoseImageProcessorKwargs7   a5  
do_affine_transform (`bool`, *optional*):
    Whether to apply an affine transformation to the input images based on the bounding boxes.
normalize_factor (`float`, *optional*, defaults to `200.0`):
    Width and height scale factor used for normalization when computing center and scale from bounding boxes.
Ndo_affine_transformnormalize_factor )	__name__
__module____qualname____firstlineno____doc__bool__annotations__float__static_attributes__r"       څ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vitpose/image_processing_vitpose.pyr   r   7   s     $dl"r,   r   F)totalboximage_widthimage_heightr!   padding_factorc                    U SS u  pVpxX-  n	[         R                  " XWS-  -   XhS-  -   /[         R                  S9n
XyU-  :  a	  US-  U	-  nOXyU-  :  a  X-  n[         R                  " Xs-  X-  /[         R                  S9nX-  nX4$ )aa  
Encodes a bounding box in COCO format into (center, scale).

Args:
    box (`Tuple`, `List`, or `np.ndarray`):
        Bounding box in COCO format (top_left_x, top_left_y, width, height).
    image_width (`int`):
        Image width.
    image_height (`int`):
        Image height.
    normalize_factor (`float`):
        Width and height scale factor.
    padding_factor (`float`):
        Bounding box padding factor.

Returns:
    tuple: A tuple containing center and scale.

    - `np.ndarray` [float32](2,): Center of the bbox (x, y).
    - `np.ndarray` [float32](2,): Scale of the bbox width & height.
N         ?dtype      ?)nparrayfloat32)r/   r0   r1   r!   r2   
top_left_x
top_left_ywidthheightaspect_ratiocenterscales               r-   box_to_center_and_scalerC   D   s    : -0G)JE-LXXzCK/sl1JKSUS]S]^Ff$$|+	&	&%HHe.0IJRTR\R\]E"E=r,   theta
size_inputsize_dstsize_targetc                    [         R                  " U 5      n [         R                  " S[         R                  S9nUS   US   -  nUS   US   -  n[        R
                  " U 5      U-  US'   [        R                  " U 5      * U-  US'   USUS   -  [        R
                  " U 5      -  SUS   -  [        R                  " U 5      -  -   SUS   -  -   -  US	'   [        R                  " U 5      U-  US
'   [        R
                  " U 5      U-  US'   USUS   -  [        R                  " U 5      -  SUS   -  [        R
                  " U 5      -  -
  SUS   -  -   -  US'   U$ )a  
Calculate the transformation matrix under the constraint of unbiased. Paper ref: Huang et al. The Devil is in the
Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py

Args:
    theta (`float`):
        Rotation angle in degrees.
    size_input (`np.ndarray`):
        Size of input image [width, height].
    size_dst (`np.ndarray`):
        Size of output image [width, height].
    size_target (`np.ndarray`):
        Size of ROI in input plane [w, h].

Returns:
    `np.ndarray`: A matrix for transformation.
)   r   r6   r   r   r   r   r   r   g      r5   r   rI   r   r   r   r   r   rI   )r9   deg2radzerosr;   mathcossin)rD   rE   rF   rG   matrixscale_xscale_ys          r-   get_warp_matrixrX   p   sX   ( JJuEXXfBJJ/FqkKN*GqkKN*G88E?W,F4LHHUO#g-F4Lz!}txx.z!}1DtxxPU1VVY\_jkl_mYmmF4L 88E?W,F4L88E?W,F4Lz!}txx.z!}1DtxxPU1VVY\_jkl_mYmmF4L Mr,   c           
         [        U R                  S   5       Vs/ s H	  o0SU4   PM     nn[        R                  " U/ SQ/5      n[	        U5      nUS   US   US   US   US   US	   4u  US'   US'   US'   US'   US	'   US'   U Vs/ s H  n[        XvUS
S9PM     nn[        R                  " USS9nU$ s  snf s  snf )aO  
This function implements cv2.warpAffine function using affine_transform in scipy. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.affine_transform.html and https://docs.opencv.org/4.x/d4/d61/tutorial_warp_affine.html for more details.

Note: the original implementation of cv2.warpAffine uses cv2.INTER_LINEAR.
.)r   r   r   rN   rM   rK   rJ   rO   rL   r   )output_shapeorderaxis)rangeshaper9   vstackr   r   stack)	srcMsizeichannelsM_scipyM_invchannelnew_srcs	            r-   scipy_warp_affinerl      s     &+399R=%9:%9CF%9H: iiI'GLEddddddTPE$KteDk5;dU4[ \dd[cPWTK[cGdhhwR(GN# ; es   B7	B<heatmapsreturnc                    [        U [        R                  5      (       d  [        S5      eU R                  S:w  a  [        S5      eU R                  u  pp4U R                  XS45      n[        R                  " US5      R                  XS45      n[        R                  " US5      R                  XS45      n[        R                  " US5      R                  [        R                  5      nUSS2SS2S	4   U-  USS2SS2S	4'   USS2SS2S4   U-  USS2SS2S4'   [        R                  " [        R                  " US5      S
:  US5      nX4$ )a  Get keypoint predictions from score maps.

Args:
    heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
        Model predicted heatmaps.

Returns:
    tuple: A tuple containing aggregated results.

    - coords (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
        Predicted keypoint location.
    - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
        Scores (confidence) of the keypoints.
zHeatmaps should be np.ndarrayr4   z Heatmaps should be 4-dimensionalrZ   rI   r   )r   r   rI   Nr   g        )
isinstancer9   ndarray	TypeErrorndim
ValueErrorr`   reshapeargmaxamaxtileastyper;   where)	rm   
batch_sizenum_keypoints_r>   heatmaps_reshapedidxscorespredss	            r-   get_keypoint_predictionsr      s$    h

++788}};<<*2..'Jq ((*R)HI
))%q
)
1
1:a2P
QCWW&*22Jq3QRFGGC#**2::6E1a7^e+E!Q'N1a7^u,E!Q'NHHRWWVY/#5ubAE=r,   coordsbatch_heatmapskernelc                 v   UR                   u  p4pVU R                   S   nUS:X  d  X7:X  d  [        S5      e[        US-
  S-  5      n[        R                  " U V	V
s/ s H!  n	U	 V
s/ s H  n
[        U
SX4SS9PM     sn
PM#     sn
n	5      n[        R                  " USS	5      n[        R                  " U5      n[        R                  " US
SS9R                  5       nU S   S-   U S   S-   US-   -  -   nXS-   US-   -  [        R                  " SX4-  5      R                  SU5      -  -  nUR                  [        5      R                  SS5      nX   nXS-      nXU-   S-      nXU-   S-      nXU-
  S-
     nXS-
     nXS-
  U-
     nSUU-
  -  nSUU-
  -  n[        R                  " UU/SS9nUR                  XtSS5      nUSU-  -
  U-   nUSU-  -
  U-   nSUU-
  U-
  U-   U-   U-
  U-
  U-   -  n[        R                  " UUUU/SS9nUR                  XtSS5      n[        R                  R                  U[        R                   " [        R"                  5      R$                  [        R&                  " S5      -  -   5      nU [        R(                  " SUU5      R+                  5       -  n U $ s  sn
f s  sn
n	f )a  DARK post-pocessing. Implemented by unbiased_data_processing.

Paper references:
- Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
- Zhang et al. Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR 2020).

Args:
    coords (`np.ndarray` of shape `(num_persons, num_keypoints, 2)`):
        Initial coordinates of human pose.
    batch_heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
        Batched heatmaps as predicted by the model.
        A batch_size of 1 is used for the bottom up paradigm where all persons share the same heatmap.
        A batch_size of `num_persons` is used for the top down paradigm where each person has its own heatmaps.
    kernel (`int`, *optional*, defaults to 3):
        Gaussian kernel size (K) for modulation.

Returns:
    `np.ndarray` of shape `(num_persons, num_keypoints, 2)` ):
        Refined coordinates.
r   r   zQThe batch size of heatmaps should be 1 or equal to the batch size of coordinates.rI   g?rK   )sigmaradiusaxesgMbP?2   )rJ   rJ   rN   rN   edge)mode).r   ).r   rZ   r   r5   r]   zijmn,ijnk->ijmk)r`   rt   intr9   r:   r   cliplogpadflattenarangeru   ry   concatenatelinalgr   finfor;   epseyeeinsumsqueeze)r   r   r   r{   r|   r?   r>   
num_coordsr   rm   heatmapbatch_heatmaps_padindexi_ix1iy1ix1y1ix1_y1_ix1_iy1_dxdy
derivativedxxdyydxyhessians                              r-   "post_dark_unbiased_data_processingr      s   * 0>/C/C,JvaJ!Oz7lmm&1*"#FXX +	
* goofn[b_WC8HvVfno*	
N WW^UB7NVVN+N0PW]^ffh6NQ&.1"4!CCE	aiFQJ'"))Az7Q*R*Z*Z[]_l*mmmELL%%b!,E		"B
QY
'C
U]Q.
/Cu}q01E !23Gai(Dai%/0D	d
	B	d
	BRq1J##Jq!DJ
B,
C
B,
C
s"R'",t3d:WD
ECnnc3S1:GoojA>GiimmGbhhrzz&:&>&>&JJKG
bii)7J?GGIIFM; p	
s   
J5
%J0<J5
0J5
rA   rB   output_sizec                    U R                   S   S;  a  [        S5      e[        U5      S:w  a  [        S5      e[        U5      S:w  a  [        S5      e[        U5      S:w  a  [        S5      eUS-  nUS   US	   S
-
  -  nUS	   US   S
-
  -  n[        R                  " U 5      nU SS2S	4   U-  US	   -   US	   S-  -
  USS2S	4'   U SS2S4   U-  US   -   US   S-  -
  USS2S4'   U$ )a'  Get final keypoint predictions from heatmaps and apply scaling and
translation to map them back to the image.

Note:
    num_keypoints: K

Args:
    coords (`np.ndarray` of shape `(num_keypoints, ndims)`):

        * If ndims=2, corrds are predicted keypoint location.
        * If ndims=4, corrds are composed of (x, y, scores, tags)
        * If ndims=5, corrds are composed of (x, y, scores, tags,
          flipped_tags)

    center (`np.ndarray` of shape `(2,)`):
        Center of the bounding box (x, y).
    scale (`np.ndarray` of shape `(2,)`):
        Scale of the bounding box wrt original image of width and height.
    output_size (`np.ndarray` of shape `(2,)`):
        Size of the destination heatmaps in (height, width) format.

Returns:
    np.ndarray: Predicted coordinates in the images.
r   )rI   r4      z5Coordinates need to have either 2, 4 or 5 dimensions.rI   z9Center needs to have 2 elements, one for x and one for y.z,Scale needs to consist of a width and heightz2Output size needs to consist of a height and width      i@r   r8   Nr5   )r`   rt   lenr9   	ones_like)r   rA   rB   r   rW   rV   target_coordss          r-   transform_predsr     s   2 ||Ai'PQQ
6{aTUU
5zQGHH
;1MNN EME Ah+a.3./GAh+a.3./GLL(M A,06!9<uQx#~MM!Q$ A,06!9<uQx#~MM!Q$r,   bboxesc                 z    U SS2S4   U SS2S4   -   S-
  U SS2S4'   U SS2S4   U SS2S4   -   S-
  U SS2S4'   U $ )a  
Converts bounding boxes from the COCO format to the Pascal VOC format.

In other words, converts from (top_left_x, top_left_y, width, height) format
to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).

Args:
    bboxes (`np.ndarray` of shape `(batch_size, 4)):
        Bounding boxes in COCO format.

Returns:
    `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
NrI   r   r   r   r"   )r   s    r-   coco_to_pascal_vocr   <  sW     !Q$<&A,.2F1a4L!Q$<&A,.2F1a4LMr,   c            '         ^  \ rS rSrSr\rS/r\r	\
rSSS.rSrSrSrSrS	\\   4U 4S
 jjr\S\S\\\\         \R0                  -  S	\\   S\4U 4S jj5       r S7S\S\\\\         \R0                  -  S-  S\S\S\\S4   S-  S\4S jjr\ RB                  RD                  SSS\#\   S\#\   S\S\$SS4S j5       r%   S8S\S   S\S\$SSS\S \$S!\S"\S#\S$\\\   -  S-  S%\\\   -  S-  S&\S-  S'\$S-  S(\S-  S)\\&-  S-  S*\S+\S\\R0                  -  S-  S\4&S, jjr' S9S-\R0                  S\R0                  S\R0                  S.\(4S/ jjr)   S:S0S1S\\\\         \R0                  -  S2\(S3\S-  S4\&\\#   -  S-  4
S5 jjr*S6r+U =r,$ );VitPoseImageProcessoriP  z6Torchvision backend for VitPose with affine transform.pixel_values      )r?   r>   Tr   kwargsc                 &   > [         TU ]  " S0 UD6  g )Nr"   )super__init__)selfr   	__class__s     r-   r   VitPoseImageProcessor.__init___  s    "6"r,   imagesboxesrn   c                 &   > [         TU ]  " X40 UD6$ )z
boxes (`list[list[list[float]]]` or `np.ndarray`):
    List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the
    bounding box coordinates in COCO format (top_left_x, top_left_y, width, height).
)r   
preprocess)r   r   r   r   r   s       r-   r    VitPoseImageProcessor.preprocessb  s     w!&:6::r,   Ndo_convert_rgbinput_data_formatdeviceztorch.devicec                 P    U R                  XXES9nX&S'   U R                  " U40 UD6$ )z"Handle extra inputs beyond images.)r   r   r   r   r   )_prepare_image_like_inputs_preprocess)r   r   r   r   r   r   r   s          r-   _preprocess_image_like_inputs3VitPoseImageProcessor._preprocess_image_like_inputsp  s>     00L] 1 
  w1&11r,   imageztorch.TensorrA   rB   rotationre   c                    [        XBS-  [        R                  " UR                  UR                  45      S-
  US-  5      nUR                  SSS5      R                  5       R                  5       n[        XvUR                  UR                  4S9n[        R                  " U5      R                  SSS5      R                  UR                  5      n	U	$ )z7Apply an affine transformation to a torch tensor image.g       @r8   r   r   rI   r   )rc   rd   re   )rX   r9   r:   r>   r?   permutecpunumpyrl   torch
from_numpytor   )
r   r   rA   rB   r   re   transformationimage_nptransformed_nptransformeds
             r-   r   &VitPoseImageProcessor.affine_transform  s     )slBHHdjj$++-F$G#$MuW\}
 ==Aq)--/557*xQUQ\Q\^b^h^hPij&&~6>>q!QGJJ5<<Xr,   	do_resizeresamplez7PILImageResampling | tvF.InterpolationMode | int | Nonedo_center_crop	crop_size
do_rescalerescale_factordo_normalize
image_mean	image_stddo_padpad_sizedisable_groupingreturn_tensorsr    r!   c           
         Ubq  U(       aj  / n[        UU5       HV  u  nnU HJ  n[        UUR                  UR                  US9u  nnU R	                  UUUSUS9nUR                  U5        ML     MX     Un[        XS9u  nn0 nUR                  5        H  u  nnU R                  UXxXU5      nUUU'   M!     [        UU5      n [        SU 0US9$ )z!Custom preprocessing for VitPose.)r0   r1   r!   r   )r   re   )r   r   )datatensor_type)ziprC   r>   r?   r   appendr	   itemsrescale_and_normalizer
   r   )!r   r   r   re   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r   r   transformed_imagesr   image_boxesr/   rA   rB   transformed_imagegrouped_imagesgrouped_images_indexprocessed_images_groupedr`   stacked_imagesprocessed_imagess!                                    r-   r   !VitPoseImageProcessor._preprocess  s   . !4!#&)&%&8"{&C$;$(JJ%)[[)9	%MFE )-(=(=eVU]^ei(=(j%&--.?@ ' '9 (F/DV/o,,#% %3%9%9%;!E>!77
LV_N /=$U+	 &<
 **BDXY.2B!CQ_``r,   rm   r   c                     UR                   u  pVpx[        U5      u  p[        XUS9n[        U5       H  n[	        X   X,   X<   Xx/S9X'   M     X4$ )zRGet final keypoint predictions from heatmaps and transform them back to the image.r   )rA   rB   r   )r`   r   r   r_   r   )r   rm   rA   rB   r   r{   r}   r?   r>   r   r   r   rf   s                r-   keypoints_from_heatmaps-VitPoseImageProcessor.keypoints_from_heatmaps  s]     (0~~$
v1(;26FSz"A&ux	`f_noEH #}r,   outputsr   kernel_size	thresholdtarget_sizesc                 4   [        U S5        UR                  R                  u  pg  nUb  U[        U5      :w  a  [	        S5      e[
        R                  " US4[
        R                  S9n	[
        R                  " US4[
        R                  S9n
[        [        R                  " U6 5      n[        U5       Hr  nUb.  X\   S   X\   S   p[
        R                  " XX/5      nX   U-  X'   U R                  S   U R                  S	   nn[        X   UUS
9u  nnUXSS24'   UXSS24'   Mt     U R                  UR                  R!                  5       R#                  5       XUS9u  nn[
        R                  " US4[
        R                  S9nU	SS2SS24   USS2SS24'   U
SS2SS24   USS2SS24'   [$        R&                  " U5      n[$        R&                  " U5      n[$        R(                  " SU5      n[$        R&                  " [+        U5      5      n/ n[-        UUU5      nU Ho  n/ nU HS  n[/        U5      u  nnn UR1                  5       nUn!Ub  UU:  n"UU"   nUU"   nU!U"   n!UUU!U S.n#UR3                  U#5        MU     UR3                  U5        Mq     U$ )a^  
Transform the heatmaps into keypoint predictions and transform them back to the image.

Args:
    outputs (`VitPoseEstimatorOutput`):
        VitPoseForPoseEstimation model outputs.
    boxes (`list[list[list[float]]]` or `np.ndarray`):
        List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
        box coordinates in COCO format (top_left_x, top_left_y, width, height).
    kernel_size (`int`, *optional*, defaults to 11):
        Gaussian kernel size (K) for modulation.
    threshold (`float`, *optional*, defaults to None):
        Score threshold to keep object detection predictions.
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
        Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
        `(height, width)` of each image in the batch. If unset, predictions will be resize with the default value.
Returns:
    `list[list[Dict]]`: A list of dictionaries, each dictionary containing the keypoints and boxes for an image
    in the batch as predicted by the model.
r   NzTMake sure that you pass in as many target sizes as the batch dimension of the logitsrI   r6   r   r   r>   r?   )r0   r1   r   r4   )	keypointsr   labelsbbox)r   rm   r`   r   rt   r9   rQ   r;   list	itertoolschainr_   r:   re   rC   r   r   r   r   tensorr   r   r   nextr   r   )$r   r   r   r  r  r  r{   r|   r}   centersscalesflattened_boxesrf   r0   r1   scale_factorr>   r?   rA   rB   r   r   	all_boxesposesr  bboxes_xyxyresultspose_bbox_pairsimage_bboxesimage_resultsposescore	bbox_xyxykeypoints_labelskeeppose_results$                                       r-   post_process_pose_estimation2VitPoseImageProcessor.post_process_pose_estimation  s   8 	$(*1*:*:*@*@'
1a#
c,6G(Gstt((J?"**=:q/<y67z"A',8OA,>PQ@R\!xxK(^_%4%7,%F" IIg.		(0C6E3O4FTYhnoMFE"GqDM Fa4L # 44  "((*GK 5 
v HHj!_BJJ?	#AqsFO	!QqS&"1ac6N	!QqS&U#f%a/ll#5i#@A79eV[9!L;=M!)-o)>&eY#) ( 9,D:D!$KE'7'=$,0EM]gpq$$[1 " NN=) " r,   r"   )N)Tr   N)   )r   NN)-r#   r$   r%   r&   r'   r   valid_kwargsmodel_input_namesr   r   r   r   re   r   r   r    r!   r   r   r   r   r  r*   r9   rq   r   r   r(   r   r   strr   r   compilerdisabletupler   r   r   r   r   r   r  r+   __classcell__)r   s   @r-   r   r   P  sO   @.L'(&J$IC(DJL#(C!D # ;; De%&3; 45	;
 
; ;& 5922 De%&3d:2 	2
 ,2 c>)*T12 
2" ^^ e U|	
   
 D %)"'*.'-a^$-a -a 	-a
 L-a -a -a -a -a -a DK'$.-a 4;&--a t-a T/-a +-a  j(4/!-a" "#-a$  %-a& bjj 4''-a* 
+-ah ** 

 zz	
 & "&8<F)F De%&3F 	F
 4<F !4;.5F Fr,   r   )r   g      ?)r   )<r'   r	  rR   typingr   r   r   r9   r   torchvision.transforms.v2r   tvFimage_processing_backendsr   image_processing_utilsr   image_transformsr	   r
   image_utilsr   r   r   r   r   r   processing_utilsr   r   utilsr   r   r   r   r   scipy.linalgr   scipy.ndimager   r   modeling_vitposer   
get_loggerr#   loggerr   r&  r  rq   r   r*   rC   rX   rl   r   r   r   r   r   __all__r"   r,   r-   <module>r7     s   )   '   7 ; 2 E  5   ?8			H	%	#,e 	#" $ )	

	")) ) 	)
 )X"5 "bjj "BJJ "]_]g]g "J4rzz eBJJ

<R6S B9rzz 92:: 9_b 9kmkuku 9x-BJJ -

 -2:: -\^\f\f -kmkuku -`rzz bjj ( F. F FR #
#r,   