
    Z j,                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSKJr  \R2                  " \5      r\" SS9\ " S S\5      5       5       r\ " S S\5      5       rSS jr " S S\R>                  5      r  " S S\R>                  5      r!\" SS9 " S S\5      5       r"SS/r#g)zPyTorch VitPose model.    )	dataclassN)nn   )initialization)load_backbone)BackboneOutput)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuple   )VitPoseConfigz6
    Class for outputs of pose estimation models.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
VitPoseEstimatorOutput$   a$  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
    Heatmaps as predicted by the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
    (also called feature maps) of the model at the output of each stage.
Nlossheatmaps.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   r   $   sq    	 &*D%

d
"))-He$&-:>M5**C/047>7;Je'',-4;r%   r   c                       \ rS rSr% \\S'   SrSrSrSr	\
R                  " 5       S\R                  \R                  -  \R                  -  4S j5       rS	rg
)VitPosePreTrainedModel<   configvitpixel_values)imageTmodulec                    [        U[        R                  [        R                  45      (       ac  [        R
                  " UR                  SU R                  R                  S9  UR                  b!  [        R                  " UR                  5        gg[        U[        R                  5      (       aA  [        R                  " UR                  5        [        R                  " UR                  5        gg)zInitialize the weightsg        )meanstdN)
isinstancer   LinearConv2dinittrunc_normal_weightr*   initializer_rangebiaszeros_	LayerNormones_)selfr.   s     r&   _init_weights$VitPosePreTrainedModel._init_weightsD   s     fryy"))455v}}3DKK<Y<YZ{{&FKK( '--KK$JJv}}% .r%   r   N)r   r   r   r   r   r"   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointingr    no_gradr   r3   r4   r;   r>   r$   r   r%   r&   r(   r(   <   sT    $O!&*#
]]_&BII		$9BLL$H & &r%   r(   c                    US;  a  [        S5      eU R                  S:w  a  [        S5      eU R                  u  p4pVSnUS:X  a  SnU SS2SSS2S	4   * U SS2SSS2S	4'   U R                  US
XuU5      n U R	                  5       nUR                  5        H)  u  pU SS2U
S	4   USS2U	S	4'   U SS2U	S	4   USS2U
S	4'   M+     UR                  X4XV45      nUR                  S
5      nU$ )a  Flip the flipped heatmaps back to the original form.

Args:
    output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
        The output heatmaps obtained from the flipped images.
    flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
        Pairs of keypoints which are mirrored (for example, left ear -- right ear).
    target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
        Target type to use. Can be gaussian-heatmap or combined-target.
        gaussian-heatmap: Classification target with gaussian distribution.
        combined-target: The combination of classification target (response map) and regression target (offset map).
        Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

Returns:
    torch.Tensor: heatmaps that flipped back to the original image
)gaussian-heatmapcombined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   rG   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrights              r&   	flip_backr\   P   s$   " AATUUa^__/=/C/C,JvH''(6q!$Q$|(D'Dq!$Q$|$#++JHeTN(..0 "((*,:1eS=,IAtSL)-;AtSL-IAucM* + .55zRX6`a-2226r%   c                      ^  \ rS rSrSrS\4U 4S jjrSS\R                  S\R                  S-  S\R                  4S	 jjr	S
r
U =r$ )VitPoseSimpleDecoderx   z
Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
feature maps into heatmaps.
r*   c                   > [         TU ]  5         [        R                  " 5       U l        [        R
                  " UR                  SSS9U l        [        R                  " UR                  R                  UR                  SSSS9U l        g )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsamplerb   
upsamplingr4   backbone_confighidden_size
num_labelsconvr=   r*   	__class__s     r&   rj   VitPoseSimpleDecoder.__init__~   se    '')++63F3FZglmII""..0A0AqYZde
	r%   Nhidden_staterR   returnc                     U R                  U5      nU R                  U5      nU R                  U5      nUb  [        X25      nU$ N)rl   rn   rr   r\   r=   rv   rR   r   s       r&   forwardVitPoseSimpleDecoder.forward   sA    |4|499\*! 6Hr%   )rl   rr   rn   ry   r   r   r   r   r   r   rj   r    Tensorr{   r$   __classcell__rt   s   @r&   r^   r^   x   sG    

} 
	ELL 	ellT>Q 	]b]i]i 	 	r%   r^   c                   x   ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\R                  S-  4S jjr	S	r
U =r$ )VitPoseClassicDecoder   z
Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
turning the feature maps into heatmaps.
r*   c           	        > [         TU ]  5         [        R                  " UR                  R
                  SSSSSS9U l        [        R                  " S5      U l        [        R                  " 5       U l
        [        R                  " SSSSSSS9U l        [        R                  " S5      U l        [        R                  " 5       U l        [        R                  " SUR                  SSSS9U l        g )	N   rH      r   F)rf   rg   rh   r9   r   re   )ri   rj   r   ConvTranspose2dro   rp   deconv1BatchNorm2d
batchnorm1rk   relu1deconv2
batchnorm2relu2r4   rq   rr   rs   s     r&   rj   VitPoseClassicDecoder.__init__   s    ))""..1VW^c
 ..-WWY
))#s!UV]bc..-WWY
IIc6#4#4!AWXY	r%   Nrv   rR   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nUb  [        X25      nU$ ry   )r   r   r   r   r   r   rr   r\   rz   s       r&   r{   VitPoseClassicDecoder.forward   sy    ||L1|4zz,/||L1|4zz,/99\*! 6Hr%   )r   r   rr   r   r   r   r   ry   r}   r   s   @r&   r   r      s;    
Z} ZELL ellT>Q  r%   r   z?
    The VitPose model with a pose estimation head on top.
    c                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\
\   S
\4S jj5       5       rSrU =r$ )VitPoseForPoseEstimation   r*   c                   > [         TU ]  U5        [        U5      U l        [	        U R                  R
                  S5      (       d  [        S5      e[	        U R                  R
                  S5      (       d  [        S5      e[	        U R                  R
                  S5      (       d  [        S5      eUR                  (       a  [        U5      O
[        U5      U l
        U R                  5         g )Nrp   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)ri   rj   r   backbonehasattrr*   rJ   use_simple_decoderr^   r   head	post_initrs   s     r&   rj   !VitPoseForPoseEstimation.__init__   s     %f- t}}++];;OPPt}}++\::OPPt}}++\::NOO4:4M4M(0ShioSp	 	r%   Nr,   dataset_indexrR   labelskwargsrw   c                 ~   SnUb  [        S5      eU R                  R                  " U4SU0UD6nUR                  S   nUR                  S   n	U R
                  R                  R                  S   U R
                  R                  R                  S   -  n
U R
                  R                  R                  S   U R
                  R                  R                  S   -  nUR                  SSS5      nUR                  U	SX5      R                  5       nU R                  XS9n[        UUUR                  UR                  S	9$ )
a  
dataset_index (`torch.Tensor` of shape `(batch_size,)`):
    Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

    This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
flip_pairs (`torch.tensor`, *optional*):
    Whether to mirror pairs of keypoints (for example, left ear -- right ear).

Examples:

```python
>>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
>>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
>>> inputs = processor(image, boxes=boxes, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
>>> heatmaps = outputs.heatmaps
```NzTraining is not yet supportedr   rI   r   r   r   )rR   )r   r   r   r   )NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsrL   r*   ro   r   r   permuterM   
contiguousr   r   r   r   )r=   r,   r   rR   r   r   r   outputssequence_outputrT   patch_heightpatch_widthr   s                r&   r{    VitPoseForPoseEstimation.forward   s7   R %&EFF"&--"L"L#
'#
 #
 "..r2$**1-
{{22==a@DKKD_D_DjDjklDmmkk11<<Q?4;;C^C^CiCijkCll)11!Q:)11*b,\ggi99_9D%!//))	
 	
r%   )r   r   )NNN)r   r   r   r   r   rj   r   r   r    r~   r
   r   r   r{   r$   r   r   s   @r&   r   r      s    } $  .2*.&*@
ll@
 ||d*@
 LL4'	@

 t#@
 +,@
 
 @
  @
r%   r   )rF   )$r   dataclassesr   r    r    r   r5   backbone_utilsr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   r   r   utils.genericr   configuration_vitposer   
get_loggerr   loggerr   r(   r\   Moduler^   r   r   __all__r   r%   r&   <module>r      s     !   & + . - & M M - 0 
		H	%
 
 <[ < <$ &_ & &&%P299 6#BII #L 
U
5 U

U
p $%?
@r%   