
    Z j;                     *   S r SSKJrJrJr  SSKJrJr  SSKJ	r	  \" 5       (       a  SSK
r
SSKJs  Jr    SSSS	SS
\S\S\S\\S4   4S jjr   SS	SSSSSS\\   S-  S\S\S\\\4   S-  S\S\\\\\-  4      4   4S jjr " S S\	5      rS/rg)z"Video processor class for Videomt.   )IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDPILImageResampling)is_torch_availablerequires_backends)BaseVideoProcessor    Nmask_labelstorch.Tensor
mask_probs	query_idxmask_thresholdoverlap_mask_area_thresholdreturnc                    X:H  nUR                  5       nX   U:  nUR                  5       nXW-  n	U	R                  5       n
US:  =(       a    US:  =(       a    U
S:  nU(       a  Xh-  nUR                  5       U:  d  SnX4$ )aD  
Checks whether a predicted query produces a valid panoptic segment.

Args:
    mask_labels (`torch.Tensor`):
        Tensor of shape `(height, width)` containing the winning query index for each pixel.
    mask_probs (`torch.Tensor`):
        Tensor of shape `(num_queries, height, width)` containing per-query mask probabilities.
    query_idx (`int`):
        Index of the query to validate.
    mask_threshold (`float`, *optional*, defaults to 0.5):
        Threshold used to binarize the query mask probabilities.
    overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
        Minimum overlap ratio required between the assigned query area and the original query mask area.

Returns:
    `tuple[bool, torch.Tensor]`: A tuple containing whether the segment is valid and the final boolean mask for
    that segment.
r	   F)sumitem)r
   r   r   r   r   
query_maskquery_mask_areaoriginal_maskoriginal_area
final_maskfinal_mask_areamask_exists
area_ratios                څ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/videomt/video_processing_videomt.pycheck_segment_validityr      s    4 )J nn&O)^;M!%%'M+J nn&O!A%S-!*;SRS@SK$4
 #>>K""    pred_scorespred_labelslabel_ids_to_fusetarget_sizec                 p   Uc  U R                   S   OUS   nUc  U R                   S   OUS   n[        R                  " Xx4[        R                  U R                  S9S-
  n	/ n
U R                  5       n USS2SS4   U -  R                  S5      nSn0 n[        UR                   S   5       H  nX.   R                  5       n[        XXU5      u  nnU(       d  M.  U(       a  X;   a  X;   a	  X   U	U'   MH  XU'   XU'   [        X   R                  5       S5      nU
R                  UUUS.5        US-  nM     X4$ )aq  
Converts per-query mask predictions into a panoptic segmentation map.

Args:
    mask_probs (`torch.Tensor`):
        Tensor of shape `(num_queries, height, width)` containing per-query mask logits.
    pred_scores (`torch.Tensor`):
        Tensor of shape `(num_queries,)` containing the confidence score of each predicted query.
    pred_labels (`torch.Tensor`):
        Tensor of shape `(num_queries,)` containing the predicted class ID of each query.
    label_ids_to_fuse (`set[int]`, *optional*):
        Label IDs that should be fused across disconnected regions.
    mask_threshold (`float`, *optional*, defaults to 0.5):
        Threshold used to binarize the query mask probabilities.
    overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
        Minimum overlap ratio required to keep a predicted segment.
    target_size (`tuple[int, int]`, *optional*):
        Final `(height, width)` of the segmentation map. If unset, uses the spatial size of `mask_probs`.

Returns:
    `tuple[torch.Tensor, list[dict[str, int | float]]]`: The panoptic segmentation map and the metadata for each
    predicted segment.
N   r	      )dtypedevice   idlabel_idscore)shapetorchzeroslongr'   sigmoidargmaxranger   r   roundappend)r   r   r    r!   r   r   r"   heightwidthsegmentationsegmentsr
   current_segment_idstuff_memory_listr   
pred_classr   r   segment_scores                      r   compute_segmentsr>   G   s`   @ %0$7Za [^F#.#6JQKNE;;ejjIZIZ[^__LH##%Jq$}-
:BB1EK(*;,,Q/0	 +002
"8Y@[#
Z !@.+<+HZ(0B*-#5Z k499;Q?(&&	
 	a5 16 !!r   c                   6   \ rS rSr\R
                  r\r\	r
SSS.rSrSrSrSrSrSrSrS/rSS	S
\\\\4      S\S	   4S jrS
\\\\4      S\S	   4S jr SS
\\\\4      S\S\\   4S jjr    SS
\\\\4      S\S\S\S\\   S-  S\\   4S jjrSrg)VideomtVideoProcessor   i  )r6   r7   TFgp?pixel_values_videosmasks_queries_logitsr   target_sizesr   c                     / n[        U5       H5  u  pE[        R                  " X   S   USSS9S   nUR                  U5        M7     U$ )z=Interpolates mask logits to each frame's original resolution.)N.bilinearF)sizemodealign_cornersr	   )	enumerateFinterpolater5   )selfrC   rD   resizedidxoriginal_size	upsampleds          r   _resize_mask_logits)VideomtVideoProcessor._resize_mask_logits   s\     "+L"9C$))4"#	
 I NN9% #: r   c                 6   [        U S/5        UR                  nUR                  nUR                  5       R	                  SS9SSS24   nUR                  5       R                  5       n[        R                  " UR                  SS5      UR                  S5      5      nUR                  UR                  S   UR                  S   UR                  S	   UR                  S   5      nU R                  Xr5      nU V	s/ s H  oR                  SS9PM     sn	$ s  sn	f )
a@  
Converts the output of [`VideomtForUniversalSegmentation`] into semantic segmentation predictions.

Args:
    outputs ([`VideomtForUniversalSegmentationOutput`]):
        Raw outputs of the model.
    target_sizes (`list[tuple[int, int]]`):
        List of `(height, width)` tuples corresponding to the requested final size of each prediction.
        Length should match the number of frames in the output.

Returns:
    `list[torch.Tensor]`: A list of tensors, each of shape `(height, width)`, where each value is the
    predicted class index for the corresponding pixel.
r.   dim.Nr$   r%   r	   )r   rC   class_queries_logitsfloatsoftmaxr1   r.   matmul	transposeflattenreshaper-   rR   r2   )
rM   outputsrD   rC   rY   masks_classesmasks_probssegmentation_logitsoutput_logitslogits
             r   "post_process_semantic_segmentation8VideomtVideoProcessor.post_process_semantic_segmentation   s   & 	$	*&;;&;; -224<<<DS#2#XN*002::<#ll=+B+B1a+H+J]J]^_J`a199a -"5"5b"9;;L;LR;PR]RcRcdfRg
 001DS1>?#???s   ;D	thresholdc           
          [        U S/5        UR                  nUR                  nU R                  XR5      nUR                  nUR
                  S   nUR
                  S   n	/ n
[        U5       GH  nXk   nXK   nUR                  5       R                  SS9SSS24   nUR                  S5      u  nnUS:  nUR                  5       R                  5       nUR                  S5      UR                  S5      -  R                  S5      UR                  S5      R                  S5      S	-   -  nUU-  n[        R                  " X+   S[        R                  US
9n/ nSn[        U	5       H{  nUU   R!                  5       n[        R"                  " UU   5      (       d  M6  UU:  d  M>  UUUU   '   UR%                  UUU   R!                  5       ['        US5      S.5        US-  nM}     U
R%                  UUS.5        GM     U
$ )a0  
Converts the output of [`VideomtForUniversalSegmentation`] into instance segmentation predictions.

Args:
    outputs ([`VideomtForUniversalSegmentationOutput`]):
        Raw outputs of the model.
    target_sizes (`list[tuple[int, int]]`):
        List of `(height, width)` tuples corresponding to the requested final size of each prediction.
        Length should match the number of frames in the output.
    threshold (`float`, *optional*, defaults to 0.5):
        Minimum combined score to keep an instance.

Returns:
    `list[dict]`: A list of dicts (one per frame), each containing:
        - `"segmentation"` -- A `torch.Tensor` of shape `(height, width)` with instance IDs (or -1 for background).
        - `"segments_info"` -- A list of dicts with `"id"`, `"label_id"`, and `"score"` for each instance.
r.   r	   rX   rU   rV   .Nr$   gư>
fill_valuer&   r'   r(   r)   r8   segments_info)r   rY   rC   rR   r'   r-   r3   rZ   r[   maxr1   r^   r   r.   fullr0   r   anyr5   r4   )rM   r`   rD   rh   rY   rC   mask_probs_batchr'   
num_framesnum_queriesresults	frame_idx	mask_pred
mask_classclass_probsscorespred_classes
pred_masksr   mask_scoresr   r8   r9   r:   r   r,   s                             r   "post_process_instance_segmentation8VideomtVideoProcessor.post_process_instance_segmentation   s
   . 	$	*&;;&;;334HW%,,)//2
*004z*I(3I-8J$**,444<S#2#XFK#.??2#6 FL"QJ"*224J%--a0:3E3Ea3HHMMaP""1%))!,t3K !;.K ::l&="TYT^T^gmnLH!"";/	#I.33599Z	2338J:LLI!67OO"4(4Y(?(D(D(F%*5!_ '!+& 0 NNL8TUA +B r   Nr   r   r!   c                 b   [        U S/5        UR                  nUR                  nUR                  S   n	UR                  S   S-
  n
U R	                  Xr5      nUR                  5       R                  SS9R                  S5      u  p/ n[        U	5       GH  nX   nX   nX   nUR                  S   UR                  S   s=:X  a  UR                  S   :X  d  O  [        S5      eUR                  U
5      UU:  -  nUU   nUU   nUU   nUR                  S   S::  a`  Ub  X/   OUR                  SS u  nn[        R                  " UU4S[        R                  UR                  S9nUR                  U/ S	.5        M  [!        UUUUUUUb  X/   OSS
9u  nnUR                  UUS	.5        GM     U$ )a  
Converts the output of [`VideomtForUniversalSegmentation`] into panoptic segmentation predictions.

Args:
    outputs ([`VideomtForUniversalSegmentationOutput`]):
        Raw outputs of the model.
    target_sizes (`list[tuple[int, int]]`):
        List of `(height, width)` tuples corresponding to the requested final size of each prediction.
        Length should match the number of frames in the output.
    threshold (`float`, *optional*, defaults to 0.8):
        Minimum score to keep a predicted segment.
    mask_threshold (`float`, *optional*, defaults to 0.5):
        Threshold for binarizing mask probabilities.
    overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
        Overlap threshold to merge masks into a single segment.
    label_ids_to_fuse (`set[int]`, *optional*):
        Label IDs that should be fused across disconnected regions.

Returns:
    `list[dict]`: A list of dicts (one per frame), each containing:
        - `"segmentation"` -- A `torch.Tensor` of shape `(height, width)` with segment IDs (or -1 for background).
        - `"segments_info"` -- A list of dicts with `"id"`, `"label_id"`, and `"score"` for each segment.
r.   r	   rU   r$   rV   z1mask, scores and labels must have the same shape!Nrj   rl   )r   r   r    r!   r   r   r"   )r   rC   rY   r-   rR   rZ   r[   rn   r3   
ValueErrorner.   ro   r0   r'   r5   r>   )rM   r`   rD   rh   r   r   r!   rC   rY   rr   
num_labelsrq   pred_scores_batchpred_labels_batchrt   ru   r   r   r    to_keepr6   r7   r8   r9   s                           r   "post_process_panoptic_segmentation8VideomtVideoProcessor.post_process_panoptic_segmentation  s   @ 	$	*&;;&;;)//2
)//3a7
334HW/C/I/I/K/S/SXZ/S/[/_/_`b/c,z*I)4J+6K+6K$$Q';+<+<Q+?W;CTCTUVCWW !TUU!nnZ0K)4KLG#G,J%g.K%g.K"a';G;S 7YcYiYijkjlYm$zzUO%**MaMhMh  rRS%5%''"3-,G7C7OL3UY&"L( NNL8TU? +@ r    )      ?)皙?r   r   N)__name__
__module____qualname____firstlineno__r   BILINEARresampler   
image_meanr   	image_stdrG   	do_resizedo_center_crop
do_rescalerescale_factordo_normalizedo_convert_rgbdo_sample_framesmodel_input_nameslisttupleintrR   rf   rZ   dictr}   setr   __static_attributes__r   r   r   r@   r@      sc   !**H&J$IC(DINJNLN./, 5c?+ 
n		"#@ 5c?+#@ 
n		#@R 	E 5c?+E 	E
 
dEV  #-0-1M 5c?+M 	M
 M &+M s8d?M 
dM Mr   r@   )r   r   )r   r   N)__doc__image_utilsr   r   r   utilsr   r   video_processing_utilsr   r.   torch.nn.functionalnn
functionalrK   r   rZ   r   boolr   r   r   r   strr>   r@   __all__r   r   r   <module>r      s,   ) Z Z : 8 ##  ),*#*#*# *# 	*#
 "'*# 4 *#d  ),*.G"G"G"  G" 3x$	G"
 G" "'G" sCx4'G" >4S#+%5 6778G"TX. Xv #
#r   