
    Z j                     .   S SK rS SKrS SK Jr  S SKJr  S SKrS SKrS SK	J
s  Jr  S SKJrJ
r
  SSKJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'  \" 5       (       a  S SK(J)r)  \!" 5       (       a  S SK*J+r+  S SK,J-r-   " S S\
R\                  5      r/ " S S\
R\                  5      r0 " S S\
R\                  5      r1 " S S\
R\                  5      r2 SRS\
R\                  S\R                  S\R                  S\R                  S \R                  S-  S!\3S"\34S# jjr4 " S$ S%\
R\                  5      r5SSS&\R                  S'\3S(\6S)\R                  4S* jjr7 " S+ S,\
R\                  5      r8 " S- S.\
R\                  5      r9 " S/ S0\5      r: " S1 S2\
R\                  5      r;\\ " S3S49 " S5 S6\5      5       5       r< STS7\R                  S8\R                  S)\R                  4S9 jjr=S:\S;\S)\4S< jr>S:\R                  S;\R                  S)\R                  4S= jr? " S> S?\
R\                  5      r@S:\S;\S@\AS)\4SA jrBS:\R                  S;\R                  S@\AS)\R                  4SB jrC " SC SD\
R\                  5      rD\  " SE SF\5      5       rE " SG SH\
R                  5      rG " SI SJ\
R\                  5      rH " SK SL\
R\                  5      rI " SM SN\
R\                  5      rJ\ " SOS49 " SP SQ\E5      5       rKSFSQ/rLg)U    N)Callable)	dataclass)Tensornn   )initialization)ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_accelerate_available)merge_with_config_defaults)capture_outputs   )VideomtConfig)linear_sum_assignment)PartialState)reducec                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )VideomtPatchEmbeddings3   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfconfigr$   r%   r&   r'   r,   	__class__s          }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/videomt/modeling_videomt.pyr#   VideomtPatchEmbeddings.__init__:   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:i    pixel_valuesreturnc                 4   UR                   S   nX R                  :w  a  [        SU R                   SU S35      eUR                  U R                  R
                  R                  S9nU R	                  U5      R                  S5      R                  SS5      nU$ )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .dtype   )	shaper&   
ValueErrortor.   weightr:   flatten	transpose)r/   r5   r&   
embeddingss       r2   forwardVideomtPatchEmbeddings.forwardI   s    #))!,,,,!../yaI 
 $T__-C-C-I-IJ__\2::1=GG1M
r4   )r$   r&   r,   r%   r.   )__name__
__module____qualname____firstlineno____doc__r#   torchr   rC   __static_attributes____classcell__r1   s   @r2   r   r   3   s.    j
ELL 
U\\ 
 
r4   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrSS\R                  S\R                  S-  S\R                  4S	 jjr	S
r
U =r$ )VideomtEmbeddingsV   zE
Construct the CLS token, mask token, position and patch embeddings.
r0   r6   Nc                 6  > [         TU ]  5         Xl        UR                  U l        [        R
                  " [        R                  " SSUR                  5      5      U l	        [        R
                  " [        R                  " SUR                  UR                  5      5      U l        [        U5      U l        U R                  R                  n[        R                   " UR"                  5      U l        SUR                  -   U l        [        R(                  " X!R                  5      U l        U R-                  S[        R.                  " U5      R1                  S5      SS9  [        R
                  " [        R                  " SSUR                  5      5      U l        g )Nr   position_idsr   F)
persistent)r"   r#   r0   r%   r   	ParameterrJ   randnr'   	cls_tokenzerosnum_register_tokensregister_tokensr   patch_embeddingsr,   Dropouthidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsregister_bufferarangeexpand
mask_token)r/   r0   r,   r1   s      r2   r#   VideomtEmbeddings.__init__[   s    ++ekk!Q8J8J&KL!||EKK6;U;UW]WiWi,jk 6v >++77zz&"<"<=!"V%?%?!?#%<<=O=O#P ^U\\+-F-M-Mg-Vchi,,u{{1a9K9K'LMr4   r5   bool_masked_posc                    UR                   S:X  a=  UR                  u  p4pVnUR                  X4-  XVU5      nUb  UR                  X4-  S5      nO2Ub/  UR                   S:  a  UR                  UR                  S   S5      nUR                  S   nU R                  U5      nUbX  UR	                  UR
                  [        R                  S9R                  S5      n	[        R                  " XR                  U5      nU R                  R                  USS5      n
U R                  R                  USS5      nXR                  U R                  5      -   n[        R                   " XU/SS9nU R#                  U5      nU$ )N   rT   r;   r   )devicer:   r   dim)ndimr<   reshaper\   r>   rk   rJ   bool	unsqueezewhererf   rX   re   r[   rb   rR   catr_   )r/   r5   rh   
batch_size
num_framesr&   heightwidthrB   mask
cls_tokensr[   s               r2   rC   VideomtEmbeddings.forwardk   sa   !BNBTBT?JL%'//
0G_deL*"1"9"9*:QSU"V(_-A-AA-E-55o6K6KA6NPRSO!''*
**<8
&"%%Z->->ejj%Q[[\^_DT??JGJ^^**:r2>
..55j"bI":":4;L;L"MM
YY
ZHaP
\\*-
r4   )	rX   r0   r_   rf   r`   r\   r%   rb   r[   NrE   rF   rG   rH   rI   r   r#   rJ   r   rC   rK   rL   rM   s   @r2   rO   rO   V   sR    N} N N ELL 5<<RVCV bgbnbn  r4   rO   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )
VideomtMLP   r6   c                 z  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[
        R                  " X$SS9U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [
        R                  " XCSS9U l        g )NTbias)r"   r#   r'   int	mlp_ratior   Linearfc1r(   
hidden_actstrr	   
activationfc2r/   r0   in_featuresout_featureshidden_featuresr1   s        r2   r#   VideomtMLP.__init__   s    %+%7%77f0063C3CCD99[Ef''--$V%6%67DO$//DO99_Fr4   hidden_statec                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r{   )r   r   r   r/   r   s     r2   rC   VideomtMLP.forward   s2    xx-|4xx-r4   )r   r   r   r6   N
rE   rF   rG   rH   r#   rJ   r   rC   rK   rL   rM   s   @r2   r~   r~      s)    	GELL U\\  r4   r~   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )VideomtGatedMLP   r6   c                 $  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[        US-  S-  5      S-   S-  S-  n[
        R                  " USU-  SS9U l        [
        R                  " XCSS9U l        g Nr;   r         Tr   	r"   r#   r'   r   r   r   r   
weights_inweights_outr   s        r2   r#   VideomtGatedMLP.__init__       %+%7%77f0063C3CCD2Q67!;AAE))K_1D4P99_Nr4   r   c                     U R                  U5      nUR                  SSS9u  p#[        R                  R	                  U5      U-  nU R                  U5      $ Nr;   rT   rl   r   chunkr   
functionalsilur   r/   r   x1x2hiddens        r2   rC   VideomtGatedMLP.forward   Q    |4##A2#.##B'",''r4   r   r   r   r   rM   s   @r2   r   r      )    O(ELL (U\\ ( (r4   r   modulequerykeyvalueattention_maskscalingr_   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrT   )rm   r:   )ptrainingr   r;   )rJ   matmulrA   r   r   softmaxfloat32r>   r:   r_   r   
contiguous)
r   r   r   r   r   r   r_   kwargsattn_weightsattn_outputs
             r2   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r4   c            
          ^  \ rS rSrSrU 4S jr S
S\R                  S\R                  S-  S\\R                  \R                  S-  4   4S jjr	S	r
U =r$ )VideomtAttention   z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r"   r#   r0   r'   	embed_dimnum_attention_heads	num_headshead_dimr=   scaleattention_dropoutr_   	is_causalr   r   k_projv_projq_projout_projr/   r0   r1   s     r2   r#   VideomtAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar4   Nhidden_statesr   r6   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNrT   r   r;           )r   r   r_   )r<   r   r   viewrA   r   r   r   get_interfacer0   _attn_implementationr   r   r   r   r_   ro   r   r   )r/   r   r   r   input_shapehidden_shapequerieskeysvaluesattention_interfacer   r   s               r2   rC   VideomtAttention.forward   s6    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "));;;;FFHmmK0((r4   )r0   r_   r   r   r   r   r   r   r   r   r   r{   )rE   rF   rG   rH   rI   r#   rJ   r   tuplerC   rK   rL   rM   s   @r2   r   r      s[    GB. /3!)||!) t+!)
 
u||U\\D00	1!) !)r4   r   input	drop_probr   r6   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )z[
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

r   r   r   )r   r:   rk   )r<   rn   rJ   randr:   rk   floor_div)r   r   r   	keep_probr<   random_tensoroutputs          r2   	drop_pathr      s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr4   c                      ^  \ rS rSrSrSS\S-  SS4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )VideomtDropPathi
  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r6   c                 .   > [         TU ]  5         Xl        g r{   )r"   r#   r   )r/   r   r1   s     r2   r#   VideomtDropPath.__init__  s    "r4   r   c                 B    [        XR                  U R                  5      $ r{   )r   r   r   r/   r   s     r2   rC   VideomtDropPath.forward  s    FFr4   c                      SU R                    3$ )Nzp=r   r/   s    r2   
extra_reprVideomtDropPath.extra_repr  s    DNN#$$r4   r   r{   )rE   rF   rG   rH   rI   floatr#   rJ   r   rC   r   r   rK   rL   rM   s   @r2   r   r   
  sQ    b#%$, #$ # #GU\\ Gell G%C % %r4   r   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )VideomtSwiGLUFFNi  r6   c                 $  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[        US-  S-  5      S-   S-  S-  n[
        R                  " USU-  SS9U l        [
        R                  " XCSS9U l        g r   r   r   s        r2   r#   VideomtSwiGLUFFN.__init__  r   r4   r   c                     U R                  U5      nUR                  SSS9u  p#[        R                  R	                  U5      U-  nU R                  U5      $ r   r   r   s        r2   rC   VideomtSwiGLUFFN.forward"  r   r4   r   r   r   rM   s   @r2   r   r     r   r4   r   c                      ^  \ rS rSrSrS\SS4U 4S jjr SS\R                  S\R                  S-  S\R                  4S	 jjr	S
r
U =r$ )VideomtLayeri)  zCThis corresponds to the Block class in the original implementation.r0   r6   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        U5      U l
        UR                  S:  a  [        UR                  5      O[        R                  " 5       U l        [        R                  " UR                  UR
                  S9U l        UR                   (       a  [#        U5      U l        O['        U5      U l        [        U5      U l        g )Nepsr   )r"   r#   r   	LayerNormr'   layer_norm_epsnorm1r   	attentionVideomtLayerScalelayer_scale1drop_path_rater   Identityr   norm2use_swiglu_ffnr   mlpr~   layer_scale2r   s     r2   r#   VideomtLayer.__init__,  s    \\&"4"4&:O:OP
)&1-f5CICXCX[^C^)>)>?dfdododq\\&"4"4&:O:OP
  '/DH!&)DH-f5r4   r   r   c                 &   U R                  U5      nU R                  X25      u  pEU R                  U5      nU R                  U5      U-   nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      U-   nU$ r{   )r  r  r	  r   r  r  r  )r/   r   r   hidden_states_normself_attention_output_layer_outputs          r2   rC   VideomtLayer.forward<  s    
 "ZZ6#'>>2D#U  $ 1 12G H '<=M zz-0xx-((6 ~~l3mCr4   )r  r   r	  r  r  r  r  r{   r|   rM   s   @r2   r   r   )  sU    M6} 6 6& /3|| t+ 
	 r4   r   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )r  iS  r6   c                    > [         TU ]  5         [        R                  " UR                  [
        R                  " UR                  5      -  5      U l        g r{   )	r"   r#   r   rV   layerscale_valuerJ   onesr'   lambda1r   s     r2   r#   VideomtLayerScale.__init__T  s8    ||F$;$;ejjI[I[>\$\]r4   r   c                     XR                   -  $ r{   r  r   s     r2   rC   VideomtLayerScale.forwardX  s    ll**r4   r  r   r   rM   s   @r2   r  r  S  s)    ^+ELL +U\\ + +r4   r  a  
    Class for outputs of [`VideomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~VideomtVideoProcessor.post_process_semantic_segmentation`] or
    [`~VideomtVideoProcessor.post_process_instance_segmentation`] or
    [`~VideomtVideoProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~VideomtVideoProcessor`] for details regarding usage.
    )custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S	'   S
rg)%VideomtForUniversalSegmentationOutputi\  a<  
loss (`torch.Tensor`, *optional*):
    The computed loss, returned when labels are present.
class_queries_logits (`torch.FloatTensor`):
    A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
    query. Note the `+ 1` is needed because we incorporate the null class.
masks_queries_logits (`torch.FloatTensor`):
    A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
    query.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Last hidden states (final feature map) of the last layer.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Self and Cross Attentions weights from transformer decoder.
Nlossclass_queries_logitsmasks_queries_logitslast_hidden_stater   
attentions )rE   rF   rG   rH   rI   r#  rJ   FloatTensor__annotations__r$  r%  r&  r   r   r'  rK   r(  r4   r2   r"  r"  \  s    & &*D%

d
")59%++d2959%++d2926u((4/659M5**+d2926Je''(4/6r4   r"  input_featurespoint_coordinatesc                     UR                  5       S:X  a  SnUR                  S5      n[        R                  R                  R
                  " U SU-  S-
  40 UD6nU(       a  UR                  S5      nU$ )a  
A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

Args:
    input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
        A tensor that contains features map on a height * width grid
    point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
    2)):
        A tensor that contains [0, 1] * [0, 1] normalized point coordinates
    add_dim (`bool`):
        boolean value to keep track of added dimension

Returns:
    point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
    height_grid, width_grid):
        A tensor that contains features for points in `point_coordinates`.
r   Tr;   g       @      ?)rm   rq   rJ   r   r   grid_samplesqueeze)r+  r,  add_dimr   point_featuress        r2   sample_pointr3    st    ( !#-77: XX((44^SK\E\_bEbmflmN'//2r4   inputslabelsc                    U R                  5       R                  S5      n S[        R                  " XR                  5      -  nU R                  S5      SS2S4   UR                  S5      SSS24   -   nSUS-   US-   -  -
  nU$ )a  
A pair wise version of the dice loss, see `dice_loss` for usage.

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    `torch.Tensor`: The computed loss between each pairs.
r   r;   rT   N)sigmoidr@   rJ   r   Tsum)r4  r5  	numeratordenominatorr#  s        r2   pair_wise_dice_lossr<    sz     ^^%%a(FELL22I**R.D)FJJrN47,CCK	A+/22DKr4   c                 Z   U R                   S   n[        R                  " SS9nU" U [        R                  " U 5      5      nU" U [        R
                  " U 5      5      n[        R                  " XB-  UR                  5      n[        R                  " XR-  SU-
  R                  5      nXg-   nU$ )a  
A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    loss (`torch.Tensor`): The computed loss between each pairs.
r   none	reduction)r<   r   BCEWithLogitsLossrJ   	ones_like
zeros_liker   r8  )	r4  r5  height_and_width	criterioncross_entropy_loss_poscross_entropy_loss_negloss_posloss_negr#  s	            r2   $pair_wise_sigmoid_cross_entropy_lossrJ    s     ||A$$v6I&vuv/FG&vu/?/?/GH||2EvxxPH||2EF
~~VHDKr4   c                      ^  \ rS rSrSr SS\S\S\S\4U 4S jjjr\R                  " 5       S\R                  S	\R                  S
\R                  S\R                  S\\\
      4
S j5       rSrU =r$ )VideomtHungarianMatcheri  aa  This class computes an assignment between the labels and the predictions of the network.

For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).

cost_class	cost_mask	cost_dice
num_pointsc                    > [         TU ]  5         US:X  a  US:X  a  US:X  a  [        S5      eX@l        Xl        X l        X0l        g)a  Creates the matcher

Params:
    cost_class (`float`, *optional*, defaults to 1.0):
        Relative weight of the classification error in the matching cost.
    cost_mask (`float`, *optional*,  defaults to 1.0):
        This is the relative weight of the focal loss of the binary mask in the matching cost.
    cost_dice (`float`, *optional*, defaults to 1.0):
        This is the relative weight of the dice loss of the binary mask in the matching cost.
    num_points (`int`, *optional*, defaults to 12544):
        No. of points to sample on which the mask loss will be calculated. The same set of K points are
        uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
        matching.
r   zAll costs can't be 0N)r"   r#   r=   rP  rM  rN  rO  )r/   rM  rN  rO  rP  r1   s        r2   r#    VideomtHungarianMatcher.__init__  sC    " 	?yA~)q.344$$""r4   r%  r$  mask_labelsclass_labelsr6   c           
         / nUR                   S   n[        U5       GH  nX'   R                  S5      nX   n	USS2XG   4   * n
X7   R                  U	5      nUSS2S4   nU	SS2S4   n	[        R
                  " SU R                  SU	R                  S9nUR                  UR                   S   SS5      n[        XSS9R                  S5      nUR                  U	R                   S   SS5      n[        XSS9R                  S5      n	[        X5      n[        X5      nU R                  U-  U R                  U
-  -   U R                  U-  -   n[        R                   " U[        R"                  " S	5      5      n[        R$                  " U[        R"                  " S
5      5      n[        R&                  " US5      n[)        UR+                  5       5      nUR-                  U5        GM     U VVs/ s HL  u  nn[        R.                  " U[        R0                  S9[        R.                  " U[        R0                  S94PMN     nnnU$ s  snnf )a  
Params:
    masks_queries_logits (`torch.Tensor`):
        A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
    class_queries_logits (`torch.Tensor`):
        A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
    class_labels (`torch.Tensor`):
        A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
        target) containing the class labels.
    mask_labels (`torch.Tensor`):
        A tensor of dim `num_target_boxes, height, width` containing the target masks.

Returns:
    matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
    where:
        - index_i is the indices of the selected predictions (in order)
        - index_j is the indices of the corresponding selected labels (in order)
    For each batch element, it holds:
        len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
r   rT   Nr   r;   rk   Falign_cornersg    _Bg    _r9   )r<   ranger   r>   rJ   r   rP  rk   repeatr3  r0  rJ  r<  rN  rM  rO  minimumtensormaximum
nan_to_numr   cpuappend	as_tensorint64)r/   r%  r$  rS  rT  indicesrt   i
pred_probs	pred_maskrM  target_maskr,  target_coordinatespred_coordinatesrN  rO  cost_matrixassigned_indicesjmatched_indicess                        r2   rC   VideomtHungarianMatcher.forward  s   8 *, *//2
z"A-088<J,/I %Q%788J%.++I6K%ag.K!!T'*I !&

1dooqIYIY Z!2!9!9+:K:KA:NPQST!U&{V[\ddefgK077	8JAqQ$YPUV^^_`aI =YTI+ICI..94t7SSVZVdVdgpVppK--U\\$5GHK--U\\%5HIK**;:K0EkooFW0XNN+,? #F ho
gn_c_`bcU__Qekk2EOOAU[[4YZgn 	 
 
s   ,AIrM  rO  rN  rP  )r.  r.  r.  i 1  )rE   rF   rG   rH   rI   r   r   r#   rJ   no_gradr   listr   rC   rK   rL   rM   s   @r2   rL  rL    s     jo##27#JO#cf# #4 ]]_D#llD $llD \\	D
 llD 
eFm	D Dr4   rL  	num_masksc                     U R                  5       R                  S5      nSX1-  R                  S5      -  nUR                  S5      UR                  S5      -   nSUS-   US-   -  -
  nUR                  5       U-  nU$ )a  
Compute the DICE loss, similar to generalized IOU for masks as follows:

$$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

$$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).
    num_masks (`int`):
        The number of masks present in the current batch, used for normalization.

Returns:
    `torch.Tensor`: The computed loss.
r   r;   rT   )r7  r@   r9  )r4  r5  rr  probsr:  r;  r#  s          r2   	dice_lossru  @  sx    , NN$$Q'EU^((,,I))B-&**R.0K	A+/22D88:	!DKr4   c                     [         R                  " SS9nU" X5      nUR                  S5      R                  5       U-  nU$ )aX  
Args:
    inputs (`torch.Tensor`):
        A float tensor of arbitrary shape.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    loss (`torch.Tensor`): The computed loss.
r>  r?  r   )r   rA  meanr9  )r4  r5  rr  rE  cross_entropy_lossr#  s         r2   sigmoid_cross_entropy_lossry  ^  sB     $$v6I"62""1%))+i7DKr4   c                     ^  \ rS rSrS\S\\\4   4U 4S jjrS\	\	\
      S\	\
   4S jrS\	\   S\\\4   4S	 jrS
\S\	\   S\\R                      S\\\4   4S jrS\R                  S\	\R                     S\\R                      S\
S\\\R                  4   4
S jrS rS rS\R                  S\R                  4S jrS\R                  S\
S\
S\S\R                  4
S jr S S\R                  S
\R                  S\	\R                     S\	\R                     S\\\R                  4   S-  S\\\R                  4   4S jjrS\R                  S\R2                  S\R                  4S jrSrU =r$ )!VideomtLossir  r0   weight_dictc                   > [         TU ]  5         [        U S/5        UR                  U l        X l        UR
                  U l        [        R                  " U R                  S-   5      nU R                  US'   U R                  SU5        UR                  U l        UR                  U l        UR                  U l        [        UR                  UR                   UR"                  U R                  S9U l        g)a	  
The Videomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
of matched ground-truth / prediction (supervise class and mask)

Args:
    config (`VideomtConfig`):
        The configuration for Videomt model also containing loss calculation specific parameters.
    weight_dict (`dict[str, float]`):
        A dictionary of weights to be applied to the different losses.
scipyr   rT   empty_weightro  N)r"   r#   r   
num_labelsr|  no_object_weighteos_coefrJ   r  rc   train_num_pointsrP  oversample_ratioimportance_sample_ratiorL  class_weightdice_weightmask_weightmatcher)r/   r0   r|  r  r1   s       r2   r#   VideomtLoss.__init__s  s     	$	* ++& //zz$//A"56==R^\: !11 & 7 7'-'E'E$.**((((	
r4   sizesr6   c                 p    US   nUSS   H'  n[        U5       H  u  pE[        X$   U5      X$'   M     M)     U$ )Nr   r   )	enumeratemax)r/   r  maxessublistindexitems         r2   _max_by_axisVideomtLoss._max_by_axis  sC    aQRyG(1"5<6  2 ! r4   tensorsc                 T   U R                  U Vs/ s H  n[        UR                  5      PM     sn5      n[        U5      /U-   nUu  pVpxUS   R                  n	US   R
                  n
[        R                  " XIU
S9n[        R                  " XWU4[        R                  U
S9n[        XU5       Ho  u  p-nUS UR                  S   2S UR                  S   2S UR                  S   24   R                  U5        SUS UR                  S   2S UR                  S   24'   Mq     X4$ s  snf )Nr   r   r   r;   F)r  rq  r<   lenr:   rk   rJ   rY   r  rp   zipcopy_)r/   r  r\  max_sizebatch_shapert   r  rv   rw   r:   rk   padded_tensorspadding_maskspadded_tensorpadding_masks                  r2   _pad_images_to_max_in_batch'VideomtLoss._pad_images_to_max_in_batch  s'   $$w%OwVd6<<&8w%OP7|nx/'2$
v
  ""[fM

J#>ejjY_`36wP]3^/F<+FLLO+->v||A->@Q&,,q/@QQRXXY_`AFL*6<<?*,=fll1o,==> 4_ ,, &Ps   D%r$  rT  rc  c           	         UnUR                   u  pVn[        R                  " U R                  S9nU R	                  U5      n	[
        R                  " [        X#5       V
VVs/ s H  u  n
u  p{X   PM     snnn
5      n[
        R                  " XV4U R                  [
        R                  UR                  S9nXU	'   UR                  SS5      nU" X5      nSU0nU$ s  snnn
f )a-  Compute the losses related to the labels using cross entropy.

Args:
    class_queries_logits (`torch.Tensor`):
        A tensor of shape `batch_size, num_queries, num_labels`
    class_labels (`list[torch.Tensor]`):
        List of class labels of shape `(labels)`.
    indices (`tuple[np.array])`:
        The indices computed by the Hungarian matcher.

Returns:
    `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
    - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
)r?   )
fill_valuer:   rk   r   r;   loss_cross_entropy)r<   r   CrossEntropyLossr  $_get_predictions_permutation_indicesrJ   rs   r  fullr  rb  rk   rA   )r/   r$  rT  rc  pred_logitsrt   num_queriesr  rE  idxtargetrl  target_classes_otarget_classespred_logits_transposedloss_celossess                    r2   loss_labelsVideomtLoss.loss_labels  s    " +%0%6%6"
''t/@/@A	77@ 99-0-GH-G>66AVY-GH
 %$//]h]o]o
 /s!,!6!6q!!<2C&0 Is    Cr%  rS  rr  c                   ^  T R                  U5      nT R                  U5      nX   nT R                  U5      u  pX   nUSS2S4   nUSS2S4   n[        R                  " 5          T R                  UU 4S jT R                  T R                  T R                  5      n
[        XSS9R                  S5      nSSS5        [        UW
SS9R                  S5      n[        UWU5      [        XU5      S.nAAU$ ! , (       d  f       NF= f)a$  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

Args:
    masks_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, height, width)`.
    mask_labels (`torch.Tensor`):
        List of mask labels of shape `(labels, height, width)`.
    indices (`tuple[np.array])`:
        The indices computed by the Hungarian matcher.
    num_masks (`int)`:
        The number of masks, used for normalization.

Returns:
    losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
    - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
      masks.
    - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
      masks.
Nc                 &   > TR                  U 5      $ r{   )calculate_uncertainty)logitsr/   s    r2   <lambda>(VideomtLoss.loss_masks.<locals>.<lambda>  s    t99&Ar4   FrW  r   )	loss_mask	loss_dice)r   _get_targets_permutation_indicesr  rJ   rp  sample_points_using_uncertaintyrP  r  r  r3  r0  ry  ru  )r/   r%  rS  rc  rr  src_idxtgt_idx
pred_maskstarget_masksr  r,  point_labelspoint_logitsr  s   `             r2   
loss_masksVideomtLoss.loss_masks  s   4 ;;GD77@)2
 ::;G#,  4(
#AtG, ]]_ $ D DA%%,,! (W\]eefghL  $J0AQVW__`ab 4L,PYZ"<yI

 ) _s   &AC77
Dc                    [         R                  " [        U5       VVVs/ s H  u  nu  p4[         R                  " X25      PM      snnn5      n[         R                  " U VVs/ s H  u  p4UPM	     snn5      nXV4$ s  snnnf s  snnf r{   rJ   rs   r  	full_like)r/   rc  rd  srcr  batch_indicespredictions_indicess          r2   r  0VideomtLoss._get_predictions_permutation_indices  sh    		iX_N`"aN`{q(35??3#:N`"ab#iiW(EW#W(EF11 #b(E   %A<#B
c                    [         R                  " [        U5       VVVs/ s H  u  nu  p4[         R                  " XB5      PM      snnn5      n[         R                  " U VVs/ s H  u  p4UPM	     snn5      nXV4$ s  snnnf s  snnf r{   r  )r/   rc  rd  r  tgtr  target_indicess          r2   r  ,VideomtLoss._get_targets_permutation_indices  sg    		iX_N`"aN`{q(15??3#:N`"ab#@HQC#@A,, #b#@r  r  c                 4    [         R                  " U5      * nU$ )a5  
In Videomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
for the foreground class in `classes`.

Args:
    logits (`torch.Tensor`):
    A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
    the number of foreground classes. The values are logits.

Returns:
    scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
    uncertain locations having the highest uncertainty score.
)rJ   abs)r/   r  uncertainty_scoress      r2   r  !VideomtLoss.calculate_uncertainty  s      %yy01!!r4   rP  r  r  c           
      h   UR                   S   n[        X4-  5      n[        R                  " XgSUR                  S9n[        XSS9n	U" U	5      n
[        XS-  5      nX;-
  n[        R                  " U
SS2SSS24   USS9S   nU[        R                  " U[        R                  UR                  S	9-  nXSS2S4   -  nUR                  S
S5      UR                  S
5      SS24   R                  XkS5      nUS:  a5  [        R                  " U[        R                  " XlSUR                  S9/SS9nU$ )a  
This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
prediction as input.

Args:
    logits (`float`):
        Logit predictions for P points.
    uncertainty_function:
        A function that takes logit predictions for P points and returns their uncertainties.
    num_points (`int`):
        The number of points P to sample.
    oversample_ratio (`int`):
        Oversampling parameter.
    importance_sample_ratio (`float`):
        Ratio of points that are sampled via importance sampling.

Returns:
    point_coordinates (`torch.Tensor`):
        Coordinates for P sampled points.
r   r;   rV  FrW  Nr   )krm   r   rT   rl   )r<   r   rJ   r   rk   r3  topkrd   longr   rs   )r/   r  uncertainty_functionrP  r  r  	num_boxesnum_points_sampledr,  r  point_uncertaintiesnum_uncertain_pointsnum_random_pointsr  shifts                  r2   r  +VideomtLoss.sample_points_using_uncertainty,  s3   < LLO	 !>? "JJyaPVP]P]^#FUS2<@"#:#GH&=jj,Q1W59MSTUVWX"U\\)5::V\VcVc%ddQW~-222q9#((2,/JOOPYqrsq  %		"EJJyQW]WdWd$ef! ! r4   Nauxiliary_predictionsc                    U R                  XX45      nU R                  XDS   R                  S9n0 U R                  XXg5      EU R	                  X$U5      EnUbk  [        U5       H\  u  pU
S   nU
S   nU R                  XX45      nUR                  5        VVs0 s H  u  pU SU	 3U_M     nnnUR                  U5        M^     U$ s  snnf )a  
This performs the loss computation.

Args:
    masks_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, height, width)`.
    class_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, num_labels)`.
    mask_labels (`torch.Tensor`):
        List of mask labels of shape `(labels, height, width)`.
    class_labels (`list[torch.Tensor]`):
        List of class labels of shape `(labels)`.
    auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
        if `use_auxiliary_loss` was set to `true` in [`VideomtConfig`], then it contains the logits from
        the inner layers of the VideomtMaskedAttentionDecoder.

Returns:
    losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
    - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
    - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
      masks.
    - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
      masks.
    if `use_auxiliary_loss` was set to `true` in [`VideomtConfig`], the dictionary contains additional
    losses for each auxiliary predictions.
r   rV  r%  r$  r  )	r  get_num_masksrk   r  r  r  rC   itemsupdate)r/   r%  r$  rS  rT  r  rc  rr  r  r  aux_outputs	loss_dictr   r   s                 r2   rC   VideomtLoss.forwardc  s    H ,,3;e&&|O<R<R&S	%
oo2T%
37K%

 !,$-.C$D '23I'J$'23I'J$ LL)=U`o	EN__EVWEVzsuAcU^U2EV	Wi( %E  Xs   Crk   c                 (   [        S U 5       5      n[        R                  " U[        R                  US9nSn[	        5       (       a3  [
        R                  0 :w  a  [        U5      n[        5       R                  n[        R                  " X4-  SS9nU$ )z[
Computes the average number of target masks across the batch, for normalization purposes.
c              3   8   #    U  H  n[        U5      v   M     g 7fr{   )r  ).0classess     r2   	<genexpr>,VideomtLoss.get_num_masks.<locals>.<genexpr>  s     ALGLs   r   r   )min)
r9  rJ   ra  r   r   r   _shared_stater   num_processesclamp)r/   rT  rk   rr  
world_sizes        r2   r  VideomtLoss.get_num_masks  sv     ALAA	OOIU[[P	
"$$))R/"9-	)^99
KK	 6A>	r4   )r  r  r  r  rP  r  r|  r{   )rE   rF   rG   rH   r   dictr   r   r#   rq  r   r  r   r   r  nparrayr  rJ   r  r  r  r  r  rC   rk   r  rK   rL   rM   s   @r2   r{  r{  r  s   !
} !
4U
;K !
F$tCy/ d3i -4< -E&RX.DY -" $* :>v, QVWYW_W_Q` 	c6k	 D<#ll< %,,'< rxx	<
 < 
c5<<	 <|2-"ELL "U\\ ""5!5! 	5!
 5! "'5! 
5!z AE5#ll5 $ll5 %,,'	5
 5<<(5  $C$56=5 
c5<<	 5n%,,  QVQ]Q]  r4   r{  c                       \ rS rSr% Sr\\S'   SrSrSr	Sr
S/rS	r\\S
.r\R"                  " 5       S\R&                  SS4S j5       rSrg)VideomtPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
r0   videomtpixel_values_videos)videoFr   T)r   r'  r   r6   Nc                    U R                   R                  n[        U[        R                  [        R
                  [        R                  45      (       a  [        R                  " UR                  [        R                  " S5      S9  UR                  by  [        R                  R                  R                  UR                  5      u  p4US:  a  S[        R                  " U5      -  OSn[        R                  " UR                  U* U5        GO[        U[        R                   5      (       aB  [        R"                  " UR                  5        [        R$                  " UR                  5        GOC[        U[        R&                  5      (       ax  [        R(                  " UR                  SSS9  UR*                  bI  [-        UR                  SS5      (       d-  [        R$                  " UR                  UR*                     5        GO[        U[.        5      (       aH  [1        US	5      (       a5  [        R2                  " UR4                  U R                   R6                  5        GOO[        U[8        5      (       a  [        R:                  " UR<                  SUS9  [        R$                  " UR>                  5        [        R@                  " URB                  [        RD                  " URB                  RF                  S
   5      RI                  S5      5        O[        U[J        5      (       aT  [        RL                  " URN                  S-   5      nURP                  US
'   [        R@                  " URR                  U5        O5[        U[T        5      (       a   [        R"                  " URV                  5        [        U[8        5      (       a*  [        R                  R%                  URX                  5        g g )Nrj   )ar   r   r   )rw  std_is_hf_initializedFr  rT   rS   )-r0   initializer_ranger(   r   r   r-   ConvTranspose2dinitkaiming_uniform_r?   mathsqrtr   rJ   _calculate_fan_in_and_fan_outuniform_r  ones_zeros_ra   normal_padding_idxgetattrr  hasattr	constant_r  r  rO   trunc_normal_rX   r[   r  rR   rd   r<   re   r{  r  r  r  r  VideomtForUniversalSegmentationattn_mask_probsrf   )r/   r   r  fan_inr  boundr  s          r2   _init_weights$VideomtPreTrainedModel._init_weights  sw   kk++fryy"))R5G5GHII!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659--JJv}}%KK$--LLSa8!!-gfmmMach6i6iFMM&*<*<=> 122vy))v~~t{{/K/KL 122v//csCKK../JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh,, ::f&7&7!&;<L%LJJv**L9 ?@@JJv--.f/00GGNN6,,- 1r4   r(  )rE   rF   rG   rH   rI   r   r*  base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpar   r   _can_record_outputsrJ   rp  r   Moduler  rK   r(  r4   r2   r  r    sn    
 !+O!&+#'(N%&
 ]]_.BII .$ . .r4   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )VideomtLayerNorm2di  c                 "   > [         TU ]  XUS9  g )N)r  elementwise_affine)r"   r#   )r/   r&   r  affiner1   s       r2   r#   VideomtLayerNorm2d.__init__  s    6Jr4   r   r6   c                     UR                  SSSS5      n[        R                  " XR                  U R                  U R
                  U R                  5      nUR                  SSSS5      nU$ )Nr   r;   r   r   )permuteF
layer_normnormalized_shaper?   r   r  r   s     r2   rC   VideomtLayerNorm2d.forward  s`    #++Aq!Q7||L2G2GVZV_V_aeaiaij#++Aq!Q7r4   r(  )gư>Tr   rM   s   @r2   r   r     s)    KELL U\\  r4   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VideomtScaleLayeri  r0   c           	         > [         TU ]  5         UR                  n[        R                  " X"SSS9U l        [        UR                     U l        [        R                  " UUSSUSS9U l
        [        U5      U l        g )Nr;   r   r   r   F)r    paddinggroupsr   )r"   r#   r'   r   r  conv1r	   r   r   r-   conv2r   layernorm2dr/   r0   r'   r1   s      r2   r#   VideomtScaleLayer.__init__  ss    ((''aXYZ
 !2!23YY

 .k:r4   r   r6   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r{   )r0  r   r1  r2  r   s     r2   rC   VideomtScaleLayer.forward  sB    

=16

=1((7r4   )r   r0  r1  r2  rE   rF   rG   rH   r   r#   rJ   r   rC   rK   rL   rM   s   @r2   r,  r,    s/    ;} ; U\\ ell  r4   r,  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VideomtScaleBlocki  r0   c                    > [         TU ]  5         UR                  U l        [        R
                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r{   )	r"   r#   num_upscale_blocks
num_blocksr   
ModuleListrY  r,  blockr/   r0   r  r1   s      r2   r#   VideomtScaleBlock.__init__  sN     33]]uT__G]#^G]!$5f$=G]#^_
#^s   A*r   r6   c                 <    U R                    H  nU" U5      nM     U$ r{   )r>  )r/   r   r>  s      r2   rC   VideomtScaleBlock.forward	  s     ZZE!-0M  r4   )r>  r<  r7  rM   s   @r2   r9  r9    s1    `} `
U\\ ell  r4   r9  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VideomtMaskHeadi  r0   c                   > [         TU ]  5         UR                  n[        R                  " X"5      U l        [        R                  " X"5      U l        [        R                  " X"5      U l        [        UR                     U l
        g r{   )r"   r#   r'   r   r   r   r   fc3r	   r   r   r3  s      r2   r#   VideomtMaskHead.__init__  s[    ((99[699[699[6 !2!23r4   r   r6   c                     U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      nU$ r{   r   r   r   rF  r   s     r2   rC   VideomtMaskHead.forward  sD    (?@(?@/r4   rI  r7  rM   s   @r2   rD  rD    s/    4} 4U\\ ell  r4   rD  zY
    The Videomt Model with head on top for instance/semantic/panoptic segmentation.
    c                   z  ^  \ rS rSrSrS\4U 4S jjrS\S\S\S\S	\\	\4   S
\\	\4   4S jr
S\\	\4   S
\4S jr\\\    SS\R                  S-  S\\R                     S-  S\\R                     S-  S\\R                     S-  S\\   S
\4S jj5       5       5       rS rS\R                  4S jrSrU =r$ )r  i   r  r0   c                 >  > [         TU ]  U5        Xl        UR                  U l        [	        U5      U l        [        R                  " UR                  UR                  S9U l
        [        R                  " UR                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        U5      PM     sn5      U l        [%        U5      U l        [)        U5      U l        [        R,                  " UR                  UR.                  S-   5      U l        UR2                  UR4                  -  UR2                  UR4                  -  4U l        UR8                  UR:                  UR<                  S.U l        [A        XR>                  S9U l!        U RE                  S[F        RH                  " URJ                  5      5        [        R,                  " UR                  UR                  5      U l&        U RO                  5         g s  snf )Nr  r   )r  r  r  )r0   r|  r  )(r"   r#   r0   num_hidden_layersrO   rB   r   r  r'   r  	layernormra   r  r   r=  rY  r   layersr9  upscale_blockrD  	mask_headr   r  class_predictorr$   r%   	grid_sizer  r  r  r|  r{  rE  rc   rJ   r  r<  query_updater	post_initr?  s      r2   r#   (VideomtForUniversalSegmentation.__init__(  s    !'!9!9+F3f&8&8f>S>ST\\&"4"4f6H6HI
mm5IaIaCb$cCba\&%9Cb$cd.v6(0!yy););V=N=NQR=RS ++v/@/@@&BSBSW]WhWhBhi"("5"5++++.
 %F@P@PQ.

6;L;L0MNYYv'9'96;M;MN' %ds   =Hr%  r$  rS  rT  r  r6   c                     U R                  UUUUUS9nU R                  R                  5        H)  u  pxUR                  5        H  u  pXy;   d  M  X-  n
M     M+     U$ )N)r%  r$  rS  rT  r  )rE  r|  r  )r/   r%  r$  rS  rT  r  r  r   r?   loss_keyr#  s              r2   get_loss_dict-VideomtForUniversalSegmentation.get_loss_dictE  sj     (,~~!5!5#%"7 (6 (
	  ++113KC"+//"3?ND #4 4
 r4   r  c                 4    [        UR                  5       5      $ r{   )r9  r   )r/   r  s     r2   get_loss(VideomtForUniversalSegmentation.get_loss]  s    9##%&&r4   Npatch_offsetsr   c           
      v   SU;   a  [        S5      eUc  [        S5      eUR                  S:w  a  [        S5      eUc  Ub  [        S5      eUR                  u  pgpn
UR                  Xg-  XU
5      nU R	                  U5      nU R
                  U R                  R                  -
  nU R                  SU  H  nU" U5      nM     UR                  XgUR                  S   UR                  S	   5      n/ n/ n/ nSn[        U5       GHn  nUSS2U4   nUcK  U R                  R                  SSS2SS24   R                  US
S
5      R                  UR                  5      nOdU R!                  U5      R                  UR                  5      U R                  R                  SSS2SS24   R                  UR                  5      -   n["        R$                  " UU4SS9nU R                  US  H  nU" U5      nM     U R'                  U5      nU R)                  U5      u  nnUR+                  U5        UR+                  U5        UR+                  U5        USS2SU R                  R,                  2SS24   nGMq     [/        S["        R$                  " USS9["        R$                  " USS9["        R$                  " USS9S9$ )a  
pixel_values_videos (`torch.Tensor`, *optional*):
    Video inputs of shape `(batch_size, num_frames, num_channels, height, width)`.
mask_labels (`list[torch.Tensor]`, *optional*):
    Not supported for 5D video inputs.
class_labels (`list[torch.LongTensor]`, *optional*):
    Not supported for 5D video inputs.
patch_offsets (`list[torch.Tensor]`, *optional*):
    Unused for video inputs and only kept for modular compatibility.
r5   zAUse `pixel_values_videos` with `VideomtForUniversalSegmentation`.Nz'You have to specify pixel_values_videosrj   zyVideomtForUniversalSegmentation only supports 5D video inputs of shape (batch_size, num_frames, channels, height, width).zTraining with 5D video inputs is not supported in `VideomtForUniversalSegmentation`. Flatten frames and use `EomtForUniversalSegmentation` instead.r   r;   rT   rl   r   )r#  r%  r$  r&  )r=   rn   r<   ro   rB   rM  r0   r<  rO  r   rY  r   r?   re   r>   rk   rT  rJ   rs   rN  predictr`  r  r"  )r/   r  rS  rT  r^  r   rt   ru   r&   rv   rw   flat_pixel_valuesr   query_start_idxlayer_moduleall_masks_queries_logitsall_class_queries_logitsall_last_hidden_statespropagated_query	frame_idxframe_hidden_statesquery_tokenssequence_outputr%  r$  s                            r2   rC   'VideomtForUniversalSegmentation.forward`  s   * V#`aa&FGG##q(E 
 "l&>Q 
 ?R>W>W;
e/77
8OQ]glm(9:004;;3I3II KK(89L(7M : &**:=CVCVWXCY[h[n[nop[qr#% #% !#z*I"/9"='#zz00q!<CCJPRTVWZZ[n[u[uv#112BCFFGZGaGabeieoeoevev!QJf"(//0 1 #())\;N,OUV"W $O,< =&23F&G# !> #nn-@AO9=o9V6 "6$++,@A$++,@A"))/:216O8O8O6OQR3RS) +, 5!&+C!K!&+C!K#ii(>AF	
 	
r4   c                 .    U R                   R                  $ r{   )rB   r\   r   s    r2   get_input_embeddings4VideomtForUniversalSegmentation.get_input_embeddings  s    ///r4   r  c                    US S 2S U R                   R                  2S S 24   nU R                  U5      nUS S 2U R                   R                  U R                  R                  -   S 2S S 24   nUR                  SS5      nUR                  " UR                  S   S/U R                  Q76 nU R                  U5      nU R                  U5      n[        R                  " SX$5      nXS4$ )Nr   r;   r   rT   zbqc, bchw -> bqhw)r0   r  rR  rB   r`   rA   ro   r<   rS  rQ  rP  rJ   einsum)r/   r  rj  class_logitsprefix_tokensmask_logitss         r2   r`  'VideomtForUniversalSegmentation.predict  s    a!:4;;#:#:!:A=>++L9q$++"9"9DOO<]<]"]"_abbc%//15%--m.A.A!.DbZ4>>Z~~l3**=9ll#6T((r4   )rR  r0   rE  rB   rS  rN  rO  rQ  rM  r   rT  rP  r|  )NNNN)rE   rF   rG   rH   r  r   r#   r   r  r   rY  r\  r   r   r   rJ   rq  r   r   r"  rC   rn  r`  rK   rL   rM   s   @r2   r  r     sK    ,O} :$ % 	
   $CK0 
c6k	0'$sF{"3 ' '   48152637O
"\\D0O
 %,,'$.O
 5<<(4/	O

 ELL)D0O
 +,O
 
/O
    O
b0)ell ) )r4   r  )r   )r   F)F)Mcollections.abcr)   r  r   dataclassesr   numpyr  rJ   torch.nn.functionalr   r   r'  r    r   r  activationsr	   
file_utilsr
   r   r   modeling_layersr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   configuration_videomtr   scipy.optimizer   
accelerater   accelerate.utilsr   r  r   rO   r~   r   r   r   r   rp   r   r   r   r   r  r"  r3  r<  rJ  rL  r   ru  ry  r{  r  r  r   r,  r9  rD  r  __all__r(  r4   r2   <module>r     s1  *   $ !      & ! L L 9 F & P P 7 5 0 4'' RYY  F,		 ,^ &(bii (0 %II%<<% 
% <<	%
 LL4'% % %.8)ryy 8)vU\\ e T V[VbVb %bii %(ryy ("'- 'T+		 + 	7K 7	 7< LQLL5:\\
\\@  6 , u|| X]XdXd 8gbii gTf f   <u|| U\\ VY ^c^j^j (u")) up	 1._ 1. 1.h 		 2			 	bii " 
`)&< `)
`)F $%F
Gr4   