
    Z j*                        S SK Jr  S SKrS SKJr  S SKJr  SSKJr  SSKJ	r	  SSK
JrJr  S	S
KJr  S	SKJrJrJrJrJrJrJrJrJrJrJr  \" SS9\ " S S\5      5       5       r " S S\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r!\\" SS9 " S S\5      5       5       r" " S  S!\5      r# " S" S#\5      r$ " S$ S%\5      r% " S& S'\5      r& " S( S)\5      r'/ S*Qr(g)+    )	dataclassN)strict)nn   )ModelOutput)Unpack)TransformersKwargsauto_docstring   )
EomtConfig)EomtEmbeddingsEomtForUniversalSegmentation	EomtLayerEomtLayerNorm2dEomtLayerScaleEomtMLPEomtPatchEmbeddingsEomtPreTrainedModelEomtScaleBlockEomtScaleLayerEomtSwiGLUFFNz&tue-mps/videomt-dinov2-small-ytvis2019)
checkpointc                       \ rS rSrSrSrg)VideomtConfig(   videomt N)__name__
__module____qualname____firstlineno__
model_type__static_attributes__r       |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/videomt/modular_videomt.pyr   r   (   s	     Jr$   r   c                   N    \ rS rSrS\R
                  S\R
                  4S jrSrg)VideomtPatchEmbeddings.   pixel_valuesreturnc                 4   UR                   S   nX R                  :w  a  [        SU R                   SU S35      eUR                  U R                  R
                  R                  S9nU R	                  U5      R                  S5      R                  SS5      nU$ )N   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .)dtyper   )	shapenum_channels
ValueErrorto
projectionweightr.   flatten	transpose)selfr)   r0   
embeddingss       r%   forwardVideomtPatchEmbeddings.forward/   s    #))!,,,,!../yaI 
 $T__-C-C-I-IJ__\2::1=GG1M
r$   r   N)r   r   r    r!   torchTensorr9   r#   r   r$   r%   r'   r'   .   s    
ELL 
U\\ 
r$   r'   c                      ^  \ rS rSrS\4U 4S jjrS
S\R                  S\R                  S-  S\R                  4S jjrS	r	U =r
$ )VideomtEmbeddings<   configc                    > [         TU ]  U5        [        U5      U l        [        R
                  " [        R                  " SSUR                  5      5      U l	        g )Nr,   )
super__init__r'   patch_embeddingsr   	Parameterr;   zeroshidden_size
mask_tokenr7   r@   	__class__s     r%   rC   VideomtEmbeddings.__init__=   s@      6v >,,u{{1a9K9K'LMr$   Nr)   bool_masked_posr*   c                    UR                   S:X  a=  UR                  u  p4pVnUR                  X4-  XVU5      nUb  UR                  X4-  S5      nO2Ub/  UR                   S:  a  UR                  UR                  S   S5      nUR                  S   nU R                  U5      nUbX  UR	                  UR
                  [        R                  S9R                  S5      n	[        R                  " XR                  U5      nU R                  R                  USS5      n
U R                  R                  USS5      nXR                  U R                  5      -   n[        R                   " XU/SS9nU R#                  U5      nU$ )N   r   r   )devicer.   r,   dim)ndimr/   reshaperD   r2   rP   r;   bool	unsqueezewhererH   	cls_tokenexpandregister_tokensposition_embeddingsposition_idscatdropout)r7   r)   rL   
batch_size
num_framesr0   heightwidthr8   mask
cls_tokensrZ   s               r%   r9   VideomtEmbeddings.forwardB   sa   !BNBTBT?JL%'//
0G_deL*"1"9"9*:QSU"V(_-A-AA-E-55o6K6KA6NPRSO!''*
**<8
&"%%Z->->ejj%Q[[\^_DT??JGJ^^**:r2>
..55j"bI":":4;L;L"MM
YY
ZHaP
\\*-
r$   )rH   rD   N)r   r   r    r!   r   rC   r;   r<   r9   r#   __classcell__rJ   s   @r%   r>   r>   <   sE    N} N
ELL 5<<RVCV bgbnbn  r$   r>   c                       \ rS rSrSrg)
VideomtMLP\   r   Nr   r   r    r!   r#   r   r$   r%   rj   rj   \       r$   rj   c                       \ rS rSrSrg)VideomtGatedMLP`   r   Nrl   r   r$   r%   ro   ro   `   rm   r$   ro   c                       \ rS rSrSrg)VideomtLayerd   r   Nrl   r   r$   r%   rr   rr   d   rm   r$   rr   c                       \ rS rSrSrg)VideomtLayerScaleh   r   Nrl   r   r$   r%   ru   ru   h   rm   r$   ru   a  
    Class for outputs of [`VideomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~VideomtVideoProcessor.post_process_semantic_segmentation`] or
    [`~VideomtVideoProcessor.post_process_instance_segmentation`] or
    [`~VideomtVideoProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~VideomtVideoProcessor`] for details regarding usage.
    )custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S	'   S
rg)%VideomtForUniversalSegmentationOutputl   a<  
loss (`torch.Tensor`, *optional*):
    The computed loss, returned when labels are present.
class_queries_logits (`torch.FloatTensor`):
    A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
    query. Note the `+ 1` is needed because we incorporate the null class.
masks_queries_logits (`torch.FloatTensor`):
    A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
    query.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Last hidden states (final feature map) of the last layer.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Self and Cross Attentions weights from transformer decoder.
Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionsr   )r   r   r    r!   __doc__r{   r;   FloatTensor__annotations__r|   r}   r~   r   tupler   r#   r   r$   r%   ry   ry   l   s    & &*D%

d
")59%++d2959%++d2926u((4/659M5**+d2926Je''(4/6r$   ry   c                   x   ^  \ rS rSrSrSr\R                  " 5       S\R                  SS4U 4S jj5       r
SrU =r$ )	VideomtPreTrainedModel   pixel_values_videos)videomoduler*   Nc                    > [         TU ]  U5        [        U[        5      (       a*  [        R
                  R                  UR                  5        g g rf   )rB   _init_weights
isinstancer>   r   initzeros_rH   )r7   r   rJ   s     r%   r   $VideomtPreTrainedModel._init_weights   s9    f%f/00GGNN6,,- 1r$   r   )r   r   r    r!   main_input_nameinput_modalitiesr;   no_gradr   Moduler   r#   rg   rh   s   @r%   r   r      s8    +O!
]]_.BII .$ . .r$   r   c                       \ rS rSrSrg)VideomtLayerNorm2d   r   Nrl   r   r$   r%   r   r      rm   r$   r   c                       \ rS rSrSrg)VideomtScaleLayer   r   Nrl   r   r$   r%   r   r      rm   r$   r   c                       \ rS rSrSrg)VideomtScaleBlock   r   Nrl   r   r$   r%   r   r      rm   r$   r   c                      ^  \ rS rSrSrS\4U 4S jjrS r    SS\R                  S-  S\
\R                     S-  S\
\R                     S-  S	\
\R                     S-  S
\\   S\4S jjrSrU =r$ )VideomtForUniversalSegmentation   r   r@   c                    > [         TU ]  U5        [        R                  " UR                  UR                  5      U l        g rf   )rB   rC   r   LinearrG   query_updaterrI   s     r%   rC   (VideomtForUniversalSegmentation.__init__   s/     YYv'9'96;M;MNr$   c                     [        S5      e)NzNot needed for Videomt)AttributeError)	attn_maskprobnum_query_tokensencoder_start_tokensrP   s        r%   _disable_attention_mask7VideomtForUniversalSegmentation._disable_attention_mask   s    566r$   Nmask_labelsclass_labelspatch_offsetskwargsr*   c           
      v   SU;   a  [        S5      eUc  [        S5      eUR                  S:w  a  [        S5      eUc  Ub  [        S5      eUR                  u  pgpn
UR                  Xg-  XU
5      nU R	                  U5      nU R
                  U R                  R                  -
  nU R                  SU  H  nU" U5      nM     UR                  XgUR                  S   UR                  S	   5      n/ n/ n/ nSn[        U5       GHn  nUSS2U4   nUcK  U R                  R                  SSS2SS24   R                  US
S
5      R                  UR                  5      nOdU R!                  U5      R                  UR                  5      U R                  R                  SSS2SS24   R                  UR                  5      -   n["        R$                  " UU4SS9nU R                  US  H  nU" U5      nM     U R'                  U5      nU R)                  U5      u  nnUR+                  U5        UR+                  U5        UR+                  U5        USS2SU R                  R,                  2SS24   nGMq     [/        S["        R$                  " USS9["        R$                  " USS9["        R$                  " USS9S9$ )a  
pixel_values_videos (`torch.Tensor`, *optional*):
    Video inputs of shape `(batch_size, num_frames, num_channels, height, width)`.
mask_labels (`list[torch.Tensor]`, *optional*):
    Not supported for 5D video inputs.
class_labels (`list[torch.LongTensor]`, *optional*):
    Not supported for 5D video inputs.
patch_offsets (`list[torch.Tensor]`, *optional*):
    Unused for video inputs and only kept for modular compatibility.
r)   zAUse `pixel_values_videos` with `VideomtForUniversalSegmentation`.Nz'You have to specify pixel_values_videosrN   zyVideomtForUniversalSegmentation only supports 5D video inputs of shape (batch_size, num_frames, channels, height, width).zTraining with 5D video inputs is not supported in `VideomtForUniversalSegmentation`. Flatten frames and use `EomtForUniversalSegmentation` instead.r,   r   rO   rQ   r   )r{   r}   r|   r~   )r1   rS   r/   rT   r8   num_hidden_layersr@   
num_blockslayersviewrangequeryr4   rY   r2   rP   r   r;   r]   	layernormpredictappendnum_queriesry   )r7   r   r   r   r   r   r_   r`   r0   ra   rb   flat_pixel_valuesr   query_start_idxlayer_moduleall_masks_queries_logitsall_class_queries_logitsall_last_hidden_statespropagated_query	frame_idxframe_hidden_statesquery_tokenssequence_outputr}   r|   s                            r%   r9   'VideomtForUniversalSegmentation.forward   s   $ V#`aa&FGG##q(E 
 "l&>Q 
 ?R>W>W;
e/77
8OQ]glm(9:004;;3I3II KK(89L(7M : &**:=CVCVWXCY[h[n[nop[qr#% #% !#z*I"/9"='#zz00q!<CCJPRTVWZZ[n[u[uv#112BCFFGZGaGabeieoeoevev!QJf"(//0 1 #())\;N,OUV"W $O,< =&23F&G# !> #nn-@AO9=o9V6 "6$++,@A$++,@A"))/:216O8O8O6OQR3RS) +, 5!&+C!K!&+C!K#ii(>AF	
 	
r$   )r   )NNNN)r   r   r    r!   r   r   rC   r   r;   r<   listr   r	   ry   r9   r#   rg   rh   s   @r%   r   r      s    +OO} O7
 48152637O
"\\D0O
 %,,'$.O
 5<<(4/	O

 ELL)D0O
 +,O
 
/O
 O
r$   r   )r   r   r   ))dataclassesr   r;   huggingface_hub.dataclassesr   r   
file_utilsr   processing_utilsr   utilsr	   r
   eomt.configuration_eomtr   eomt.modeling_eomtr   r   r   r   r   r   r   r   r   r   r   r   r'   r>   rj   ro   rr   ru   ry   r   r   r   r   r   __all__r   r$   r%   <module>r      s   "  .  % & 7 0    CDJ   E0  @	 		m 		9 		 	 	7K 7	 78.0 .	 		 		 	Y
&B Y
xr$   