
    Z jv                       S SK r S SKJr  S SKJr  S SKrS SKJr  S SKJs  J	r
  S SKJr  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*J+r+  SSK,J-r-  \#" SS9\ " S S\5      5       5       r.\#" SS9\ " S S\5      5       5       r/\\#" SS9 " S S\5      5       5       r0\" S5       " S S \Rb                  5      5       r2 " S! S"\Rb                  5      r3 " S# S$\Rb                  5      r4 " S% S&\Rb                  5      r5 SwS'\S(\S)\S*\S+\6\7   S,\4S- jjr8 " S. S/\Rb                  5      r9 " S0 S1\Rb                  5      r: " S2 S3\Rb                  5      r; " S4 S5\Rb                  5      r< " S6 S7\Rb                  5      r= " S8 S9\Rb                  5      r>  SxS:\Rb                  S;\R                  S<\R                  S'\R                  S=\R                  S-  S>\?S-  S?\?S@\\"   4SA jjr@ " SB SC\Rb                  5      rA " SD SE\Rb                  5      rB " SF SG\Rb                  5      rC " SH SI\Rb                  5      rD " SJ SK\Rb                  5      rE " SL SM\Rb                  5      rFSN rG " SO SP\Rb                  5      rH " SQ SR\Rb                  5      rI " SS ST\Rb                  5      rJ " SU SV\Rb                  5      rK " SW SX\Rb                  5      rL\# " SY SZ\5      5       rM " S[ S\\M5      rNSyS]\R                  S^\R                  S_\OS,\R                  4S` jjrP " Sa Sb\M5      rQSzSc jrRSd\7Se\R                  Sf\7S,\R                  4Sg jrSSh\R                  Sf\?S,\R                  4Si jrT " Sj Sk\M5      rU   S{Sl jrV\#" SmS9 " Sn So\M5      5       rW\#" SpS9\ " Sq Sr\5      5       5       rX\#" SsS9 " St Su\M5      5       rY/ SvQrZg)|    N)Callable)	dataclass)Tensor   )initialization)ACT2CLS)load_backbone)center_to_corners_formatcorners_to_center_format)use_kernel_forward_from_hub)ModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)#compile_compatible_method_lru_cache)TransformersKwargsauto_docstringtorch_compilable_check	torch_int)can_return_tuplemerge_with_config_defaults)OutputRecordercapture_outputs   )Deimv2Configa&  
    Base class for outputs of the Deimv2Decoder. This class adds two attributes to
    BaseModelOutputWithCrossAttentions, namely:
    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
    - a stacked tensor of intermediate reference points.
    )custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   Sr\\R                     S-  \S
'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)Deimv2DecoderOutput-   a  
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
    Stacked intermediate hidden states (output of each layer of the decoder).
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
    Stacked intermediate logits (logits of each layer of the decoder).
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
    Stacked intermediate reference points (reference points of each layer of the decoder).
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
    Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
    Stacked initial reference points (initial reference points of each layer of the decoder).
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
    used to compute the weighted average in the cross-attention heads.
Nlast_hidden_stateintermediate_hidden_statesintermediate_logitsintermediate_reference_pointsintermediate_predicted_cornersinitial_reference_pointshidden_states
attentionscross_attentions )__name__
__module____qualname____firstlineno____doc__r    torchFloatTensor__annotations__r!   r"   r#   r$   r%   r&   tupler'   r(   __static_attributes__r)       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/deimv2/modeling_deimv2.pyr   r   -   s    " 37u((4/6;? 1 1D 8?48**T18>B!5#4#4t#;B?C"E$5$5$<C9=e//$6=59M5**+d2926Je''(4/68<eE--.5<r4   r   zF
    Base class for outputs of the RT-DETR encoder-decoder model.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   Sr\\R                     S-  \S
'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\S-  \S'   Srg)Deimv2ModelOutputS   a
  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the decoder of the model.
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
    Stacked intermediate hidden states (output of each layer of the decoder).
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
    Stacked intermediate logits (logits of each layer of the decoder).
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
    Stacked intermediate reference points (reference points of each layer of the decoder).
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
    Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
    Initial reference points used for the first decoder layer.
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
    Initial reference points sent through the Transformer decoder.
enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
    Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
    picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e.
    foreground and background).
enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`):
    Logits of predicted bounding boxes coordinates in the encoder stage.
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
    Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
    picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
    foreground and background).
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
    Logits of predicted bounding boxes coordinates in the first stage.
denoising_meta_values (`dict`):
    Extra dictionary for the denoising related values.
Nr    r!   r"   r#   r$   r%   decoder_hidden_statesdecoder_attentionsr(   encoder_last_hidden_stateencoder_hidden_statesencoder_attentionsinit_reference_pointsenc_topk_logitsenc_topk_bboxesenc_outputs_classenc_outputs_coord_logitsdenoising_meta_valuesr)   )r*   r+   r,   r-   r.   r    r/   r0   r1   r!   r"   r#   r$   r%   r9   r2   r:   r(   r;   r<   r=   r>   r?   r@   rA   rB   rC   dictr3   r)   r4   r5   r7   r7   S   s   > 37u((4/6;? 1 1D 8?48**T18>B!5#4#4t#;B?C"E$5$5$<C9=e//$6==A5!2!23d:A:>e//047>8<eE--.5<:>u0047>=A5!2!23d:A:>e//047>6:5,,t3:04OU&&-404OU&&-426u((4/69=e//$6=)-4$;-r4   r7   z
    Output type for DEIMv2 encoder modules (HybridEncoder and LiteEncoder).
    Attentions are only available for HybridEncoder variants with AIFI layers.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                  S4   S-  \	S'   Sr\\R                  S4   S-  \	S'   Srg)	Deimv2EncoderOutput   zy
feature_maps (`list[torch.FloatTensor]`):
    List of multi-scale feature maps from the encoder, one per feature level.
Nfeature_maps.r&   r'   r)   )r*   r+   r,   r-   r.   rH   listr/   r0   r1   r&   r2   r'   r3   r)   r4   r5   rF   rF      s\    
 -1L$u(()0:>M5**C/047>7;Je'',-4;r4   rF   RMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Deimv2RMSNorm   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z,
Deimv2RMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parameterr/   onesweightvariance_epsilon)selfhidden_sizerN   	__class__s      r5   rR   Deimv2RMSNorm.__init__   s/     	ll5::k#:; #r4   r&   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   Tkeepdim)	dtypetor/   float32powmeanrsqrtrW   rV   )rX   r&   input_dtypevariances       r5   forwardDeimv2RMSNorm.forward   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r4   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r2   rV   shaperW   )rX   s    r5   
extra_reprDeimv2RMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr4   )rW   rV   )gư>)r*   r+   r,   r-   floatrR   r/   r   ri   rm   r3   __classcell__rZ   s   @r5   rL   rL      sB    $ $$ $ $;U\\ ;ell ;J Jr4   rL   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Deimv2SwiGLUFFN   configc                 P  > [         TU ]  5         UR                  S-  n[        R                  " UR
                  USS9U l        [        R                  " UR
                  USS9U l        [        R                  " X!R
                  SS9U l        [        R                  " 5       U l
        g )Nr]   Tbias)rQ   rR   decoder_ffn_dimrS   Lineard_model	gate_projup_proj	down_projSiLUact_fn)rX   ru   hidden_featuresrZ   s      r5   rR   Deimv2SwiGLUFFN.__init__   sq     00A56>>?NyytL?NNNggir4   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ N)r~   r   r|   r}   )rX   xr~   s      r5   ri   Deimv2SwiGLUFFN.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r4   )r   r~   r|   r}   )	r*   r+   r,   r-   r   rR   ri   r3   rp   rq   s   @r5   rs   rs      s     |   r4   rs   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	
Deimv2Gate   r{   c                    > [         TU ]  5         [        R                  " SU-  SU-  5      U l        [        U5      U l        g )Nr]   )rQ   rR   rS   rz   gaterL   norm)rX   r{   rZ   s     r5   rR   Deimv2Gate.__init__   s4    IIa'k1w;7	!'*	r4   second_residualr&   rO   c                     [         R                  " X/SS9n[         R                  " U R                  U5      5      nUR	                  SSS9u  pVU R                  XQ-  Xb-  -   5      nU$ )Nr^   dimr]   )r/   catsigmoidr   chunkr   )rX   r   r&   
gate_inputgatesgate1gate2s          r5   ri   Deimv2Gate.forward   s_    YY?RH
dii
34{{1"{-		%"9E<Q"QRr4   )r   r   )r*   r+   r,   r-   intrR   r/   r   ri   r3   rp   rq   s   @r5   r   r      s=    + +
u|| ELL UZUaUa  r4   r   c                   ~   ^  \ rS rSrSS\S\S\S\S\4
U 4S jjjrS\R                  S	\R                  4S
 jr	Sr
U =r$ )	Deimv2MLP   	input_dim
hidden_dim
output_dim
num_layersactc                    > [         T	U ]  5         X@l        U/US-
  -  nU/U-   nXc/-   n[        R                  " S [        Xx5       5       5      U l        [        U   " 5       U l        g )Nr   c              3   R   #    U  H  u  p[         R                  " X5      v   M     g 7fr   )rS   rz   ).0in_dimout_dims      r5   	<genexpr>%Deimv2MLP.__init__.<locals>.<genexpr>   s      #sVr?6BIIf$>$>Vrs   %')	rQ   rR   r   rS   
ModuleListziplayersr   r   )
rX   r   r   r   r   r   hidden_dims
input_dimsoutput_dimsrZ   s
            r5   rR   Deimv2MLP.__init__   sc    $!lj1n5[;.
!L0mm#sVYZdVr#ss3<>r4   stat_featuresrO   c                     [        U R                  5       H6  u  p#X R                  S-
  :  a  U R                  U" U5      5      OU" U5      nM8     U$ Nr   )	enumerater   r   r   )rX   r   ilayers       r5   ri   Deimv2MLP.forward   sH    !$++.HA>?//TUBU>UDHHU=%9:[`an[oM /r4   )r   r   r   )relu)r*   r+   r,   r-   r   strrR   r/   r   ri   r3   rp   rq   s   @r5   r   r      sR    "# "3 "C "UX "_b " "U\\ ell  r4   r   valuevalue_spatial_shapessampling_locationsattention_weightsnum_points_listrO   c           	         U R                   u  pgpUR                   u  pzpnU R                  SSSS5      R                  SS5      R                  U VVs/ s H	  u  pX-  PM     snnSS9nUS:X  a	  SU-  S-
  nOUS:X  a  UnWR                  SSSSS	5      R                  SS5      nUR                  US
S9n/ n[	        U5       GH  u  nu  pUU   R                  Xh-  XU5      nUU   nUS:X  a#  [        R                  R                  UUSSSS9nGO>US:X  Ga7  U[        R                  " X//U R                  S9-  S-   R                  [        R                  5      nUS   R                  SUS-
  5      nUS   R                  SUS-
  5      n[        R                  " UU/SS9nUR                  Xh-  XU   -  S5      n[        R                   " UR                   S   U R                  S9R#                  S5      R%                  SUR                   S   5      nUUS S 2US   US   4   nUR                  SSS5      R                  Xh-  XUU   5      nUR'                  W5        GM     UR                  SSSS5      R                  Xh-  SU
[)        U5      5      n[        R*                  " USS9U-  R)                  S5      R-                  XhU	-  U
5      nUR/                  SS5      R1                  5       $ s  snnf )Nr   r]   r   r   r^   r   defaultdiscrete   bilinearzerosF)modepadding_modealign_cornersdevice      ?.r   .r   )rl   permuteflattensplitr   reshaperS   
functionalgrid_sampler/   tensorr   rb   int64clampstackarange	unsqueezerepeatappendsumconcatview	transpose
contiguous)r   r   r   r   r   method
batch_size_	num_headsr   num_queries
num_levels
num_pointsheightwidth
value_listsampling_gridssampling_value_listlevel_idvalue_l_sampling_grid_l_sampling_value_l_sampling_coordsampling_coord_xsampling_coord_ysampling_idxoutputs                              r5   #multi_scale_deformable_attention_v2r      s3    ,1;;(J98J8P8P5AI:aAq!	A	4HI4H=64HIr	R  //!3	:	+#++Aq!Q:BB1aHN#))/r)BN%./C%D!/6
 h'//
0F
\ab *(3Y " 9 9*'af !: ! z!.>OX]XdXd1eehkkooN
  .f5;;AuqyI-f5;;AvzJ #[[*:<L)MSUVN+33J4JKjrZsLsuvwN^11!4U\\J2>//23 
 !)q.:PR`agRh)h i 1 9 9!Q B J J&
QYAZ! 	""#45I &EP *11!Q1=EE;O0D 
)r	25F	F	R	jj0+	> 
 Aq!,,..u Js   K0
c                      ^  \ rS rSrS\4U 4S jjr     SS\R                  S\R                  S-  S\\	   S\
\R                  \R                  4   4S	 jjrS
rU =r$ )#Deimv2MultiscaleDeformableAttentioni*  ru   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        UR                  U l        [        U R                  [        5      (       a  U R                  nO.[        U R                  5       Vs/ s H  o0R                  PM     nnX l        U R                   VVs/ s H  n[        U5        H  nSU-  PM
     M     nnnU R!                  S["        R$                  " U["        R&                  S95        U R                  [)        U R                  5      -  U l        [,        R.                  " U R                  U R*                  S-  5      U l        [,        R.                  " U R                  U R*                  5      U l        [4        U l        gs  snf s  snnf )z3
D-Fine version of multiscale deformable attention
r   num_points_scalera   r]   N)rQ   rR   r{   decoder_attention_headsn_headsnum_feature_levelsn_levelsdecoder_offset_scaleoffset_scaledecoder_methoddecoder_n_pointsn_points
isinstancerI   ranger   register_bufferr/   r   rc   r   total_pointsrS   rz   sampling_offsetsr   r   ms_deformable_attn_core)rX   ru   r   r   nr   rZ   s         r5   rR   ,Deimv2MultiscaleDeformableAttention.__init__+  s_    	~~5511"77$33//dmmT**"mmO6;DMM6JK6J}}6JOK.+/+?+?R+?aqAAEE+?R/>NV[VcVc1de LL3t/C/C+DD "		$,,8I8IA8M N!#4<<9J9J!K'J$ L Ss   9G%"GNr&   attention_maskkwargsrO   c                    UR                   u  pn
UR                   u  pn
[        US S 2S4   US S 2S4   -  R                  5       U:H  S5        UR                  XU R                  U R
                  U R                  -  5      nUb  UR                  US   ) [        S5      5      nU R                  U5      nUR                  XU R                  [        U R                  5      S5      nU R                  U5      R                  XU R                  [        U R                  5      5      n[        R                  " USS9nUR                   S   S:X  ak  [        R                  " U5      nUR                  S/5      R                  SSSU R                   SS5      nUR                  XSU R                   SS5      X-  -   nOUR                   S   S:X  am  U R"                  R%                  UR&                  S	9R)                  S5      nUU-  US S 2S S 2S S S 2SS 24   -  U R*                  -  nUS S 2S S 2S S S 2S S24   U-   nO[-        S
UR                   S    S35      eU R/                  UUUUU R                  U R0                  5      nUU4$ )Nr   r   z[Make sure to align the spatial shapes with the sequence length of the encoder hidden states.Nr]   r^   r   r   r   z5Last dim of reference_points must be 2 or 4, but get z	 instead.)rl   r   r   r   r   r{   masked_fillro   r	  r   r   Fsoftmaxr/   r   flipr   r   rb   ra   r   r  
ValueErrorr
  r  )rX   r&   r  reference_pointsr<   spatial_shapesspatial_shapes_listr  r   r   r   sequence_lengthr   r	  r   offset_normalizerr   r   offsetr   s                       r5   ri   +Deimv2MultiscaleDeformableAttention.forwardG  s    &3%8%8"
)>)D)D&
QAqD!N1a4$88==??Ri	
 &--j4<<Y]YeYeimiuiuYuv%%%~i'@&@%(KE)-)>)>})M+33T\\3t7K7K3La
 !22=AIIT\\3t7K7K3L
 II&7R@!!"%* %^ < 1 6 6s ; C CAq!T]]\]_` a ((aXY[\]"67  ##B'1,  $4477m>Q>Q7R\\]_`%(88;KAqRVXY[\[]L];^^aeararrF!1!Qa!2C!Dv!MGHXH^H^_aHbGcclm  --  
 (((r4   )r   r{   r  r
  r   r   r  r   r  r	  r  NNNNN)r*   r+   r,   r-   r   rR   r/   r   r   r   r2   ri   r3   rp   rq   s   @r5   r   r   *  sw    K| K> /3" <)||<) t+<) +,<) 
u||U\\)	*<) <)r4   r   c                   h   ^  \ rS rSr   SS\S\S\S\S\S\S	\S-  S
\S-  4U 4S jjjrS rSr	U =r
$ )Deimv2ConvNormLayeri  Nru   in_channelsout_channelskernel_sizestridegroupspadding
activationc	           
      "  > [         T	U ]  5         [        R                  " UUUUUUc  US-
  S-  OUSS9U l        [        R
                  " X1R                  5      U l        Uc  [        R                  " 5       U l
        g [        U   " 5       U l
        g )Nr   r]   F)r$  r%  rx   )rQ   rR   rS   Conv2dconvBatchNorm2dbatch_norm_epsr   Identityr   r&  )
rX   ru   r   r!  r"  r#  r$  r%  r&  rZ   s
            r5   rR   Deimv2ConvNormLayer.__init__  s|     	II.5o[1_*7
	 NN<1F1FG	+5+="++-7:CVCXr4   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r)  r   r&  )rX   hidden_states     r5   ri   Deimv2ConvNormLayer.forward  s2    yy.yy.|4r4   )r&  r)  r   )r   NN)r*   r+   r,   r-   r   r   r   rR   ri   r3   rp   rq   s   @r5   r  r    s     "!%YY Y 	Y
 Y Y Y tY $JY Y0 r4   r  c                   B   ^  \ rS rSrSrS\S\S\4U 4S jjrS rSr	U =r
$ )	Deimv2RepVggBlocki  zc
RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again".
ru   r   r!  c           	         > [         TU ]  5         UR                  nUn[        XUSSSS9U l        [        XUSSSS9U l        Uc  [        R                  " 5       U l	        g [        U   " 5       U l	        g )Nr   r   )r%  r   )
rQ   rR   activation_functionr  conv1conv2rS   r,  r   r&  )rX   ru   r   r!  r&  hidden_channelsrZ   s         r5   rR   Deimv2RepVggBlock.__init__  sl    //
%(,PQST^_`
(,PQST^_`
+5+="++-7:CVCXr4   c                 j    U R                  U5      U R                  U5      -   nU R                  U5      $ r   )r5  r6  r&  )rX   r   ys      r5   ri   Deimv2RepVggBlock.forward  s+    JJqMDJJqM)q!!r4   )r&  r5  r6  )r*   r+   r,   r-   r.   r   r   rR   ri   r3   rp   rq   s   @r5   r2  r2    s0    Y| Y# YS Y" "r4   r2  c                      ^  \ rS rSrSr SS\S\S\S\S\4
U 4S jjjrS	\	R                  S
\	R                  4S jrSrU =r$ )Deimv2CSPRepLayeri  a	  
Cross Stage Partial (CSP) network layer with RepVGG blocks.
Differs from DFineCSPRepLayer: uses a single conv that splits into residual + processing path
(instead of two separate convs), and has an optional trailing conv controlled by `encoder_has_trailing_conv`.
ru   r   r!  
num_blocks	expansionc           
        > [         T	U ]  5         UR                  n[        X5-  5      n[	        XUS-  SSUS9U l        [        R                  " [        U5       Vs/ s H  n[        XU5      PM     sn5      U l
        UR                  (       a  [	        XUSSUS9U l        g [        R                  " 5       U l        g s  snf )Nr]   r   r&  r   )rQ   rR   r4  r   r  r5  rS   r   r  r2  bottlenecksencoder_has_trailing_convr,  r6  )
rX   ru   r   r!  r>  r?  r&  r7  r   rZ   s
            r5   rR   Deimv2CSPRepLayer.__init__  s     	//
l67(oPQ>QSTVWdno
==RWXbRcdRcQvHRcd

 //  q!Xbc 	
  	
 es   B<r&   rO   c                     U R                  U5      R                  SSS9u  p!U R                   H  nU" U5      nM     U R                  X!-   5      $ Nr]   r   r   )r5  r   rB  r6  )rX   r&   residual
bottlenecks       r5   ri   Deimv2CSPRepLayer.forward  sO    "&**]";"A"A!"A"K**J&}5M +zz(233r4   )rB  r5  r6  )      ?)r*   r+   r,   r-   r.   r   r   ro   rR   r/   r   ri   r3   rp   rq   s   @r5   r=  r=    sa     nq
"
14
DG
UX
ej
 
 4U\\ 4ell 4 4r4   r=  c                   v   ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\R                  4S jr
S	rU =r$ )Deimv2RepNCSPELAN5i  aI  
Rep(VGG) N(etwork) CSP (Cross Stage Partial) ELAN (Efficient Layer Aggregation Network) block.
Similar to DFineRepNCSPELAN4 but without intermediate convolutions between CSP branches,
resulting in a simpler 4-way concatenation (2 split halves + 2 CSP branches) instead of D-FINE's
4-branch design with interleaved convolutions.
ru   numb_blocksc           	      h  > [         TU ]  5         UR                  nUR                  nUR                  nUR                  S-  n[	        UR
                  UR                  -  S-  5      n[        XUSSUS9U l        [        XS-  XrS9U l	        [        XXrS9U l
        [        XSU-  -   USSUS9U l        g )Nr]   r   rA  )r>  )rQ   rR   r4  encoder_hidden_dimroundhidden_expansionr  r5  r=  csp_rep1csp_rep2r6  )	rX   ru   rM  r&  r   r!  split_channelscsp_channelsrZ   s	           r5   rR   Deimv2RepNCSPELAN5.__init__  s    //
//0022Q6V44v7P7PPTUUV(naQR_ij
)&A2E|l)&e(a,&67q!Xb

r4   r&   rO   c                     U R                  U5      R                  SSS9u  p#U R                  U5      nU R                  U5      n[        R
                  " X#XE/SS9nU R                  U5      $ rF  )r5  r   rR  rS  r/   r   r6  )rX   r&   hidden_states_1hidden_states_2hidden_states_3hidden_states_4merged_hidden_statess          r5   ri   Deimv2RepNCSPELAN5.forward  sg    +/::m+D+J+J1RS+J+T(--8--8$yy/O)mstuzz.//r4   )r5  r6  rR  rS  )r   )r*   r+   r,   r-   r.   r   r   rR   r/   r   ri   r3   rp   rq   s   @r5   rL  rL    s@    
| 
# 
 
0U\\ 0ell 0 0r4   rL  c                   r   ^  \ rS rSrS\S\S\4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
Deimv2SCDowni  ru   r"  r#  c                    > [         TU ]  5         [        XR                  UR                  SS5      U l        [        UUR                  UR                  UUUR                  5      U l        g r   )rQ   rR   r  rO  r5  r6  )rX   ru   r"  r#  rZ   s       r5   rR   Deimv2SCDown.__init__  s^    (1J1JFLeLeghjkl
(%%%%%%

r4   input_featuresrO   c                 J    U R                  U5      nU R                  U5      nU$ r   r5  r6  )rX   rb  s     r5   ri   Deimv2SCDown.forward  s$    N3N3r4   rd  )r*   r+   r,   r-   r   r   rR   r/   r   ri   r3   rp   rq   s   @r5   r_  r_    s=    

| 

# 

s 

ell u||  r4   r_  modulequerykeyr  scalingdropoutr  c                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr^         r]   r   r   ptrainingr   )
sizer/   matmulr   rS   r   r  rj  ro  r   )
rf  rg  rh  r   r  ri  rj  r  attn_weightsattn_outputs
             r5   eager_attention_forwardrt    s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r4   c                      ^  \ rS rSrSr  SS\S\S\S\S\4
U 4S jjjr	  SS
\
R                  S\
R                  S	-  S\
R                  S	-  S\\   S\\
R                  \
R                  4   4
S jjrSrU =r$ )Deimv2SelfAttentioni$  z
Multi-headed self-attention from 'Attention Is All You Need' paper.

In DEIMV2, position embeddings are added to both queries and keys (but not values) in self-attention.
ru   rY   num_attention_headsrj  rx   c                 R  > [         TU ]  5         Xl        X#-  U l        U R                  S-  U l        X@l        SU l        [        R                  " X"US9U l	        [        R                  " X"US9U l
        [        R                  " X"US9U l        [        R                  " X"US9U l        g )Nrl  Frw   )rQ   rR   ru   head_dimri  attention_dropout	is_causalrS   rz   k_projv_projq_projo_proj)rX   ru   rY   rw  rj  rx   rZ   s         r5   rR   Deimv2SelfAttention.__init__+  s     	#:}}d*!(iitDiitDiitDiitDr4   Nr&   r  position_embeddingsr  rO   c                    UR                   SS n/ UQSPU R                  P7nUb  X-   OUnU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                  U R                  S.UD6u  pUR                  " / UQSP76 R!                  5       nU R#                  U5      nX4$ )zJ
Position embeddings are added to both queries and keys (but not values).
Nr^   r   r]           )rj  ri  )rl   ry  r~  r   r   r|  r}  r   get_interfaceru   _attn_implementationrt  ro  rz  ri  r   r   r  )rX   r&   r  r  r  input_shapehidden_shapequery_key_inputquery_states
key_statesvalue_statesattention_interfacers  rr  s                 r5   ri   Deimv2SelfAttention.forward?  sV    $))#2.88b8$--8ATA`-=fs{{?388FPPQRTUV[[166|DNNqRST
{{=166|DNNqRST(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r4   )	rz  ru   ry  r{  r|  r  r~  ri  r}  )r  T)NN)r*   r+   r,   r-   r.   r   r   ro   boolrR   r/   r   r   r   r2   ri   r3   rp   rq   s   @r5   rv  rv  $  s     EE E !	E
 E E E. /337	$)||$) t+$) #\\D0	$)
 +,$) 
u||U\\)	*$) $)r4   rv  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S-  S\\	   S	\R                  4
S
 jjr
SrU =r$ )Deimv2EncoderLayerif  ru   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        UU R                  UR                  UR                  S9U l        [        R                  " U R                  UR                  S9U l        UR                  U l        [        U R                  UR                  U R                  SUR                  5      U l        [        R                  " U R                  UR                  S9U l        g )Nru   rY   rw  rj  rN   r]   )rQ   rR   normalize_beforerO  rY   rv  rw  rj  	self_attnrS   	LayerNormlayer_norm_epsself_attn_layer_normr   encoder_ffn_dimencoder_activation_functionmlpfinal_layer_normrX   ru   rZ   s     r5   rR   Deimv2EncoderLayer.__init__g  s     & 7 7!44 -(( & : :NN	
 %'LL1A1AvG\G\$]!~~f44d6F6F6KmKm
 !#T-=-=6CXCX Yr4   Nr&   r  spatial_position_embeddingsr  rO   c                    UnU R                   (       a  U R                  U5      nU R                  " SUUUS.UD6u  p[        R                  R                  XR
                  U R                  S9nXQ-   nU R                   (       d  U R                  U5      nU R                   (       a  U R                  U5      nUnU R                  U5      nXQ-   nU R                   (       d  U R                  U5      nU R                  (       al  [        R                  " U5      R                  5       (       dC  [        R                  " UR                  5      R                  S-
  n[        R                  " X* US9nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
        values.
    spatial_position_embeddings (`torch.FloatTensor`, *optional*):
        Spatial position embeddings (2D positional encodings of image locations), to be added to both
        the queries and keys in self-attention (but not to values).
r&   r  r  rm  i  minmaxr)   )r  r  r  rS   r   rj  ro  r  r  r/   isfiniteallfinfora   r  r   )rX   r&   r  r  r  rG  r   clamp_values           r5   ri   Deimv2EncoderLayer.forwardz  s6   " !   55mDM>> 
') ;
 	
 --m||VZVcVc-d 0$$ 55mDM   11-@M / 0$$ 11-@M==>>-04466#kk-*=*=>BBTI %M|Q\ ]r4   )rj  r  rY   r  r  r  r  r   )r*   r+   r,   r-   r   rR   r/   r   r   r   ri   r3   rp   rq   s   @r5   r  r  f  sl    Z| Z. <@	0||0 0 &+\\D%8	0
 +,0 
0 0r4   r  c                      ^  \ rS rSrSrSS\S\4U 4S jjjr\" SS9S\S	\S
\R                  \
-  S\R                  S\R                  4
S j5       rSrU =r$ )Deimv2SinePositionEmbeddingi  zB
2D sinusoidal position embedding used in RT-DETR hybrid encoder.
	embed_dimtemperaturec                 :   > [         TU ]  5         Xl        X l        g r   )rQ   rR   r  r  )rX   r  r  rZ   s      r5   rR   $Deimv2SinePositionEmbedding.__init__  s    "&r4       maxsizer   r   r   ra   rO   c                    [         R                  " [        U5      US9R                  U5      n[         R                  " [        U5      US9R                  U5      n[         R                  " XVSS9u  pVU R
                  S-  S:w  a  [        S5      eU R
                  S-  n[         R                  " XsS9R                  U5      U-  nSU R                  U-  -  nUR                  5       S   US	   -  n	UR                  5       S   US	   -  n
[         R                  " U
R                  5       U
R                  5       U	R                  5       U	R                  5       /S
S9S	S	S	2S	S	24   $ )zu
Generate 2D sinusoidal position embeddings.

Returns:
    Position embeddings of shape (1, height*width, embed_dim)
r   xyindexingr   r   zHEmbed dimension must be divisible by 4 for 2D sin-cos position embeddingrJ  r  Nr   r   )r/   r   r   rb   meshgridr  r  r  r   r   sincos)rX   r   r   r   ra   grid_wgrid_hpos_dimomegaout_wout_hs              r5   ri   #Deimv2SinePositionEmbedding.forward  s*    i.v>AA%Hi/?BB5IF>>A"ghh..A%W477>Ht''./ +eDk9 +eDk9||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr4   r  r  )   i'  )r*   r+   r,   r-   r.   r   rR   r   r/   r   r   ra   r   ri   r3   rp   rq   s   @r5   r  r    s    '# '# ' '
 )4ee e s"	e
 {{e 
e 5er4   r  c                   x   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\
   S\R                  4S jrS	rU =r$ )
Deimv2AIFILayeri  z^
AIFI (Attention-based Intra-scale Feature Interaction) layer used in RT-DETR hybrid encoder.
ru   c                 R  > [         TU ]  5         Xl        UR                  U l        UR                  U l        [        U R                  UR                  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf )Nr  )rQ   rR   ru   rO  	eval_sizer  positional_encoding_temperatureposition_embeddingrS   r   r  encoder_layersr  r   rX   ru   r   rZ   s      r5   rR   Deimv2AIFILayer.__init__  s    "(";";))"=-->>#
 mmvOdOdIe$fIeA%7%?Ie$fg$fs   B$r&   r  rO   c                    UR                   S   nUR                   SS u  pEUR                  S5      R                  SSS5      nU R                  (       d  U R                  c'  U R                  UUUR                  UR                  S9nOSnU R                   H  nU" U4SUS.UD6nM     UR                  SSS5      R                  X0R                  XE5      R                  5       nU$ )z
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`):
        Feature map to process.
r   r]   Nr   )r   r   r   ra   )r  r  )rl   r   r   ro  r  r  r   ra   r   r   rO  r   )rX   r&   r  r   r   r   	pos_embedr   s           r5   ri   Deimv2AIFILayer.forward  s     #((+
%++AB/%--a088AqA==DNN2//$++#))	 0 I I[[E!#,5 	M ! !!!Q*22:?V?VX^fqqs 	 r4   )ru   rO  r  r   r  )r*   r+   r,   r-   r.   r   rR   r/   r   r   r   ri   r3   rp   rq   s   @r5   r  r    sJ    
h| 
h%||% +,% 
	% %r4   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                  \R                  \R                  4   4S jr	Sr
U =r$ )Deimv2SpatialTuningAdapteri  ru   c           	      ^  > [         TU ]  5         UR                  n[        USUSSSS9U l        [
        R                  " SSSS9U l        [        XSU-  SS5      U l        [        USU-  SU-  SS5      U l	        [        USU-  SU-  SS5      U l
        [
        R                  " 5       U l        g )Nr   r]   gelurA  r   r"  r#  r%  r   )rQ   rR   spatial_tuning_adapter_inplanesr  	stem_convrS   	MaxPool2d	stem_poolr6  conv3conv4GELUr   )rX   ru   inplanesrZ   s      r5   rR   #Deimv2SpatialTuningAdapter.__init__  s    99,VQ!QSYZ!AqI(1x<AN
(Xq8|QPQR
(Xq8|QPQR
ggir4   pixel_valuesrO   c                     U R                  U R                  U5      5      nU R                  U5      nU R                  U R	                  U5      5      nU R                  U R	                  U5      5      nX4U4$ r   )r  r  r6  r  r   r  )rX   r  rX  rY  rZ  r[  s         r5   ri   "Deimv2SpatialTuningAdapter.forward  s`    ..)EF**_5**T[[%AB**T[[%AB@@r4   )r   r6  r  r  r  r  )r*   r+   r,   r-   r   rR   r/   r   r2   ri   r3   rp   rq   s   @r5   r  r    sJ     |  AELL AU5<<W\WcWc;c5d A Ar4   r  c                   >   ^  \ rS rSrSrU 4S jrU 4S jrS rSrU =r	$ )Deimv2FrozenBatchNorm2di   z
BatchNorm2d where the batch statistics and the affine parameters are fixed.

Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
torchvision.models.resnet[18,34,50,101] produce nans.
c                 R  > [         TU ]  5         U R                  S[        R                  " U5      5        U R                  S[        R
                  " U5      5        U R                  S[        R
                  " U5      5        U R                  S[        R                  " U5      5        g )NrV   rx   running_meanrunning_var)rQ   rR   r  r/   rU   r   )rX   r  rZ   s     r5   rR    Deimv2FrozenBatchNorm2d.__init__(  sn    Xuzz!}5VU[[^4^U[[^<]EJJqM:r4   c           	      B   > US-   nX;   a  X	 [         T	U ]  XX4XVU5        g )Nnum_batches_tracked)rQ   _load_from_state_dict)
rX   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsnum_batches_tracked_keyrZ   s
            r5   r  -Deimv2FrozenBatchNorm2d._load_from_state_dict/  s4     #)+@"@"03%Wa	
r4   c                 <   U R                   R                  SSSS5      nU R                  R                  SSSS5      nU R                  R                  SSSS5      nU R                  R                  SSSS5      nSnX$U-   R                  5       -  nX5U-  -
  nX-  U-   $ )Nr   r^   h㈵>)rV   r   rx   r  r  rf   )rX   r   rV   rx   r  r  epsilonscales           r5   ri   Deimv2FrozenBatchNorm2d.forward:  s     $$QAq1yy  B1-&&..q"a;((00B1=/6688U**y4r4   r)   )
r*   r+   r,   r-   r.   rR   r  ri   r3   rp   rq   s   @r5   r  r     s    ;	

  
 r4   r  c                    U R                  5        GH>  u  p[        U[        R                  5      (       a  [	        UR
                  5      nUR                  R                  [        R                  " S5      :w  a  UR                  R                  UR                  5        UR                  R                  UR                  5        UR                  R                  UR                  5        UR                  R                  UR                  5        X0R                  U'   [        [        UR!                  5       5      5      S:  d  GM3  [#        U5        GMA     g)z
Recursively replace all `torch.nn.BatchNorm2d` with `Deimv2FrozenBatchNorm2d`.

Args:
    model (torch.nn.Module):
        input model
metar   N)named_childrenr  rS   r*  r  num_featuresrV   r   r/   copy_rx   r  r  _moduleslenrI   childrenreplace_batch_norm)modelnamerf  
new_modules       r5   r  r  G  s     ,,.fbnn--01D1DEJ}}##u||F';;!!''6%%fkk2''--f.A.AB&&,,V-?-?@#-NN4 tFOO%&'!+v& /r4   c                   v   ^  \ rS rSrSrU 4S jrS\R                  S\\	   S\
\R                     4S jrSrU =r$ )	Deimv2ConvEncoderi_  z
Convolutional backbone using the modeling_deimv2_resnet.py.

nn.BatchNorm2d layers are replaced by Deimv2FrozenBatchNorm2d as defined above.
https://github.com/lyuwenyu/RT-DETR/blob/main/Deimv2_pytorch/src/nn/backbone/presnet.py#L142
c                   > [         TU ]  5         [        U5      nUR                  (       a)  [        R
                  " 5          [        U5        S S S 5        X l        U R                  R                  U l	        [        R                  " U R                   Vs/ s H@  nUR                  S:w  a  [        XUR                  SS5      O[        R                  " 5       PMB     sn5      U l        g ! , (       d  f       N= fs  snf )Nliter   )rQ   rR   r	   freeze_backbone_batch_normsr/   no_gradr  r   channelsintermediate_channel_sizesrS   r   encoder_typer  rO  r,  encoder_input_proj)rX   ru   backbone
in_channelrZ   s       r5   rR   Deimv2ConvEncoder.__init__g  s     (--"8, !
*.***=*='"$--
 #'"A"A	 #BJ &&&0 $F8Q8QSTVWX[[]# #B	#
	 !
s   C(AC9(
C6r  r  rO   c                     U R                   " U40 UD6R                  n[        U R                  U5       VVs/ s H  u  pEU" U5      PM     snn$ s  snnf r   )r   rH   r   r  )rX   r  r  featuresprojfeats         r5   ri   Deimv2ConvEncoder.forward{  sH    ::l5f5BB-01H1H(-ST-SztT
-STTTs   A)r  r
  r   )r*   r+   r,   r-   r.   rR   r/   r   r   r   rI   ri   r3   rp   rq   s   @r5   r  r  _  sH    
(UELL UFCU<V U[_`e`l`l[m U Ur4   r  c                   z   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\
\R                     4S jrSrU =r$ )	Deimv2DINOv3ConvEncoderi  ru   c                 f  > [         TU ]  5         [        U5      U l        [	        U5      U l        UR                  R                  nUR                  nUR                  n[        R                  " [        XUS-  -   USS5      [        XUS-  -   USS5      [        XUS-  -   USS5      /5      U l        g )Nr]   r   r   )rQ   rR   r	   r  r  spatial_tuning_adapterbackbone_configrY   rO  r  rS   r   r  fusion_proj)rX   ru   r  r   spatial_tuning_adapter_channelsrZ   s        r5   rR    Deimv2DINOv3ConvEncoder.__init__  s    %f-&@&H#**66	..
*0*P*P'==#F8WZ[8[,[]gijlmn#F8WZ[8[,[]gijlmn#F8WZ[8[,[]gijlmn
r4   r  r  rO   c                    U R                   " U40 UD6nUR                  nU R                   R                  R                  nUR                  S   U-  nUR                  S   U-  n/ n[        U5      n	[        U5       H\  u  p[        USU	S-
  U
-
  -  -  5      n[        USU	S-
  U
-
  -  -  5      n[        R                  " XU/SSS9nUR                  U5        M^     U R                  U5      n/ n[        [        X5      5       HD  u  n
u  nn[        R                  " UU/SS9nUR                  U R                  U
   " U5      5        MF     U$ )Nr]   r   r   F)rp  r   r   r   r   )r  rH   ru   
patch_sizerl   r  r   r   r  interpolater   r  r   r/   r   r  )rX   r  r  backbone_outputrH   r  height_patcheswidth_patchessemantic_features
num_scalesr   r  resize_heightresize_widthspatialdetail_featuresoutputssemantic_featuredetail_featurefuseds                       r5   ri   Deimv2DINOv3ConvEncoder.forward  sR   --??&33]]))44
%++A.*<$**1-;&
 .GAzA~7I1J JKM}qZ!^a5G/HHILmmD|/LS]mrsG$$W-	 / 55lC5>sCT?f5g1A1 .II/@aHENN4++A.u56 6h r4   )r  r  r  )r*   r+   r,   r-   r   rR   r/   r   r   r   rI   ri   r3   rp   rq   s   @r5   r  r    sE    
| 
"ELL FCU<V [_`e`l`l[m  r4   r  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
Deimv2Integrali  a  
A static layer that calculates integral results from a distribution.

This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
where Pr(n) is the softmax probability vector representing the discrete
distribution, and W(n) is the non-uniform Weighting Function.

Args:
    max_num_bins (int): Max number of the discrete bins. Default is 32.
                   It can be adjusted based on the dataset or task requirements.
ru   c                 D   > [         TU ]  5         UR                  U l        g r   )rQ   rR   max_num_binsr  s     r5   rR   Deimv2Integral.__init__  s    "//r4   pred_cornersprojectrO   c                 *   UR                   u  p4n[        R                  " UR                  SU R                  S-   5      SS9n[        R
                  " XR                  UR                  5      5      R                  SS5      nUR                  X4S5      nU$ )Nr^   r   r   r   )rl   r  r  r   r1  linearrb   r   )rX   r3  r4  r   r   r   s         r5   ri   Deimv2Integral.forward  s    %1%7%7"
yy!5!5b$:K:Ka:O!PVWXxxjj9L9L.MNVVWY[\]#++JRHr4   )r1  )r*   r+   r,   r-   r.   r   rR   r/   r   ri   r3   rp   rq   s   @r5   r/  r/    s?    
0| 0ELL 5<< ELL  r4   r/  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )		Deimv2LQEi  ru   c                    > [         TU ]  5         UR                  U l        UR                  U l        [	        SU R                  S-   -  UR
                  SUR                  5      U l        g )Nr   r   )rQ   rR   top_prob_valuesr1  r   lqe_hidden_dim
lqe_layersreg_confr  s     r5   rR   Deimv2LQE.__init__  sZ    %55"//!!t';';a'?"@&BWBWYZ\b\m\mnr4   scoresr3  rO   c           	      \   UR                  5       u  p4n[        R                  " UR                  X4SU R                  S-   5      SS9nUR                  U R                  SS9u  pu[        R                  " XwR                  SSS9/SS9nU R                  UR                  X4S5      5      n	X-   nU$ )Nr   r   r^   r   T)r   r`   )rp  r  r  r   r1  topkr;  r/   r   re   r>  )
rX   r@  r3  r   lengthr   prob	prob_topkstatquality_scores
             r5   ri   Deimv2LQE.forward  s     , 1 1 3
Ayy--j!TEVEVYZEZ[acdyy!5!52y>	yy)^^D^%IJPRSdll:r&JK'r4   )r1  r>  r;  )r*   r+   r,   r-   r   rR   r/   r   ri   r3   rp   rq   s   @r5   r9  r9    s<    o| oell %,, 5<<  r4   r9  c                   4  ^  \ rS rSrS\4U 4S jjr      SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\\	\
\
4      S-  S
\R                  S-  S\R                  S-  S\\   S\R                  4S jjrSrU =r$ )Deimv2DecoderLayeri  ru   c                 J  > [         TU ]  5         UR                  U l        [	        UU R                  UR
                  UR                  S9U l        UR                  U l        [        UR                  5      U l
        [        US9U l        [        U5      U l        [        UR                  5      U l        UR                   (       a  [#        UR                  5      OS U l        UR                   U l        UR                   (       a  S U l        g [        UR                  5      U l        g )Nr  ru   )rQ   rR   r{   rY   rv  r   rz  r  rj  rL   r  r   encoder_attnrs   r  r  use_gatewayr   gatewayencoder_attn_layer_normr  s     r5   rR   Deimv2DecoderLayer.__init__  s    !>> -(( & > >,,	
 ~~$1&..$A!?vN"6* -fnn =5;5G5Gz&..1T!--/5/A/At$}U[UcUcGd$r4   Nr&   r  r  r  r  r<   encoder_attention_maskr  rO   c                 $   Un	U R                   " SUUUS.UD6u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUn	Uc  UOX-   nU R                  UUUUUS9u  p[        R                  R                  XR                  U R                  S9nU R                  b  U R                  X5      nOX-   nU R                  U5      nUn	U R                  U5      nX-   nU R                  U5      nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, hidden_size)`.
    object_queries_position_embeddings (`torch.FloatTensor`, *optional*):
        Position embeddings for the object query slots. These are added to both queries and keys
        in the self-attention layer (not values).
    reference_points (`torch.FloatTensor`, *optional*):
        Reference points.
    spatial_shapes (`torch.LongTensor`, *optional*):
        Spatial shapes.
    level_start_index (`torch.LongTensor`, *optional*):
        Level start index.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
        values.
r  rm  )r&   r<   r  r  r  r)   )r  rS   r   rj  ro  r  rM  rO  rP  r  r  )rX   r&   r  r  r  r  r<   rR  r  rG  r   s              r5   ri   Deimv2DecoderLayer.forward  s8   < !  >> 
'1 3
 	
 --m||VZVcVc-d 011-@  *=)D-Jm,,'"7-) 3 - 
 --m||VZVcVc-d<<# LLAM$4M 88GM !/ 0--m<r4   )
rj  rM  rP  r  rO  rY   r  r  r  rN  )NNNNNN)r*   r+   r,   r-   r   rR   r/   r   rI   r2   r   r   r   ri   r3   rp   rq   s   @r5   rJ  rJ    s    e| e. 4804.2<@596:E||E #\\D0E  ,,-	E
 t+E "%S/2T9E  %||d2E !&t 3E +,E 
E Er4   rJ  c                      ^  \ rS rSr% \\S'   SrSrSr/ SQr	Sr
SrSrSr\R                  " 5       U 4S j5       rS	rU =r$ )
Deimv2PreTrainedModeli3  ru   deimv2r  )image)Deimv2HybridEncoderDeimv2LiteEncoderrJ  Tc           
        > [         TU ]  U5        [        U[        [        45      (       Ga  UR
                  b  UR
                   H  nU R                  R                  =(       d    SU R                  R                  S-   -  n[        [        R                  " SU-
  U-  5      * 5      n[        R                  " UR                  5        [        R                  " UR                   U5        M     UR"                  bo  UR"                   H_  n[        R                  " UR$                  S   R                  S5        [        R                  " UR$                  S   R                   S5        Ma     ['        US5      (       a5  [        R                  " UR(                  U R                  R(                  5        ['        US5      (       a5  [        R                  " UR*                  U R                  R*                  5        [        U[,        5      (       Ga  [        R                  " UR.                  R                  S5        [0        R2                  " 5       n[0        R4                  " UR6                  [0        R8                  S9R;                  U5      S	[        R<                  -  UR6                  -  -  n[0        R>                  " URA                  5       URC                  5       /S5      nXwRE                  5       RG                  SS
S9RH                  -  nURK                  UR6                  SS5      RM                  S[O        URP                  5      S/5      n[0        RR                  " URP                   Vs/ s H  n[0        R4                  " SUS-   5      PM     sn5      RK                  SSS5      n	Xy-  n[        RT                  " UR.                  R                   URW                  5       5        [        R                  " URX                  R                  S5        [        R                  " URX                  R                   S5        URP                   VV
s/ s H  n[[        U5        H  n
SU-  PM
     M     nnn
[        RT                  " UR\                  [0        R^                  " U[0        R`                  S95        [        U[b        5      (       a  U R                  R                  =(       d    SU R                  R                  S-   -  n[        [        R                  " SU-
  U-  5      * 5      n[        R                  " URd                  R                  5        [        R                  " URd                  R                   U5        [        U[f        5      (       av  [        [        R                  " S5      * 5      n[        R                  " URh                  R                   U5        [        R                  " URh                  R                  S5        [        U[j        5      (       ap  [        R                  " URl                  R$                  S   R                   S5        [        R                  " URl                  R$                  S   R                  S5        ['        US5      (       aE  U R                  Rn                  (       a*  [        R                  " URp                  R                  5        ['        US5      (       aD  U R                  Rr                  S:  a*  [        R                  " URt                  R                  5        [        U[v        5      (       Ga   [        R                  " URx                  R                  5        [        R                  " URx                  R                   S5        [        R                  " URz                  R                  5        [        R                  " URz                  R                   S5        [        R                  " UR|                  R                  5        [        R                  " UR|                  R                   S5        ggs  snf s  sn
nf )zInitialize the weightsNr   r^   r   	reg_scaleupr  r          @Tr_   r]   rJ  weight_embeddingdenoising_class_embed)?rQ   _init_weightsr  Deimv2ForObjectDetectionDeimv2Decoderclass_embedru   initializer_bias_prior_prob
num_labelsro   mathloginitxavier_uniform_rV   	constant_rx   
bbox_embedr   hasattrr\  r]  r   r	  r/   get_default_dtyper   r   r   rb   pir   r  r  absr  valuesr   tiler   r   r   r  r   r   r  r   r   rc   Deimv2Modelenc_score_headr   r   r9  r>  learn_initial_queryr_  num_denoisingr`  rs   r|   r}   r~   )rX   rf  r   
prior_probrx   default_dtypethetas	grid_initr  ri  r   r   rZ   s               r5   ra  #Deimv2PreTrainedModel._init_weights?  s    	f%f7GHH!!-#//E!%!H!H!lAQUQ\Q\QgQgjkQkLlJ $((A
Nj+H"I!IJD((6NN5::t4	 0   ,#..ENN5<<#3#:#:A>NN5<<#3#8#8!< / v{++v//1F1FGvt$$vyy$++..9fABBNN622993?!335M\\&..DGGVdgg.F VZZ\6::<$@"EI!MMO$7$7D$7$I$P$PPI!))&..!Q?DDaVMcMcIdfgEhiIllFDZDZ#[DZqELLAE$:DZ#[\ddefhjlmnG IJJv..33Y5F5F5HINN633::C@NN63388#>/5/E/EX/E!uUVx!Ax/EXJJv..=MUZUbUb0cdfk**@@dAI_I_bcIcDdJ$((A
Nj#@AABD  !6!6!=!=>NN60055t<fj))$((?334DNN6;;++T2NN6;;--q1fi((NN6??11"5::A>NN6??11"5<<a@6-..4;;3R3R  !8!8!?!?@62338Q8QTU8U  !=!=!D!DEfo..  !1!1!8!89NN6++00!4  !6!67NN6>>..2  !1!1!8!89NN6++00!4 /= $\  Ys   '$a"ar)   )r*   r+   r,   r-   r   r1   base_model_prefixmain_input_nameinput_modalities_no_split_modules_supports_sdpa_supports_flash_attn_supports_attention_backend_supports_flex_attnr/   r  ra  r3   rp   rq   s   @r5   rV  rV  3  sQ     $O!]N"&
]]_D5 D5r4   rV  c                      ^  \ rS rSrS\" \SS9\" \SS9/0rS\4U 4S jjr\	\
S\\R                     S	\\   S
\4S j5       5       rSrU =r$ )rZ  i  r&   
input_proj)
layer_namebi_fusion_convru   c                 B  > [         TU ]  U5        UR                  nUR                  n[        R
                  " UR                   Vs/ s H  n[        XUSS5      PM     sn5      U l        [        R                  " SSSS9U l
        [        XUSSUS9U l        [        R                  " SSSS9U l        [        XUSSUS9U l        [        XUSSUS9U l        [        SUR                   -  5      n[#        XS9U l        [#        XS9U l        U R)                  5         g s  snf )Nr   r   r]   r  rA  rM  )rQ   rR   rO  r4  rS   r   encoder_in_channelsr  r  	AvgPool2d
down_pool1
down_conv1
down_pool2
down_conv2r  rP  
depth_multrL  	fpn_block	pan_block	post_init)rX   ru   r   r&  r  r>  rZ   s         r5   rR   Deimv2LiteEncoder.__init__  s    ..
//
--Y_YsYstYs: ZAFYst
 ,,1QJ-f*aQR_ij,,1QJ-f*aQR_ij1&jRSUVcmn1v0001
+FK+FK us   Dinputs_embedsr  rO   c                 `   [        U5       VVs/ s H  u  p4U R                  U   " U5      PM     nnnUR                  U R                  U R	                  US   5      5      5        U R                  US   [        R                  " US   S5      -   5      US'   / nUS   [        R                  " US   SSS9-   nUR                  U R                  U5      5        US   U R                  U R                  US   5      5      -   nUR                  U R                  U5      5        [        US9$ s  snnf )Nr^   r   r   r^  nearestscale_factorr   rH   )r   r  r   r  r  r  r  adaptive_avg_pool2dr  r  r  r  r  rF   )rX   r  r  r   featureprojected_featuresr)  fused_features           r5   ri   Deimv2LiteEncoder.forward  s     MVVcLdeLdjadooa09Lde!!$//$//BTUWBX2Y"Z[!%!4!4r"Q%:%:;Mb;QST%UU"
2 *1->PQR>Sbelu0vvt~~m45*1-PWXZP[@\0]]t~~m45"88 fs   !D*)r  r  r  r  r  r  r  r  )r*   r+   r,   r-   r   r  _can_record_outputsr   rR   r   r   rI   r/   r   r   r   rF   ri   r3   rp   rq   s   @r5   rZ  rZ    sy     	.<H.;KL
| ,  9T%,,%7 96J\C] 9bu 9   9r4   rZ  feature_map_1feature_map_2fuse_opc                 B    US:X  a  X-   $ [         R                  " X/SS9$ )zJFuses two feature maps via element-wise sum or channel-wise concatenation.r   r   r   )r/   r   )r  r  r  s      r5   fuse_feature_mapsr    s'    %,,99m3;;r4   c            	          ^  \ rS rSrSr\\S.rS\4U 4S jjr	\
\" SS9 SS	\\R                     S-  S
\\   S\4S jj5       5       rSrU =r$ )rY  i  a5  
DEIMv2 variant of DFineHybridEncoder. Uses element-wise sum fusion (`fuse_feature_maps`) instead of
D-FINE's channel concatenation, Deimv2RepNCSPELAN5 (simplified 4-way concat) instead of DFineRepNCSPELAN4,
and returns Deimv2EncoderOutput with feature_maps instead of BaseModelOutput with last_hidden_state.
)r&   r'   ru   c           
        > [         TU ]  U5        Xl        UR                  U l        [        U R                  5      S-
  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  U l
        UR                  U l        U R                   Vs/ s H  o R                  PM     snU l        U R                  U l        UR                  U l        [         R"                  " [%        [        U R                  5      5       Vs/ s H  n['        U5      PM     sn5      U l        [         R"                  " 5       U l        [         R"                  " 5       U l        [%        [        U R                  5      S-
  SS5       Hx  nU R*                  R/                  [1        XR                  U R                  SS5      5        [3        SUR4                  -  5      nU R,                  R/                  [7        XS95        Mz     [         R"                  " 5       U l        [         R"                  " 5       U l        [%        [        U R                  5      S-
  5       Hc  nU R8                  R/                  [=        USS5      5        [3        SUR4                  -  5      nU R:                  R/                  [7        XS95        Me     U R?                  5         g s  snf s  snf )Nr   r   r^   r   r  r]   ) rQ   rR   ru   r  r   r  num_fpn_stagesfeat_stridesrO  encode_proj_layersr  r  r!  out_stridesencoder_fuse_opr  rS   r   r  r  aifilateral_convs
fpn_blocksr   r  rP  r  rL  downsample_convs
pan_blocksr_  r  )rX   ru   r   r>  rZ   s       r5   rR   Deimv2HybridEncoder.__init__  s/    !55!$"2"23a7"//"(";";"(";";/5/U/U,))>B>N>NO>N44>NO,,--MME#dNeNeJfDg"hDgq?6#:Dg"hi	]]_--/s4++,q0!R8A%%#F,C,CTE\E\^_abc q6#4#445JOO""#5f#UV 9 !#--/s4++,q01A!!((fa)CDq6#4#445JOO""#5f#UV 2
 	- P #is   (KKF)tie_last_hidden_statesNr  r  rO   c                 &   UnU R                   R                  S:  a8  [        U R                  5       H  u  pEU R                  U   " X5   40 UD6X5'   M!     US   /n[        [        U R                  U R                  5      5       Hr  u  nu  pX0R                  U-
  S-
     n
US   nU" U5      nXS'   [        R                  " USSS9n[        XU R                  5      nU	" U5      nUR                  U5        Mt     UR                  5         US   /n[        [        U R                  U R                   5      5       HM  u  nu  nnUS   nXgS-      nU" U5      n[        UUU R                  5      nU" U5      nUR                  U5        MO     [#        US9$ )z
Args:
    inputs_embeds (`list[torch.FloatTensor]`):
        Multi-scale feature maps from the backbone (one tensor per feature level) passed to the encoder.
r   r^   r   r^  r  r  r  )ru   r  r   r  r  r   r  r  r  r  r  r  r  r   reverser  r  rF   )rX   r  r  rH   r   enc_indfpn_feature_mapsidxlateral_convr  backbone_feature_maptop_fpn_feature_mapfused_feature_mapnew_fpn_feature_mappan_feature_mapsdownsample_convr  top_pan_feature_mapfpn_feature_mapdownsampled_feature_mapnew_pan_feature_maps                        r5   ri   Deimv2HybridEncoder.forward  s    %;;%%)'(?(?@
(,		!\5J(Uf(U% A ),-.7D<N<NPTP_P_8`.a*C*,#/0C0Cc0IA0M#N "22"6"./B"C#6R "#--0CRU\e"f 12E]a]i]i j"+,=">##$78 /b 	  " -Q/01:3t?T?TVZVeVe;f1g-C-/9"22"6.Qw7O&56I&J# 12I?\`\h\h i"+,=">##$78 2h #0@AAr4   )r  ru   r  r  rO  r  r  r  r  r   r  r  r!  r  r  r  r   )r*   r+   r,   r-   r.   r  rv  r  r   rR   r   r   rI   r/   r   r   r   rF   ri   r3   rp   rq   s   @r5   rY  rY    s     ))
 |  D  E2 48(BELL)D0(B +,(B 
	(B 3  (Br4   rY  c                     U R                  SSS9n U R                  US9nSU -
  R                  US9n[        R                  " X#-  5      $ )Nr   r   r  )r  )r   r/   rh  )r   rN   x1x2s       r5   inverse_sigmoidr    sI    	A1A	
S	B
a%3	B99RWr4   r1  r]  r\  c                    [        US   5      [        U5      -  n[        US   5      [        U5      -  S-  nUS-   SU S-
  -  -  n[        U S-  S-
  SS5       Vs/ s H  oeU-  * S-   PM     nn[        SU S-  5       Vs/ s H
  oeU-  S-
  PM     nnU* /U-   [        R                  " US   S   5      /-   U-   U/-   n	[        R                  " U	S5      n	U	$ s  snf s  snf )u  
Generates the non-uniform Weighting Function W(n) for bounding box regression.

Args:
    max_num_bins (int): Max number of the discrete bins.
    up (Tensor): Controls upper bounds of the sequence,
                 where maximum offset is ±up * H / W.
    reg_scale (float): Controls the curvature of the Weighting Function.
                       Larger values result in flatter weights near the central axis W(max_num_bins/2)=0
                       and steeper weights at both ends.
Returns:
    Tensor: Sequence of Weighting Function.
r   r]   r   r^   N)rp  r  r/   
zeros_liker   )
r1  r]  r\  upper_bound1upper_bound2stepr   left_valuesright_valuesrq  s
             r5   weighting_functionr  #  s     r!u:I.Lr!u:I.2L1!|a'7"89D/4\Q5F5JAr/RS/R!q[>A%/RKS-21la6G-HI-HaK!O-HLIm_{*e.>.>r!uT{.K-LL|[_k^llFYYvq!FM	 TIs   C Cdistancec                 0   [        U5      nU S   SU-  US   -   U S   U-  -  -
  nU S   SU-  US   -   U S   U-  -  -
  nU S   SU-  US   -   U S   U-  -  -   nU S   SU-  US   -   U S   U-  -  -   n[        R                  " X4XV/S5      n[        U5      $ )a\  
Decodes edge-distances into bounding box coordinates.

Args:
    points (`torch.Tensor`):
        (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
    distance (`torch.Tensor`):
        (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries.
    reg_scale (`float`):
        Controls the curvature of the Weighting Function.
Returns:
    `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
r   r   ).r]   r   ).r   r^   )rp  r/   r   r   )pointsr  r\  
top_left_x
top_left_ybottom_right_xbottom_right_ybboxess           r5   distance2bboxr  ;  s     II3?Xf5E#E&QW.[dJd"eeJ3?Xf5E#E&QW.[dJd"eeJF^sY&9I'IfU[n_hNh&iiNF^sY&9I'IfU[n_hNh&iiN[[*.QSUVF#F++r4   c                      ^  \ rS rSrSr\\\S.rS\	4U 4S jjr
\\    SS\R                  S\R                  S\R                  S	\\   S
\4
S jj5       5       rSrU =r$ )rc  iT  aC  
D-FINE Decoder implementing Fine-grained Distribution Refinement (FDR).

This decoder refines object detection predictions through iterative updates across multiple layers,
utilizing attention mechanisms, location quality estimators, and distribution refinement techniques
to improve bounding box accuracy and robustness.
)r&   r'   r(   ru   c           	        > [         TU ]  U5        UR                  S:  a  UR                  OUR                  UR                  -   U l        UR                  U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn[        UR                  U R                  -
  S-
  5       Vs/ s H  n[        U5      PM     sn-   5      U l	        [        SUR                  UR                  SUR                  5      U l        S U l        S U l        [
        R                   " ["        R$                  " UR&                  /5      SS9U l        UR(                  U l        UR                  U l        UR*                  U l        [        UR,                  UR,                  SS5      U l        [1        U5      U l        UR4                  U l        [
        R                   " ["        R$                  " UR8                  /5      SS9U l        [
        R                  " [        UR                  5       Vs/ s H  n[;        U5      PM     sn5      U l        U R?                  5         g s  snf s  snf s  snf )Nr   r   r   r   F)requires_grad) rQ   rR   eval_idxdecoder_layersrj  rS   r   r  rJ  r   r   r{   decoder_activation_functionquery_pos_headrl  rd  rT   r/   r   r\  r1  layer_scalerY   pre_bbox_headr/  integralr   num_headr]  r9  r=  r  r  s      r5   rR   Deimv2Decoder.__init__c  s    +1??a+?VEZEZ]c]l]lEl~~mm16v7L7L1MN1MA'1MN389N9NQUQ^Q^9^ab9b3cd3ca!&)3cde
 (6>>6>>1fNpNpq ellF4D4D3E&FV[\"//~~!--&v'9'96;M;MqRST&v.66,,u||VYYK8N--E&J_J_D`(aD`q6):D`(ab 	' Od )bs   I*I/
8I4r<   r  r  r  rO   c	                    Ub  Un
SnSnSnSnSnS=nn[        U R                  U R                  U R                  5      n[        R
                  " U5      n[        U R                  5       GH  u  nnUR                  S5      nU R                  U5      R                  SSS9nU" W
4UUUUUUS.U	D6n
US:X  aA  [        R
                  " U R                  U
5      [        U5      -   5      nUR                  5       nU R                  bT  U R                  U   " U
U-   5      U-   n[        WU R!                  UU5      U R                  5      nUnUR                  5       nU
R                  5       nX4-  nU R"                  c  GM  U R$                  (       d  UU R&                  :X  d  GM8  U R"                  U   " U
5      nUS:X  a  UU4-  nUW4-  nU R(                  U   " UW5      nUU4-  nUW4-  nUW4-  nUU4-  nGM     [*        R,                  " U5      nU R"                  ba  U R                  bT  [*        R,                  " USS	9n[*        R,                  " USS	9n[*        R,                  " USS	9n[*        R,                  " USS	9n[/        W
UUUUUS
9$ )a#  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
        The query embeddings that are passed into the decoder.
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
        in `[0, 1]`:
        - 1 for pixels that are real (i.e. **not masked**),
        - 0 for pixels that are padding (i.e. **masked**).
    reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
        Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
    spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
        Spatial shapes of the feature maps.
    level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
        Indexes for the start of each feature level. In range `[0, sequence_length]`.
r)   r   r]   i
   r  )r  r  r  r  r<   rR  r   r   )r    r!   r"   r#   r$   r%   )r  r1  r]  r\  r  r   r   r   r   r  r   r  r  detachrl  r  r  rd  ro  r  r=  r/   r   r   )rX   r<   r  r  r  level_start_indexr  rR  memory_maskr  r&   intermediater#   r"   r$   r%   output_detachpred_corners_undetachr4  ref_points_detachr   decoder_layerref_points_inputquery_pos_embednew_reference_pointsref_points_initialr3  inter_ref_bboxr@  s                                r5   ri   Deimv2Decoder.forward~  s   B $)M (*% )+&#% 011-$T%6%6PII&67 )$++ 6A}0::1="112CDJJsXZJ[O)	$3!1-$7&;'=	 	M Av'(yy&&}5HY8ZZ($ &:%@%@%B" *#q1--2OPShh!.&lG(Ldnn" )5%$2$9$9$;!)002M,,L+!t}}BT))!,];6'F94'16J5LL1+FLA#y0#-.1BB-(-?,AA(.</A.] !7b {{<0'DOO,G"'++.Aq"I-2[[9W]^-_*',{{3KQR'S$,1KK8U[\,])"+'3 3*G+I%=
 	
r4   )rl  rd  r{   rj  r  r  r  r   r=  r1  r  r  r  r\  r]  NNNN)r*   r+   r,   r-   r.   rJ  rv  r   r  r   rR   r   r   r/   r   r   r   r   ri   r3   rp   rq   s   @r5   rc  rc  T  s     ,)?| 6    #m
$||m
  ,,m
 ||	m
 +,m
 
m
   m
r4   rc  c                    US::  a  gU  Vs/ s H  n[        US   5      PM     nnU S   S   R                  n	[        U5      n
U
S:X  a  gXJ-  nUS:X  a  SOUn[        U5      n[        R                  " X/U[        R
                  U	S9n[        R                  " XS/U	S9n[        R                  " X/[        R                  U	S9n[        U5       H7  nUU   nUS:  d  M  U U   S   UUSU24'   U U   S	   UUSU24'   SUUSU24'   M9     UR                  SS
U-  /5      nUR                  SS
U-  S/5      nUR                  SS
U-  /5      n[        R                  " XS
-  S/U	S9nSUSS2U
S24'   UR                  SUS/5      nSU-
  nUR                  S5      U-  n[        R                  " U5      SS2S4   n[        R                  " UU Vs/ s H  nUU-  PM
     sn5      n[        U
S
-  U-  5      nUS:  ad  [        R                  " U[        R                  S9US-  :  n[        R                   " USXR"                  S9n[        R$                  " UU-  UU5      nUS:  a  ['        U5      n[        R                  " USS
S24   S-  / SQ5      U-  n[        R                   " USS
5      S-  S-
  n[        R                  " U5      nUS-   U-  USU-
  -  -   nUU-  nUUU-  -  nUR)                  SSS9  [+        U5      n[-        U5      nU" U5      nXB-   n[        R                  " UU/S[        R                  U	S9n[        R.                  * UUS2SU24'   [        U5       HJ  nU
S
-  U-  nU
S
-  US-   -  n[        R.                  * UUU2SU24'   [        R.                  * UUU2UU24'   ML     UUXB/S.n XUU 4$ s  snf s  snf )a  
Creates a contrastive denoising training group using ground-truth samples. It adds noise to labels and boxes.

Args:
    targets (`list[dict]`):
        The target objects, each containing 'class_labels' and 'boxes' for objects in an image.
    num_classes (`int`):
        Total number of classes in the dataset.
    num_queries (`int`):
        Number of query slots in the transformer.
    class_embed (`callable`):
        A function or a model layer to embed class labels.
    num_denoising_queries (`int`, *optional*, defaults to 100):
        Number of denoising queries.
    label_noise_ratio (`float`, *optional*, defaults to 0.5):
        Ratio of noise applied to labels.
    box_noise_scale (`float`, *optional*, defaults to 1.0):
        Scale of noise applied to bounding boxes.
Returns:
    `tuple` comprising various elements:
    - **input_query_class** (`torch.FloatTensor`) --
      Class queries with applied label noise.
    - **input_query_bbox** (`torch.FloatTensor`) --
      Bounding box queries with applied box noise.
    - **attn_mask** (`torch.FloatTensor`) --
       Attention mask for separating denoising and reconstruction queries.
    - **denoising_meta_values** (`dict`) --
      Metadata including denoising positive indices, number of groups, and split sizes.
r   r  class_labelsr   ra   r   r   r   Nboxesr]   r^   r   r   .)r   r   r]   r^  rJ  r  r  )dn_positive_idxdn_num_groupdn_num_split)r  r   r  r/   fullint32r   r  r  rr  squeezenonzeror   r   	rand_likero   randint_likera   wherer
   clip_r   r  inf)!targetsnum_classesr   rd  num_denoising_querieslabel_noise_ratiobox_noise_scaletnum_ground_truthsr   
max_gt_numnum_groups_denoising_queriesr   input_query_classinput_query_bboxpad_gt_maskr   num_gtnegative_gt_maskpositive_gt_maskdenoise_positive_idxr  mask	new_label
known_bboxdiff	rand_sign	rand_parttarget_size	attn_maskidx_block_startidx_block_endrC   s!                                    r5   (get_contrastive_denoising_training_groupr!    sp   N !%9@AAQ~./AQZ'..F&'JQ%#8#F (D(I1Ok &'J

J#;[PUP[P[djk{{JA#>vN++z6ejjQWXK:"1%A:,3AJ~,Fa&j)+21:g+>QZ(&'K7F7
#  *..17S3S/TU',,a5Q1QST-UV""Aq+G'G#HIK{{JQ#B6R'(Q
^$',,a1Mq-QR++'//3kA ==)9:1a4@ ;;IZ[IZAq#??IZ[ &j1n7S&ST10DHY\_H_`&&tQCZCZ[	!KK{(:IGXY-.>?
zz*373c99EW&&'7A>DsJ	OO$45	_(889L\H\;]]	Y	i$&&
Sc*3J?*+;<#$56'5K

K5qTZ[IAF
I#$&<'<&<<= /0$q.1,"Q!a%0FKiiZ	/-/1A/1AABY^YbYbXb	/-/?T1TTU	 1 04.< 	;PPP] BF \s   OO	
z|
    RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
    c                   N  ^  \ rS rSrS\4U 4S jjrS rS r\" SS9SS	S
\	R                  4S j5       r\\    SS\	R                  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\\   S-  S\\   S\\	R                     \-  4S jj5       5       rSrU =r$ )rs  ik  ru   c           
        > [         TU ]  U5        [        UR                  SS 5      S:H  nU(       a  [	        U5      O
[        U5      U l        UR                  S:X  a  [        U5      O[        US9U l
        UR                  S:  a<  [        R                  " UR                  S-   UR                  UR                  S9U l        UR"                  (       a0  [        R                  " UR$                  UR                  5      U l        [        R(                  " [        R*                  " UR                  UR                  5      [        R,                  " UR                  UR.                  S95      U l        [        R*                  " UR                  UR                  5      U l        [5        UR                  UR                  S	S
5      U l        UR8                  (       a&  U R;                  U R<                  S9u  U l        U l         [C        URD                  5      n/ nURD                  S   n[G        U5       H\  nURI                  URJ                  URD                  S   :X  a  [        RL                  " 5       O[O        XUR                  SS5      5        M^     [G        URP                  U-
  5       H\  nURI                  URJ                  URD                  S   :X  a  [        RL                  " 5       O[O        XUR                  S
S5      5        M^     [        RR                  " U5      U l*        [W        U5      U l,        U R[                  5         g )N
model_type
dinov3_vitr  rL  r   r   )padding_idxr  r   r   r   r^   r]   ).rQ   rR   getattrr  r  r  conv_encoderr  rZ  rY  encoderrv  rS   	Embeddingrf  r{   r`  ru  r   r_  
Sequentialrz   r  r  
enc_outputrt  r   enc_bbox_headanchor_image_sizegenerate_anchorsra   anchors
valid_maskr  decoder_in_channelsr  r   rY   r,  r  r   r   decoder_input_projrc  decoderr  )rX   ru   	is_dinov3num_backbone_outsr3  r   r   rZ   s          r5   rR   Deimv2Model.__init__q  sw    F22L$G<W	?H3F;N_`fNg)/)<)<)Ff%L_gmLn 	 !#)+!!A%v~~6CTCT*D& %%$&LL1C1CV^^$TD!--IIfnnfnn5LLV-B-BC
 !ii8I8IJ&v~~v~~q!L##,0,A,A

,A,S)DL$/ : :;004()A%%%%)C)CB)GG (fnnaQRS * v003DDEA%%%%)C)CB)GG (fnnaQRS F #%--0B"C$V,r4   c                 h    U R                   R                  5        H  nUR                  S5        M     g )NFr  
parametersrequires_grad_rX   params     r5   freeze_backboneDeimv2Model.freeze_backbone  s'    ]]--/E  ' 0r4   c                 h    U R                   R                  5        H  nUR                  S5        M     g )NTr9  r<  s     r5   unfreeze_backboneDeimv2Model.unfreeze_backbone  s'    ]]--/E  & 0r4   r  r  Ng?cpuc           
      L   Ucn  U R                   R                   Vs/ s HM  n[        U R                   R                  S   U-  5      [        U R                   R                  S   U-  5      /PMO     nn/ n[	        U5       GH  u  nu  p[
        R                  " [
        R                  " XS9R                  U5      [
        R                  " XS9R                  U5      SS9u  p[
        R                  " X/S5      nUR                  S5      S-   nUS==   U	-  ss'   US	==   U-  ss'   [
        R                  " U5      U-  S
U-  -  nUR                  [
        R                  " X/S5      R                  SX-  S5      5        GM     Sn[
        R                  " US5      nXn:  USU-
  :  -  R                  SSS9n[
        R                   " USU-
  -  5      n[
        R"                  " X[
        R$                  " [
        R&                  " U5      R(                  XCS95      nXo4$ s  snf )Nr   r   )endr   ijr  r^   r   r   r   r^  r   g{Gz?Tr_   r  )ru   r  r   r.  r   r/   r  r   rb   r   r   	ones_liker   r   r   r  rh  r  r   r  r  )rX   r  	grid_sizer   ra   sr0  levelr   r   grid_ygrid_xgrid_xywhrN   r1  s                   r5   r/  Deimv2Model.generate_anchors  s   ! 111A T[[22159:C@]@]^_@`cd@d<ef1   &/&?"E?F"^^7::5A699%@NF
 kk6"2B7G''*S0GFOu$OFOv%O)I5eDBNN5<<r:BB2v~WXYZ '@ ,,w*}1s7):;@@T@R
))Gq7{34++j5<<E@R@V@V^c3st""1s   AH!r  
pixel_maskencoder_outputsr  labelsr  rO   c                    Uc  Uc  [        S5      eUcG  UR                  u  pxpUR                  nUc  [        R                  " XyU
4US9nU R                  U5      nOUR                  S   nUR                  nUnU R                  " U40 UD6n/ n[        UR                  5       H)  u  pUR                  U R                  U   " U5      5        M+     U R                  R                  [        U5      :  a  UR                  U R                  [        U5         " UR                  S   5      5        [        [        U5      U R                  R                  5       H4  nUR                  U R                  U   " UR                  S   5      5        M6     / n/ n[        R                  " [        U5      S4U[        R                   S9n[        U5       Hh  u  pUR                  SS u  pU	UUS4'   U
UUS	4'   UR                  X45        UR#                  S5      R%                  S	S5      nUR                  U5        Mj     [        R&                  " US	5      n[        R&                  " UR)                  S
5      UR+                  S	5      R-                  S5      SS 45      nU R.                  (       a  U R                  R0                  S:  a  Ub  [3        UU R                  R4                  U R                  R6                  U R8                  U R                  R0                  U R                  R:                  U R                  R<                  S9u  nnnnOSu  nnnn[        U5      nUR                  nUR>                  nU R.                  (       d  U R                  R@                  c   [C        U5      nU RE                  UUUS9u  nnO<U RF                  U RH                  nnURK                  UU5      URK                  UU5      nnURK                  UR>                  5      U-  nU RM                  U5      nU RO                  U5      nU RQ                  U5      U-   n [        RR                  " URU                  S5      RV                  U R                  R6                  S	S9u  n!n"U RY                  S	U"R[                  S5      R]                  S	S	U R                  S   5      S9n#[^        R`                  " U#5      n$Ub  [        Rb                  " UU#/S	5      n#URY                  S	U"R[                  S5      R]                  S	S	UR                  S   5      S9n%U R                  Rd                  (       a  U Rf                  Ri                  US	S	/5      n&OMURY                  S	U"R[                  S5      R]                  S	S	UR                  S   5      S9n&U&Rk                  5       n&Ub  [        Rb                  " UU&/S	5      n&U#Rk                  5       n'U Rl                  " S"U&UUU'UUUS.UD6n([o        S"0 SU(Rp                  _SU(Rr                  _SU(Rt                  _SU(Rv                  _SU(Rx                  _SU(Rz                  _SU(R|                  _SU(R~                  _SU(R                  _SUR                  _SUR|                  _SUR~                  _SU'_SU%_SU$_SU_S U _S!U_6$ )#am  
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
    can choose to directly pass a flattened representation of an image.
labels (`list[Dict]` of len `(batch_size,)`, *optional*):
    Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
    following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
    respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
    in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.

Examples:

```python
>>> from transformers import AutoImageProcessor, Deimv2Model
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("PekingU/Deimv2_r50vd")
>>> model = Deimv2Model.from_pretrained("PekingU/Deimv2_r50vd")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 300, 256]
```Nz8You have to specify either pixel_values or inputs_embedsr   r   r^   r]   )r   ra   r   r   )r   )r  r  r   rd  r	  r
  r  r  r   )r   index)r  r<   rR  r  r  r  r  r    r!   r"   r#   r$   r%   r9   r:   r(   r;   r<   r=   r>   r?   r@   rA   rB   rC   r)   )Ar  rl   r   r/   rU   r(  r)  r   rH   r   r3  ru   r   r  r  emptylongr   r   r   	new_zerosprodcumsumro  rv  r!  rf  r   r`  r
  r  ra   r.  r2   r/  r0  r1  rb   r,  rt  r-  rB  r  rq  gatherr   r   r  r   r   ru  r_  rr  r  r4  r7   r    r!   r"   r#   r$   r%   r&   r'   r(   ))rX   r  rP  rQ  r  rR  r  r   num_channelsr   r   r   
proj_featssourcesrJ  sourcer   source_flattenr  r  r  denoising_classdenoising_bbox_unactr  rC   ra   spatial_shapes_tupler0  r1  memoryoutput_memoryrA   rB   r   topk_indreference_points_unactr@   r?   targetr>   decoder_outputss)                                            r5   ri   Deimv2Model.forward  s   X M$9WXX  6B6H6H3Jf!((F!"ZZ*e)DfU
 **<8J&,,Q/J"))F&J,,

 &'C'CDMENN42259&AB E ;;))CL8NN4223w<@A]A]^`Aabc3w<)G)GHt66q9/:V:VWY:Z[\ I  c'lA%6vUZZX&w/ME"LL-MF'-N5!8$',N5!8$&&7^^A&00A6F!!&) 0 >15!II~'?'?'E~GZGZ[\G]GdGdefGghkikGl&mn ==T[[66:v?Q 9 KK22 KK33 66&*kk&?&?"&++"?"? $ ; ;$% \rXO1>CX(
&&$$ ==DKK99A $))<#= "&"7"78LU[ch"7"iGZ"&,,ZG")**VU";Z]]6SX=YZG ~334~E/ //>#'#5#5m#Dw#N jj!2!6!6r!:!A!A4;;CZCZ`ab8!9!@!@++B/66q!=U=[=[\^=_` "A "
 ))$:;+%*\\3GI_2`bc%d"+22++B/66q!=N=T=TUW=XY 3 

 ;;****//Q0BCF"))ax7I7I"7M7T7TUVXY[h[n[noq[r7s)tF]]_F&\\?F";Q?F 6 = = ? ,, 	
 "0#12) 3/	
 	
 ! 
-??
'6'Q'Q
 !0 C C
 +:*W*W	

 ,;+Y+Y
 &5%M%M
 #2"?"?
  /99
 -==
 '6&B&B
 #2"?"?
  /99
 #8
 ,
 ,
  0!
" &>#
$ #8%
 	
r4   )r0  r(  r4  r3  r`  r-  r,  rt  r)  r1  r_  r  )r*   r+   r,   r-   r   rR   r>  rA  r   r/   rc   r/  r   r   r0   
LongTensorrI   rD   r   r   r2   r7   ri   r3   rp   rq   s   @r5   rs  rs  k  s   -| -^(' )4.2d5X]XeXe # 5#8  /34826$(|
''|
 $$t+|
 **T1	|

 ((4/|
 T
T!|
 +,|
 
u  	!$5	5|
  |
r4   rs  z6
    Output type of [`Deimv2ForObjectDetection`].
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\
S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\
   S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr \R                  S-  \S'   Sr!\
S-  \S'   Sr"g)Deimv2ObjectDetectionOutputi  a`  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
    Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
    bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
    scale-invariant IoU loss.
loss_dict (`Dict`, *optional*):
    A dictionary containing the individual losses. Useful for logging.
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
    Classification logits (including no-object) for all queries.
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
    Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
    values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
    possible padding). You can use [`~Deimv2ImageProcessor.post_process_object_detection`] to retrieve the
    unnormalized (absolute) bounding boxes.
auxiliary_outputs (`list[Dict]`, *optional*):
    Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
    and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
    `pred_boxes`) for each decoder layer.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the decoder of the model.
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
    Stacked intermediate hidden states (output of each layer of the decoder).
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`):
    Stacked intermediate logits (logits of each layer of the decoder).
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
    Stacked intermediate reference points (reference points of each layer of the decoder).
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
    Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
    Stacked initial reference points (initial reference points of each layer of the decoder).
init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
    Initial reference points sent through the Transformer decoder.
enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
    Logits of predicted bounding boxes coordinates in the encoder.
enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
    Logits of predicted bounding boxes coordinates in the encoder.
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
    Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
    picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
    foreground and background).
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
    Logits of predicted bounding boxes coordinates in the first stage.
denoising_meta_values (`dict`):
    Extra dictionary for the denoising related values
Nloss	loss_dictlogits
pred_boxesauxiliary_outputsr    r!   r"   r#   r$   r%   r9   r:   r(   r;   r<   r=   r>   r?   r@   rA   rB   rC   r)   )#r*   r+   r,   r-   r.   rm  r/   r0   r1   rn  rD   ro  rp  rq  rI   r    r!   r"   r#   r$   r%   r9   r2   r:   r(   r;   r<   r=   r>   r?   r@   rA   rB   rC   r3   r)   r4   r5   rl  rl    s   ,\ &*D%

d
")!Itd{!'+FE$++/J!!D(/+/tDzD(/26u((4/6;? 1 1D 8?48**T18>B!5#4#4t#;B?C"E$5$5$<C9=e//$6==A5!2!23d:A:>e//047>8<eE--.5<:>u0047>=A5!2!23d:A:>e//047>=A5!2!23d:A04OU&&-404OU&&-426u((4/69=e//$6=)-4$;-r4   rl  z
    RT-DETR Model (consisting of a backbone and encoder-decoder) outputting bounding boxes and logits to be further
    decoded into scores and classes.
    c                   .  ^  \ rS rSrSSSSS.rS\4U 4S jjrS	 r\\	    SS\
R                  S\
R                  S
-  S\
R                  S
-  S\
R                  S
-  S\\   S
-  S\\   S\\
R                     \-  4S jj5       5       r\S 5       rSrU =r$ )rb  i  bbox_embed.0^class_embed.0model.decoder.class_embedmodel.decoder.bbox_embed)bbox_embed.(?![0])\d+class_embed.(?![0])\d+rd  rl  ru   c                   > [         TU ]  U5        UR                  S:  a  UR                  OUR                  UR                  -   U l        [	        U5      U l        [        UR                  UR                  -  5      nUR                  n[        R                  " [        U5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l        UR                   (       aR  [#        UR                  UR                  SUR$                  S-   -  S5      n[        R                  " U/U-  5      U l        O[        R                  " [        U R                  S-   5       Vs/ s H5  n[#        UR                  UR                  SUR$                  S-   -  S5      PM7     sn[        UR                  U R                  -
  S-
  5       Vs/ s H   n[#        X"SUR$                  S-   -  S5      PM"     sn-   5      U l        U R                  U R
                  R(                  l        U R&                  U R
                  R(                  l        U R+                  5         g s  snf s  snf s  snf )Nr   r   r   r   )rQ   rR   r  r  rs  r   rP  r  rY   rS   r   r  rz   r{   rf  rd  share_bbox_headr   r1  rl  r4  r  )rX   ru   
scaled_dimnum_predr   shared_bboxrZ   s         r5   rR   !Deimv2ForObjectDetection.__init__  s    +1??a+?VEZEZ]c]l]lEl (
6--0B0BBC
((==`efn`o)p`o[\"))FNNFDUDU*V`o)pq!!#F$6$68J8JAQWQdQdghQhLiklmK mm[MH,DEDO mm #4==1#455 f00&2D2Da6K^K^abKbFcefg5 #6#8#84==#H1#LMM ja6;N;NQR;R6SUVWM		DO *.)9)9

&(,

%% *qs   &5I)4<I.'I3
c                 R    [        X5       VVs/ s H	  u  p4X4S.PM     snn$ s  snnf )N)ro  rp  )r   )rX   outputs_classoutputs_coordabs        r5   _set_aux_loss&Deimv2ForObjectDetection._set_aux_loss  s&    ;>};\];\411.;\]]]s   #Nr  rP  rQ  r  rR  r  rO   c           	         U R                   " U4UUUUS.UD6nU R                  (       a  UR                  OSnUR                  n	UR                  n
UR
                  nUR                  nU	SS2S4   nU
SS2S4   nSu  nnnnnUbO  UR                  nUR                  nU R                  " UUU R                  UU R                  U	U
4UUUUUS.UD6u  nnn[        S0 SU_SU_SU_S	U_S
U_SUR                  _SUR                  _SUR                  _SUR                  _SUR
                  _SUR                  _SUR                  _SUR                   _SUR"                  _SUR$                  _SUR&                  _SUR(                  _SUR*                  _SUR                  _SUR                  _SUR,                  _SUR.                  _SUR                  _6$ )a_  
Example:

```python
>>> import torch
>>> from transformers.image_utils import load_image
>>> from transformers import AutoImageProcessor, Deimv2ForObjectDetection

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> image_processor = AutoImageProcessor.from_pretrained("harshaljanjani/DEIMv2_HGNetv2_N_COCO_Transformers")
>>> model = Deimv2ForObjectDetection.from_pretrained("harshaljanjani/DEIMv2_HGNetv2_N_COCO_Transformers")

>>> # prepare image for the model
>>> inputs = image_processor(images=image, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> list(logits.shape)
[1, 300, 80]

>>> boxes = outputs.pred_boxes
>>> list(boxes.shape)
[1, 300, 4]

>>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)
>>> result = results[0]  # first image in batch

>>> for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
...     box = [round(i, 2) for i in box.tolist()]
...     print(
...         f"Detected {model.config.id2label[label.item()]} with confidence "
...         f"{round(score.item(), 3)} at location {box}"
...     )
```
)rP  rQ  r  rR  Nr^   r  )r?   r@   rC   predicted_cornersr%   rm  rn  ro  rp  rq  r    r!   r"   r#   r$   r%   r9   r:   r(   r;   r<   r=   r>   r?   r@   rA   rB   rC   r)   )r   ro  rC   r"   r#   r$   r%   r?   r@   loss_functionr   ru   rl  r    r!   r9   r:   r(   r;   r<   r=   r>   rA   rB   )rX   r  rP  rQ  r  rR  r  r)  rC   r  r  r  r%   ro  rp  rm  rn  rq  r?   r@   s                       r5   ri    Deimv2ForObjectDetection.forward   ss   h **
!+'
 
 BF = =TX33==#BB#*#C#C q"u%"1b5)
OkLi*O_%55O%55O151C1C2 !0 /&;"3)A2 2.D).  + 


 
 "	

 0
 &77
 (/'I'I
 !( ; ;
 +2*O*O
 ,3+Q+Q
 &-%E%E
 #*"?"?
  '99
 %55
 '.&G&G
  #*"?"?!
"  '99#
$ #*"?"?%
& $33'
( $33)
* &77+
, &-%E%E-
. #*"?"?/
 	
r4   c                 \    SSSS.nU R                   R                  (       a
  SUS'   SUS'   U$ )	Nrt  ru  rv  )rx  rd  rl  zmodel.decoder.bbox_embed.0z&model\.decoder\.bbox_embed\.(?![0])\d+rs  rw  )ru   rz  )rX   keyss     r5   _tied_weights_keys+Deimv2ForObjectDetection._tied_weights_keysu  s>     (964

 ;;&&>[D:;-<D)*r4   )rl  rd  r  r   r  )r*   r+   r,   r-   r  r   rR   r  r   r   r/   r0   rj  rI   rD   r   r   r2   rl  ri   propertyr3   rp   rq   s   @r5   rb  rb    s     #2#420	| 6^  /34826$(q
''q
 $$t+q
 **T1	q

 ((4/q
 T
T!q
 +,q
 
u  	!$?	?q
  q
f 	 	r4   rb  )rs  rV  rb  )r   )Nr  )r   )r  )d   r   rJ  )[rg  collections.abcr   dataclassesr   r/   torch.nnrS   torch.nn.functionalr   r  r    r   ri  activationsr   backbone_utilsr	   image_transformsr
   r   integrationsr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   r   configuration_deimv2r   r   r7   rF   ModulerL   rs   r   r   rI   r   r   r   r  r2  r=  rL  r_  ro   rt  rv  r  r  r  r  r  r  r  r  r/  r9  rJ  rV  rZ  r   r  rY  r  r  r  rc  r!  rs  rl  rb  __all__r)   r4   r5   <module>r     s  *  $ !      & " + R 7 + F & @ Z Z I E .  =+ = =: 
 1. 1. 1.h <+ < < Y'JBII J (J(bii  		 , G/G/ G/ G/ 	G/
 #YG/ G/TY)")) Y)x")) @"		 "&4		 4<0 0<299 2 !%II%<<% 
% <<	%
 LL4'% T\% % '(%8?)")) ?)DD DN$e")) $eN6bii 6rA A&$ bii $ N'0U		 UB)bii )XRYY 2		 "Z Zz P5O P5 P5f09- 09f<U\\ <%,, <Y\ <iniuiu <XB/ XBvS ell s u|| 0,ELL ,U ,u|| ,2Y
) Y
B xQv 
S
' S

S
l 
 E.+ E. E.P e4 eeP Or4   