
    Z j              	          S SK Jr  S SKrS SKJr  S SKJs  Jr  S SKJ	r	  SSK
Jr  SSKJr  SSKJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSK J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4  SSK5J6r6J7r7  \Rp                  " \95      r:\" SS9\	 " S S\5      5       5       r; " S S\&5      r< " S S\/5      r=\\" SS9 " S S\5      5       5       r> " S S\75      r? " S S \65      r@ " S! S"\)5      rA " S# S$\-5      rB " S% S&\05      rC " S' S(\#5      rD " S) S*\25      rE " S+ S,\R                  5      rG " S- S.\R                  5      rH " S/ S0\35      rI " S1 S2\'5      rJ " S3 S4\!5      rK " S5 S6\R                  5      rLSSS7\R                  S8\R                  S9\NS:\R                  4S; jjrO " S< S=\"5      rP " S> S?\R                  5      rQ " S@ SA\+5      rR " SB SC\,5      rS " SD SE\%5      rT " SF SG\15      rU " SH SI\U5      rV " SJ SK\*5      rW " SL SM\$5      rX " SN SO\.5      rY " SP SQ\(5      rZ/ SRQr[g)T    )	dataclassN)strict   )initialization)load_backbone)ModelOutput)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)OutputRecordercapture_outputs   )
AutoConfig)DFineConfig)DFineAIFILayerDFineConvEncoderDFineConvNormLayerDFineDecoderDFineDecoderLayerDFineDecoderOutputDFineEncoderLayerDFineForObjectDetection	DFineGateDFineHybridEncoderDFineIntegralDFineLQEDFineMLP
DFineModelDFineModelOutput"DFineMultiscaleDeformableAttentionDFinePreTrainedModelDFineRepVggBlockDFineSCDown(get_contrastive_denoising_training_group)LlamaMLPLlamaRMSNormz"Intellindust/DEIMv2_HGNetv2_N_COCO)
checkpointc                       \ rS rSr% SrSrS\0rSr\	\
   \\
\
4   -  S-  \S'   Sr\\S'   S	r\\S
'   Sr\S-  \S'   Sr\\S'   Sr\
\S'   Sr\\S'   S	r\\S'   Sr\\S'   S	r\\S'   Srg)Deimv2Config;   aS  
initializer_bias_prior_prob (`float`, *optional*):
    The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
    If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
    Whether to freeze the batch normalization layers in the backbone.
encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
    Multi level features input for encoder.
feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`):
    Strides used in each feature map.
encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`):
    Indexes of the projected layers to be used in the encoder.
positional_encoding_temperature (`int`, *optional*, defaults to 10000):
    The temperature parameter used to create the positional encodings.
encoder_activation_function (`str`, *optional*, defaults to `"gelu"`):
    The non-linear activation function (function or string) in the encoder and pooler.
eval_size (`list[int]` or `tuple[int, int]`, *optional*):
    Height and width used to computes the effective height and width of the position embeddings after taking
    into account the stride.
normalize_before (`bool`, *optional*, defaults to `False`):
    Determine whether to apply layer normalization in the transformer encoder layer before self-attention and
    feed-forward modules.
hidden_expansion (`float`, *optional*, defaults to 1.0):
    Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer.
num_queries (`int`, *optional*, defaults to 300):
    Number of object queries.
decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`):
    Multi level features dimension for decoder.
num_feature_levels (`int`, *optional*, defaults to 3):
    The number of input feature levels.
decoder_n_points (`int`, *optional*, defaults to 4):
    The number of sampled keys in each feature level for each attention head in the decoder.
decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
    The non-linear activation function (function or string) in the decoder.
num_denoising (`int`, *optional*, defaults to 100):
    The total number of denoising tasks or queries to be used for contrastive denoising.
label_noise_ratio (`float`, *optional*, defaults to 0.5):
    The fraction of denoising labels to which random noise should be added.
box_noise_scale (`float`, *optional*, defaults to 1.0):
    Scale or magnitude of noise to be added to the bounding boxes.
learn_initial_query (`bool`, *optional*, defaults to `False`):
    Indicates whether the initial query embeddings for the decoder should be learned during training.
anchor_image_size (`tuple[int, int]`, *optional*):
    Height and width of the input image used during evaluation to generate the bounding box anchors.
with_box_refine (`bool`, *optional*, defaults to `True`):
    Whether to apply iterative bounding box refinement.
matcher_alpha (`float`, *optional*, defaults to 0.25):
    Parameter alpha used by the Hungarian Matcher.
matcher_gamma (`float`, *optional*, defaults to 2.0):
    Parameter gamma used by the Hungarian Matcher.
matcher_class_cost (`float`, *optional*, defaults to 2.0):
    The relative weight of the class loss used by the Hungarian Matcher.
matcher_bbox_cost (`float`, *optional*, defaults to 5.0):
    The relative weight of the bounding box loss used by the Hungarian Matcher.
matcher_giou_cost (`float`, *optional*, defaults to 2.0):
    The relative weight of the giou loss of used by the Hungarian Matcher.
use_focal_loss (`bool`, *optional*, defaults to `True`):
    Parameter informing if focal loss should be used.
focal_loss_alpha (`float`, *optional*, defaults to 0.75):
    Parameter alpha used to compute the focal loss.
focal_loss_gamma (`float`, *optional*, defaults to 2.0):
    Parameter gamma used to compute the focal loss.
weight_loss_vfl (`float`, *optional*, defaults to 1.0):
    Relative weight of the varifocal loss in the object detection loss.
weight_loss_bbox (`float`, *optional*, defaults to 5.0):
    Relative weight of the L1 bounding box loss in the object detection loss.
weight_loss_giou (`float`, *optional*, defaults to 2.0):
    Relative weight of the generalized IoU loss in the object detection loss.
weight_loss_fgl (`float`, *optional*, defaults to 0.15):
    Relative weight of the fine-grained localization loss in the object detection loss.
weight_loss_ddf (`float`, *optional*, defaults to 1.5):
    Relative weight of the decoupled distillation focal loss in the object detection loss.
eval_idx (`int`, *optional*, defaults to -1):
    Index of the decoder layer to use for evaluation.
layer_scale (`float`, *optional*, defaults to `1.0`):
    Scaling factor for the hidden dimension in later decoder layers.
max_num_bins (`int`, *optional*, defaults to 32):
    Maximum number of bins for the distribution-guided bounding box refinement.
reg_scale (`float`, *optional*, defaults to 4.0):
    Scale factor for the regression distribution.
depth_mult (`float`, *optional*, defaults to 1.0):
    Multiplier for the number of blocks in RepNCSPELAN5 layers.
top_prob_values (`int`, *optional*, defaults to 4):
    Number of top probability values to consider from each corner's distribution.
lqe_hidden_dim (`int`, *optional*, defaults to 64):
    Hidden dimension size for the Location Quality Estimator (LQE) network.
lqe_layers (`int`, *optional*, defaults to 2):
    Number of layers in the Location Quality Estimator MLP.
decoder_offset_scale (`float`, *optional*, defaults to 0.5):
    Offset scale used in deformable attention.
decoder_method (`str`, *optional*, defaults to `"default"`):
    The method to use for the decoder: `"default"` or `"discrete"`.
up (`float`, *optional*, defaults to 0.5):
    Controls the upper bounds of the Weighting Function.
weight_loss_mal (`float`, *optional*, defaults to 1.0):
    Relative weight of the matching auxiliary loss in the object detection loss.
use_dense_one_to_one (`bool`, *optional*, defaults to `True`):
    Whether to use dense one-to-one matching across decoder layers.
mal_alpha (`float`, *optional*):
    Alpha parameter for the Matching Auxiliary Loss (MAL). If `None`, uses `focal_loss_alpha`.
encoder_fuse_op (`str`, *optional*, defaults to `"sum"`):
    Fusion operation used in the encoder FPN. DEIMv2 uses `"sum"` instead of D-FINE's `"cat"`.
spatial_tuning_adapter_inplanes (`int`, *optional*, defaults to 16):
    Number of input planes for the STA convolutional stem.
encoder_type (`str`, *optional*, defaults to `"hybrid"`):
    Type of encoder to use. `"hybrid"` uses the full HybridEncoder with AIFI, FPN, and PAN.
    `"lite"` uses the lightweight LiteEncoder with GAP fusion for smaller variants (Atto, Femto, Pico).
use_gateway (`bool`, *optional*, defaults to `True`):
    Whether to use the gateway mechanism (cross-attention gating) in decoder layers. When `False`,
    uses RMSNorm on the encoder attention output instead.
share_bbox_head (`bool`, *optional*, defaults to `False`):
    Whether to share the bounding box prediction head across all decoder layers.
encoder_has_trailing_conv (`bool`, *optional*, defaults to `True`):
    Whether the encoder's CSP blocks include a trailing 3x3 convolution after the bottleneck path.
    `True` for RepNCSPELAN4 (used by HGNetV2 N and LiteEncoder variants).
    `False` for RepNCSPELAN5 (used by DINOv3 variants).
deimv2backbone_configN	eval_size      ?weight_loss_malTuse_dense_one_to_one	mal_alphasumencoder_fuse_op   spatial_tuning_adapter_inplaneshybridencoder_typeuse_gatewayFshare_bbox_headencoder_has_trailing_conv )__name__
__module____qualname____firstlineno____doc__
model_typer   sub_configsr/   listinttuple__annotations__r1   floatr2   boolr3   r5   strr7   r9   r:   r;   r<   __static_attributes__r=       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/deimv2/modular_deimv2.pyr+   r+   ;   s    tl J$j1K48ItCy5c?*T18 OU !%$%"Iut|" OS +-#S- L# K!OT!&*t*rM   r+   c                       \ rS rSrSrg)Deimv2DecoderOutput   r=   Nr>   r?   r@   rA   rL   r=   rM   rN   rP   rP          rM   rP   c                       \ rS rSrSrg)Deimv2ModelOutput   r=   NrR   r=   rM   rN   rU   rU      rS   rM   rU   z
    Output type for DEIMv2 encoder modules (HybridEncoder and LiteEncoder).
    Attentions are only available for HybridEncoder variants with AIFI layers.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                  S4   S-  \	S'   Sr\\R                  S4   S-  \	S'   Srg)	Deimv2EncoderOutput   zy
feature_maps (`list[torch.FloatTensor]`):
    List of multi-scale feature maps from the encoder, one per feature level.
Nfeature_maps.hidden_states
attentionsr=   )r>   r?   r@   rA   rB   r[   rE   torchFloatTensorrH   r\   rG   r]   rL   r=   rM   rN   rY   rY      s\    
 -1L$u(()0:>M5**C/047>7;Je'',-4;rM   rY   c                       \ rS rSrSrg)Deimv2RMSNorm   r=   NrR   r=   rM   rN   ra   ra      rS   rM   ra   c                   "    \ rS rSrS\4S jrSrg)Deimv2SwiGLUFFN   configc                 p   [         R                  R                  U 5        UR                  S-  n[         R                  " UR
                  USS9U l        [         R                  " UR
                  USS9U l        [         R                  " X!R
                  SS9U l        [         R                  " 5       U l
        g )Nr   T)bias)nnModule__init__decoder_ffn_dimLineard_model	gate_projup_proj	down_projSiLUact_fn)selfrf   hidden_featuress      rN   rk   Deimv2SwiGLUFFN.__init__   sw    
		4  00A56>>?NyytL?NNNggirM   )rs   rq   ro   rp   N)r>   r?   r@   rA   r+   rk   rL   r=   rM   rN   rd   rd      s     |  rM   rd   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )
Deimv2Gate   rn   c                 D   > [         TU ]  U5        [        U5      U l        g N)superrk   ra   norm)rt   rn   	__class__s     rN   rk   Deimv2Gate.__init__   s    !!'*	rM   )r}   )r>   r?   r@   rA   rF   rk   rL   __classcell__r~   s   @rN   rx   rx      s    + + +rM   rx   c                       \ rS rSrSrg)	Deimv2MLP   r=   NrR   r=   rM   rN   r   r      rS   rM   r   c                       \ rS rSrSrg)#Deimv2MultiscaleDeformableAttention   r=   NrR   r=   rM   rN   r   r      rS   rM   r   c                       \ rS rSrSrg)Deimv2ConvNormLayer   r=   NrR   r=   rM   rN   r   r      rS   rM   r   c                       \ rS rSrSrg)Deimv2RepVggBlock   r=   NrR   r=   rM   rN   r   r      rS   rM   r   c                      ^  \ rS rSrSr SS\S\S\S\S\4
U 4S jjjrS	\	R                  S
\	R                  4S jrSrU =r$ )Deimv2CSPRepLayeri  a	  
Cross Stage Partial (CSP) network layer with RepVGG blocks.
Differs from DFineCSPRepLayer: uses a single conv that splits into residual + processing path
(instead of two separate convs), and has an optional trailing conv controlled by `encoder_has_trailing_conv`.
rf   in_channelsout_channels
num_blocks	expansionc           
        > [         T	U ]  5         UR                  n[        X5-  5      n[	        XUS-  SSUS9U l        [        R                  " [        U5       Vs/ s H  n[        XU5      PM     sn5      U l
        UR                  (       a  [	        XUSSUS9U l        g [        R                  " 5       U l        g s  snf )Nr      
activationr   )r|   rk   activation_functionrF   r   conv1ri   
ModuleListranger   bottlenecksr<   Identityconv2)
rt   rf   r   r   r   r   r   hidden_channels_r~   s
            rN   rk   Deimv2CSPRepLayer.__init__  s     	//
l67(oPQ>QSTVWdno
==RWXbRcdRcQvHRcd

 //  q!Xbc 	
  	
 es   B<r\   returnc                     U R                  U5      R                  SSS9u  p!U R                   H  nU" U5      nM     U R                  X!-   5      $ Nr   r   dim)r   chunkr   r   )rt   r\   residual
bottlenecks       rN   forwardDeimv2CSPRepLayer.forward  sO    "&**]";"A"A!"A"K**J&}5M +zz(233rM   )r   r   r   )r0   )r>   r?   r@   rA   rB   r+   rF   rI   rk   r^   Tensorr   rL   r   r   s   @rN   r   r     sa     nq
"
14
DG
UX
ej
 
 4U\\ 4ell 4 4rM   r   c                   v   ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\R                  4S jr
S	rU =r$ )Deimv2RepNCSPELAN5i  aI  
Rep(VGG) N(etwork) CSP (Cross Stage Partial) ELAN (Efficient Layer Aggregation Network) block.
Similar to DFineRepNCSPELAN4 but without intermediate convolutions between CSP branches,
resulting in a simpler 4-way concatenation (2 split halves + 2 CSP branches) instead of D-FINE's
4-branch design with interleaved convolutions.
rf   numb_blocksc           	      h  > [         TU ]  5         UR                  nUR                  nUR                  nUR                  S-  n[	        UR
                  UR                  -  S-  5      n[        XUSSUS9U l        [        XS-  XrS9U l	        [        XXrS9U l
        [        XSU-  -   USSUS9U l        g )Nr   r   r   )r   )r|   rk   r   encoder_hidden_dimroundhidden_expansionr   r   r   csp_rep1csp_rep2r   )	rt   rf   r   r   r   r   split_channelscsp_channelsr~   s	           rN   rk   Deimv2RepNCSPELAN5.__init__'  s    //
//0022Q6V44v7P7PPTUUV(naQR_ij
)&A2E|l)&e(a,&67q!Xb

rM   r\   r   c                     U R                  U5      R                  SSS9u  p#U R                  U5      nU R                  U5      n[        R
                  " X#XE/SS9nU R                  U5      $ r   )r   r   r   r   r^   catr   )rt   r\   hidden_states_1hidden_states_2hidden_states_3hidden_states_4merged_hidden_statess          rN   r   Deimv2RepNCSPELAN5.forward5  sg    +/::m+D+J+J1RS+J+T(--8--8$yy/O)mstuzz.//rM   )r   r   r   r   )r   )r>   r?   r@   rA   rB   r+   rF   rk   r^   r   r   rL   r   r   s   @rN   r   r     s@    
| 
# 
 
0U\\ 0ell 0 0rM   r   c                       \ rS rSrSrg)Deimv2SCDowni=  r=   NrR   r=   rM   rN   r   r   =  rS   rM   r   c                       \ rS rSrSrg)Deimv2EncoderLayeriA  r=   NrR   r=   rM   rN   r   r   A  rS   rM   r   c                       \ rS rSrSrg)Deimv2AIFILayeriE  r=   NrR   r=   rM   rN   r   r   E  rS   rM   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                  \R                  \R                  4   4S jr	Sr
U =r$ )Deimv2SpatialTuningAdapteriI  rf   c           	      ^  > [         TU ]  5         UR                  n[        USUSSSS9U l        [
        R                  " SSSS9U l        [        XSU-  SS5      U l        [        USU-  SU-  SS5      U l	        [        USU-  SU-  SS5      U l
        [
        R                  " 5       U l        g )Nr   r   gelur   r   kernel_sizestridepadding   )r|   rk   r7   r   	stem_convri   	MaxPool2d	stem_poolr   conv3conv4GELUrs   )rt   rf   inplanesr~   s      rN   rk   #Deimv2SpatialTuningAdapter.__init__J  s    99,VQ!QSYZ!AqI(1x<AN
(Xq8|QPQR
(Xq8|QPQR
ggirM   pixel_valuesr   c                     U R                  U R                  U5      5      nU R                  U5      nU R                  U R	                  U5      5      nU R                  U R	                  U5      5      nX4U4$ r{   )r   r   r   r   rs   r   )rt   r   r   r   r   r   s         rN   r   "Deimv2SpatialTuningAdapter.forwardT  s`    ..)EF**_5**T[[%AB**T[[%AB@@rM   )rs   r   r   r   r   r   )r>   r?   r@   rA   r+   rk   r^   r   rG   r   rL   r   r   s   @rN   r   r   I  sJ     |  AELL AU5<<W\WcWc;c5d A ArM   r   feature_map_1feature_map_2fuse_opr   c                 B    US:X  a  X-   $ [         R                  " X/SS9$ )zJFuses two feature maps via element-wise sum or channel-wise concatenation.r4   r   r   )r^   r   )r   r   r   s      rN   fuse_feature_mapsr   \  s'    %,,99m3;;rM   c                   r   ^  \ rS rSrU 4S jrS\R                  S\\   S\	\R                     4S jr
SrU =r$ )Deimv2ConvEncoderic  c                   > [         TU ]  U5        [        R                  " U R                   Vs/ s H@  nUR
                  S:w  a  [        XUR                  SS5      O[        R                  " 5       PMB     sn5      U l	        g s  snf )Nliter   )
r|   rk   ri   r   intermediate_channel_sizesr9   r   r   r   encoder_input_proj)rt   rf   
in_channelr~   s      rN   rk   Deimv2ConvEncoder.__init__d  s     "$--
 #'"A"A	 #BJ &&&0 $F8Q8QSTVWX[[]# #B	#
s   ABr   kwargsr   c                     U R                   " U40 UD6R                  n[        U R                  U5       VVs/ s H  u  pEU" U5      PM     snn$ s  snnf r{   )modelr[   zipr   )rt   r   r   featuresprojfeats         rN   r   Deimv2ConvEncoder.forwardo  sH    ::l5f5BB-01H1H(-ST-SztT
-STTTs   A)r   )r>   r?   r@   rA   rk   r^   r   r	   r
   rE   r   rL   r   r   s   @rN   r   r   c  sC    	
UELL UFCU<V U[_`e`l`l[m U UrM   r   c                   z   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\
\R                     4S jrSrU =r$ )	Deimv2DINOv3ConvEncoderit  rf   c                 f  > [         TU ]  5         [        U5      U l        [	        U5      U l        UR                  R                  nUR                  nUR                  n[        R                  " [        XUS-  -   USS5      [        XUS-  -   USS5      [        XUS-  -   USS5      /5      U l        g )Nr   r   r   )r|   rk   r   backboner   spatial_tuning_adapterr.   hidden_sizer   r7   ri   r   r   fusion_proj)rt   rf   	embed_dim
hidden_dimspatial_tuning_adapter_channelsr~   s        rN   rk    Deimv2DINOv3ConvEncoder.__init__u  s    %f-&@&H#**66	..
*0*P*P'==#F8WZ[8[,[]gijlmn#F8WZ[8[,[]gijlmn#F8WZ[8[,[]gijlmn
rM   r   r   r   c                    U R                   " U40 UD6nUR                  nU R                   R                  R                  nUR                  S   U-  nUR                  S   U-  n/ n[        U5      n	[        U5       H\  u  p[        USU	S-
  U
-
  -  -  5      n[        USU	S-
  U
-
  -  -  5      n[        R                  " XU/SSS9nUR                  U5        M^     U R                  U5      n/ n[        [        X5      5       HD  u  n
u  nn[        R                  " UU/SS9nUR                  U R                  U
   " U5      5        MF     U$ )Nr   r   bilinearF)sizemodealign_cornersr   r   )r   r[   rf   
patch_sizeshapelen	enumeraterF   Finterpolateappendr   r   r^   r   r   )rt   r   r   backbone_outputr[   r   height_patcheswidth_patchessemantic_features
num_scalesir   resize_heightresize_widthspatialdetail_featuresoutputssemantic_featuredetail_featurefuseds                       rN   r   Deimv2DINOv3ConvEncoder.forward  sR   --??&33]]))44
%++A.*<$**1-;&
 .GAzA~7I1J JKM}qZ!^a5G/HHILmmD|/LS]mrsG$$W-	 / 55lC5>sCT?f5g1A1 .II/@aHENN4++A.u56 6h rM   )r   r   r   )r>   r?   r@   rA   r+   rk   r^   r   r	   r
   rE   r   rL   r   r   s   @rN   r   r   t  sE    
| 
"ELL FCU<V [_`e`l`l[m  rM   r   c                       \ rS rSrSrg)Deimv2Integrali  r=   NrR   r=   rM   rN   r  r    rS   rM   r  c                       \ rS rSrSrg)	Deimv2LQEi  r=   NrR   r=   rM   rN   r  r    rS   rM   r  c                   4  ^  \ rS rSrS\4U 4S jjr      SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\\	\
\
4      S-  S
\R                  S-  S\R                  S-  S\\   S\R                  4S jjrSrU =r$ )Deimv2DecoderLayeri  rf   c                   > [         TU ]  U5        [        US9U l        [	        UR
                  5      U l        [	        UR
                  5      U l        [        U5      U l	        UR                  U l
        UR                  (       a  [        UR
                  5      OS U l        UR                  (       a  S U l        g [	        UR
                  5      U l        g )Nrf   )r|   rk   r   encoder_attnra   rn   self_attn_layer_normfinal_layer_normrd   mlpr:   rx   gatewayencoder_attn_layer_normrt   rf   r~   s     rN   rk   Deimv2DecoderLayer.__init__  s     ?vN$1&..$A! -fnn ="6*!--5;5G5Gz&..1T/5/A/At$}U[UcUcGd$rM   Nr\   position_embeddingsreference_pointsspatial_shapesspatial_shapes_listencoder_hidden_statesencoder_attention_maskr   r   c                 $   Un	U R                   " SUUUS.UD6u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUn	Uc  UOX-   nU R                  UUUUUS9u  p[        R                  R                  XR                  U R                  S9nU R                  b  U R                  X5      nOX-   nU R                  U5      nUn	U R                  U5      nX-   nU R                  U5      nU$ )N)r\   attention_maskr&  )ptraining)r\   r*  r'  r(  r)  r=   )	self_attnri   
functionaldropoutr/  r  r  r"  r#  r!  r   )rt   r\   r&  r'  r(  r)  r*  r+  r   r   r   s              rN   r   Deimv2DecoderLayer.forward  s8    !  >> 
'1 3
 	
 --m||VZVcVc-d 011-@  *=)D-Jm,,'"7-) 3 - 
 --m||VZVcVc-d<<# LLAM$4M 88GM !/ 0--m<rM   )r  r#  r   r"  r!  r  r:   )NNNNNN)r>   r?   r@   rA   r+   rk   r^   r   rE   rG   rF   r	   r
   r   rL   r   r   s   @rN   r  r    s    e| e 4804.2<@596:2||2 #\\D02  ,,-	2
 t+2 "%S/2T92  %||d22 !&t 32 +,2 
2 2rM   r  c                   X   ^  \ rS rSr/ SQr\R                  " 5       U 4S j5       rSrU =r	$ )Deimv2PreTrainedModeli  )Deimv2HybridEncoderDeimv2LiteEncoderr  c                 P  > [         TU ]  U5        [        U[        5      (       Ga   [        R
                  " UR                  R                  5        [        R                  " UR                  R                  S5        [        R
                  " UR                  R                  5        [        R                  " UR                  R                  S5        [        R
                  " UR                  R                  5        [        R                  " UR                  R                  S5        g g )Nr   )r|   _init_weights
isinstancerd   initxavier_uniform_ro   weight	constant_rh   rp   rq   )rt   moduler~   s     rN   r9  #Deimv2PreTrainedModel._init_weights  s    f%fo..  !1!1!8!89NN6++00!4  !6!67NN6>>..2  !1!1!8!89NN6++00!4 /rM   r=   )
r>   r?   r@   rA   _no_split_modulesr^   no_gradr9  rL   r   r   s   @rN   r5  r5    s!    ]
]]_	5 	5rM   r5  c                      ^  \ rS rSrS\" \SS9\" \SS9/0rS\4U 4S jjr\	\
S\\R                     S	\\   S
\4S j5       5       rSrU =r$ )r7  i  r\   
input_proj)
layer_namebi_fusion_convrf   c                 B  > [         TU ]  U5        UR                  nUR                  n[        R
                  " UR                   Vs/ s H  n[        XUSS5      PM     sn5      U l        [        R                  " SSSS9U l
        [        XUSSUS9U l        [        R                  " SSSS9U l        [        XUSSUS9U l        [        XUSSUS9U l        [        SUR                   -  5      n[#        XS9U l        [#        XS9U l        U R)                  5         g s  snf )Nr   r   r   r   r   r   )r|   rk   r   r   ri   r   encoder_in_channelsr   rD  	AvgPool2d
down_pool1
down_conv1
down_pool2
down_conv2rF  r   
depth_multr   	fpn_block	pan_block	post_init)rt   rf   r   r   r   r   r~   s         rN   rk   Deimv2LiteEncoder.__init__  s    ..
//
--Y_YsYstYs: ZAFYst
 ,,1QJ-f*aQR_ij,,1QJ-f*aQR_ij1&jRSUVcmn1v0001
+FK+FK us   Dinputs_embedsr   r   c                 `   [        U5       VVs/ s H  u  p4U R                  U   " U5      PM     nnnUR                  U R                  U R	                  US   5      5      5        U R                  US   [        R                  " US   S5      -   5      US'   / nUS   [        R                  " US   SSS9-   nUR                  U R                  U5      5        US   U R                  U R                  US   5      5      -   nUR                  U R                  U5      5        [        US9$ s  snnf )Nr   r          @nearestscale_factorr   r[   )r  rD  r  rL  rK  rF  r  adaptive_avg_pool2dr  rP  rN  rM  rQ  rY   )rt   rT  r   r  featureprojected_featuresr  fused_features           rN   r   Deimv2LiteEncoder.forward  s     MVVcLdeLdjadooa09Lde!!$//$//BTUWBX2Y"Z[!%!4!4r"Q%:%:;Mb;QST%UU"
2 *1->PQR>Sbelu0vvt~~m45*1-PWXZP[@\0]]t~~m45"88 fs   !D*)rF  rL  rN  rK  rM  rP  rD  rQ  )r>   r?   r@   rA   r   r   _can_record_outputsr+   rk   r   r   rE   r^   r   r	   r
   rY   r   rL   r   r   s   @rN   r7  r7    sy     	.<H.;KL
| ,  9T%,,%7 96J\C] 9bu 9   9rM   r7  c                   h    \ rS rSrSrS\4S jr SS\\R                     S-  S\
\   S\4S	 jjrS
rg)r6  i+  a5  
DEIMv2 variant of DFineHybridEncoder. Uses element-wise sum fusion (`fuse_feature_maps`) instead of
D-FINE's channel concatenation, Deimv2RepNCSPELAN5 (simplified 4-way concat) instead of DFineRepNCSPELAN4,
and returns Deimv2EncoderOutput with feature_maps instead of BaseModelOutput with last_hidden_state.
rf   c           
         [         R                  X5        Xl        UR                  U l        [        U R                  5      S-
  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  U l
        UR                  U l        U R                   Vs/ s H  o R                  PM     snU l        U R                  U l        UR                  U l        [         R"                  " [%        [        U R                  5      5       Vs/ s H  n['        U5      PM     sn5      U l        [         R"                  " 5       U l        [         R"                  " 5       U l        [%        [        U R                  5      S-
  SS5       Hx  nU R*                  R/                  [1        XR                  U R                  SS5      5        [3        SUR4                  -  5      nU R,                  R/                  [7        XS95        Mz     [         R"                  " 5       U l        [         R"                  " 5       U l        [%        [        U R                  5      S-
  5       Hc  nU R8                  R/                  [=        USS5      5        [3        SUR4                  -  5      nU R:                  R/                  [7        XS95        Me     U R?                  5         g s  snf s  snf )Nr   r   rV  r   rH  r   ) r5  rk   rf   rI  r   r  num_fpn_stagesfeat_stridesr   encode_proj_layerspositional_encoding_temperaturer/   r   out_stridesr5   r   ri   r   r   r   aifilateral_convs
fpn_blocksr  r   r   rO  r   downsample_convs
pan_blocksr   rR  )rt   rf   r   r   s       rN   rk   Deimv2HybridEncoder.__init__2  s/   &&t4!55!$"2"23a7"//"(";";"(";";/5/U/U,))>B>N>NO>N44>NO,,--MME#dNeNeJfDg"hDgq?6#:Dg"hi	]]_--/s4++,q0!R8A%%#F,C,CTE\E\^_abc q6#4#445JOO""#5f#UV 9 !#--/s4++,q01A!!((fa)CDq6#4#445JOO""#5f#UV 2
 	- P #is   -KK"NrT  r   r   c                 &   UnU R                   R                  S:  a8  [        U R                  5       H  u  pEU R                  U   " X5   40 UD6X5'   M!     US   /n[        [        U R                  U R                  5      5       Hr  u  nu  pX0R                  U-
  S-
     n
US   nU" U5      nXS'   [        R                  " USSS9n[        XU R                  5      nU	" U5      nUR                  U5        Mt     UR                  5         US   /n[        [        U R                  U R                   5      5       HM  u  nu  nnUS   nXgS-      nU" U5      n[        UUU R                  5      nU" U5      nUR                  U5        MO     [#        US9$ )z
Args:
    inputs_embeds (`list[torch.FloatTensor]`):
        Multi-scale feature maps from the backbone (one tensor per feature level) passed to the encoder.
r   rV  r   rW  rX  rY  r[  )rf   encoder_layersr  rf  ri  r   rj  rk  rd  r  r  r   r   r  reverserl  rm  rY   )rt   rT  r   r[   r  enc_indfpn_feature_mapsidxlateral_convrP  backbone_feature_maptop_fpn_feature_mapfused_feature_mapnew_fpn_feature_mappan_feature_mapsdownsample_convrQ  top_pan_feature_mapfpn_feature_mapdownsampled_feature_mapnew_pan_feature_maps                        rN   r   Deimv2HybridEncoder.forwardT  s    %;;%%)'(?(?@
(,		!\5J(Uf(U% A ),-.7D<N<NPTP_P_8`.a*C*,#/0C0Cc0IA0M#N "22"6"./B"C#6R "#--0CRU\e"f 12E]a]i]i j"+,=">##$78 /b 	  " -Q/01:3t?T?TVZVeVe;f1g-C-/9"22"6.Qw7O&56I&J# 12I?\`\h\h i"+,=">##$78 2h #0@AArM   )ri  rf   rl  rf  r   r/   re  rk  r   r   rj  rd  r   rh  rm  rg  r{   )r>   r?   r@   rA   rB   r+   rk   rE   r^   r   r	   r
   rY   r   rL   r=   rM   rN   r6  r6  +  sX     |  H 48(BELL)D0(B +,(B 
	(B (BrM   r6  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Deimv2Decoderi  rf   c                    > [         TU ]  US9  [        SUR                  UR                  SUR                  5      U l        g )Nr  r   r   )r|   rk   r   rn   decoder_activation_functionquery_pos_headr$  s     rN   rk   Deimv2Decoder.__init__  s6    ''6>>6>>1fNpNpqrM   )r  )r>   r?   r@   rA   r+   rk   rL   r   r   s   @rN   r  r    s    r| r rrM   r  c                       \ rS rSrS\4S jr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\	\
   S-  S
\\   4S jjrSrg)Deimv2Modeli  rf   c           
         [         R                  X5        [        UR                  SS 5      S:H  nU(       a  [	        U5      O
[        U5      U l        UR                  S:X  a  [        U5      O[        US9U l
        UR                  S:  a<  [        R                  " UR                  S-   UR                  UR                  S9U l        UR"                  (       a0  [        R                  " UR$                  UR                  5      U l        [        R(                  " [        R*                  " UR                  UR                  5      [        R,                  " UR                  UR.                  S95      U l        [        R*                  " UR                  UR                  5      U l        [5        UR                  UR                  S	S
5      U l        UR8                  (       a&  U R;                  U R<                  S9u  U l        U l         [C        URD                  5      n/ nURD                  S   n[G        U5       H\  nURI                  URJ                  URD                  S   :X  a  [        RL                  " 5       O[O        XUR                  SS5      5        M^     [G        URP                  U-
  5       H\  nURI                  URJ                  URD                  S   :X  a  [        RL                  " 5       O[O        XUR                  S
S5      5        M^     [        RR                  " U5      U l*        [W        U5      U l,        U R[                  5         g )NrC   
dinov3_vitr   r  r   r   )padding_idx)epsr   r   )dtyperV  r   ).r5  rk   getattrr.   r   r   conv_encoderr9   r7  r6  encodernum_denoisingri   	Embedding
num_labelsrn   denoising_class_embedlearn_initial_querynum_queriesweight_embedding
Sequentialrm   	LayerNormlayer_norm_eps
enc_outputenc_score_headr   enc_bbox_headanchor_image_sizegenerate_anchorsr  anchors
valid_maskr  decoder_in_channelsr   r  r   r   r   num_feature_levelsr   decoder_input_projr  decoderrR  )rt   rf   	is_dinov3num_backbone_outsr  r   r   s          rN   rk   Deimv2Model.__init__  sw   &&t4F22L$G<W	?H3F;N_`fNg)/)<)<)Ff%L_gmLn 	 !#)+!!A%v~~6CTCT*D& %%$&LL1C1CV^^$TD!--IIfnnfnn5LLV-B-BC
 !ii8I8IJ&v~~v~~q!L##,0,A,A

,A,S)DL$/ : :;004()A%%%%)C)CB)GG (fnnaQRS * v003DDEA%%%%)C)CB)GG (fnnaQRS F #%--0B"C$V,rM   Nr   
pixel_maskencoder_outputsrT  labelsr   c                    Uc  Uc  [        S5      eUcG  UR                  u  pxpUR                  nUc  [        R                  " XyU
4US9nU R                  U5      nOUR                  S   nUR                  nUnU R                  " U40 UD6n/ n[        UR                  5       H)  u  pUR                  U R                  U   " U5      5        M+     U R                  R                  [        U5      :  a  UR                  U R                  [        U5         " UR                  S   5      5        [        [        U5      U R                  R                  5       H4  nUR                  U R                  U   " UR                  S   5      5        M6     / n/ n[        R                  " [        U5      S4U[        R                   S9n[        U5       Hh  u  pUR                  SS  u  pU	UUS4'   U
UUS4'   UR                  X45        UR#                  S5      R%                  SS5      nUR                  U5        Mj     [        R&                  " US5      n[        R&                  " UR)                  S	5      UR+                  S5      R-                  S5      S S 45      nU R.                  (       a  U R                  R0                  S:  a  Ub  [3        UU R                  R4                  U R                  R6                  U R8                  U R                  R0                  U R                  R:                  U R                  R<                  S
9u  nnnnOSu  nnnn[        U5      nUR                  nUR>                  nU R.                  (       d  U R                  R@                  c   [C        U5      nU RE                  UUUS9u  nnO<U RF                  U RH                  nnURK                  UU5      URK                  UU5      nnURK                  UR>                  5      U-  nU RM                  U5      nU RO                  U5      nU RQ                  U5      U-   n [        RR                  " URU                  S5      RV                  U R                  R6                  SS9u  n!n"U RY                  SU"R[                  S5      R]                  SSU R                  S   5      S9n#[^        R`                  " U#5      n$Ub  [        Rb                  " UU#/S5      n#URY                  SU"R[                  S5      R]                  SSUR                  S   5      S9n%U R                  Rd                  (       a  U Rf                  Ri                  USS/5      n&OMURY                  SU"R[                  S5      R]                  SSUR                  S   5      S9n&U&Rk                  5       n&Ub  [        Rb                  " UU&/S5      n&U#Rk                  5       n'U Rl                  " S!U&UUU'UUUS.UD6n([o        S!0 SU(Rp                  _SU(Rr                  _SU(Rt                  _SU(Rv                  _SU(Rx                  _SU(Rz                  _SU(R|                  _SU(R~                  _SU(R                  _SUR                  _SUR|                  _SUR~                  _SU'_SU%_SU$_SU_SU _S U_6$ )"Nz8You have to specify either pixel_values or inputs_embeds)devicer   rV  r   )r  r  r   )r   )targetsnum_classesr  class_embednum_denoising_querieslabel_noise_ratiobox_noise_scaleNNNNr   )r   index)rT  r*  r+  r'  r(  r)  level_start_indexlast_hidden_stateintermediate_hidden_statesintermediate_logitsintermediate_reference_pointsintermediate_predicted_cornersinitial_reference_pointsdecoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_stater*  encoder_attentionsinit_reference_pointsenc_topk_logitsenc_topk_bboxesenc_outputs_classenc_outputs_coord_logitsdenoising_meta_valuesr=   )A
ValueErrorr  r  r^   onesr  r  r  r[   r  r  rf   r  r  r   emptylongflatten	transposer   	new_zerosprodcumsumr/  r  r&   r  r  r  r  r  r  r  rG   r  r  r  tor  r  r  topkmaxvaluesgather	unsqueezerepeatr  sigmoidconcatr  r  tiledetachr  rU   r  r  r  r  r  r  r\   r]   r  ))rt   r   r  r  rT  r  r   
batch_sizenum_channelsheightwidthr  
proj_featssourceslevelsourcer  source_flattenr)  r(  r  denoising_classdenoising_bbox_unactr-  r  r  spatial_shapes_tupler  r  memoryoutput_memoryr  r  r   topk_indreference_points_unactr  r  targetr  decoder_outputss)                                            rN   r   Deimv2Model.forward  s    M$9WXX  6B6H6H3Jf!((F!"ZZ*e)DfU
 **<8J&,,Q/J"))F&J,,

 &'C'CDMENN42259&AB E ;;))CL8NN4223w<@A]A]^`Aabc3w<)G)GHt66q9/:V:VWY:Z[\ I  c'lA%6vUZZX&w/ME"LL-MF'-N5!8$',N5!8$&&7^^A&00A6F!!&) 0 >15!II~'?'?'E~GZGZ[\G]GdGdefGghkikGl&mn ==T[[66:v?Q 9 KK22 KK33 66&*kk&?&?"&++"?"? $ ; ;$% \rXO1>CX(
&&$$ ==DKK99A $))<#= "&"7"78LU[ch"7"iGZ"&,,ZG")**VU";Z]]6SX=YZG ~334~E/ //>#'#5#5m#Dw#N jj!2!6!6r!:!A!A4;;CZCZ`ab8!9!@!@++B/66q!=U=[=[\^=_` "A "
 ))$:;+%*\\3GI_2`bc%d"+22++B/66q!=N=T=TUW=XY 3 

 ;;****//Q0BCF"))ax7I7I"7M7T7TUVXY[h[n[noq[r7s)tF]]_F&\\?F";Q?F 6 = = ? ,, 	
 "0#12) 3/	
 	
 ! 
-??
'6'Q'Q
 !0 C C
 +:*W*W	

 ,;+Y+Y
 &5%M%M
 #2"?"?
  /99
 -==
 '6&B&B
 #2"?"?
  /99
 #8
 ,
 ,
  0!
" &>#
$ #8%
 	
rM   )r  r  r  r  r  r  r  r  r  r  r  r  )r>   r?   r@   rA   r+   rk   r^   r_   
LongTensorrE   dictr	   r
   r   rL   r=   rM   rN   r  r    s    -| -d /34826$(]
'']
 $$t+]
 **T1	]

 ((4/]
 T
T!]
 +,]
 ]
rM   r  c                   T   ^  \ rS rSr\" 5       r\S 5       rS\4S jr	U 4S jr
SrU =r$ )Deimv2ForObjectDetectioniU  c                 \    SSSS.nU R                   R                  (       a
  SUS'   SUS'   U$ )	Nz^class_embed.0zmodel.decoder.class_embedzmodel.decoder.bbox_embed)zclass_embed.(?![0])\d+r  
bbox_embedzmodel.decoder.bbox_embed.0z&model\.decoder\.bbox_embed\.(?![0])\d+zbbox_embed.0zbbox_embed.(?![0])\d+)rf   r;   )rt   keyss     rN   _tied_weights_keys+Deimv2ForObjectDetection._tied_weights_keysX  s>     (964

 ;;&&>[D:;-<D)*rM   rf   c                    [         R                  X5        UR                  S:  a  UR                  OUR                  UR                  -   U l        [	        U5      U l        [        UR                  UR                  -  5      nUR                  n[        R                  " [        U5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l        UR                   (       aR  [#        UR                  UR                  SUR$                  S-   -  S5      n[        R                  " U/U-  5      U l        O[        R                  " [        U R                  S-   5       Vs/ s H5  n[#        UR                  UR                  SUR$                  S-   -  S5      PM7     sn[        UR                  U R                  -
  S-
  5       Vs/ s H   n[#        X"SUR$                  S-   -  S5      PM"     sn-   5      U l        U R                  U R
                  R(                  l        U R&                  U R
                  R(                  l        U R+                  5         g s  snf s  snf s  snf )Nr   r   r   r   )r5  rk   eval_idxdecoder_layersr  r   r   layer_scaler   ri   r   r   rm   rn   r  r  r;   r   max_num_binsr  r  rR  )rt   rf   
scaled_dimnum_predr   shared_bboxs         rN   rk   !Deimv2ForObjectDetection.__init__d  s   &&t4+1??a+?VEZEZ]c]l]lEl (
6--0B0BBC
((==`efn`o)p`o[\"))FNNFDUDU*V`o)pq!!#F$6$68J8JAQWQdQdghQhLiklmK mm[MH,DEDO mm #4==1#455 f00&2D2Da6K^K^abKbFcefg5 #6#8#84==#H1#LMM ja6;N;NQR;R6SUVWM		DO *.)9)9

&(,

%% *qs   +5I.9<I3'I8
c                  :   > [        5       R                  " S0 U D6  g)a_  
Example:

```python
>>> import torch
>>> from transformers.image_utils import load_image
>>> from transformers import AutoImageProcessor, Deimv2ForObjectDetection

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> image_processor = AutoImageProcessor.from_pretrained("harshaljanjani/DEIMv2_HGNetv2_N_COCO_Transformers")
>>> model = Deimv2ForObjectDetection.from_pretrained("harshaljanjani/DEIMv2_HGNetv2_N_COCO_Transformers")

>>> # prepare image for the model
>>> inputs = image_processor(images=image, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> list(logits.shape)
[1, 300, 80]

>>> boxes = outputs.pred_boxes
>>> list(boxes.shape)
[1, 300, 4]

>>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)
>>> result = results[0]  # first image in batch

>>> for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
...     box = [round(i, 2) for i in box.tolist()]
...     print(
...         f"Detected {model.config.id2label[label.item()]} with confidence "
...         f"{round(score.item(), 3)} at location {box}"
...     )
```
Nr=   )r|   r   )super_kwargsr~   s    rN   r    Deimv2ForObjectDetection.forward  s    T 	','rM   )r  r  r  r   )r>   r?   r@   rA   AttributeErrorrA  propertyr  r+   rk   r   rL   r   r   s   @rN   r  r  U  s4    &(	 	| 6*( *(rM   r  )r+   r  r5  r  )r4   )\dataclassesr   r^   torch.nnri   torch.nn.functionalr1  r  huggingface_hub.dataclassesr    r   r;  backbone_utilsr   modeling_outputsr   processing_utilsr	   utilsr
   r   r   utils.genericr   utils.output_capturingr   r   autor   d_fine.configuration_d_finer   d_fine.modeling_d_finer   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   llama.modeling_llamar'   r(   
get_loggerr>   loggerr+   rP   rU   rY   ra   rd   rx   r   r   r   r   rj   r   r   r   r   r   r   r   rK   r   r   r   r  r  r  r5  r7  r6  r  r  r  __all__r=   rM   rN   <module>r     sg   "     . & + + & @ @ 7 E  5     , : 
		H	% ?@C+; C+  AC+L	, 		( 	 <+ < <	L 	 h  + +	 		*L 		, 		( 	4		 4<0 0<	; 		* 		n 	A A&<U\\ <%,, <Y\ <iniuiu <U( U")bii )X	] 		 	=* =@50 5 09- 09fQB, QBhrL rM
* M
`T(6 T(nrM   