
    Z j                        S SK rS SKrS SK Jr  S SKJr  S SKrS SKrS SK	J
s  Jr  S SKJrJ
r
  SSKJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'  \" 5       (       a  S SK(J)r)  \!" 5       (       a  S SK*J+r+  S SK,J-r-  \\ " SS9 " S S\5      5       5       r. SPS\R                  S\R                  S\R                  4S jjr/S\S\S\4S jr0S\R                  S\R                  S\R                  4S jr1 " S  S!\
Rd                  5      r3S\S\S"\4S\4S# jr5S\R                  S\R                  S"\4S\R                  4S$ jr6 " S% S&\
Rd                  5      r7 " S' S(\
Rd                  5      r8 " S) S*\
Rd                  5      r9 SQS+\
Rd                  S,\R                  S-\R                  S.\R                  S/\R                  S-  S0\:S1\:4S2 jjr; " S3 S4\
Rd                  5      r< " S5 S6\
Rd                  5      r=SRS7\R                  S8\:S9\>S\R                  4S: jjr? " S; S<\
Rd                  5      r@ " S= S>\
Rd                  5      rA " S? S@\
Rd                  5      rB " SA SB\5      rC " SC SD\
R                  5      rE " SE SF\
Rd                  5      rF " SG SH\
Rd                  5      rG " SI SJ\
Rd                  5      rH\  " SK SL\5      5       rI\ " SMS9 " SN SO\I5      5       rJSLSO/rKg)S    N)Callable)	dataclass)Tensornn   )initialization)ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_accelerate_available)merge_with_config_defaults)capture_outputs   )
EomtConfig)linear_sum_assignment)PartialState)reducea  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )custom_introc                   D   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S	'   Sr\\R"                     S-  \S
'   Srg)"EomtForUniversalSegmentationOutput3   a  
loss (`torch.Tensor`, *optional*):
    The computed loss, returned when labels are present.
class_queries_logits (`torch.FloatTensor`):
    A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
    query. Note the `+ 1` is needed because we incorporate the null class.
masks_queries_logits (`torch.FloatTensor`):
    A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
    query.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Last hidden states (final feature map) of the last layer.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Self and Cross Attentions weights from transformer decoder.
patch_offsets (`list[torch.Tensor]`, *optional*):
    list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r    r!   r"   r#   tupler$   r%   listr   __static_attributes__r&       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/eomt/modeling_eomt.pyr   r   3   s    * &*D%

d
")59%++d2959%++d2926u((4/659M5**+d2926Je''(4/6/3M4%,3r2   r   input_featurespoint_coordinatesreturnc                     UR                  5       S:X  a  SnUR                  S5      n[        R                  R                  R
                  " U SU-  S-
  40 UD6nU(       a  UR                  S5      nU$ )a  
A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

Args:
    input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
        A tensor that contains features map on a height * width grid
    point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
    2)):
        A tensor that contains [0, 1] * [0, 1] normalized point coordinates
    add_dim (`bool`):
        boolean value to keep track of added dimension

Returns:
    point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
    height_grid, width_grid):
        A tensor that contains features for points in `point_coordinates`.
r   T   g       @      ?)dim	unsqueezer,   r   
functionalgrid_samplesqueeze)r4   r5   add_dimkwargspoint_featuress        r3   sample_pointrB   ^   st    ( !#-77: XX((44^SK\E\_bEbmflmN'//2r2   inputslabelsc                    U R                  5       R                  S5      n S[        R                  " XR                  5      -  nU R                  S5      SS2S4   UR                  S5      SSS24   -   nSUS-   US-   -  -
  nU$ )a  
A pair wise version of the dice loss, see `dice_loss` for usage.

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    `torch.Tensor`: The computed loss between each pairs.
r   r8   N)sigmoidflattenr,   matmulTsum)rC   rD   	numeratordenominatorr   s        r3   pair_wise_dice_lossrN   ~   sz     ^^%%a(FELL22I**R.D)FJJrN47,CCK	A+/22DKr2   c                 Z   U R                   S   n[        R                  " SS9nU" U [        R                  " U 5      5      nU" U [        R
                  " U 5      5      n[        R                  " XB-  UR                  5      n[        R                  " XR-  SU-
  R                  5      nXg-   nU$ )a  
A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    loss (`torch.Tensor`): The computed loss between each pairs.
r   none	reduction)shaper   BCEWithLogitsLossr,   	ones_like
zeros_likerI   rJ   )	rC   rD   height_and_width	criterioncross_entropy_loss_poscross_entropy_loss_negloss_posloss_negr   s	            r3   $pair_wise_sigmoid_cross_entropy_lossr]      s     ||A$$v6I&vuv/FG&vu/?/?/GH||2EvxxPH||2EF
~~VHDKr2   c                      ^  \ rS rSrSr SS\S\S\S\4U 4S jjjr\R                  " 5       S\R                  S	\R                  S
\R                  S\R                  S\\\
      4
S j5       rSrU =r$ )EomtHungarianMatcher   aa  This class computes an assignment between the labels and the predictions of the network.

For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).

cost_class	cost_mask	cost_dice
num_pointsc                    > [         TU ]  5         US:X  a  US:X  a  US:X  a  [        S5      eX@l        Xl        X l        X0l        g)a  Creates the matcher

Params:
    cost_class (`float`, *optional*, defaults to 1.0):
        Relative weight of the classification error in the matching cost.
    cost_mask (`float`, *optional*,  defaults to 1.0):
        This is the relative weight of the focal loss of the binary mask in the matching cost.
    cost_dice (`float`, *optional*, defaults to 1.0):
        This is the relative weight of the dice loss of the binary mask in the matching cost.
    num_points (`int`, *optional*, defaults to 12544):
        No. of points to sample on which the mask loss will be calculated. The same set of K points are
        uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
        matching.
r   zAll costs can't be 0N)super__init__
ValueErrorrd   ra   rb   rc   )selfra   rb   rc   rd   	__class__s        r3   rg   EomtHungarianMatcher.__init__   sC    " 	?yA~)q.344$$""r2   r!   r    mask_labelsclass_labelsr6   c           
         / nUR                   S   n[        U5       GH  nX'   R                  S5      nX   n	USS2XG   4   * n
X7   R                  U	5      nUSS2S4   nU	SS2S4   n	[        R
                  " SU R                  SU	R                  S9nUR                  UR                   S   SS5      n[        XSS9R                  S5      nUR                  U	R                   S   SS5      n[        XSS9R                  S5      n	[        X5      n[        X5      nU R                  U-  U R                  U
-  -   U R                  U-  -   n[        R                   " U[        R"                  " S	5      5      n[        R$                  " U[        R"                  " S
5      5      n[        R&                  " US5      n[)        UR+                  5       5      nUR-                  U5        GM     U VVs/ s HL  u  nn[        R.                  " U[        R0                  S9[        R.                  " U[        R0                  S94PMN     nnnU$ s  snnf )a  
Params:
    masks_queries_logits (`torch.Tensor`):
        A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
    class_queries_logits (`torch.Tensor`):
        A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
    class_labels (`torch.Tensor`):
        A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
        target) containing the class labels.
    mask_labels (`torch.Tensor`):
        A tensor of dim `num_target_boxes, height, width` containing the target masks.

Returns:
    matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
    where:
        - index_i is the indices of the selected predictions (in order)
        - index_j is the indices of the corresponding selected labels (in order)
    For each batch element, it holds:
        len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
r   rF   Nr   r8   deviceFalign_cornersg    _Bg    _©dtype)rS   rangesoftmaxtor,   randrd   rp   repeatrB   r>   r]   rN   rb   ra   rc   minimumtensormaximum
nan_to_numr   cpuappend	as_tensorint64)ri   r!   r    rl   rm   indices
batch_sizei
pred_probs	pred_maskra   target_maskr5   target_coordinatespred_coordinatesrb   rc   cost_matrixassigned_indicesjmatched_indicess                        r3   forwardEomtHungarianMatcher.forward   s   8 *, *//2
z"A-088<J,/I %Q%788J%.++I6K%ag.K!!T'*I !&

1dooqIYIY Z!2!9!9+:K:KA:NPQST!U&{V[\ddefgK077	8JAqQ$YPUV^^_`aI =YTI+ICI..94t7SSVZVdVdgpVppK--U\\$5GHK--U\\%5HIK**;:K0EkooFW0XNN+,? #F ho
gn_c_`bcU__Qekk2EOOAU[[4YZgn 	 
 
s   ,AIra   rc   rb   rd   )r9   r9   r9   i 1  )r'   r(   r)   r*   r+   floatintrg   r,   no_gradr   r0   r/   r   r1   __classcell__rj   s   @r3   r_   r_      s     jo##27#JO#cf# #4 ]]_D#llD $llD \\	D
 llD 
eFm	D Dr2   r_   	num_masksc                     U R                  5       R                  S5      nSX1-  R                  S5      -  nUR                  S5      UR                  S5      -   nSUS-   US-   -  -
  nUR                  5       U-  nU$ )a  
Compute the DICE loss, similar to generalized IOU for masks as follows:

$$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

$$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).
    num_masks (`int`):
        The number of masks present in the current batch, used for normalization.

Returns:
    `torch.Tensor`: The computed loss.
r   r8   rF   )rG   rH   rK   )rC   rD   r   probsrL   rM   r   s          r3   	dice_lossr     sx    , NN$$Q'EU^((,,I))B-&**R.0K	A+/22D88:	!DKr2   c                     [         R                  " SS9nU" X5      nUR                  S5      R                  5       U-  nU$ )aX  
Args:
    inputs (`torch.Tensor`):
        A float tensor of arbitrary shape.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    loss (`torch.Tensor`): The computed loss.
rP   rQ   r   )r   rT   meanrK   )rC   rD   r   rX   cross_entropy_lossr   s         r3   sigmoid_cross_entropy_lossr   8  sB     $$v6I"62""1%))+i7DKr2   c                     ^  \ rS rSrS\S\\\4   4U 4S jjrS\	\	\
      S\	\
   4S jrS\	\   S\\\4   4S	 jrS
\S\	\   S\\R                      S\\\4   4S jrS\R                  S\	\R                     S\\R                      S\
S\\\R                  4   4
S jrS rS rS\R                  S\R                  4S jrS\R                  S\
S\
S\S\R                  4
S jr S S\R                  S
\R                  S\	\R                     S\	\R                     S\\\R                  4   S-  S\\\R                  4   4S jjrS\R                  S\R2                  S\R                  4S jrSrU =r$ )!EomtLossiL  configweight_dictc                   > [         TU ]  5         [        U S/5        UR                  U l        X l        UR
                  U l        [        R                  " U R                  S-   5      nU R                  US'   U R                  SU5        UR                  U l        UR                  U l        UR                  U l        [        UR                  UR                   UR"                  U R                  S9U l        g)a   
The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
of matched ground-truth / prediction (supervise class and mask)

Args:
    config (`EomtConfig`):
        The configuration for Eomt model also containing loss calculation specific parameters.
    weight_dict (`dict[str, float]`):
        A dictionary of weights to be applied to the different losses.
scipyr   rF   empty_weightr   N)rf   rg   r   
num_labelsr   no_object_weighteos_coefr,   onesregister_buffertrain_num_pointsrd   oversample_ratioimportance_sample_ratior_   class_weightdice_weightmask_weightmatcher)ri   r   r   r   rj   s       r3   rg   EomtLoss.__init__M  s     	$	* ++& //zz$//A"56==R^\: !11 & 7 7'-'E'E$+**((((	
r2   sizesr6   c                 p    US   nUSS   H'  n[        U5       H  u  pE[        X$   U5      X$'   M     M)     U$ )Nr   r   )	enumeratemax)ri   r   maxessublistindexitems         r3   _max_by_axisEomtLoss._max_by_axisp  sC    aQRyG(1"5<6  2 ! r2   tensorsc                 T   U R                  U Vs/ s H  n[        UR                  5      PM     sn5      n[        U5      /U-   nUu  pVpxUS   R                  n	US   R
                  n
[        R                  " XIU
S9n[        R                  " XWU4[        R                  U
S9n[        XU5       Ho  u  p-nUS UR                  S   2S UR                  S   2S UR                  S   24   R                  U5        SUS UR                  S   2S UR                  S   24'   Mq     X4$ s  snf )Nr   rt   rp   r   r8   F)r   r0   rS   lenrt   rp   r,   zerosr   boolzipcopy_)ri   r   r{   max_sizebatch_shaper   _heightwidthrt   rp   padded_tensorspadding_maskspadded_tensorpadding_masks                  r3   _pad_images_to_max_in_batch$EomtLoss._pad_images_to_max_in_batchx  s'   $$w%OwVd6<<&8w%OP7|nx/'2$
v
  ""[fM

J#>ejjY_`36wP]3^/F<+FLLO+->v||A->@Q&,,q/@QQRXXY_`AFL*6<<?*,=fll1o,==> 4_ ,, &Ps   D%r    rm   r   c           	         UnUR                   u  pVn[        R                  " U R                  S9nU R	                  U5      n	[
        R                  " [        X#5       V
VVs/ s H  u  n
u  p{X   PM     snnn
5      n[
        R                  " XV4U R                  [
        R                  UR                  S9nXU	'   UR                  SS5      nU" X5      nSU0nU$ s  snnn
f )a-  Compute the losses related to the labels using cross entropy.

Args:
    class_queries_logits (`torch.Tensor`):
        A tensor of shape `batch_size, num_queries, num_labels`
    class_labels (`list[torch.Tensor]`):
        List of class labels of shape `(labels)`.
    indices (`tuple[np.array])`:
        The indices computed by the Hungarian matcher.

Returns:
    `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
    - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
)weight)
fill_valuert   rp   r   r8   loss_cross_entropy)rS   r   CrossEntropyLossr   $_get_predictions_permutation_indicesr,   catr   fullr   r   rp   	transpose)ri   r    rm   r   pred_logitsr   num_queriesr   rX   idxtargetr   target_classes_otarget_classespred_logits_transposedloss_celossess                    r3   loss_labelsEomtLoss.loss_labels  s    " +%0%6%6"
''t/@/@A	77@ 99-0-GH-G>66AVY-GH
 %$//]h]o]o
 /s!,!6!6q!!<2C&0 Is    Cr!   rl   r   c                   ^  T R                  U5      nT R                  U5      nX   nT R                  U5      u  pX   nUSS2S4   nUSS2S4   n[        R                  " 5          T R                  UU 4S jT R                  T R                  T R                  5      n
[        XSS9R                  S5      nSSS5        [        UW
SS9R                  S5      n[        UWU5      [        XU5      S.nAAU$ ! , (       d  f       NF= f)a$  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

Args:
    masks_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, height, width)`.
    mask_labels (`torch.Tensor`):
        List of mask labels of shape `(labels, height, width)`.
    indices (`tuple[np.array])`:
        The indices computed by the Hungarian matcher.
    num_masks (`int)`:
        The number of masks, used for normalization.

Returns:
    losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
    - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
      masks.
    - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
      masks.
Nc                 &   > TR                  U 5      $ N)calculate_uncertainty)logitsri   s    r3   <lambda>%EomtLoss.loss_masks.<locals>.<lambda>  s    t99&Ar2   Frq   r   )	loss_mask	loss_dice)r    _get_targets_permutation_indicesr   r,   r   sample_points_using_uncertaintyrd   r   r   rB   r>   r   r   )ri   r!   rl   r   r   src_idxtgt_idx
pred_maskstarget_masksr   r5   point_labelspoint_logitsr   s   `             r3   
loss_masksEomtLoss.loss_masks  s   4 ;;GD77@)2
 ::;G#,  4(
#AtG, ]]_ $ D DA%%,,! (W\]eefghL  $J0AQVW__`ab 4L,PYZ"<yI

 ) _s   &AC77
Dc                    [         R                  " [        U5       VVVs/ s H  u  nu  p4[         R                  " X25      PM      snnn5      n[         R                  " U VVs/ s H  u  p4UPM	     snn5      nXV4$ s  snnnf s  snnf r   r,   r   r   	full_like)ri   r   r   srcr   batch_indicespredictions_indicess          r3   r   -EomtLoss._get_predictions_permutation_indices  sh    		iX_N`"aN`{q(35??3#:N`"ab#iiW(EW#W(EF11 #b(E   %A<#B
c                    [         R                  " [        U5       VVVs/ s H  u  nu  p4[         R                  " XB5      PM      snnn5      n[         R                  " U VVs/ s H  u  p4UPM	     snn5      nXV4$ s  snnnf s  snnf r   r   )ri   r   r   r   tgtr   target_indicess          r3   r   )EomtLoss._get_targets_permutation_indices  sg    		iX_N`"aN`{q(15??3#:N`"ab#@HQC#@A,, #b#@r   r   c                 4    [         R                  " U5      * nU$ )a2  
In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
for the foreground class in `classes`.

Args:
    logits (`torch.Tensor`):
    A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
    the number of foreground classes. The values are logits.

Returns:
    scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
    uncertain locations having the highest uncertainty score.
)r,   abs)ri   r   uncertainty_scoress      r3   r   EomtLoss.calculate_uncertainty  s      %yy01!!r2   rd   r   r   c           
      h   UR                   S   n[        X4-  5      n[        R                  " XgSUR                  S9n[        XSS9n	U" U	5      n
[        XS-  5      nX;-
  n[        R                  " U
SS2SSS24   USS9S   nU[        R                  " U[        R                  UR                  S	9-  nXSS2S4   -  nUR                  S
S5      UR                  S
5      SS24   R                  XkS5      nUS:  a5  [        R                  " U[        R                  " XlSUR                  S9/SS9nU$ )a  
This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
prediction as input.

Args:
    logits (`float`):
        Logit predictions for P points.
    uncertainty_function:
        A function that takes logit predictions for P points and returns their uncertainties.
    num_points (`int`):
        The number of points P to sample.
    oversample_ratio (`int`):
        Oversampling parameter.
    importance_sample_ratio (`float`):
        Ratio of points that are sampled via importance sampling.

Returns:
    point_coordinates (`torch.Tensor`):
        Coordinates for P sampled points.
r   r8   ro   Frq   Nr   )kr:   r   rF   r:   )rS   r   r,   rx   rp   rB   topkarangelongviewr   )ri   r   uncertainty_functionrd   r   r   	num_boxesnum_points_sampledr5   r   point_uncertaintiesnum_uncertain_pointsnum_random_pointsr   shifts                  r3   r   (EomtLoss.sample_points_using_uncertainty  s3   < LLO	 !>? "JJyaPVP]P]^#FUS2<@"#:#GH&=jj,Q1W59MSTUVWX"U\\)5::V\VcVc%ddQW~-222q9#((2,/JOOPYqrsq  %		"EJJyQW]WdWd$ef! ! r2   Nauxiliary_predictionsc                    U R                  XX45      nU R                  XDS   R                  S9n0 U R                  XXg5      EU R	                  X$U5      EnUbk  [        U5       H\  u  pU
S   nU
S   nU R                  XX45      nUR                  5        VVs0 s H  u  pU SU	 3U_M     nnnUR                  U5        M^     U$ s  snnf )a  
This performs the loss computation.

Args:
    masks_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, height, width)`.
    class_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, num_labels)`.
    mask_labels (`torch.Tensor`):
        List of mask labels of shape `(labels, height, width)`.
    class_labels (`list[torch.Tensor]`):
        List of class labels of shape `(labels)`.
    auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
        if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
        the inner layers of the EomtMaskedAttentionDecoder.

Returns:
    losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
    - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
    - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
      masks.
    - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
      masks.
    if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
    losses for each auxiliary predictions.
r   ro   r!   r    r   )	r   get_num_masksrp   r   r   r   r   itemsupdate)ri   r!   r    rl   rm   r  r   r   r   r   aux_outputs	loss_dictkeyvalues                 r3   r   EomtLoss.forward=  s    H ,,3;e&&|O<R<R&S	%
oo2T%
37K%

 !,$-.C$D '23I'J$'23I'J$ LL)=U`o	EN__EVWEVzsuAcU^U2EV	Wi( %E  Xs   Crp   c                 (   [        S U 5       5      n[        R                  " U[        R                  US9nSn[	        5       (       a3  [
        R                  0 :w  a  [        U5      n[        5       R                  n[        R                  " X4-  SS9nU$ )z[
Computes the average number of target masks across the batch, for normalization purposes.
c              3   8   #    U  H  n[        U5      v   M     g 7fr   )r   ).0classess     r3   	<genexpr>)EomtLoss.get_num_masks.<locals>.<genexpr>x  s     ALGLs   r   r   )min)
rK   r,   r   r   r   r   _shared_stater   num_processesclamp)ri   rm   rp   r   
world_sizes        r3   r  EomtLoss.get_num_maskst  sv     ALAA	OOIU[[P	
"$$))R/"9-	)^99
KK	 6A>	r2   )r   r   r   r   rd   r   r   r   )r'   r(   r)   r*   r   dictstrr   rg   r0   r   r   r   r/   r   nparrayr   r,   r   r   r   r   r   r   rp   r  r1   r   r   s   @r3   r   r   L  s   !
z !
S%Z8H !
F$tCy/ d3i -4< -E&RX.DY -" $* :>v, QVWYW_W_Q` 	c6k	 D<#ll< %,,'< rxx	<
 < 
c5<<	 <|2-"ELL "U\\ ""5!5! 	5!
 5! "'5! 
5!z AE5#ll5 $ll5 %,,'	5
 5<<(5  $C$56=5 
c5<<	 5n%,,  QVQ]Q]  r2   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )EomtPatchEmbeddingsi  z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   kernel_sizestride)rf   rg   
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)ri   r   r5  r6  r7  r8  r=  rj   s          r3   rg   EomtPatchEmbeddings.__init__  s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir2   pixel_valuesr6   c                     UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r8   )rS   r7  rh   r?  rH   r   )ri   rA  r7  
embeddingss       r3   r   EomtPatchEmbeddings.forward  sx    #))!,,,,!../yaI  __\2::1=GG1M
r2   )r5  r7  r=  r6  r?  )r'   r(   r)   r*   r+   rg   r,   r   r   r1   r   r   s   @r3   r0  r0    s.    jELL U\\  r2   r0  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
EomtEmbeddingsi  zE
Construct the CLS token, mask token, position and patch embeddings.
r   r6   Nc                   > [         TU ]  5         Xl        UR                  U l        [        R
                  " [        R                  " SSUR                  5      5      U l	        [        R
                  " [        R                  " SUR                  UR                  5      5      U l        [        U5      U l        U R                  R                  n[        R                   " UR"                  5      U l        SUR                  -   U l        [        R(                  " X!R                  5      U l        U R-                  S[        R.                  " U5      R1                  S5      SS9  g )Nr   position_idsr   rF   F)
persistent)rf   rg   r   r6  r   	Parameterr,   randnr8  	cls_tokenr   num_register_tokensregister_tokensr0  patch_embeddingsr=  Dropouthidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsr   r
  expand)ri   r   r=  rj   s      r3   rg   EomtEmbeddings.__init__  s     ++ekk!Q8J8J&KL!||EKK6;U;UW]WiWi,jk 3F ;++77zz&"<"<=!"V%?%?!?#%<<=O=O#P ^U\\+-F-M-Mg-Vchir2   rA  c                    UR                   u  n    nU R                  R                  R                  R                  nU R                  UR                  US95      nU R                  R                  USS5      nU R                  R                  USS5      nXPR                  U R                  5      -   n[        R                  " XgU/SS9nU R                  U5      nU$ )Nrs   rF   r   r  )rS   rQ  r?  r   rt   rw   rN  rX  rP  rW  rI  r,   r   rT  )ri   rA  r   r   target_dtyperD  
cls_tokensrP  s           r3   r   EomtEmbeddings.forward  s    *00
Aq!,,77>>DD**<???+NO
^^**:r2>
..55j"bI":":4;L;L"MM
YY
ZHaP
\\*-
r2   )rN  r   rT  rU  rQ  r6  rW  rP  r'   r(   r)   r*   r+   r   rg   r,   r   r   r1   r   r   s   @r3   rG  rG    s>    jz jd j ELL U\\  r2   rG  modulequeryr  r  attention_maskscalingrT  c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrF   )r:   rt   )ptrainingr   r8   )r,   rI   r   r   r<   rv   float32rw   rt   rT  rf  
contiguous)
r_  r`  r  r  ra  rb  rT  r@   attn_weightsattn_outputs
             r3   eager_attention_forwardrk    s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r2   c            
          ^  \ rS rSrSrU 4S jr S
S\R                  S\R                  S-  S\\R                  \R                  S-  4   4S jjr	S	r
U =r$ )EomtAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)rf   rg   r   r8  	embed_dimnum_attention_heads	num_headshead_dimrh   scaleattention_dropoutrT  	is_causalr   Lineark_projv_projq_projout_projri   r   rj   s     r3   rg   EomtAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar2   Nr#   ra  r6   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNrF   r   r8           )ru  rb  rT  )rS   rr  ry  r  r   rw  rx  r   get_interfacer   _attn_implementationrk  ru  rs  rf  rT  reshaperh  rz  )ri   r#   ra  r@   input_shapehidden_shapequerieskeysvaluesattention_interfacerj  ri  s               r3   r   EomtAttention.forward  s6    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "));;;;FFHmmK0((r2   )r   rT  ro  rr  ru  rw  rq  rz  ry  rs  rx  r   )r'   r(   r)   r*   r+   rg   r,   r   r/   r   r1   r   r   s   @r3   rm  rm    s[    GB. /3!)||!) t+!)
 
u||U\\D00	1!) !)r2   rm  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )EomtLayerScalei  r6   c                    > [         TU ]  5         [        R                  " UR                  [
        R                  " UR                  5      -  5      U l        g r   )	rf   rg   r   rL  layerscale_valuer,   r   r8  lambda1r{  s     r3   rg   EomtLayerScale.__init__  s8    ||F$;$;ejjI[I[>\$\]r2   hidden_statec                     XR                   -  $ r   r  ri   r  s     r3   r   EomtLayerScale.forward!  s    ll**r2   r  r6   N
r'   r(   r)   r*   rg   r,   r   r   r1   r   r   s   @r3   r  r    s)    ^+ELL +U\\ + +r2   r  input	drop_probrf  c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )z[
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

r~  r   r   )r   r   )rS   ndimr,   rx   rt   rp   floor_div)r  r  rf  	keep_probrS   random_tensoroutputs          r3   	drop_pathr  %  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr2   c                      ^  \ rS rSrSrSS\S-  SS4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )EomtDropPathi4  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr  r6   c                 .   > [         TU ]  5         Xl        g r   )rf   rg   r  )ri   r  rj   s     r3   rg   EomtDropPath.__init__7  s    "r2   r#   c                 B    [        XR                  U R                  5      $ r   )r  r  rf  ri   r#   s     r3   r   EomtDropPath.forward;  s    FFr2   c                      SU R                    3$ )Nzp=r  ri   s    r3   
extra_reprEomtDropPath.extra_repr>  s    DNN#$$r2   r  r   )r'   r(   r)   r*   r+   r   rg   r,   r   r   r,  r  r1   r   r   s   @r3   r  r  4  sQ    b#%$, #$ # #GU\\ Gell G%C % %r2   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )EomtMLPiB  r6   c                 z  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[
        R                  " X$SS9U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [
        R                  " XCSS9U l        g )NTbias)rf   rg   r8  r   	mlp_ratior   rv  fc1r9  
hidden_actr,  r	   
activationfc2ri   r   in_featuresout_featureshidden_featuresrj   s        r3   rg   EomtMLP.__init__C  s    %+%7%77f0063C3CCD99[Ef''--$V%6%67DO$//DO99_Fr2   r  c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r  s     r3   r   EomtMLP.forwardN  s2    xx-|4xx-r2   )r  r  r  r  r  r   s   @r3   r  r  B  s)    	GELL U\\  r2   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )EomtSwiGLUFFNiU  r6   c                 $  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[        US-  S-  5      S-   S-  S-  n[
        R                  " USU-  SS9U l        [
        R                  " XCSS9U l        g )Nr8   r         Tr  )	rf   rg   r8  r   r  r   rv  
weights_inweights_outr  s        r3   rg   EomtSwiGLUFFN.__init__V  s    %+%7%77f0063C3CCD2Q67!;AAE))K_1D4P99_Nr2   r  c                     U R                  U5      nUR                  SSS9u  p#[        R                  R	                  U5      U-  nU R                  U5      $ )Nr8   rF   r  )r  chunkr   r<   silur  )ri   r  x1x2hiddens        r3   r   EomtSwiGLUFFN.forward_  sQ    |4##A2#.##B'",''r2   )r  r  r  r  r   s   @r3   r  r  U  s)    O(ELL (U\\ ( (r2   r  c                      ^  \ rS rSrSrS\SS4U 4S jjr SS\R                  S\R                  S-  S\R                  4S	 jjr	S
r
U =r$ )	EomtLayerif  zCThis corresponds to the Block class in the original implementation.r   r6   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        U5      U l
        UR                  S:  a  [        UR                  5      O[        R                  " 5       U l        [        R                  " UR                  UR
                  S9U l        UR                   (       a  [#        U5      U l        O['        U5      U l        [        U5      U l        g )Nepsr~  )rf   rg   r   	LayerNormr8  layer_norm_epsnorm1rm  	attentionr  layer_scale1drop_path_rater  Identityr  norm2use_swiglu_ffnr  mlpr  layer_scale2r{  s     r3   rg   EomtLayer.__init__i  s    \\&"4"4&:O:OP
&v.*62@F@U@UX[@[f&;&;<acalalan\\&"4"4&:O:OP
  $V,DHvDH*62r2   r#   ra  c                 &   U R                  U5      nU R                  X25      u  pEU R                  U5      nU R                  U5      U-   nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      U-   nU$ r   )r  r  r  r  r  r  r  )ri   r#   ra  hidden_states_normself_attention_outputr   layer_outputs          r3   r   EomtLayer.forwardy  s    
 "ZZ6#'>>2D#U  $ 1 12G H '<=M zz-0xx-((6 ~~l3mCr2   )r  r  r  r  r  r  r  r   r^  r   s   @r3   r  r  f  sU    M3z 3d 3& /3|| t+ 
	 r2   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )EomtLayerNorm2di  c                 "   > [         TU ]  XUS9  g )N)r  elementwise_affine)rf   rg   )ri   r7  r  affinerj   s       r3   rg   EomtLayerNorm2d.__init__  s    6Jr2   r  r6   c                     UR                  SSSS5      n[        R                  " XR                  U R                  U R
                  U R                  5      nUR                  SSSS5      nU$ )Nr   r8   r   r   )permuteF
layer_normnormalized_shaper   r  r  r  s     r3   r   EomtLayerNorm2d.forward  s`    #++Aq!Q7||L2G2GVZV_V_aeaiaij#++Aq!Q7r2   r&   )gư>Tr  r   s   @r3   r  r    s)    KELL U\\  r2   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )EomtScaleLayeri  r   c           	         > [         TU ]  5         UR                  n[        R                  " X"SSS9U l        [        UR                     U l        [        R                  " UUSSUSS9U l
        [        U5      U l        g )Nr8   r2  r   r   F)r3  paddinggroupsr  )rf   rg   r8  r   ConvTranspose2dconv1r	   r  r  r>  conv2r  layernorm2dri   r   r8  rj   s      r3   rg   EomtScaleLayer.__init__  ss    ((''aXYZ
 !2!23YY

 +;7r2   r#   r6   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r  r  s     r3   r   EomtScaleLayer.forward  sB    

=16

=1((7r2   )r  r  r  r  r'   r(   r)   r*   r   rg   r,   r   r   r1   r   r   s   @r3   r  r    s/    8z 8 U\\ ell  r2   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )EomtScaleBlocki  r   c                    > [         TU ]  5         UR                  U l        [        R
                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r   )	rf   rg   num_upscale_blocks
num_blocksr   
ModuleListru   r  blockri   r   r   rj   s      r3   rg   EomtScaleBlock.__init__  sM     33]]E$//DZ#[DZqN6$:DZ#[\
#[s   A*r#   r6   c                 <    U R                    H  nU" U5      nM     U$ r   )r  )ri   r#   r  s      r3   r   EomtScaleBlock.forward  s     ZZE!-0M  r2   )r  r   r  r   s   @r3   r  r    s1    ]z ]
U\\ ell  r2   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )EomtMaskHeadi  r   c                   > [         TU ]  5         UR                  n[        R                  " X"5      U l        [        R                  " X"5      U l        [        R                  " X"5      U l        [        UR                     U l
        g r   )rf   rg   r8  r   rv  r  r  fc3r	   r  r  r  s      r3   rg   EomtMaskHead.__init__  s[    ((99[699[699[6 !2!23r2   r#   r6   c                     U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      nU$ r   r  r  r  r
  r  s     r3   r   EomtMaskHead.forward  sD    (?@(?@/r2   r  r  r   s   @r3   r  r    s/    4z 4U\\ ell  r2   r  c                       \ rS rSr% Sr\\S'   SrSrSr	Sr
S/rS	r\\S
.r\R"                  " 5       S\R&                  SS4S j5       rSrg)EomtPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
r   eomtrA  )imageFr  T)r#   r$   r_  r6   Nc                 l   U R                   R                  n[        U[        R                  [        R
                  [        R                  45      (       a  [        R                  " UR                  [        R                  " S5      S9  UR                  bz  [        R                  R                  R                  UR                  5      u  p4US:  a  S[        R                  " U5      -  OSn[        R                  " UR                  U* U5        g g [        U[        R                   5      (       aA  [        R"                  " UR                  5        [        R$                  " UR                  5        g [        U[        R&                  5      (       ay  [        R(                  " UR                  SSS9  UR*                  bK  [-        UR                  SS5      (       d.  [        R$                  " UR                  UR*                     5        g g g [        U[.        5      (       aH  [1        US	5      (       a6  [        R2                  " UR4                  U R                   R6                  5        g g [        U[8        5      (       a  [        R:                  " UR<                  SUS9  [        R$                  " UR>                  5        [        R@                  " URB                  [        RD                  " URB                  RF                  S
   5      RI                  S5      5        g [        U[J        5      (       aT  [        RL                  " URN                  S-   5      nURP                  US
'   [        R@                  " URR                  U5        g [        U[T        5      (       a!  [        R"                  " URV                  5        g g )N   )ar   r   r~  )r   std_is_hf_initializedFr  rF   rJ  ),r   initializer_ranger9  r   rv  r>  r  initkaiming_uniform_r   mathsqrtr  r,   _calculate_fan_in_and_fan_outuniform_r  ones_zeros_rV  normal_padding_idxgetattrr  hasattr	constant_r  r  rG  trunc_normal_rN  rP  r   rI  r
  rS   rX  r   r   r   r   r   EomtForUniversalSegmentationattn_mask_probs)ri   r_  r  fan_inr   boundr   s          r3   _init_weights!EomtPreTrainedModel._init_weights  sc   kk++fryy"))R5G5GHII!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659 ' --JJv}}%KK$--LLSa8!!-gfmmMach6i6iFMM&*<*<=> 7j-//vy))v~~t{{/K/KL *//v//csCKK../JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh)) ::f&7&7!&;<L%LJJv**L9 <==JJv--. >r2   r&   )r'   r(   r)   r*   r+   r   r.   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpar  rm  _can_record_outputsr,   r   r   Moduler+  r1   r&   r2   r3   r  r    sm    
 $O!&+#$N"#
 ]]_/BII /$ / /r2   r  zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                   2  ^  \ rS rSrSrS\4U 4S jjrS\S\S\S\S	\\	\4   S
\\	\4   4S jr
S\\	\4   S
\4S jr\\\   SS\S\\   S-  S\\   S-  S\\   S-  S\\   S
\4S jj5       5       5       rS rS\R                  4S jr\S 5       rSrU =r$ )r'  i  rA  r   c                   > [         TU ]  U5        Xl        UR                  U l        [	        U5      U l        [        R                  " UR                  UR                  S9U l
        [        R                  " UR                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        U5      PM     sn5      U l        [%        U5      U l        [)        U5      U l        [        R,                  " UR                  UR.                  S-   5      U l        UR2                  UR4                  -  UR2                  UR4                  -  4U l        UR8                  UR:                  UR<                  S.U l        [A        XR>                  S9U l!        U RE                  S[F        RH                  " URJ                  5      5        U RM                  5         g s  snf )Nr  r   )r   r   r   )r   r   r(  )'rf   rg   r   num_hidden_layersrG  rD  r   r  r8  r  	layernormrV  r   r`  r  ru   r  layersr  upscale_blockr  	mask_headrv  r   class_predictorr5  r6  	grid_sizer   r   r   r   r   rX   r   r,   r   r   	post_initr  s      r3   rg   %EomtForUniversalSegmentation.__init__  sr    !'!9!9(0f&8&8f>S>ST\\&"4"4f6H6HI
mmfF^F^@_$`@_1Yv%6@_$`a+F3%f-!yy););V=N=NQR=RS ++v/@/@@&BSBSW]WhWhBhi"("5"5++++.
 "=M=MN.

6;L;L0MN% %as   =G*r!   r    rl   rm   r  r6   c                     U R                  UUUUUS9nU R                  R                  5        H)  u  pxUR                  5        H  u  pXy;   d  M  X-  n
M     M+     U$ )Nr!   r    rl   rm   r  )rX   r   r  )ri   r!   r    rl   rm   r  r  r  r   loss_keyr   s              r3   get_loss_dict*EomtForUniversalSegmentation.get_loss_dict(  sj     (,~~!5!5#%"7 (6 (
	  ++113KC"+//"3?ND #4 4
 r2   r  c                 4    [        UR                  5       5      $ r   )rK   r  )ri   r  s     r3   get_loss%EomtForUniversalSegmentation.get_loss@  s    9##%&&r2   Nr%   r@   c           	         Su  pgSnUc  [        S5      eU R                  U5      n	[        U R                  5       GH  u  pXR                  U R
                  R                  -
  :X  am  U R                  R                  SSS2SS24   R                  U	R                  S   SS5      R                  U	R                  5      n[        R                  " X4SS9n	XR                  U R
                  R                  -
  :  Ga  U R                  (       d7  U R                   XR                  -
  U R
                  R                  -      S:  Ga  U R#                  U	5      nU R%                  U5      u  pXn4-  nX4-  n[        R&                  " U	R                  S   U	R                  S   U	R                  S   U	R                  [        R(                  S9n[*        R,                  " XR.                  S	S
9nUR1                  UR3                  S5      UR3                  S5      S5      nU R
                  R4                  nUU R                  R6                  -   nUS:  USS2SU2US24'   U R9                  UU R                   XR                  -
  U R
                  R                  -      UUUR                  S9nUSS2SS4   R                  SU R
                  R:                  SS5      nUR=                  5       R?                  U) S5      nU" X5      n	GM     U R#                  U	5      nU R%                  U5      u  pXn4-  nX4-  nSnUb@  Ub=  Sn[A        Xg5       H,  u  pU RC                  UUUUSS9nUU RE                  U5      -  nM.     [G        UUUUUS9$ )a'  
mask_labels (`list[torch.Tensor]`, *optional*):
    list of mask labels of shape `(num_labels, height, width)` to be fed to a model
class_labels (`list[torch.LongTensor]`, *optional*):
    list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
    labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
patch_offsets (`list[torch.Tensor]`, *optional*):
    list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
)r&   r&   Nz You have to specify pixel_valuesr   rF   r   r  )rp   rt   bilinear)sizemode)probnum_query_tokensencoder_start_tokensrp   .g    er~  rA  )r   r!   r    r"   r%   )$rh   rD  r   r9  r7  r   r   r`  r   rX  rS   rw   rp   r,   r   rf  r(  r8  predictr   r   r  interpolater=  r  rJ  r   rU  _disable_attention_maskrp  r   masked_fillr   rC  rF  r   )ri   rA  rl   rm   r%   r@   masks_queries_logits_per_layerclass_queries_logits_per_layerra  r#   r   layer_moduler`  norm_hidden_statesr!   r    interpolated_logitsrM  rN  sequence_outputr   r  s                         r3   r   $EomtForUniversalSegmentation.forwardC  s   * JPF&?@@5!*4;;!7C,,t{{/E/EEE

))$1*5<<]=P=PQR=SUWY[\__`m`t`tu %		5*@a H,,t{{/E/EEE!5!5c<R<R6RUYU`U`UkUk6k!lop!p%)^^M%B"=A\\J\=]:$.2II..2II.!&!''*!''*!''*(//**" '(mm4H~~dn&o#&9&>&>',,Q/1D1I1I!1Lb'# $(;;#:#: '7$//:[:['[$ ObdeNeq"3#3"35I5JJK "&!=!="--c4J4J.JT[[McMc.cd%5)=)00 "> " "04!=!D!DRIhIhjlnp!q!/!5!5!7!C!C^OUY!Z(GM] "8` ..759\\/5R2&*AA&&*AA&"|'?D>A.?:$ !..)=)= +!-*. / 	 i00? 2!5!5-'
 	
r2   c                 .    U R                   R                  $ r   )rD  rQ  r  s    r3   get_input_embeddings1EomtForUniversalSegmentation.get_input_embeddings  s    ///r2   r   c                    US S 2S U R                   R                  2S S 24   nU R                  U5      nUS S 2U R                   R                  U R                  R                  -   S 2S S 24   nUR                  SS5      nUR                  " UR                  S   S/U R                  Q76 nU R                  U5      nU R                  U5      n[        R                  " SX$5      nXS4$ )Nr   r8   r   rF   zbqc, bchw -> bqhw)r   r   r<  rD  rU  r   r  rS   r=  r;  r:  r,   einsum)ri   r   query_tokensclass_logitsprefix_tokensmask_logitss         r3   rO  $EomtForUniversalSegmentation.predict  s    a!:4;;#:#:!:A=>++L9q$++"9"9DOO<]<]"]"_abbc%//15%--m.A.A!.DbZ4>>Z~~l3**=9ll#6T((r2   c                 ~    US:  a6  [         R                  " U R                  S   X$S9U:  nSU S S 2S U2US 24   U'   U $ )Nr   r   ro   )r,   rx   rS   )	attn_maskrL  rM  rN  rp   random_queriess         r3   rQ  4EomtForUniversalSegmentation._disable_attention_mask  sT    !8"ZZ	(:<L\_ccN VWIa***,@,AAB>Rr2   )r<  r   rX   rD  r=  r8  r9  r;  r7  r`  r:  r   )NNN)r'   r(   r)   r*   r.  r   rg   r   r+  r,  rC  rF  r   r   r   r0   r   r   r   r   r[  r,   rO  staticmethodrQ  r1   r   r   s   @r3   r'  r'    s?    %Oz 8$ % 	
   $CK0 
c6k	0'$sF{"3 ' '   ,0,0-1e
e
 &\D(e
 6lT)	e

 F|d*e
 +,e
 
,e
    e
N0)ell )   r2   r'  )F)r~  )r~  F)Lcollections.abcr:  r  r   dataclassesr   numpyr-  r,   torch.nn.functionalr   r<   r  r    r   r  activationsr	   
file_utilsr
   r   r   modeling_layersr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   configuration_eomtr   scipy.optimizer   
accelerater   accelerate.utilsr   r   rB   rN   r]   r4  r_   r   r   r   r   r0  rG  r   rk  rm  r  r   r  r  r  r  r  r  r  r  r  r  r  r'  __all__r&   r2   r3   <module>r{     s"  *   $ !      & ! L L 9 F & P P 7 5 * 4'' 	4 4	 4B LQLL5:\\
\\@  6 , u|| X]XdXd 8g299 gTf f   <u|| U\\ VY ^c^j^j (uryy up	")) B"RYY "X %II%<<% 
% <<	%
 LL4'% % %.8)BII 8)v+RYY +U\\ e T V[VbVb %299 %bii &(BII ("'* 'Tbll RYY 2	RYY 	299 " /// // //d 
@#6 @
@F !"@
Ar2   