
    Z j{*                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	  SSK
Jr  SS	KJrJr  S
SKJr  \" SS9\ " S S\5      5       5       r\ " S S\5      5       r " S S\R&                  5      r " S S\R&                  5      r " S S\R&                  5      r " S S\R&                  5      r " S S\R&                  5      r\" SS9 " S S\5      5       rSS/rg)zPyTorch ViTMatte model.    )	dataclassN)nn   )initialization)load_backbone)PreTrainedModel)ModelOutputauto_docstring   )VitMatteConfigz4
    Class for outputs of image matting models.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	ImageMattingOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Loss.
alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Estimated alpha values.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
    (also called feature maps) of the model at the output of each stage.
Nlossalphashidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr   r      sg    	 &*D%

d
")'+FE$+59M5**+d2926Je''(4/6r    r   c                   z    \ rS rSr% \\S'   SrSrSr/ r	\
R                  " 5       S\R                  4S j5       rSrg	)
VitMattePreTrainedModel4   configpixel_values)imageTmodulec                    [        U[        R                  [        R                  45      (       a  [        R
                  " UR                  SU R                  R                  S9  UR                  b   [        R                  " UR                  5        [        USS 5      ba  [        R                  " UR                  5        [        R                  " UR                  5        [        R                  " UR                  5        g g g )Ng        )meanstdrunning_mean)
isinstancer   Conv2dBatchNorm2dinitnormal_weightr%   initializer_rangebiaszeros_getattrr,   ones_running_varnum_batches_tracked)selfr(   s     r!   _init_weights%VitMattePreTrainedModel._init_weights<   s    fryy"..9::LLSdkk6S6ST{{&FKK(v~t4@F//0

6--.F667 A	 ;r    r   N)r   r   r   r   r   r   main_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   no_gradr   Moduler;   r   r   r    r!   r#   r#   4   sB    $O!&*#
]]_8BII 8 8r    r#   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )VitMatteBasicConv3x3H   zH
Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
c           	         > [         TU ]  5         [        R                  " UUSUUSS9U l        [        R
                  " X1R                  S9U l        [        R                  " 5       U l	        g )Nr   F)in_channelsout_channelskernel_sizestridepaddingr4   )eps)
super__init__r   r.   convr/   batch_norm_eps
batch_normReLUrelu)r:   r%   rG   rH   rJ   rK   	__class__s         r!   rN   VitMatteBasicConv3x3.__init__M   sU    II#%
	 ..;P;PQGGI	r    c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ N)rO   rQ   rS   r:   hidden_states     r!   forwardVitMatteBasicConv3x3.forwardZ   s2    yy.|4yy.r    )rQ   rO   rS   )   r   	r   r   r   r   r   rN   rZ   r   __classcell__rT   s   @r!   rD   rD   H   s     r    rD   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteConvStreamb   z[
Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
c                   > [         TU ]  5         SnUR                  b  UR                  R                  n[	        UR
                  5      n[        R                  " 5       U l        U/U-   U l	        [        [        U R                  5      S-
  5       HI  nU R                  U   nU R                  US-      nU R                  R                  [        XU5      5        MK     g )N   r   )rM   rN   backbone_confignum_channelslistconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendrD   )r:   r%   rG   rH   iin_chan_	out_chan_rT   s          r!   rN   VitMatteConvStream.__init__g   s     !!- 00==KF::;]]_
&-,6s4??+a/0Aq)HA.IJJ26YOP 1r    c                     SU0nUn[        [        U R                  5      5       H-  nU R                  U   " U5      nS[        US-   5      -   nX2U'   M/     U$ )Ndetailed_feature_map_0detailed_feature_map_r   )rl   rm   rj   str)r:   r&   out_dict
embeddingsro   name_s         r!   rZ   VitMatteConvStream.forwardz   sZ    ,l;!
s4::'AAz2J+c!a%j8E(UO (
 r    )rk   rj   r]   r_   s   @r!   ra   ra   b   s    Q& r    ra   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteFusionBlock   zT
Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
c                 D   > [         TU ]  5         [        XUSSS9U l        g )Nr   )rJ   rK   )rM   rN   rD   rO   )r:   r%   rG   rH   rT   s       r!   rN   VitMatteFusionBlock.__init__   s"    (lST^_`	r    c                     [         R                  R                  USSSS9n[        R                  " X#/SS9nU R                  U5      nU$ )Nr\   bilinearF)scale_factormodealign_cornersr   )dim)r   
functionalinterpolater   catrO   )r:   featuresdetailed_feature_mapupscaled_featuresouts        r!   rZ   VitMatteFusionBlock.forward   sH    MM55hQU_ot5uii-AqIiin
r    )rO   r]   r_   s   @r!   r|   r|      s    a r    r|   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteHead   zB
Simple Matting Head, containing only conv3x3 and conv1x1 layers.
c                 &  > [         TU ]  5         UR                  S   nSn[        R                  " [        R
                  " X#SSSS9[        R                  " U5      [        R                  " S5      [        R
                  " USSSSS95      U l        g )N   r   r   )rI   rJ   rK   Tr   )	rM   rN   fusion_hidden_sizesr   
Sequentialr.   r/   rR   matting_convs)r:   r%   rG   mid_channelsrT   s       r!   rN   VitMatteHead.__init__   sr    004]]IIkQqRSTNN<(GGDMIIlA1QJ	
r    c                 (    U R                  U5      nU$ rW   r   rX   s     r!   rZ   VitMatteHead.forward   s    )),7r    r   r]   r_   s   @r!   r   r      s    
 r    r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteDetailCaptureModule   z?
Simple and lightweight Detail Capture Module for ViT Matting.
c           
        > [         TU ]  5         [        UR                  5      [        UR                  5      S-   :w  a  [        S5      eXl        [        U5      U l        U R                  R                  U l	        [        R                  " 5       U l        UR                  /UR                  -   U l        [        [        U R                  5      S-
  5       HX  nU R                  R!                  [#        UU R                  U   U R                  US-   *    -   U R                  US-      S95        MZ     [%        U5      U l        g )Nr   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r%   rG   rH   )rM   rN   rm   r   rh   
ValueErrorr%   ra   
convstreamrk   r   ri   fusion_blockshidden_sizefusion_channelsrl   rn   r|   r   matting_head)r:   r%   ro   rT   s      r!   rN   $VitMatteDetailCaptureModule.__init__   s   v))*c&2P2P.QTU.UUq  ,V4//44]]_ & 2 23f6P6PPs4//0145A%%#! $ 4 4Q 7$//APQE(:S S!%!5!5a!e!< 6 )0r    c                 :   U R                  U5      n[        [        U R                  5      5       HB  nS[	        [        U R                  5      U-
  S-
  5      -   nU R                  U   " XU   5      nMD     [
        R                  " U R                  U5      5      nU$ )Nru   r   )r   rl   rm   r   rv   r   sigmoidr   )r:   r   r&   detail_featuresro   detailed_feature_map_namer   s          r!   rZ   #VitMatteDetailCaptureModule.forward   s    //,7s4--./A(?#c$J\J\F]`aFadeFeBf(f%))!,XG`7abH 0 t00:;r    )r%   rk   r   r   r   r   r]   r_   s   @r!   r   r      s    12 r    r   zX
    ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    c                      ^  \ rS rSrU 4S jr\     SS\R                  S-  S\S-  S\S-  S\R                  S-  S\S-  4
S	 jj5       r	S
r
U =r$ )VitMatteForImageMatting   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g rW   )rM   rN   r%   r   backboner   decoder	post_init)r:   r%   rT   s     r!   rN    VitMatteForImageMatting.__init__   s9     %f-26: 	r    Nr&   output_attentionsoutput_hidden_stateslabelsreturn_dictc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnUb  [	        S5      eU R
                  R                  XUS9nUR                  S   n	U R                  X5      n
U(       d  U
4USS -   nUb  U4U-   $ U$ [        UU
UR                  UR                  S9$ )ap  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth image matting for computing the loss.

Examples:

```python
>>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
>>> import torch
>>> from PIL import Image
>>> from huggingface_hub import hf_hub_download

>>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
>>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

>>> filepath = hf_hub_download(
...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
... )
>>> image = Image.open(filepath).convert("RGB")
>>> filepath = hf_hub_download(
...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
... )
>>> trimap = Image.open(filepath).convert("L")

>>> # prepare image + trimap for the model
>>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

>>> with torch.no_grad():
...     alphas = model(**inputs).alphas
>>> print(alphas.shape)
torch.Size([1, 1, 640, 960])
```NzTraining is not yet supported)r   r   r   r   )r   r   r   r   )r%   r   r   r   NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsr   r   r   r   )r:   r&   r   r   r   r   kwargsr   outputsr   r   outputs               r!   rZ   VitMatteForImageMatting.forward   s    T &1%<k$++BYBY$8$D $++JjJj 	 2C1N-TXT_T_TqTq%&EFF--<<Wh = 
 ''+h5Y,F)-)9TGf$EvE!!//))	
 	
r    )r   r%   r   )NNNNN)r   r   r   r   rN   r
   r   TensorboolrZ   r   r^   r_   s   @r!   r   r      s      -1)-,0&*#'C
llT)C
  $;C
 #Tk	C

 t#C
 D[C
 C
r    r   )r   dataclassesr   r   r    r   r0   backbone_utilsr   modeling_utilsr   utilsr	   r
   configuration_vitmatter   r   r#   rB   rD   ra   r|   r   r   r   __all__r   r    r!   <module>r      s     !   & + - 0 2 
 7 7 7$ 8o 8 8&299 4   F")) "299 0&")) &R 
O
5 O

O
d %&?
@r    