
    Z j              	          S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJ	r
  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJr  \R0                  " \5      r\\" SS9 " S S\5      5       5       r\\" SS9 " S S\5      5       5       r\\" SS9 " S S\5      5       5       rS rS r " S S\R@                  5      r! " S S\R@                  5      r" " S S\R@                  5      r#S>S\RH                  S \%S!\&S"\RH                  4S# jjr' " S$ S%\R@                  5      r( " S& S'\R@                  5      r) " S( S)\R@                  5      r* " S* S+\R@                  5      r+ " S, S-\R@                  5      r, " S. S/\R@                  5      r- " S0 S1\R@                  5      r. " S2 S3\5      r/ " S4 S5\R@                  5      r0\ " S6 S7\5      5       r1\ " S8 S9\15      5       r2\" S:S9 " S; S<\15      5       r3/ S=Qr4g)?zPyTorch Donut Swin Transformer model.

This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states.    N)	dataclass)nn   )initialization)ACT2FN)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )DonutSwinConfigzS
    DonutSwin encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   S	rg)
DonutSwinEncoderOutput%   a  
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   tupler   r   __static_attributes__r       ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/donut/modeling_donut_swin.pyr   r   %   s}     37u((4/6:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr#   r   z[
    DonutSwin model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)DonutSwinModelOutput<   a  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
    Average pooling of the last layer hidden-state.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nr   pooler_output.r   r   r   r   )r   r   r   r   r   r   r   r   r    r(   r   r!   r   r   r"   r   r#   r$   r&   r&   <   s    	 37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr#   r&   z5
    DonutSwin outputs for image classification.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)DonutSwinImageClassifierOutputV   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification (or regression if config.num_labels==1) scores (before SoftMax).
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlosslogits.r   r   r   r   )r   r   r   r   r   r,   r   r   r    r-   r   r!   r   r   r"   r   r#   r$   r*   r*   V   s     &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr#   r*   c                     U R                   u  p#pEU R                  X#U-  XU-  X5      n U R                  SSSSSS5      R                  5       R                  SXU5      nU$ )z*
Partitions the given input into windows.
r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowss          r$   window_partitionr?   s   so     /<.A.A+J!&&k);8LkM ##Aq!Q15@@BGGKfrsGNr#   c                     U R                   S   nU R                  SX!-  X1-  XU5      n U R                  SSSSSS5      R                  5       R                  SX#U5      n U $ )z7
Merges windows to produce higher resolution features.
r2   r   r   r   r/   r0   r1   r3   )r>   r9   r;   r<   r=   s        r$   window_reverserA      se     ==$Lll2v4e6JKfrsGooaAq!Q/::<AA"fUabGNr#   c            
          ^  \ rS rSrSrSU 4S jjrS\R                  S\S\S\R                  4S jr	  SS
\R                  S	-  S\R                  S	-  S\S\\R                     4S jjrSrU =r$ )DonutSwinEmbeddings   zO
Construct the patch and position embeddings. Optionally, also the mask token.
c                   > [         TU ]  5         [        U5      U l        U R                  R                  nU R                  R
                  U l        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l        UR                  (       a?  [        R                  " [        R                  " SUS-   UR                  5      5      U l        OS U l        [        R                  " UR                  5      U l        [        R"                  " UR$                  5      U l        UR(                  U l        Xl        g )Nr   )super__init__DonutSwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)selfrY   use_mask_tokenrJ   	__class__s       r$   rG   DonutSwinEmbeddings.__init__   s     8 @++77//99O]",,u{{1a9I9I'JKcg))')||EKK;QR?TZTdTd4e'fD$'+D$LL!1!12	zz&"<"<= ++r#   
embeddingsr;   r<   returnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr2   g      ?r   r   r/   bicubicF)sizemodealign_cornersdim)r4   rR   r   jit
is_tracingrX   r   reshaper6   r   
functionalinterpolater5   cat)rZ   r^   r;   r<   rJ   num_positionsclass_pos_embedpatch_pos_embedrf   
new_height	new_widthsqrt_num_positionss               r$   interpolate_pos_encoding,DonutSwinEmbeddings.interpolate_pos_encoding   sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr#   Npixel_valuesbool_masked_posrs   c                    UR                   u  pEpgU R                  U5      u  pU R                  U5      nUR                  5       u  pnUbI  U R                  R                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  b*  U(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nX4$ )Nr2   g      ?)r4   rI   rT   rb   rP   expand	unsqueezetype_asrR   rs   rW   )rZ   ru   rv   rs   _r=   r;   r<   r^   output_dimensionsr:   seq_lenmask_tokensmasks                 r$   forwardDonutSwinEmbeddings.forward   s     *6););&(,(=(=l(K%
YYz*
!+!2
Q&//00bIK",,R088ED#sTz2[5GGJ##/''*G*G
\a*bb
'*B*BB
\\*-
,,r#   )rY   rW   rP   rT   rI   rL   rX   rR   )FNF)r   r   r   r   r   rG   r   Tensorintrs   r   
BoolTensorboolr!   r   r"   __classcell__r\   s   @r$   rC   rC      s    &&D5<< &D &DUX &D]b]i]i &DV 48).	-''$.- ))D0- #'	-
 
u||	- -r#   rC   c                      ^  \ rS rSrSrU 4S jrS rS\R                  S-  S\	\R                  \	\   4   4S jrS	rU =r$ )
rH      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        US   US   -  US   US   -  4U l        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)rF   rG   
image_sizerX   r=   rO   
isinstancecollectionsabcIterablerJ   rK   r   Conv2d
projection)rZ   rY   r   rX   r=   hidden_sizerJ   r\   s          r$   rG   !DonutSwinPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79I9Ik#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY))L:ir#   c                 f   X0R                   S   -  S:w  aB  SU R                   S   X0R                   S   -  -
  4n[        R                  R                  X5      nX R                   S   -  S:w  aD  SSSU R                   S   X R                   S   -  -
  4n[        R                  R                  X5      nU$ )Nr   r   )rX   r   rj   pad)rZ   ru   r;   r<   
pad_valuess        r$   	maybe_pad"DonutSwinPatchEmbeddings.maybe_pad   s    ??1%%*T__Q/%//!:L2LLMJ==,,\FLOOA&&!+Q4??1#5QRAS8S#STJ==,,\FLr#   ru   Nr_   c                     UR                   u  p#pEU R                  XU5      nU R                  U5      nUR                   u    p$nXE4nUR                  S5      R	                  SS5      nXg4$ )Nr/   r   )r4   r   r   flatten	transpose)rZ   ru   r{   r=   r;   r<   r^   r|   s           r$   r    DonutSwinPatchEmbeddings.forward  sp    )5););&~~lEB__\2
(..1e#O''*44Q:
,,r#   )rK   r   r=   rJ   rX   r   )r   r   r   r   r   rG   r   r   r   r!   r   r   r   r"   r   r   s   @r$   rH   rH      sK    j	-E$5$5$< 	-u||UZ[^U_G_A` 	- 	-r#   rH   c            	          ^  \ rS rSrSr\R                  4S\\   S\S\R                  SS4U 4S jjjr
S	 rS
\R                  S\\\4   S\R                  4S jrSrU =r$ )DonutSwinPatchMergingi  a  
Patch Merging Layer.

Args:
    input_resolution (`tuple[int]`):
        Resolution of input feature.
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
input_resolutionrf   
norm_layerr_   Nc                    > [         TU ]  5         Xl        X l        [        R
                  " SU-  SU-  SS9U l        U" SU-  5      U l        g )Nr0   r/   Fbias)rF   rG   r   rf   r   Linear	reductionrT   )rZ   r   rf   r   r\   s       r$   rG   DonutSwinPatchMerging.__init__!  sE     01s7AG%@q3w'	r#   c                     US-  S:H  =(       d    US-  S:H  nU(       a-  SSSUS-  SUS-  4n[         R                  R                  X5      nU$ )Nr/   r   r   )r   rj   r   )rZ   r8   r;   r<   
should_padr   s         r$   r   DonutSwinPatchMerging.maybe_pad(  sS    qjAo:519>
Q519a!<JMM--mHMr#   r8   input_dimensionsc                    Uu  p4UR                   u  pVnUR                  XSXG5      nU R                  XU5      nUS S 2SS S2SS S2S S 24   nUS S 2SS S2SS S2S S 24   n	US S 2SS S2SS S2S S 24   n
US S 2SS S2SS S2S S 24   n[        R                  " XX/S5      nUR                  USSU-  5      nU R                  U5      nU R                  U5      nU$ )Nr   r/   r   r2   r0   )r4   r5   r   r   rl   rT   r   )rZ   r8   r   r;   r<   r:   rf   r=   input_feature_0input_feature_1input_feature_2input_feature_3s               r$   r   DonutSwinPatchMerging.forward0  s   ((5(;(;%
%**:uS}eD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?_"fhjk%**:r1|;KL		-0}5r#   )rf   r   rT   r   )r   r   r   r   r   r   rS   r!   r   ModulerG   r   r   r   r   r"   r   r   s   @r$   r   r     s|    
 XZWcWc (s (# (299 (hl ( (U\\ U3PS8_ Y^YeYe  r#   r   input	drop_probtrainingr_   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )z[
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

        r   r   )r   dtypedevice)r4   ndimr   randr   r   floor_div)r   r   r   	keep_probr4   random_tensoroutputs          r$   	drop_pathr   K  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr#   c                      ^  \ rS rSrSrSS\S-  SS4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )DonutSwinDropPathi[  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r_   c                 .   > [         TU ]  5         Xl        g N)rF   rG   r   )rZ   r   r\   s     r$   rG   DonutSwinDropPath.__init__^  s    "r#   r   c                 B    [        XR                  U R                  5      $ r   )r   r   r   rZ   r   s     r$   r   DonutSwinDropPath.forwardb  s    FFr#   c                      SU R                    3$ )Nzp=r   rZ   s    r$   
extra_reprDonutSwinDropPath.extra_repre  s    DNN#$$r#   r   r   )r   r   r   r   r   floatrG   r   r   r   strr   r"   r   r   s   @r$   r   r   [  sQ    b#%$, #$ # #GU\\ Gell G%C % %r#   r   c            
          ^  \ rS rSrU 4S jr  SS\R                  S\R                  S-  S\S-  S\	\R                     4S jjr
S	 rS
rU =r$ )DonutSwinSelfAttentionij  c                   > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        [        U[        R                  R                  5      (       a  UOXD4U l        [        R                  " [        R                  " SU R                  S   -  S-
  SU R                  S   -  S-
  -  U5      5      U l        U R#                  SU R%                  5       5        [        R&                  " U R                  U R                  UR(                  S9U l        [        R&                  " U R                  U R                  UR(                  S9U l        [        R&                  " U R                  U R                  UR(                  S9U l        [        R0                  " UR2                  5      U l        g )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r/   r   relative_position_indexr   )rF   rG   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   r9   r   rM   r   rN   relative_position_bias_tableregister_buffercreate_relative_position_indexr   qkv_biasquerykeyvaluerU   attention_probs_dropout_probrW   rZ   rY   rf   	num_headsr9   r\   s        r$   rG   DonutSwinSelfAttention.__init__k  s   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP%k;??3K3KLLKS^Rl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
) 	68[8[8]^YYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr#   Nr   attention_maskoutput_attentionsr_   c                 v   UR                   u  pEnXESU R                  4nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
[        R                  " XR	                  SS5      5      nU[        R                  " U R                  5      -  nU R                  U R                  R                  S5         nUR                  U R                  S   U R                  S   -  U R                  S   U R                  S   -  S5      nUR                  SSS5      R                  5       nXR!                  S5      -   nUbm  UR                   S   nUR                  XM-  XR"                  XU5      nXR!                  S5      R!                  S5      -   nUR                  SU R"                  XU5      n[$        R&                  R)                  USS9nU R+                  U5      n[        R                  " X5      nUR                  SSSS5      R                  5       nUR-                  5       S S U R.                  4-   nUR                  U5      nU(       a  X4nU$ U4nU$ )Nr2   r   r/   r   re   r   )r4   r   r   r5   r   r   r   r   matmulmathsqrtr   r   r9   r6   r7   ry   r   r   rj   softmaxrW   rb   r   )rZ   r   r   r   r:   rf   r=   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                     r$   r   DonutSwinSelfAttention.forward  s    )6(;(;%
"T-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.N.Nq.QQ%'--a0J/44(*6N6NPS   02J2J12M2W2WXY2ZZ/44R9Q9QSV\ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=2 O\M]r#   c                    [         R                  " U R                  S   5      n[         R                  " U R                  S   5      n[         R                  " [         R                  " X/SS95      n[         R
                  " US5      nUS S 2S S 2S 4   US S 2S S S 24   -
  nUR                  SSS5      R                  5       nUS S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   SU R                  S   -  S-
  -  ss'   UR                  S5      nU$ )Nr   r   ij)indexingr/   r2   )	r   aranger9   stackmeshgridr   r6   r7   sum)rZ   coords_hcoords_wcoordscoords_flattenrelative_coordsr   s          r$   r   5DonutSwinSelfAttention.create_relative_position_index  s+   << 0 0 34<< 0 0 34U^^X,@4PQvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"9&&r#   )	r   r   rW   r   r   r   r   r   r9   r   )r   r   r   r   rG   r   r   r   r   r!   r   r   r"   r   r   s   @r$   r   r   j  sc    G: 48).	1||1 ))D01  $;	1
 
u||	1f' 'r#   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )DonutSwinSelfOutputi  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g r   )rF   rG   r   r   denserU   r   rW   rZ   rY   rf   r\   s      r$   rG   DonutSwinSelfOutput.__init__  s4    YYs(
zz&"E"EFr#   r   input_tensorr_   c                 J    U R                  U5      nU R                  U5      nU$ r   r  rW   )rZ   r   r  s      r$   r   DonutSwinSelfOutput.forward  s$    

=1]3r#   r  
r   r   r   r   rG   r   r   r   r"   r   r   s   @r$   r
  r
    s7    G
U\\  RWR^R^  r#   r
  c            
          ^  \ rS rSrU 4S jr  S
S\R                  S\R                  S-  S\S-  S\	\R                     4S jjr
S	rU =r$ )DonutSwinAttentioni  c                 d   > [         TU ]  5         [        XX45      U l        [	        X5      U l        g r   )rF   rG   r   rZ   r
  r   r   s        r$   rG   DonutSwinAttention.__init__  s(    *6	O	)&6r#   Nr   r   r   r_   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )rZ   r   )rZ   r   r   r   self_outputsattention_outputr   s          r$   r   DonutSwinAttention.forward  sC     yy@QR;;|AF#%QR(88r#   )r   rZ   r   )r   r   r   r   rG   r   r   r   r   r!   r   r"   r   r   s   @r$   r  r    s\    7 48).		||	 ))D0	  $;		
 
u||		 	r#   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DonutSwinIntermediatei  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rF   rG   r   r   r   	mlp_ratior  r   
hidden_actr   r   intermediate_act_fnr  s      r$   rG   DonutSwinIntermediate.__init__  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r#   r   r_   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r!  r   s     r$   r   DonutSwinIntermediate.forward  s&    

=100?r#   r$  r  r   s   @r$   r  r    s(    9U\\ ell  r#   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DonutSwinOutputi  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g r   )
rF   rG   r   r   r   r  r  rU   rV   rW   r  s      r$   rG   DonutSwinOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r#   r   r_   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r   s     r$   r   DonutSwinOutput.forward  s$    

=1]3r#   r  r  r   s   @r$   r'  r'    s(    >
U\\ ell  r#   r'  c                      ^  \ rS rSrSU 4S jjrS rS rS r  SS\R                  S\
\\4   S\S	-  S
\S	-  S\
\R                  \R                  4   4
S jjrSrU =r$ )DonutSwinLayeri  c                   > [         TU ]  5         UR                  U l        X`l        UR                  U l        X0l        [        R                  " X!R                  S9U l	        [        XX@R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X!R                  S9U l        [!        X5      U l        [%        X5      U l        g )N)eps)r9   r   )rF   rG   chunk_size_feed_forward
shift_sizer9   r   r   rS   layer_norm_epslayernorm_beforer  	attentionr   Identityr   layernorm_afterr  intermediater'  r   )rZ   rY   rf   r   r   drop_path_rater1  r\   s          r$   rG   DonutSwinLayer.__init__  s    '-'E'E$$!-- 0 "S6K6K L+FP`P`a>Ls>R*>:XZXcXcXe!||C5J5JK1&>%f2r#   c                    [        U5      U R                  ::  an  [        S5      U l        [        R
                  R                  5       (       a*  [        R                   " [        R                  " U5      5      O
[        U5      U l        g g Nr   )minr9   r   r1  r   rg   rh   tensor)rZ   r   s     r$   set_shift_and_window_size(DonutSwinLayer.set_shift_and_window_size  s_     D$4$44'lDO=BYY=Q=Q=S=S		%,,'789Y\]mYn  5r#   c           	         U R                   S:  Gae  [        R                  " SXS4X4S9n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4nSnU H  n	U H  n
XS S 2XS S 24'   US-  nM     M     [        XPR                  5      nUR                  SU R                  U R                  -  5      nUR                  S5      UR                  S5      -
  nUR                  US:g  S5      R                  US:H  S5      nU$ S nU$ )Nr   r   r   r2   r/   g      Yr   )	r1  r   rN   slicer9   r?   r5   ry   masked_fill)rZ   r;   r<   r   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r$   get_attn_maskDonutSwinLayer.get_attn_mask  sy   ??Q{{Ava#8UHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E -#/K@EQ1<=QJE $0 !.
 ,H6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir#   c                     U R                   X0R                   -  -
  U R                   -  nU R                   X R                   -  -
  U R                   -  nSSSUSU4n[        R                  R                  X5      nX4$ r;  )r9   r   rj   r   )rZ   r   r;   r<   	pad_right
pad_bottomr   s          r$   r   DonutSwinLayer.maybe_pad8  sy    %%0@0@(@@DDTDTT	&&2B2B)BBdFVFVV
Ay!Z8
))-D((r#   r   r   r   Nalways_partitionr_   c                    U(       d  U R                  U5        O Uu  pVUR                  5       u  pxn	Un
U R                  U5      nUR                  XuXi5      nU R	                  XU5      u  pUR
                  u  ppU R                  S:  a.  [        R                  " XR                  * U R                  * 4SS9nOUn[        XR                  5      nUR                  SU R                  U R                  -  U	5      nU R                  XUR                  UR                  S9nU R                  UUUS9nUS   nUR                  SU R                  U R                  U	5      n[        UU R                  X5      nU R                  S:  a-  [        R                  " UU R                  U R                  4SS9nOUnUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R!                  5       nUR                  XuU-  U	5      nXR#                  U5      -   nU R%                  U5      nU R'                  U5      nXR)                  U5      -   nU(       a	  UUS	   4nU$ U4nU$ )
Nr   )r   r/   )shiftsdimsr2   r   )r   r   r1   r   )r>  rb   r3  r5   r   r4   r1  r   rollr?   r9   rK  r   r   r4  rA   r7   r   r6  r7  r   )rZ   r   r   r   rQ  r;   r<   r:   r{   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsrJ  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                           r$   r   DonutSwinLayer.forward?  s{     **+;<("/"4"4"6
x --m<%**:uO %)NN=%$P!&3&9&9#y??Q$)JJ}FVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&)<)<EZEaEa ' 
	 !NN+@)_pNq,Q/,11"d6F6FHXHXZbc():D<L<Ljd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:~xX >>2C#DD++M:((6${{<'@@@Q'8';< YeWfr#   )
r4  r0  r   r   r7  r6  r3  r   r1  r9   )r   r   FF)r   r   r   r   rG   r>  rK  r   r   r   r!   r   r   r   r"   r   r   s   @r$   r-  r-    s    38) */(->||>  S/>  $;	>
 +> 
u||U\\)	*> >r#   r-  c                      ^  \ rS rSrU 4S jr  SS\R                  S\\\4   S\	S-  S\	S-  S\\R                     4
S	 jjr
S
rU =r$ )DonutSwinStagei  c                 R  > [         T	U ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H+  n[        UUUUXh   US-  S:X  a  SOUR                  S-  S9PM-     sn5      U l	        Ub  U" X2[        R                  S9U l        OS U l        SU l        g s  snf )Nr/   r   )rY   rf   r   r   r8  r1  )rf   r   F)rF   rG   rY   rf   r   
ModuleListranger-  r9   blocksrS   
downsamplepointing)
rZ   rY   rf   r   depthr   r   rj  ir\   s
            r$   rG   DonutSwinStage.__init__  s    mm u
 &A !%5'#,<%&UaZqf6H6HA6M &

 !()9r||\DO"DO'
s   2B$r   r   r   NrQ  r_   c                     Uu  pV[        U R                  5       H  u  pxU" XX45      n	U	S   nM     Un
U R                  b%  US-   S-  US-   S-  pXVX4nU R                  X5      nOXVXV4nXU4nU(       a  UW	SS  -  nU$ )Nr   r   r/   )	enumerateri  rj  )rZ   r   r   r   rQ  r;   r<   rm  layer_modulera  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledr|   stage_outputss                  r$   r   DonutSwinStage.forward  s     )(5OA(J[nM)!,M  6
 -:)??&5;aZA4EPQ	VWGW 1!'0B V OO,M`M!' >&K\]]12..Mr#   )ri  rY   rf   rj  rk  rc  )r   r   r   r   rG   r   r   r!   r   r   r   r"   r   r   s   @r$   re  re    sg    < */(-||  S/  $;	
 + 
u||	 r#   re  c                      ^  \ rS rSrU 4S jr     SS\R                  S\\\4   S\	S-  S\	S-  S\	S-  S	\	S-  S
\	S-  S\\
-  4S jjrSrU =r$ )DonutSwinEncoderi  c                   > [         TU ]  5         [        UR                  5      U l        Xl        [        R                  " SUR                  [        UR                  5      SS9 Vs/ s H  o3R                  5       PM     nn[        R                  " [        U R                  5       Vs/ s H  n[        U[        UR                   SU-  -  5      US   SU-  -  US   SU-  -  4UR                  U   UR"                  U   U[        UR                  S U 5      [        UR                  S US-    5       XPR                  S-
  :  a  [$        OS S9PM     sn5      U l        SU l        g s  snf s  snf )Nr   cpu)r   r/   r   )rY   rf   r   rl  r   r   rj  F)rF   rG   lendepths
num_layersrY   r   linspacer8  r  itemr   rg  rh  re  r   rO   r   r   layersgradient_checkpointing)rZ   rY   rK   xdpri_layerr\   s         r$   rG   DonutSwinEncoder.__init__  sQ   fmm,!&63H3H#fmmJ\ej!kl!kAvvx!klmm  %T__5  6G !F,,q'z9:&/lq'z&BIaLUVX_U_D`%a --0$..w7!#fmmHW&=">V]]S`U\_`U`EaAbc9@??UVCV9V4]a  6
 ',#! ms   &E&(B*E+r   r   r   Noutput_hidden_states(output_hidden_states_before_downsamplingrQ  return_dictr_   c                    U(       a  SOS nU(       a  SOS n	U(       a  SOS n
U(       aB  UR                   u  pnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nX4-  n	[        U R                  5       H  u  nnU" XX65      nUS   nUS   nUS   nUS   US   4nU(       aS  U(       aL  UR                   u  pnUR                  " U/US   US   4QUP76 nUR                  SSSS5      nUU4-  nX4-  n	OPU(       aI  U(       dB  UR                   u  pnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nX4-  n	U(       d  M  U
USS  -  n
M     U(       d  [        S XU
4 5       5      $ [        UUU
U	S	9$ )
Nr   r   r   r   r/   r   r2   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   ).0vs     r$   	<genexpr>+DonutSwinEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)r   r   r   r   )r4   r5   r6   rp  r  r!   r   )rZ   r   r   r   r  r  rQ  r  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr:   r{   r   reshaped_hidden_staterm  rq  ra  rr  r|   s                       r$   r   DonutSwinEncoder.forward  s    #7BD+?RT"$5b4)6)<)<&J;$1$6$6z$bDT$bVa$b!$9$A$A!Q1$M!!11&*BB&(5OA|(J[nM)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
{ )J(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*.FF*%.V-:-@-@*
{(5(:(::(fHX(fZe(f%(=(E(EaAq(Q%!%55!*.FF*  #}QR'88#9  6< m]GZ$[mmm%++*#=	
 	
r#   )rY   r  r  r}  )FFFFT)r   r   r   r   rG   r   r   r!   r   r   r   r   r"   r   r   s   @r$   rx  rx    s    ,4 */,1@E(-#'<
||<
  S/<
  $;	<

 #Tk<
 37+<
 +<
 D[<
 
'	'<
 <
r#   rx  c                   r   ^  \ rS rSr% \\S'   SrSrSrSr	S/r
\R                  " 5       U 4S j5       rS	rU =r$ )
DonutSwinPreTrainedModeli  rY   donutru   )imageTre  c                   > [         TU ]  U5        [        U[        5      (       a\  UR                  b   [
        R                  " UR                  5        UR                  b!  [
        R                  " UR                  5        gg[        U[        5      (       aP  [
        R                  " UR                  5        [
        R                  " UR                  UR                  5       5        gg)zInitialize the weightsN)rF   _init_weightsr   rC   rP   initzeros_rR   r   r   copy_r   r   )rZ   moduler\   s     r$   r  &DonutSwinPreTrainedModel._init_weights  s     	f%f122  ,F--.))5F667 6 677KK;;<JJv55v7\7\7^_ 8r#   r   )r   r   r   r   r   r    base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   no_gradr  r"   r   r   s   @r$   r  r    sE     $O!&*#)*
]]_
` 
`r#   r  c                      ^  \ rS rSrSU 4S jjrS r\      SS\R                  S-  S\R                  S-  S\
S-  S\
S-  S	\
S
\
S-  S\\-  4S jj5       rSrU =r$ )DonutSwinModeli'  c                   > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  SU R
                  S-
  -  -  5      U l        [        XS9U l
        [        XR                  R                  5      U l        U(       a  [        R                  " S5      OSU l        U R#                  5         g)z
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token for masked image modeling.
r/   r   )r[   N)rF   rG   rY   r{  r|  r}  r   rO   num_featuresrC   r^   rx  rL   encoderr   AdaptiveAvgPool1dpooler	post_init)rZ   rY   add_pooling_layerr[   r\   s       r$   rG   DonutSwinModel.__init__)  s     	 fmm, 0 0119L3M MN-fT'0J0JK1Bb**1- 	r#   c                 .    U R                   R                  $ r   )r^   rI   r   s    r$   get_input_embeddings#DonutSwinModel.get_input_embeddings=  s    ///r#   Nru   rv   r   r  rs   r  r_   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XUS9u  pU R                  UU	UUUS9n
U
S   nSnU R                  b8  U R                  UR                  SS5      5      n[        R                  " US5      nU(       d  X4U
SS -   nU$ [        UUU
R                  U
R                  U
R                  S9$ )	z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_values)rv   rs   )r   r  r  r   r   r/   )r   r(   r   r   r   )rY   r   r  r  r   r^   r  r  r   r   r   r&   r   r   r   )rZ   ru   rv   r   r  rs   r  kwargsembedding_outputr   encoder_outputssequence_outputpooled_outputr   s                 r$   r   DonutSwinModel.forward@  s5    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY?@@-1__Tl .= .
* ,,/!5# ' 
 *!,;;" KK(A(A!Q(GHM!MM-;M%58KKFM#-')77&11#2#I#I
 	
r#   )rY   r^   r  r  r}  r  )TFNNNNFN)r   r   r   r   rG   r  r   r   r   r   r   r!   r&   r   r"   r   r   s   @r$   r  r  '  s    (0  2637)-,0).#'5
''$.5
 ))D05
  $;	5

 #Tk5
 #'5
 D[5
 
%	%5
 5
r#   r  a  
    DonutSwin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune DonutSwin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrU 4S jr\      SS\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S	\	S-  S
\
\-  4S jj5       rSrU =r$ )DonutSwinForImageClassificationiy  c                 D  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a5  [
        R                  " U R                  R                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g r;  )rF   rG   
num_labelsr  r  r   r   r  r5  
classifierr  )rZ   rY   r\   s     r$   rG   (DonutSwinForImageClassification.__init__  sx      ++#F+
 FLEVEVYZEZBIIdjj--v/@/@A`b`k`k`m 	
 	r#   Nru   labelsr   r  rs   r  r_   c                 X   Ub  UOU R                   R                  nU R                  UUUUUS9nUS   n	U R                  U	5      n
SnUb  U R	                  X*U R                   5      nU(       d  U
4USS -   nUb  U4U-   $ U$ [        UU
UR                  UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r   r  rs   r  r   r/   )r,   r-   r   r   r   )	rY   r  r  r  loss_functionr*   r   r   r   )rZ   ru   r  r   r  rs   r  r  r   r  r-   r,   r   s                r$   r   'DonutSwinForImageClassification.forward  s    " &1%<k$++BYBY**/!5%=#  
  
/%%fdkkBDY,F)-)9TGf$EvE-!//))#*#A#A
 	
r#   )r  r  r  r  )r   r   r   r   rG   r   r   r   
LongTensorr   r!   r*   r   r"   r   r   s   @r$   r  r  y  s       26*.)-,0).#',
''$.,
   4',
  $;	,

 #Tk,
 #',
 D[,
 
/	/,
 ,
r#   r  )r  r  r  )r   F)5r   collections.abcr   r   dataclassesr   r   r    r   r  activationsr   modeling_layersr   modeling_utilsr	   utilsr
   r   r   r   configuration_donut_swinr   
get_loggerr   loggerr   r&   r*   r?   rA   r   rC   rH   r   r   r   r   r   r   r   r
  r  r  r'  r-  re  rx  r  r  r  __all__r   r#   r$   <module>r     sA  
   !   & ! 9 - D D 5 
		H	% H[ H H  H; H H& H[ H H,	Y-")) Y-z(-ryy (-X3BII 3nU\\ e T V[VbVb  %		 %Z'RYY Z'|
")) 
 &BII  	bii 	wRYY wv4/ 4pS
ryy S
l ` ` `, N
- N
 N
b <
&> <
<
~ \r#   