
    Z jՅ                     "   S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  \R:                  " \5      r " S S\R@                  5      r! " S S\R@                  5      r" " S S\RF                  5      r$ " S S\RF                  5      r% " S S\5      r&\ " S S\5      5       r' " S S\'5      r(\" SS9 " S  S!\'5      5       r)\" S"S9 " S# S$\'\5      5       r*S$S/r+g)%z/PyTorch TrOCR decoder model (based on RoBERTa).    N)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )TrOCRConfigc                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S	\R                  S-  4U 4S
 jjjr	Sr
U =r$ )TrOCRLearnedPositionalEmbedding%   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g N   )offsetsuper__init__)selfr   r   	__class__s      y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/trocr/modeling_trocr.pyr   (TrOCRLearnedPositionalEmbedding.__init__*   s"     ++5}E    N	input_idspast_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nr   )dtypedevicer   )shapetorcharangelongweightr*   expand	unsqueezer   forwardr   )r    r%   r&   r'   bszseq_lenr!   s         r"   r3   'TrOCRLearnedPositionalEmbedding.forward0   s    
 $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r$   )r   )r   N)__name__
__module____qualname____firstlineno____doc__intr   r-   Tensorr3   __static_attributes____classcell__r!   s   @r"   r   r   %   sW    Fs F3 F mq;;?B;V[VbVbeiVi; ;r$   r   c            
       r   ^  \ rS rSrSrSS\S\S\S\S-  4U 4S jjjrS	\R                  4U 4S
 jjr
SrU =r$ )TrOCRScaledWordEmbeddingA   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r   r   padding_idxembed_scaleNc                 2   > [         TU ]  XU5        X@l        g N)r   r   rE   )r    r   r   rD   rE   r!   s        r"   r   !TrOCRScaledWordEmbedding.__init__F   s    D&r$   r%   c                 <   > [         TU ]  U5      U R                  -  $ rG   )r   r3   rE   )r    r%   r!   s     r"   r3    TrOCRScaledWordEmbedding.forwardJ   s    wy)D,<,<<<r$   rE   )      ?)r7   r8   r9   r:   r;   r<   floatr   r-   r=   r3   r>   r?   r@   s   @r"   rB   rB   A   sJ    's '3 'S '_dgk_k ' '= = =r$   rB   c            	          ^  \ rS rSrSrSS\S\S\S-  4U 4S jjjr\SS\S\S\S-  4S	 jj5       r\	R                  " 5       SS
\	R                  S\4S jj5       r SS
\	R                  S\S\S-  4S jjrSrU =r$ )"TrOCRSinusoidalPositionalEmbeddingN   zDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   rD   c                 v   > [         TU ]  5         SU l        X l        X0l        U R                  XU5      U l        g r   )r   r   r   r   rD   get_embeddingweights)r    rQ   r   rD   r!   s       r"   r   +TrOCRSinusoidalPositionalEmbedding.__init__Q   s5    *&))-Tr$   r   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
r   i'  r   )r)   r   dimr+   N)mathlogr-   expr.   int64rM   r2   catsincosviewzerostoget_default_dtype)r   r   rD   half_dimembs        r"   rS   0TrOCRSinusoidalPositionalEmbedding.get_embeddingX   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r$   r%   r&   c                    UR                  5       u  p4U R                  XR                  U5      R                  UR                  5      nU R                  S-   U-   nU R
                  b  X`R
                  R                  S5      :  a+  U R                  X`R                  U R                  5      U l        U R
                  R                  SUR                  S5      5      R                  X4S5      R                  5       nU$ )Nr   r   r+   )size"create_position_ids_from_input_idsrD   rb   r*   rT   rS   r   index_selectr`   detach)r    r%   r&   r4   r5   r'   max_posxs           r"   r3   *TrOCRSinusoidalPositionalEmbedding.forwardk   s     ~~'>>yJZJZ\rsvv

 ""Q&0<<7\\->->q-A#A--g7I7I4K[K[\DLLL%%a):):2)>?DDSSUV]]_r$   c                     UR                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
r   rW   )ner<   r-   cumsumtype_asr/   )r    r%   rD   r&   maskincremental_indicess         r"   ri   ETrOCRSinusoidalPositionalEmbedding.create_position_ids_from_input_ids}   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r$   )r   r   rD   rT   rG   )r   )r7   r8   r9   r:   r;   r<   r   staticmethodrS   r-   no_gradr=   r3   ri   r>   r?   r@   s   @r"   rO   rO   N   s    NUc U# UCRVJ U U 1c 1# 1CRVJ 1 1$ ]]_ s  $ _`
8
847
8QTW[Q[
8 
8r$   rO   c                   l  ^  \ rS rSrSr       SS\S\S\S-  S\S-  S\S-  S	\S-  S
\S-  S\S-  S\S-  4U 4S jjjr    SS\	R                  S\	R                  S-  S\S-  S\	R                  S-  S\S-  S\\	R                  \	R                  S-  \\	R                     S-  4   4S jjrSrU =r$ )TrOCRAttention   z>Multi-headed attention from 'Attention Is All You Need' paper.N	embed_dim	num_headskdimvdimdropout
is_decoderbiasis_cross_attention	layer_idxc                 2  > [         TU ]  5         X l        Ub  UOUU l        Ub  UOUU l        X0l        X`l        X#-  U l        U R                  U-  U R                  :X  d  [        SU R                   SU S35      eU R                  S-  U l	        Xpl
        Xl        [        R                  " U R                  X(S9U l        [        R                  " U R                  X(S9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r   )r   r   r{   r}   r~   r|   r   head_dim
ValueErrorscalingr   r   r   Lineark_projv_projq_projout_proj)r    configr{   r|   r}   r~   r   r   r   r   r   r!   s              r"   r   TrOCRAttention.__init__   s     	" ,D)	 ,D)	"!.	)T^^;MdnnM] ^;b"  }}d*$"ii		9@ii		9@ii	4@		)TBr$   hidden_stateskey_value_statespast_key_valuesattention_maskoutput_attentionsreturnc                 	   USLnUR                  5       u  pn
U R                  U5      U R                  -  nSnUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  USU R                   U R"                  5      R%                  SS5      nUR                  USU R                   U R"                  5      R%                  SS5      nUbU  WR'                  UUU R                  5      u  nnU(       a.  [        U[        5      (       a  SUR
                  U R                  '   XR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR(                  " U6 nUR(                  " U6 nUR(                  " U6 nUR                  S5      n[*        R,                  " XR%                  SS5      5      nUR                  5       XR                   -  U	U4:w  a.  [/        SXR                   -  U	U4 SUR                  5        35      eUbz  UR                  5       USU	U4:w  a#  [/        S	USU	U4 SUR                  5        35      eUR                  XR                   U	U5      U-   nUR                  XR                   -  U	U5      n[0        R2                  R5                  USS
9nU(       a=  UR                  XR                   U	U5      nUR                  XR                   -  U	U5      nOSn[0        R2                  R7                  UU R6                  U R8                  S9n[*        R,                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [/        SXR                   XR"                  4 SUR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR)                  XU
5      nU R;                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNFr+   r   r   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size rW   ptrainingz `attn_output` should be of size )rh   r   r   
isinstancer	   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r`   r|   r   	transposeupdatereshaper-   bmmr   r   
functionalsoftmaxr   r   r   )r    r   r   r   r   r   kwargsr   r4   tgt_lenr{   query_statesr   curr_past_key_valuescurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                          r"   r3   TrOCRAttention.forward   sg    .T9"/"4"4"6i {{=1DLL@
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*+?+F+FzS_aeaoao+p(
L%*_FY*Z*ZAEO..t~~>NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yy/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1!))#	BmmK0111r$   )r   r{   r   r   r   r}   r   r|   r   r   r   r   r~   )NNg        FTFN)NNNF)r7   r8   r9   r:   r;   r<   rM   boolr   r-   r=   r   tupler3   r>   r?   r@   s   @r"   ry   ry      sH   H   #"' */!%!C !C 	!C
 Dj!C Dj!C !C 4K!C Tk!C !4K!C $;!C !CL 15(,.2).d2||d2  ,,-d2 	d2
 t+d2  $;d2 
u||U\\D0%2E2LL	Md2 d2r$   ry   c                      ^  \ rS rSrSS\4U 4S jjjr      SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\	S-  S\	S-  4S jjr
SrU =r$ )TrOCRDecoderLayeri  Nr   c                 j  > [         TU ]  5         UR                  U l        [	        UU R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        UR                   (       am  [	        UU R                  UR
                  UR"                  UR"                  UR                  SSUS9	U l        [        R                  " U R                  5      U l        [        R(                  " U R                  UR*                  5      U l        [        R(                  " UR*                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r{   r|   r   r   r   )r{   r|   r}   r~   r   r   r   r   )r   r   hidden_sizer{   ry   decoder_attention_headsattention_dropout	self_attnr   r   activation_functionactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normr   cross_attention_hidden_sizeencoder_attnencoder_attn_layer_normr   decoder_ffn_dimfc1fc2final_layer_norm)r    r   r   r!   s      r"   r   TrOCRDecoderLayer.__init__  s=   ++'nn44,,
 ~~#F$>$>?"(";";$&LL$@! ... 88777700#'#
!D ,.<<+GD(99T^^V-C-CD99V33T^^D "T^^ <r$   r   r   encoder_hidden_statesencoder_attention_maskr   r   	use_cachec                    Un	U R                  UUUUS9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nSnUb^  Un	U R                  UUUUUS9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUn	U R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU4nU(       a  XU4-  nU$ )a^  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    past_key_values (`Cache`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   r   r   N)r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   )r    r   r   r   r   r   r   r   r   residualself_attn_weightscross_attn_weightsoutputss                r"   r3   TrOCRDecoderLayer.forward<  s   2 ! ,0>>'+)/	 ,: ,
( --m||VZVcVc-d 011-@ " ,$H040A0A+!65 /"3 1B 1-M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m< "+=>>Gr$   )r   r   r   r{   r   r   r   r   r   r   r   rG   )NNNNFT)r7   r8   r9   r:   r   r   r-   r=   r   r   r3   r>   r?   r@   s   @r"   r   r     s    "={ "= "=N /3596:(,).!%G||G t+G  %||d2	G
 !&t 3G G  $;G $;G Gr$   r   c                   .    \ rS rSr% \\S'   SrSrS/rSr	g)TrOCRPreTrainedModeli  r   modelTr    N)
r7   r8   r9   r:   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr>   r   r$   r"   r   r     s    &*#,-r$   r   c                   R   ^  \ rS rSrSrS\4U 4S jjr          SS jrSrU =r	$ )TrOCRDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TrOCRDecoderLayer`]

Args:
    config: TrOCRConfig
r   c           
      |  > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R                  US9U l        UR                  (       a&  [        UR                   UR                  5      U l        O@[%        UR                   U R                  -   S-   UR                  U R                  5      U l        UR&                  (       a&  [(        R*                  " UR                  5      U l        OS U l        [(        R,                  " [/        UR0                  5       Vs/ s H  n[3        XS9PM     sn5      U l        SU l        U R9                  5         g s  snf )NrL   rK   r   )r   F)r   r   r   decoder_layerdrop	layerdroppad_token_idrD   scale_embeddingrY   sqrtr   rB   
vocab_sizeembed_tokensuse_learned_position_embeddingsr   max_position_embeddingsembed_positionsrO   layernorm_embeddingr   r   
ModuleListrangedecoder_layersr   r   gradient_checkpointing	post_init)r    r   rE   ir!   s       r"   r   TrOCRDecoder.__init__  sT    ~~11!..7=7M7Mdii 2 23SV4v1143C3CQ\
 11#B6CaCacicucu#vD #E..1A1AAAE""  $D  %%')||F4F4F'GD$'+D$mmUZ[a[p[pUq$rUqPQ%6v%KUq$rs&+#	 %ss   F9c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  Ub  [        S5      eUb"  UnUR                  SUR                  S   5      nOUb  USS2SS2S4   nO[        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       ab  Uc_  Uc  U R                   R                  (       a.  [        [        U R                   S9[        U R                   S95      O[        U R                   S9nUb  UR                  5       OSnUc  U R!                  U5      nU R                   R"                  (       a  U R%                  XS	9nOU R%                  XS	9nXn-   nU R&                  b  U R'                  U5      n[(        R*                  R-                  XR,                  U R                  S
9n[/        U R                   UUUS9nUb  Ub  [1        U R                   UUUS9nU	(       a  SOSnU(       a  SOSnU(       a  Ub  SOSn[3        U R4                  5       H}  u  nnU	(       a  UU4-  nU R                  (       a(  [6        R8                  " / 5      nUU R:                  :  a  ML  U" UUUUUUUS9nUS   nU(       d  Mf  UUS   4-  nUc  Mt  UUS   4-  nM     U	(       a  UU4-  nU
(       d  [=        S XUUU4 5       5      $ [?        UUUUUS9$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer+   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz^`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...F)r   r   )r&   r   )r   inputs_embedsr   r   )r   r   r   r   r   )r   r   r   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7frG   r   ).0vs     r"   	<genexpr>'TrOCRDecoder.forward.<locals>.<genexpr>Z  s      rA rs   	)last_hidden_stater   r   
attentionscross_attentions) r   r   output_hidden_statesr   return_dictr   r`   r,   r   r   loggerwarning_onceis_encoder_decoderr	   r   get_seq_lengthr   r   r   r   r   r   r   r   r   	enumerater   r-   randr   r   r   )r    r%   r   r   r   r   r   r   r   r   r   r   inputr&   	embed_posr   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputss                          r"   r3   TrOCRDecoder.forward  sV   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY  ]%>stt"E!r5;;r?;I&!!Q(+Edee&&4==##t "	0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg  --i8M;;66,,U,bI,,Y,fI%1##/ 44]CM--m||VZVcVc-d+;;')+	
 !,1G1S%>{{+5&;	&" #7BD0d&7<Q<]rdh"+DKK"8C#!m%55!}}&+jjn#&7)%'= /"3#M *!,M  =#3"55(4(]1-=,??(1 #96  -!11 ':K^]qr  
 9+++%1
 	
r$   )r   r   r   r   r   r   r   rD   )
NNNNNNNNNN)
r7   r8   r9   r:   r;   r   r   r3   r>   r?   r@   s   @r"   r   r     sA    { B "#!p
 p
r$   r   a  
    The TrOCR Model with a language modeling head. Can be used for summarization.
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    )custom_introc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TrOCRDecoderWrapperih  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rG   )r   r   r   decoderr   r    r   r!   s     r"   r   TrOCRDecoderWrapper.__init__p  s&     #F+r$   c                 &    U R                   " U0 UD6$ rG   r  )r    argsr   s      r"   r3   TrOCRDecoderWrapper.forwardu  s    ||T,V,,r$   r  )r7   r8   r9   r:   r   r3   r>   r?   r@   s   @r"   r  r  h  s    
- -r$   r  zy
    The TrOCR Decoder with a language modeling head. Can be used as the decoder part of [`EncoderDecoderModel`] and
    c                   f  ^  \ rS rSrSS0rU 4S jrS rS rS rS r	\
           SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\\-  4S jj5       rSrU =r$ )TrOCRForCausalLMiy  zoutput_projection.weightz!model.decoder.embed_tokens.weightc                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr   )r   r   r   r   r  r   r   r   r   r   output_projectionr   r  s     r"   r   TrOCRForCausalLM.__init__  sZ     $)! (0
!#6+=+=v?P?PW\!] 	r$   c                 B    U R                   R                  R                  $ rG   r   r  r   r    s    r"   get_input_embeddings%TrOCRForCausalLM.get_input_embeddings  s    zz!!...r$   c                 8    XR                   R                  l        g rG   r  )r    values     r"   set_input_embeddings%TrOCRForCausalLM.set_input_embeddings  s    */

'r$   c                     U R                   $ rG   r  r   s    r"   get_output_embeddings&TrOCRForCausalLM.get_output_embeddings  s    %%%r$   c                     Xl         g rG   r(  )r    new_embeddingss     r"   set_output_embeddings&TrOCRForCausalLM.set_output_embeddings  s    !/r$   Nr%   r   r   r   r   r   labelsr   r   r   r   r   c                 H   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  R                  UUUUUUUU	U
US9
nU R                  US   5      nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import (
...     TrOCRConfig,
...     TrOCRProcessor,
...     TrOCRForCausalLM,
...     ViTConfig,
...     ViTModel,
...     VisionEncoderDecoderModel,
... )
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> # TrOCR is a decoder model and should be used within a VisionEncoderDecoderModel
>>> # init vision2text model with random weights
>>> encoder = ViTModel(ViTConfig())
>>> decoder = TrOCRForCausalLM(TrOCRConfig())
>>> model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

>>> # If you want to start from the pretrained model, load the checkpoint with `VisionEncoderDecoderModel`
>>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
>>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

>>> # load image from the IAM dataset
>>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read())).convert("RGB")
>>> pixel_values = processor(image, return_tensors="pt").pixel_values
>>> text = "industry, ' Mr. Brown commented icily. ' Let us have a"

>>> # training
>>> model.config.decoder_start_token_id = processor.tokenizer.eos_token_id
>>> model.config.pad_token_id = processor.tokenizer.pad_token_id
>>> model.config.vocab_size = model.config.decoder.vocab_size

>>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
>>> outputs = model(pixel_values, labels=labels)
>>> loss = outputs.loss
>>> round(loss.item(), 2)
5.30

>>> # inference
>>> generated_ids = model.generate(pixel_values)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> generated_text
'industry, " Mr. Brown commented icily. " Let us have a'
```N)
r%   r   r   r   r   r   r   r   r   r   r   r+   r   )losslogitsr   r   r   r   )r   r   r   r   r   r  r  r   r`   r   r   r   r   r   r   )r    r%   r   r   r   r   r   r/  r   r   r   r   r   r   r2  r1  loss_fctoutputs                     r"   r3   TrOCRForCausalLM.forward  sB   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY **$$)"7#9+'/!5# % 
 ''
3')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r$   )r   r  )NNNNNNNNNNN)r7   r8   r9   r:   _tied_weights_keysr   r!  r%  r)  r-  r   r-   
LongTensorr=   FloatTensorr   r   r   r   r3   r>   r?   r@   s   @r"   r  r  y  s?    56YZ	/0&0  .2.2:>:>(,26*.!%)-,0#'m
##d*m
 t+m
  %0047	m

 !& 0 04 7m
 m
 ((4/m
   4'm
 $;m
  $;m
 #Tkm
 D[m
 
2	2m
 m
r$   r  ),r;   rY   r-   r   torch.nnr   activationsr   cache_utilsr   r   r	   
generationr
   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_trocrr   
get_loggerr7   r   	Embeddingr   rB   ModulerO   ry   r   r   r   r  r  __all__r   r$   r"   <module>rG     s$   6    % ! C C ) J 9 l - , , 
		H	%;bll ;8
=r|| 
=98 98xJ2RYY J2Zl2 l^ .? . .W
' W
t -. -- 
H
+_ H

H
V 5
6r$   