
    Z jb                     b   S r SSKrSSKJr  SSKrSSKrSSKJr  SSKJ	r	  SSK
Jr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,  SSK-J.r.  SSK/J0r0J1r1  SSK2J3r3  \,Rh                  " \55      r6S\Rn                  S\8S\84S jr9 " S S\Rt                  5      r;  S<S\Rx                  S\Rn                  S\Rn                  S\Rn                  S \Rn                  S-  S!\=S-  S"\=S#\&\(   4S$ jjr> " S% S&\Rx                  5      r? " S' S(\5      r@ " S) S*\5      rA\) " S+ S,\$5      5       rB " S- S.\B5      rC " S/ S0\B5      rD\) " S1 S2\B5      5       rE\)" S3S49 " S5 S6\B\5      5       rF " S7 S8\B5      rG " S9 S:\B\5      rH/ S;QrIg)=z=PyTorch MarianMTModel model, ported from the Marian C++ repo.    N)Callable)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)OutputRecordercapture_outputs   )MarianConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr!   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r#   r$   r%   shifted_input_idss       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/marian/modeling_marian.pyshift_tokens_rightr/   8   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    c            
          ^  \ rS rSrSrSS\S\S\S-  SS4U 4S jjjrS	 r\R                  " 5        SS
\R                  S\S\R                  S-  S\R                  4U 4S jjj5       rSrU =r$ )#MarianSinusoidalPositionalEmbeddingH   zDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxreturnc                 "   > [         TU ]  XSS9  g )NT)_freeze)super__init__)selfr4   r5   r6   	__class__s       r.   r;   ,MarianSinusoidalPositionalEmbedding.__init__K   s    tDr0   c                    U R                   R                  u  p[        R                  " [	        U5       VVs/ s H?  n[	        U5       Vs/ s H%  oC[        R
                  " SSUS-  -  U-  5      -  PM'     snPMA     snn5      n[        R                  " XU R                   R                  SS9nUS-  S:X  a  US-  OUS-  S-   n[        R                  " [        R                  " USS2SSS24   5      5      USS2SU24'   [        R                  " [        R                  " USS2SSS24   5      5      USS2US24'   U$ s  snf s  snnf )z
Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
the 2nd half of the vector. [dim // 2:]
i'     F)dtyperequires_gradr   r!   N)weightr)   nparrayrangepowertorchemptyrA   FloatTensorsincos)r<   n_posdimposjposition_encoutsentinels           r.   create_weight1MarianSinusoidalPositionalEmbedding.create_weightN   s   
 [[&&
xxX]^cXdeXdQTsLABHHUAaL3$677LXde
 kk%DKK,=,=US"Qw!|3!8#(a"..rvvl1add76K/LMAqzM!--bff\!QTT'5J.KLAxyL
 Mes   E

,E6E
E
input_ids_shapepast_key_values_lengthposition_idsc                    > UcB  USS u  pE[         R                  " X"U-   [         R                  U R                  R                  S9n[
        TU ]  U5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr@   )rA   device)rH   arangelongrC   rZ   r:   forward)r<   rV   rW   rX   bszseq_lenr=   s         r.   r]   +MarianSinusoidalPositionalEmbedding.forward]   sX    
 *2A.LC <<&(HPUPZPZcgcncncucuL w|,,r0    N)r   N)__name__
__module____qualname____firstlineno____doc__intr;   rT   rH   no_gradSizeTensorr]   __static_attributes____classcell__r=   s   @r.   r2   r2   H   s    NEc E# ECRVJ Ebf E E ]]_pt	-$zz	-CF	-Z_ZfZfimZm	-		- 	-r0   r2   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr'         r@   r   rN   ptrainingr!   )
sizerH   matmul	transposer   
functionalsoftmaxru   r|   
contiguous)
ro   rp   rq   rr   rs   rt   ru   rv   attn_weightsattn_outputs
             r.   eager_attention_forwardr   k   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r0   c                   $  ^  \ rS rSrSr      SS\S\S\S\S\S	\S
\S-  S\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\S-  S\
R                  S-  S\\   S\\
R                  \
R                  S-  4   4S jjrSrU =r$ )MarianAttention   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsru   
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rx   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )r:   r;   r   r   ru   head_dimr   r+   rt   r   r   r   loggerwarning_oncer=   rc   r   Lineark_projv_projq_projout_proj)
r<   r   r   ru   r   r   r   r   r   r=   s
            r.   r;   MarianAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr0   hidden_stateskey_value_statespast_key_valuesrs   rv   r7   c                 ,   USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Sn
Ub]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      n/ UR                   SS QSPU R                  P7nUR                  U5      R	                  SS5      nUR                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U U	UUU4U R.                  (       d  SOU R0                  U R2                  S.UD6u  nnUR4                  " / UQSP76 R7                  5       nU R9                  U5      nUU4$ )	z#Input shape: Batch x Time x ChannelNr'   r!   r@   FT        )ru   rt   )r)   r   r   viewr   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacer   _attn_implementationr   r|   ru   rt   reshaper   r   )r<   r   r   r   rs   rv   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacer   r   s                      r.   r]   MarianAttention.forward   sc    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFHmmK0L((r0   )r   ru   r   r   r   r   r   r   r   r   r   rt   r   )r   FTFNNNNN)rc   rd   re   rf   rg   rh   floatboolr"   r;   rH   rk   r	   r   r   tupler]   rl   rm   rn   s   @r.   r   r      s   G  &* $%C%C %C 	%C
 %C %C %C t#%C :%C %CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H) H)r0   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjrS\R                  S\R                  S\	\
   S	\R                  4S
 jrSrU =r$ )MarianEncoderLayer   Nr   r   c                 l  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  UUS9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)r   r   ru   r   r   )r:   r;   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normru   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normr<   r   r   r=   s      r.   r;   MarianEncoderLayer.__init__   s    (nn44,,
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r0   r   rs   rv   r7   c                 2   UnU R                   " U4SU0UD6u  p[        R                  R                  XR                  U R                  S9nXA-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nXA-   nU R                  U5      nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       dC  [        R                   " UR                  5      R"                  S-
  n[        R$                  " X* US9nU$ )Nrs   rz   i  )minmax)r   r   r   ru   r|   r   r   r   r   r   r   rA   rH   float16isfiniteallfinfor   clamp)r<   r   rs   rv   residual_clamp_values          r.   r]   MarianEncoderLayer.forward  s[    !>>
)
 

 --m||VZVcVc-d 011-@ **488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<%--/}8U8Y8Y8[8[++m&9&9:>>EK!KK<[YMr0   )	r   r   ru   r   r   r   r   r   r   rb   )rc   rd   re   rf   r"   rh   r;   rH   rJ   r   r   rk   r]   rl   rm   rn   s   @r.   r   r      se    =| =d
 = =&(( )) +,	
 
 r0   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\
S-  S\\   S\R                  4S jjrSrU =r$ )MarianDecoderLayeri2  Nr   r   c           
        > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r   r   ru   r   r   r   r   )ru   r   r   r   )r:   r;   r   r   r   decoder_attention_headsr   r   ru   r   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   r   s      r.   r;   MarianDecoderLayer.__init__3  s    (nn44,,
 ~~#F$>$>?"(";";$&LL$@!+NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r0   r   rs   encoder_hidden_statesencoder_attention_maskr   	use_cacherv   r7   c                    UnU R                   " U4UUS.UD6u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUbb  UnU R                  " U4UUUS.UD6u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU$ )N)r   rs   rz   )r   rs   r   )r   r   r   ru   r|   r   r   r   r   r   r   r   r   )
r<   r   rs   r   r   r   r   rv   r   r   s
             r.   r]   MarianDecoderLayer.forwardR  s    !  >>
+)
 	
 --m||VZVcVc-d 011-@ !,$H#00 !65 /	 
  M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<r0   )r   r   ru   r   r   r   r   r   r   r   r   rb   )NNNNT)rc   rd   re   rf   r"   rh   r;   rH   rk   r	   r   r   r   r]   rl   rm   rn   s   @r.   r   r   2  s    =| =d
 = =D /3596:(,!%/||/ t+/  %||d2	/
 !&t 3/ / $;/ +,/ 
/ /r0   r   c                      ^  \ rS rSr% \\S'   SrSrSrSr	Sr
Sr\R                  " 5       U 4S j5       r\S 5       rSrU =r$ )MarianPreTrainedModeli  r   modelTc                   > [         TU ]  U5        [        U[        5      (       a0  [        R
                  " UR                  UR                  5       5        g [        U[        5      (       a!  [        R                  " UR                  5        g g rb   )r:   _init_weightsr   r2   initcopy_rC   rT   MarianMTModelzeros_final_logits_bias)r<   ro   r=   s     r.   r   #MarianPreTrainedModel._init_weights  s]    f%fABBJJv}}f&:&:&<=..KK001 /r0   c                     U R                   R                  n[        R                  " / SQSSSSU//U R                  S9nUR                  U5      UUS.nU$ )N)r      
      r@   r         r@   rZ   )rs   r#   decoder_input_ids)r   r$   rH   tensorrZ   ne)r<   	pad_tokenr#   dummy_inputss       r.   r   "MarianPreTrainedModel.dummy_inputs  sZ    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"!*

 r0   ra   )rc   rd   re   rf   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphrH   ri   r   propertyr   rl   rm   rn   s   @r.   r   r     sX    &*#N!
]]_2 2  r0   r   c                      ^  \ rS rSrSr\\S.rS\4U 4S jjr	\
\\   SS\R                  S-  S\R                  S-  S	\R                  S-  S
\\   S\4
S jj5       5       5       rSrU =r$ )MarianEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`MarianEncoderLayer`].

Args:
    config: MarianConfig
    embed_tokens (nn.Embedding): output embedding
)r   
attentionsr   c                   > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSU l        [        R                  " UR                   X R                  5      U l        [%        UR                  X R                  5      U l        [        R(                  " [+        UR,                  5       Vs/ s H  n[/        U5      PM     sn5      U l        SU l        U R5                  5         g s  snf )N      ?F)r:   r;   ru   encoder_layerdrop	layerdropr   r$   r6   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   	Embedding
vocab_sizeembed_tokensr2   embed_positions
ModuleListrF   encoder_layersr   r   gradient_checkpointing	post_init)r<   r   r   r   r=   s       r.   r;   MarianEncoder.__init__  s     ~~11NN	!..$*$B$B!393I3I499Y/sLL):):IGWGWXB**I7G7G 
 mmvOdOdIe$fIeA%7%?Ie$fg&+#	 %gs   ENr#   rs   inputs_embedsrv   r7   c                 0   US L US L-  (       a  [        S5      eUc  U R                  U5      U R                  -  nU R                  UR                  S S 5      nX5-   n[
        R                  R                  X`R                  U R                  S9n[        U R                  UUS9n[        U R                  5       HR  u  pxSn	U R                  (       a'  [        R                  " / 5      n
XR                  :  a  Sn	U	(       a  MH  U" UU40 UD6nMT     [!        US9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr'   rz   )r   r  rs   FT)last_hidden_state)r+   r  r  r  r)   r   r   ru   r|   r   r   	enumerater   rH   randr  r   )r<   r#   rs   r  rv   	embed_posr   idxencoder_layerto_dropdropout_probabilitys              r.   r]   MarianEncoder.forward  s    -t";<YZZ  --i84;K;KKM(()<)<Sb)AB	%1--m||VZVcVc-d2;;')
 #,DKK"8CG}}&+jjn#&7"G7 -!"! ! #9 +
 	
r0   )	ru   r  r  r  r  r  r   r  r6   r   )rc   rd   re   rf   rg   r   r   _can_record_outputsr"   r;   r   r    r   rH   
LongTensorrJ   r   r   r   r]   rl   rm   rn   s   @r.   r	  r	    s     ,%
| ,   .22626	(
##d*(
 ((4/(
 ((4/	(

 +,(
 
(
    (
r0   r	  c                   J  ^  \ rS rSrSr\\" \SSS9\" \SSS9S.rS\	4U 4S	 jjr
\\\       SS\R                  S
-  S\R                   S
-  S\R"                  S
-  S\R                  S
-  S\S
-  S\R"                  S
-  S\S
-  S\\   S\4S jj5       5       5       rSrU =r$ )MarianDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MarianDecoderLayer`]

Args:
    config: MarianConfig
    embed_tokens (nn.Embedding): output embedding
r!   r   )index
layer_namer   )r   r
  cross_attentionsr   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        R                  " UR                   UR                  U R                  5      U l        [%        UR                  UR                  U R                  5      U l        [        R(                  " [+        UR,                  5       Vs/ s H  n[/        XS9PM     sn5      U l        SU l        U R5                  5         g s  snf )Nr  )r   F)r:   r;   ru   decoder_layerdropr  r$   r6   r  max_target_positionsr  r  r  r   r  r   r  decoder_vocab_sizer  r2   r  r  rF   decoder_layersr   r   r  r  )r<   r   ir=   s      r.   r;   MarianDecoder.__init__  s     ~~11!..$*$B$B!8>8N8N499V^^4TWLL)B)BFNNTXTdTdeB**FNND<L<L 
 mmV[\b\q\qVr$sVrQR%7%LVr$st&+#	 %ts   ENr#   rs   r   r   r   r  r   rv   r7   c                 Z   US L US L-  (       a  [        S5      eUc  U R                  U5      nX`R                  -  nU(       ab  Uc_  Uc  U R                  R                  (       a.  [        [        U R                  S9[        U R                  S95      O[        U R                  S9nUR                  5       S S u  pUb  UR                  5       OSn[        R                  " XR                  S9U-   nUc2  [        5       (       d#  X-   n[        R                  " XUR                  S9n[        U[
        5      (       a  UR                  OUn[!        U R                  UUUS9n[#        U R                  UUUS9nU R%                  X4XS9nXl-   n[&        R(                  R+                  UU R*                  U R,                  S	9n[/        U R0                  5       HN  u  nnU R,                  (       a(  [        R2                  " / 5      nUU R4                  :  a  M?  U" UUU4UUUS
.UD6nMP     [7        UUS9$ )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)r   r'   r   r   )r   r  rs   r   )r   r  rs   r   )rX   rz   )r   r   r   )r   r   )r+   r  r  r   is_encoder_decoderr   r
   r}   get_seq_lengthrH   r[   rZ   r   onesr   r   r   r   r  r   r   ru   r|   r!  r   r"  r  r   )r<   r#   rs   r   r   r   r  r   rv   
batch_size
seq_lengthrW   rX   mask_seq_lengthself_attn_cachecausal_maskr   r$  decoder_layerr'  s                       r.   r]   MarianDecoder.forward  s:    -t";<stt  --i8M &(8(88 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
ETE`!?!?!Afg||J7K7KLOee!*B*D*D4AO"ZZ
ML`L`aN /+>?? 00  	 );;')+	
 ";;;'1"7	"
 ++$&< , 
 &4--mt||VZVcVc-d"+DKK"8C}}&+jjn#&7)% (> /# M #9  9++
 	
r0   )	ru   r  r  r  r  r  r   r2  r6   )NNNNNNN)rc   rd   re   rf   rg   r   r   r   r)  r"   r;   r   r    r   rH   r*  rk   rJ   r	   r   r   r   r   r]   rl   rm   rn   s   @r.   r,  r,    s    ,$_A+V*?!P^_| &   .2.2:>:>(,26!%R
##d*R
 t+R
  %0047	R

 !& 0 04 7R
 R
 ((4/R
 $;R
 +,R
 
3R
    R
r0   r,  c                     ^  \ rS rSrSS/rS\4U 4S jjrS rS rS r	S	 r
S
\S\R                  4S jr\\         SS\R$                  S-  S\R&                  S-  S\R$                  S-  S\R&                  S-  S\\R&                     \-  S-  S\S-  S\R.                  S-  S\R.                  S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )MarianModelip  $model.encoder.embed_positions.weight$model.decoder.embed_positions.weightr   c                 X  > [         TU ]  U5        UR                  UR                  p2U R                  R
                  (       a1  [        R                  " X1R                  U5      U l	        SSS.U l
        OS U l
        [        U5      U l        [        U5      U l        U R                  5         g )Nzshared.weight)zdecoder.embed_tokens.weightzencoder.embed_tokens.weight)r:   r;   r$   r  r    share_encoder_decoder_embeddingsr   r  r   shared_tied_weights_keysr	  encoderr,  decoderr  )r<   r   r6   r  r=   s       r.   r;   MarianModel.__init__w  s     "("5"5v7H7HZ ;;77,,z>>;ODK/>/>'D#
 '+D#$V,$V, 	r0   c                 >    U R                  5       R                  5       $ rb   )get_encoderget_input_embeddingsr<   s    r.   rO   MarianModel.get_input_embeddings  s    !6688r0   c                     U R                   R                  (       a=  Xl        U R                  U R                  l        U R                  U R
                  l        g XR                  l        g rb   )r   rG  rH  rJ  r  rK  r<   rr   s     r.   set_input_embeddings MarianModel.set_input_embeddings  s>    ;;77K(,DLL%(,DLL%(-LL%r0   c                     U R                   R                  (       a  [        S5      eU R                  5       R	                  5       $ )Nz`get_decoder_input_embeddings` should not be called if `config.share_encoder_decoder_embeddings` is `True`. Please use `get_input_embeddings` instead.)r   rG  r+   get_decoderrO  rP  s    r.   get_decoder_input_embeddings(MarianModel.get_decoder_input_embeddings  s<    ;;77H  !6688r0   c                 p    U R                   R                  (       a  [        S5      eXR                  l        g )Na   `config.share_encoder_decoder_embeddings` is set to `True` meaning the decoder input embeddings are shared with the encoder. In order to set the decoder input embeddings, you should simply set the encoder input embeddings by calling `set_input_embeddings` with the appropriate embeddings.)r   rG  r+   rK  r  rS  s     r.   set_decoder_input_embeddings(MarianModel.set_decoder_input_embeddings  s.    ;;77r 
 %*!r0   new_num_tokensr7   c                     U R                   R                  (       a  [        S5      eU R                  5       nU R	                  X!5      nU R                  U5        U R                  5       nUc  U$ XR                   l        U R                  5         U$ Nz`resize_decoder_token_embeddings` should not be called if `config.share_encoder_decoder_embeddings` is `True`. Please use `resize_token_embeddings` instead.)r   rG  r+   rX  _get_resized_embeddingsr[  r3  tie_weights)r<   r]  old_embeddingsnew_embeddingsmodel_embedss        r.   resize_decoder_token_embeddings+MarianModel.resize_decoder_token_embeddings  s    ;;77K 
 ::<55nU)).988:! *8& 	r0   Nr#   rs   r   decoder_attention_maskencoder_outputsr   r  decoder_inputs_embedsr   rv   c
                    Uc  U R                   " S	UUUS.U
D6nOK[        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  " S	UUUS   UUUU	S.U
D6n[        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )
a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Marian uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

Example:

```python
>>> from transformers import AutoTokenizer, MarianModel

>>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> model = MarianModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")

>>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
>>> decoder_inputs = tokenizer(
...     "<pad> Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
...     return_tensors="pt",
...     add_special_tokens=False,
... )
>>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_inputs.input_ids)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 26, 512]
```N)r#   rs   r  r   r!   r@   )r   r   r
  r#   rs   r   r   r   r  r   )r   r   decoder_hidden_statesdecoder_attentionsr/  encoder_last_hidden_stater   encoder_attentionsra   )rJ  r   r   lenrK  r   r   r   r   r
  r/  )r<   r#   rs   r   rg  rh  r   r  ri  r   rv   decoder_outputss               r.   r]   MarianModel.forward  s   h ""ll #-+ 	O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO ,, 	
'1"1!"4#1+/	
 	
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r0   )rI  rK  rJ  rH  )	NNNNNNNNN)rc   rd   re   rf   _keys_to_ignore_on_load_missingr"   r;   rO  rT  rX  r[  rh   r   r  re  r   r   rH   r*  rk   r   r   r	   rJ   r   r   r   r   r]   rl   rm   rn   s   @r.   rC  rC  p  s[    	/.'#
| *9.9*c bll 0  .2.2596:HL(,26:>!%V
##d*V
 t+V
 !++d2	V

 !&t 3V
 u||,>EV
 V
 ((4/V
  %0047V
 $;V
 +,V
 
V
  V
r0   rC  zX
    The Marian Model with a language modeling head. Can be used for summarization.
    )custom_introc                   f  ^  \ rS rSrSr/ SQrSS/rSS0rS\4U 4S	 jjr	 S#S\
S\
S
-  S\S\R                  4U 4S jjjrS$S\
S\R                  4S jjrS rS\
SS
4S jrS\R                  4S jr\\          S%S\R,                  S
-  S\R.                  S
-  S\R,                  S
-  S\R.                  S
-  S\\R.                     \-  S
-  S\S
-  S\R6                  S
-  S\R6                  S
-  S\R,                  S
-  S\S
-  S\\   S\4S  jj5       5       rS\R.                  4S! jr S"r!U =r"$ )&r   i  r   )r   rD  rE  rD  rE  lm_head.weight!model.decoder.embed_tokens.weightr   c                   > [         TU ]  U5        [        U5      U l        U R                  R
                  (       a  SSSS.U l        UR
                  (       a  UR                  OUR                  nU R                  S[        R                  " SU45      5        [        R                  " UR                  USS9U l        U R!                  5         g )Nzmodel.shared.weight)rv  rw  z!model.encoder.embed_tokens.weightr   r!   Fr   )r:   r;   rC  r   r   rG  rI  r  r3  register_bufferrH   zerosr   r   r   lm_headr  )r<   r   target_vocab_sizer=   s      r.   r;   MarianMTModel.__init__+  s      (
;;77"75J5J'D# 281X1XF--^d^w^w0%++qBS>T2UVyy1BO 	r0   Nr]  pad_to_multiple_ofmean_resizingr7   c                    > [         TU ]  XU5      nU R                  R                  (       a  U R	                  U5        U$ rb   )r:   resize_token_embeddingsr   rG  _resize_final_logits_bias)r<   r]  r~  r  rc  r=   s        r.   r  %MarianMTModel.resize_token_embeddings<  s8     8]jk;;77**>:r0   c                    U R                  5       nU R                  XAU5      nU R                  U5        UR                  R                  S   nU R
                  R                  (       a  XR
                  l        U R
                  R                  (       a^  U R                  5       bM  U R
                  R                  (       d2  U R                  5       nU R                  Xa5      nU R                  U5        U R                  5       $ )Nr   )rO  r`  rT  rC   r)   r   rG  r3  get_output_embeddingstie_word_embeddings_get_resized_lm_headset_output_embeddings)r<   r]  r~  argsrb  rc  old_lm_headnew_lm_heads           r.   _resize_token_embeddings&MarianMTModel._resize_token_embeddingsE  s    22455nVhi!!.1'..44Q7;;77-;KK* KK88**,8KK33446K33KPK&&{3((**r0   c                 :   U R                   R                  (       a  [        S5      eU R                  R	                  5       nU R                  X!5      nU R                  R                  U5        U R                  5       bM  U R                   R                  (       d2  U R                  5       nU R                  XA5      nU R                  U5        U R                  R	                  5       nUc  U$ XR                   l        U R                  5         U R                  U5        U$ r_  )r   rG  r+   r   rX  r`  r[  r  r  r  r  r3  ra  r  )r<   r]  rb  rc  r  r  rd  s          r.   re  -MarianMTModel.resize_decoder_token_embeddings[  s    ;;77K 
 @@B55nU

//? %%'3DKK<[<[446K33KPK&&{3zz>>@! *8& 	&&~6r0   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr'   r!   r   ry   r   )r   r)   rH   rz  rZ   catry  )r<   r]  old_num_tokensnew_bias
extra_biass        r.   r  'MarianMTModel._resize_final_logits_bias{  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r0   rc  c                     Xl         g rb   )r{  )r<   rc  s     r.   r  #MarianMTModel.set_output_embeddings  s    %r0   r#   rs   r   rg  rh  r   r  ri  labelsr   rv   c                 v   U	bX  U
(       a  [         R                  S5        Sn
Uc7  Uc4  [        XR                  R                  U R                  R
                  5      nU R                  " U4UUUUUUUU
S.UD6nU R                  US   5      U R                  -   nSnU	bF  [        5       nU" UR                  SU R                  R                  5      U	R                  S5      5      n[        UUUR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  S9	$ )u  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Marian uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, MarianMTModel

>>> src = "fr"  # source language
>>> trg = "en"  # target language

>>> model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
>>> model = MarianMTModel.from_pretrained(model_name)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name)

>>> sample_text = "où est l'arrêt de bus ?"
>>> batch = tokenizer([sample_text], return_tensors="pt")

>>> generated_ids = model.generate(**batch)
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
"Where's the bus stop?"
```
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rs   r   rh  rg  r   r  ri  r   r   r'   )	losslogitsr   rl  rm  r/  rn  r   ro  )r   warningr/   r   r$   r%   r   r{  r   r   r   r3  r   r   rl  rm  r/  rn  r   ro  )r<   r#   rs   r   rg  rh  r   r  ri  r  r   rv   outputs	lm_logitsmasked_lm_lossloss_fcts                   r.   r]   MarianMTModel.forward  s<   p klI (-B-J$6KK44dkk6X6X%! '+jj'
)/+#9+'"7'
 '
 LL,t/E/EE	')H%innR9W9W&XZ`ZeZefhZijN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r0   c                 j    [        XR                  R                  U R                  R                  5      $ rb   )r/   r   r$   r%   )r<   r  s     r.   %prepare_decoder_input_ids_from_labels3MarianMTModel.prepare_decoder_input_ids_from_labels  s#    !&++*B*BDKKDfDfggr0   )rI  r{  r   )NTrb   )
NNNNNNNNNN)#rc   rd   re   rf   r  rs  _keys_to_ignore_on_saverI  r"   r;   rh   r   r   r  r  r  re  r  r  r   r   rH   r*  rk   r   r   r	   rJ   r   r   r   r]   r  rl   rm   rn   s   @r.   r   r     s     '#
  FGmn*,OP| $ ae!7:TzY]	 +s +_a_k_k +,@< < <&BLL &  .2.2596:HL(,26:>*.!%]
##d*]
 t+]
 !++d2	]

 !&t 3]
 u||,>E]
 ]
 ((4/]
  %0047]
   4']
 $;]
 +,]
 
]
  ]
~hELL h hr0   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MarianDecoderWrapperi  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rb   )r:   r;   r,  rK  r  r<   r   r=   s     r.   r;   MarianDecoderWrapper.__init__  s&     $V,r0   c                 &    U R                   " U0 UD6$ rb   rK  )r<   r  rv   s      r.   r]   MarianDecoderWrapper.forward  s    ||T,V,,r0   r  )	rc   rd   re   rf   rg   r;   r]   rl   rm   rn   s   @r.   r  r    s    

- -r0   r  c                   j  ^  \ rS rSrSS0rU 4S jrS rS r\\	         SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S\\
R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )MarianForCausalLMi  rv  rw  c                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr   )r   r8  r:   r;   r  r   r   r   hidden_sizer  r{  r  r  s     r.   r;   MarianForCausalLM.__init__  sX     $)! )&1
yy!3!3V5F5FUS 	r0   c                 B    U R                   R                  R                  $ rb   r   rK  r  rP  s    r.   rO  &MarianForCausalLM.get_input_embeddings  s    zz!!...r0   c                 8    XR                   R                  l        g rb   r  rS  s     r.   rT  &MarianForCausalLM.set_input_embeddings  s    */

'r0   Nr#   rs   r   r   r   r  r  r   logits_to_keeprv   r7   c
                 
   U R                   R                  " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUba  UR                  UR                  5      n[        5       nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, MarianForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
>>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```rk  r   Nr'   )r  r  r   r   r
  r/  ra   )r   rK  r   rh   slicer{  torZ   r   r   r   r  r   r   r   r
  r/  )r<   r#   rs   r   r   r   r  r  r   r  rv   r  r   slice_indicesr  r  r  s                    r.   r]   MarianForCausalLM.forward  s   L >BZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r0   )r{  r   )	NNNNNNNNr   )rc   rd   re   rf   rI  r;   rO  rT  r   r   rH   r*  rk   rJ   r	   r   rh   r   r   r   r   r]   rl   rm   rn   s   @r.   r  r    s/   =	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r0   r  )r  rC  r   r   )Nr   )Jrg   r  collections.abcr   numpyrD   rH   r   torch.nnr    r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r    configuration_marianr"   
get_loggerrc   r   rk   rh   r/   r  r2   Moduler   r   r   r   r   r   r	  r,  rC  r   r  r  __all__ra   r0   r.   <module>r     s   D  $    % & ! C C ) J B 9  G &  8 E . 
		H	%%,, c [^  -",, -R !%II%<<% 
% <<	%
 LL4'% T\% % '(%:r)bii r)l03 0hO3 Od O  <P
) P
fw
) w
t h
' h
 h
V 
Hh)? Hh
HhX-0 - Y
- Y
x Yr0   