
    Z j                     P   S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-J.r.J/r/J0r0  SSK1J2r2  SSK3J4r4J5r5  SSK6J7r7  \/Rp                  " \95      r:S\Rv                  S\<S\<4S jr= " S S\R|                  5      r? " S S\R|                  5      r@  SJS\R                  S\Rv                  S \Rv                  S!\Rv                  S"\Rv                  S-  S#\BS-  S$\BS%\)\+   4S& jjrC " S' S(\R                  5      rD " S) S*\5      rE " S+ S,\5      rF " S- S.\R                  5      rG\, " S/ S0\'5      5       rH " S1 S2\H5      rI " S3 S4\H5      rJ " S5 S6\H5      rK " S7 S8\H5      rL\, " S9 S:\H5      5       rM\," S;S<9 " S= S>\H\5      5       rN\," S?S<9 " S@ SA\H5      5       rO\, " SB SC\H5      5       rP " SD SE\H5      rQ\," SFS<9 " SG SH\H\5      5       rR/ SIQrSg)KzPyTorch BART model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilingloggingtorch_compilable_check)merge_with_config_defaults)OutputRecordercapture_outputs   )
BartConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr&   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r(   r)   r*   shifted_input_idss       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/bart/modeling_bart.pyshift_tokens_rightr4   :   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S	\R                  S-  4U 4S
 jjjr	Sr
U =r$ )BartLearnedPositionalEmbeddingJ   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g N   )offsetsuper__init__)selfr9   r:   	__class__s      r3   r@   'BartLearnedPositionalEmbedding.__init__O   s"     ++5}Er5   Nr(   past_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nr=   )dtypedevicer,   r   )r.   torcharangelongweightrH   expand	unsqueezer?   forwardr>   )rA   r(   rD   rE   bszseq_lenrB   s         r3   rO   &BartLearnedPositionalEmbedding.forwardU   s    
 $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r5   )r>   )r   N)__name__
__module____qualname____firstlineno____doc__intr@   rI   TensorrO   __static_attributes____classcell__rB   s   @r3   r7   r7   J   sW    Fs F3 F mq;;?B;V[VbVbeiVi; ;r5   r7   c            
       r   ^  \ rS rSrSrSS\S\S\S\S-  4U 4S jjjrS	\R                  4U 4S
 jjr
SrU =r$ )BartScaledWordEmbeddinge   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r9   r:   padding_idxembed_scaleNc                 2   > [         TU ]  XU5        X@l        g N)r?   r@   ra   )rA   r9   r:   r`   ra   rB   s        r3   r@    BartScaledWordEmbedding.__init__j   s    D&r5   r(   c                 <   > [         TU ]  U5      U R                  -  $ rc   )r?   rO   ra   )rA   r(   rB   s     r3   rO   BartScaledWordEmbedding.forwardn   s    wy)D,<,<<<r5   ra   )      ?rS   rT   rU   rV   rW   rX   floatr@   rI   rY   rO   rZ   r[   r\   s   @r3   r^   r^   e   sJ    's '3 'S '_dgk_k ' '= = =r5   r^   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr,         r=   r   dimptrainingr&   )
sizerI   matmul	transposer   
functionalsoftmaxrq   ry   
contiguous)
rk   rl   rm   rn   ro   rp   rq   rr   attn_weightsattn_outputs
             r3   eager_attention_forwardr   s   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r5   c                   $  ^  \ rS rSrSr      SS\S\S\S\S\S	\S
\S-  S\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\S-  S\
R                  S-  S\\   S\\
R                  \
R                  S-  4   4S jjrSrU =r$ )BartAttention   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsrq   
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rt   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )r?   r@   r   r   rq   head_dimr   r0   rp   r   r   r   loggerwarning_oncerB   rS   r   Lineark_projv_projq_projout_proj)
rA   r   r   rq   r   r   r   r   r   rB   s
            r3   r@   BartAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr5   hidden_stateskey_value_statespast_key_valuesro   rr   returnc                 ,   USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Sn
Ub]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      n/ UR                   SS QSPU R                  P7nUR                  U5      R	                  SS5      nUR                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U U	UUU4U R.                  (       d  SOU R0                  U R2                  S.UD6u  nnUR4                  " / UQSP76 R7                  5       nU R9                  U5      nUU4$ )	z#Input shape: Batch x Time x ChannelNr,   r&   r=   FT        )rq   rp   )r.   r   r   viewr|   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacer   _attn_implementationr   ry   rq   rp   reshaper   r   )rA   r   r   r   ro   rr   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacer   r   s                      r3   rO   BartAttention.forward   sc    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFHmmK0L((r5   )r   rq   r   r   r   r   r   r   r   r   r   rp   r   )r   FTFNNNNN)rS   rT   rU   rV   rW   rX   rj   boolr'   r@   rI   rY   r   r   r   tuplerO   rZ   r[   r\   s   @r3   r   r      s   G  $( $%C%C %C 	%C
 %C %C %C T!%C :%C %CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H) H)r5   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjrS\R                  S\R                  S\	\
   S	\R                  4S
 jrSrU =r$ )BartEncoderLayeri  Nr   r   c                 l  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  UUS9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)r   r   rq   r   r   )r?   r@   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrq   r
   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normrA   r   r   rB   s      r3   r@   BartEncoderLayer.__init__  s    &nn44,,
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r5   r   ro   rr   r   c                 2   UnU R                   " U4SU0UD6u  p[        R                  R                  XR                  U R                  S9nXA-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nXA-   nU R                  U5      nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       dC  [        R                   " UR                  5      R"                  S-
  n[        R$                  " X* US9nU$ )Nro   rw   i  )minmax)r   r   r}   rq   ry   r   r   r   r   r   r   rG   rI   float16isfiniteallfinfor   clamp)rA   r   ro   rr   residual_clamp_values          r3   rO   BartEncoderLayer.forward  s[    !>>
)
 

 --m||VZVcVc-d 011-@ **488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<%--/}8U8Y8Y8[8[++m&9&9:>>EK!KK<[YMr5   )	r   r   rq   r   r   r   r   r   r   rc   )rS   rT   rU   rV   r'   rX   r@   rI   FloatTensorr   r   rY   rO   rZ   r[   r\   s   @r3   r   r     se    =z =cDj = =&(( )) +,	
 
 r5   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\
S-  S\\   S\R                  4S jjrSrU =r$ )BartDecoderLayeri7  Nr   r   c           
        > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r   r   rq   r   r   r   r   )rq   r   r   r   )r?   r@   r   r   r   decoder_attention_headsr   r   rq   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   r   s      r3   r@   BartDecoderLayer.__init__8  s    &nn44,,
 ~~#F$>$>?"(";";$&LL$@!)NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r5   r   ro   encoder_hidden_statesencoder_attention_maskr   	use_cacherr   r   c                    UnU R                   " U4UUS.UD6u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUbb  UnU R                  " U4UUUS.UD6u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU$ )N)r   ro   rw   )r   ro   r   )r   r   r}   rq   ry   r   r   r   r   r   r   r   r   )
rA   r   ro   r   r   r   r   rr   r   r   s
             r3   rO   BartDecoderLayer.forwardW  s    !  >>
+)
 	
 --m||VZVcVc-d 011-@ !,$H#00 !65 /	 
  M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<r5   )r   r   rq   r   r   r   r   r   r   r   r   rc   )NNNNT)rS   rT   rU   rV   r'   rX   r@   rI   rY   r   r   r   r   rO   rZ   r[   r\   s   @r3   r   r   7  s    =z =cDj = =D /3596:(,!%/||/ t+/  %||d2	/
 !&t 3/ / $;/ +,/ 
/ /r5   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jr
SrU =r$ )BartClassificationHeadi  z-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " US9U l        [        R                  " X#5      U l        g )N)rx   )r?   r@   r   r   denseDropoutrq   r   )rA   r   r   r   r   rB   s        r3   r@   BartClassificationHead.__init__  s@     	YYy4
zzN3		)9r5   r   r   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ rc   )rq   r   rI   tanhr   )rA   r   s     r3   rO   BartClassificationHead.forward  sN    ]3

=1

=1]3m4r5   )r   rq   r   ri   r\   s   @r3   r   r     sQ    7
:
: 
: 	
:
 
:U\\ ell  r5   r   c                   p   ^  \ rS rSr% \\S'   SrSrSS/rSS/r	S	r
SrSrSrSrU 4S
 jr\S 5       rSrU =r$ )BartPreTrainedModeli  r   modelTzencoder.versionzdecoder.versionr   r   r   c                    > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g g rc   )r?   _init_weightsr   BartForConditionalGenerationinitzeros_final_logits_bias)rA   rk   rB   s     r3   r   !BartPreTrainedModel._init_weights  s5    f%f:;;KK001 <r5   c                     U R                   R                  n[        R                  " / SQSSSSU//U R                  S9nUR                  U5      US.nU$ )N)r      
      r=   r         r=   rH   )ro   r(   )r   r)   rI   tensorrH   ne)rA   	pad_tokenr(   dummy_inputss       r3   r   BartPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r5    )rS   rT   rU   rV   r'   __annotations__base_model_prefixsupports_gradient_checkpointing"_keys_to_ignore_on_load_unexpected_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   propertyr  rZ   r[   r\   s   @r3   r   r     sc    &*#*;=N)O&,.AB"3N!2
  r5   r   c                       \ rS rSrS rSrg)PretrainedBartModeli  c                 :    [         R                  " S[        5        g Nz_The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.warningswarnFutureWarningrA   s    r3   __init_subclass__%PretrainedBartModel.__init_subclass__      m	
r5   r  NrS   rT   rU   rV   r  rZ   r  r5   r3   r  r        
r5   r  c                       \ rS rSrS rSrg)BartPretrainedModeli  c                 :    [         R                  " S[        5        g r  r  r  s    r3   r  %BartPretrainedModel.__init_subclass__  r  r5   r  Nr   r  r5   r3   r#  r#    r!  r5   r#  c                      ^  \ rS rSrSr\\S.rS\4U 4S jjr	\
\\   SS\R                  S-  S\R                  S-  S	\R                   S-  S
\\   S\4
S jj5       5       5       rSrU =r$ )BartEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`BartEncoderLayer`].

Args:
    config: BartConfig
    embed_tokens (nn.Embedding): output embedding
)r   
attentionsr   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSn[        UR                  X R                  US9U l        [!        UR                  U5      U l        [$        R&                  " [)        UR*                  5       Vs/ s H  n[-        XS9PM     sn5      U l        [$        R0                  " U5      U l        SU l        U R7                  5         g s  snf Nrh   rg   )r   F)r?   r@   rq   encoder_layerdrop	layerdropr   r)   r`   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtr^   
vocab_sizeembed_tokensr7   embed_positionsr   
ModuleListrangeencoder_layersr   r   r   layernorm_embeddinggradient_checkpointing	post_init)rA   r   r   ra   irB   s        r3   r@   BartEncoder.__init__  s     ~~11NN	!..$*$B$B!.4.D.Ddii	*#3y*:*:
  >** 
 mmTYZ`ZoZoTp$qTpq%5f%JTp$qr#%<<	#: &+# %rs   0ENr(   ro   inputs_embedsrr   r   c                 h   US L US L-  (       a  [        S5      eUc  U R                  U5      nU R                  US S 2S S 2S4   5      nUR                  UR                  5      nX5-   nU R                  U5      n[        R                  R                  X`R                  U R                  S9n[        U R                  UUS9n[        U R                  5       HR  u  pxSn	U R                  (       a'  [        R                  " / 5      n
XR                   :  a  Sn	U	(       a  MH  U" UU40 UD6nMT     [#        US9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr,   rw   )r   r=  ro   FT)last_hidden_state)r0   r3  r4  torH   r8  r   r}   rq   ry   r   r   	enumerater   rI   randr,  r   )rA   r(   ro   r=  rr   	embed_posr   idxencoder_layerto_dropdropout_probabilitys              r3   rO   BartEncoder.forward  s,    -t";<YZZ  --i8M((q!Rx)@A	LL!5!56	%100?--m||VZVcVc-d2;;')

 #,DKK"8CG}}&+jjn#&7"G7 -!"! ! #9 +
 	
r5   )	rq   r4  r3  r9  r,  r8  r   r.  r`   r   )rS   rT   rU   rV   rW   r   r   _can_record_outputsr'   r@   r#   r%   r   rI   
LongTensorrY   r   r   r   r   rO   rZ   r[   r\   s   @r3   r'  r'    s     *#
z 4   .2.226	*
##d**
 t+*
 ((4/	*

 +,*
 
*
    *
r5   r'  c                   J  ^  \ rS rSrSr\\" \SSS9\" \SSS9S.rS\	4U 4S	 jjr
\\\       SS\R                  S
-  S\R                   S
-  S\R"                  S
-  S\R                  S
-  S\S
-  S\R"                  S
-  S\S
-  S\\   S\4S jj5       5       5       rSrU =r$ )BartDecoderi(  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]

Args:
    config: BartConfig
    embed_tokens (nn.Embedding): output embedding
r&   r   )index
layer_namer   )r   r(  cross_attentionsr   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R                  US9U l        [!        UR                  UR                  5      U l        [$        R&                  " [)        UR*                  5       Vs/ s H  n[-        XS9PM     sn5      U l        [$        R0                  " UR                  5      U l        SU l        U R7                  5         g s  snf r*  )r?   r@   rq   decoder_layerdropr,  r)   r`   r-  max_target_positionsr/  r0  r1  r   r^   r2  r3  r7   r4  r   r5  r6  decoder_layersr   r   r   r8  r9  r:  )rA   r   ra   r;  rB   s       r3   r@   BartDecoder.__init__7  s    ~~11!..$*$B$B!393I3Idii/s3v~~t/?/?[
  >**NN 
 mmTYZ`ZoZoTp$qTpq%5f%JTp$qr#%<<#? &+# %rs   ENr(   ro   r   r   r   r=  r   rr   r   c                    US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       ab  Uc_  Uc  U R                  R                  (       a.  [	        [        U R                  S9[        U R                  S95      O[        U R                  S9nUR                  5       S S u  pUb  UR                  5       OSn[        R                  " XR                  S9U-   nUc2  [        5       (       d#  X-   n[        R                  " XUR                  S9n[        U[        5      (       a  UR                  OUn[        U R                  UUUS9n[!        U R                  UUUS9nU R#                  [$        XS9nUR'                  UR                  5      nXo-   nU R)                  U5      n[*        R,                  R/                  UU R.                  U R0                  S	9n[3        U R4                  5       HN  u  nnU R0                  (       a(  [        R6                  " / 5      nUU R8                  :  a  M?  U" UUU4UUUS
.UD6nMP     [;        UUS9$ )NzJYou must specify exactly one of decoder_input_ids or decoder_inputs_embeds)r   r,   r   r  )r   r=  ro   r   )r   r=  ro   r   )rE   rw   )r   r   r   )r?  r   )r0   r3  r   is_encoder_decoderr   r   rz   get_seq_lengthrI   rJ   rH   r    onesr   r   r   r   r4  inputr@  r8  r   r}   rq   ry   rA  r   rB  r,  r   )rA   r(   ro   r   r   r   r=  r   rr   
batch_size
seq_lengthrD   rE   mask_seq_lengthself_attn_cache	positionsr   rD  decoder_layerrG  s                       r3   rO   BartDecoder.forwardO  sD    -t";<ijj  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
ETE`!?!?!Afg||J7K7KLOee!*B*D*D4AO"ZZ
ML`L`aN /+>?? 00  	 ,;;')+	
 ";;;'1"7	"
 ((0F(b	LL!5!56	%100?--mt||VZVcVc-d"+DKK"8C}}&+jjn#&7)% (> /# M #9" 9++
 	
r5   )	rq   r4  r3  r9  r,  r8  r   rR  r`   )NNNNNNN)rS   rT   rU   rV   rW   r   r$   r   rI  r'   r@   r#   r%   r   rI   rJ  rY   r   r   r   r   r   r   rO   rZ   r[   r\   s   @r3   rL  rL  (  s    *$]!T*=n]z 0   .2.2:>:>(,26!%R
##d*R
 t+R
  %0047	R

 !& 0 04 7R
 R
 ((4/R
 $;R
 +,R
 
3R
    R
r5   rL  c                   z  ^  \ rS rSrSSS.rS\4U 4S jjrS rS r\	\
         SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\\R                     S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )	BartModeli  zshared.weight)zdecoder.embed_tokens.weightzencoder.embed_tokens.weightr   c                 J  > [         TU ]  U5        UR                  UR                  p2UR                  (       a   [
        R                  " UR                  5      OSn[        X1R                  X$S9U l	        [        U5      U l        [        U5      U l        U R                  5         g )Nrh   rg   )r?   r@   r)   r2  r/  r0  r1  r   r^   sharedr'  encoderrL  decoderr:  )rA   r   r`   r2  ra   rB   s        r3   r@   BartModel.__init__  su     "("5"5v7H7HZ393I3Idii/s-j..+o"6*"6* 	r5   c                     U R                   $ rc   )rd  r  s    r3   get_input_embeddingsBartModel.get_input_embeddings  s    {{r5   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g rc   )rd  re  r3  rf  rA   rn   s     r3   set_input_embeddingsBartModel.set_input_embeddings  s'    $(KK!$(KK!r5   Nr(   ro   decoder_input_idsdecoder_attention_maskencoder_outputsr   r=  decoder_inputs_embedsr   rr   r   c
                 V   UcE  UcB  Uc  [        S5      e[        XR                  R                  U R                  R                  5      nUc  U R
                  " S
UUUS.U
D6nOK[        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  " S
UUUS   UUUU	S.U
D6n[        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S	9$ )C  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r(   ro   r=  r   r&   r=   )r?  r   r(  r(   ro   r   r   r   r=  r   )r?  r   decoder_hidden_statesdecoder_attentionsrO  encoder_last_hidden_stater   encoder_attentionsr  )r0   r4   r   r)   r*   re  r   r   lenrf  r   r?  r   r   r(  rO  )rA   r(   ro   ro  rp  rq  r   r=  rr  r   rr   decoder_outputss               r3   rO   BartModel.forward  sa   P $)>)F  U  !3;;33T[[5W5W! "/3|| 0#-+0 	0O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO FJ\\ 	F
'1"1!"4#1+/	F
 	F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r5   )rf  re  rd  	NNNNNNNNN)rS   rT   rU   rV   _tied_weights_keysr'   r@   ri  rm  r   r   rI   rJ  rY   listr   r   r   r   r   r   r   rO   rZ   r[   r\   s   @r3   rb  rb    s?    (7'6
z 0
  .2.259:>:>(,26:>!%T
##d*T
 t+T
 !++d2	T

 !& 0 04 7T
 e//047T
 T
 ((4/T
  %0047T
 $;T
 +,T
 
#	#T
  T
r5   rb  zV
    The BART Model with a language modeling head. Can be used for summarization.
    )custom_introc                   
  ^  \ rS rSrSrSS0rS/rS\4U 4S jjr SS	\	S
\	S-  S\
S\R                  4U 4S jjjrS	\	SS4S jr\\          SS\R$                  S-  S\R&                  S-  S\R$                  S-  S\R$                  S-  S\\R*                     S-  S\S-  S\R*                  S-  S\R*                  S-  S\R$                  S-  S\
S-  S\\   S\\-  4S jj5       5       rS\R&                  4S jrSrU =r$ )r   i  r   lm_head.weightzmodel.shared.weightr   r   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nr   r&   Fr   )r?   r@   rb  r   register_bufferrI   zerosrd  r9   r   r   r   lm_headr:  rA   r   rB   s     r3   r@   %BartForConditionalGeneration.__init__(  s     v&
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r5   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 x   > [         TU ]  XU5      nU R                  UR                  R                  S   5        U$ )Nr   )r?   resize_token_embeddings_resize_final_logits_biasrL   r.   )rA   r  r  r  new_embeddingsrB   s        r3   r  4BartForConditionalGeneration.resize_token_embeddings1  s<     8]jk&&~'<'<'B'B1'EFr5   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr,   r&   r  ru   r   )r   r.   rI   r  rH   catr  )rA   r  old_num_tokensnew_bias
extra_biass        r3   r  6BartForConditionalGeneration._resize_final_logits_bias8  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r5   r(   ro   ro  rp  rq  r   r=  rr  labelsr   rr   c                    U	bX  U
(       a  [         R                  S5        Sn
Uc7  Uc4  [        XR                  R                  U R                  R
                  5      nU R                  " U4UUUUUUUU
S.UD6nU R                  US   5      nXR                  R                  UR                  5      -   nSnU	ba  U	R                  UR                  5      n	[        5       nU" UR                  SU R                  R                  5      U	R                  S5      5      n[        UUUR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9	$ )ap  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example summarization:

```python
>>> from transformers import AutoTokenizer, BartForConditionalGeneration

>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

>>> ARTICLE_TO_SUMMARIZE = (
...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
... )
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")

>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
```

Mask filling example:

```python
>>> from transformers import AutoTokenizer, BartForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

>>> TXT = "My friends are <mask> but they eat too many carbs."
>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
>>> logits = model(input_ids).logits

>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)

>>> tokenizer.decode(predictions).split()
['not', 'good', 'healthy', 'great', 'very']
```
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)ro   ro  rq  rp  r   r=  rr  r   r   r,   	losslogitsr   rv  rw  rO  rx  r   ry  )r   warningr4   r   r)   r*   r   r  r   r@  rH   r   r   r2  r   r   rv  rw  rO  rx  r   ry  )rA   r(   ro   ro  rp  rq  r   r=  rr  r  r   rr   outputs	lm_logitsmasked_lm_lossloss_fcts                   r3   rO   $BartForConditionalGeneration.forwardA  sd   h klI (-B-J$6KK44dkk6X6X%! '+jj'
)/+#9+'"7'
 '
 LL,	 6 6 9 9):J:J KK	YYy//0F')H%innR9O9O&PRXR]R]^`RabN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r5   c                 j    [        XR                  R                  U R                  R                  5      $ rc   )r4   r   r)   r*   )rA   r  s     r3   %prepare_decoder_input_ids_from_labelsBBartForConditionalGeneration.prepare_decoder_input_ids_from_labels  s#    !&++*B*BDKKDfDfggr5   r  r   )NT
NNNNNNNNNN)rS   rT   rU   rV   r
  r~  _keys_to_ignore_on_load_missingr'   r@   rX   r   r   	Embeddingr  r  r   r   rI   rJ  rY   r  r   r   r   r   r   r   rO   r  rZ   r[   r\   s   @r3   r   r     s     / (;&;#z  ae!7:TzY]	 < < <  .2.259:>:>(,26:>*.!%{
##d*{
 t+{
 !++d2	{

 !& 0 04 7{
 e//047{
 {
 ((4/{
  %0047{
   4'{
 $;{
 +,{
 
	 {
  {
zhELL h hr5   r   z
    Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                   x  ^  \ rS rSrS\4U 4S jjr\\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\\R                     S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )BartForSequenceClassificationi  r   c                    > [         TU ]  " U40 UD6  [        U5      U l        [	        UR
                  UR
                  UR                  UR                  5      U l        U R                  5         g rc   )
r?   r@   rb  r   r   r   
num_labelsclassifier_dropoutclassification_headr:  )rA   r   rr   rB   s      r3   r@   &BartForSequenceClassification.__init__  sZ    *6*v&
#9NNNN%%	$
  	r5   Nr(   ro   ro  rp  rq  r=  rr  r  r   rr   r   c
                    Ub  Sn	Uc%  Ub"  [        SU R                  R                   35      eU R                  " U4UUUUUUU	S.U
D6nUS   nUR	                  U R
                  R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      R                  5       S:H  S5        XSS24   R                  UR                  S5      SUR                  S5      5      SS2SSS24   nU R!                  U5      nSnUGb  UR                  UR                  5      nU R
                  R"                  c  U R
                  R$                  S:X  a  S	U R
                  l        OyU R
                  R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  S
U R
                  l        OSU R
                  l        U R
                  R"                  S	:X  aS  [-        5       nU R
                  R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" X5      nOU R
                  R"                  S
:X  aG  [1        5       nU" UR                  SU R
                  R$                  5      UR                  S5      5      nO,U R
                  R"                  S:X  a  [3        5       nU" X5      n[5        UUUR6                  UR8                  UR:                  UR<                  UR>                  UR@                  URB                  S9	$ )aU  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NFz8Passing input embeddings is currently not supported for ro   ro  rp  rq  r=  rr  r   r   r&   z7All examples must have the same number of <eos> tokens.r,   
regressionsingle_label_classificationmulti_label_classificationr  )"NotImplementedErrorrB   rS   r   eqr   eos_token_idr@  rH   r"   rI   unique_consecutivesumnumelr   rz   r  problem_typer  rG   rK   rX   r   squeezer   r   r   r   rv  rw  rO  rx  r   ry  )rA   r(   ro   ro  rp  rq  r=  rr  r  r   rr   r  r   eos_masksentence_representationr  r  r  s                     r3   rO   %BartForSequenceClassification.forward  s   R I!:%J4>>KbKbJcd  '+jj
'
)/#9+'"7
'
 
'
  
<< 8 89<<]=Q=QR$$X\\!_5;;=BE	
 #0!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#F3D))-JJ+-B0F0F GUWY))-II,./.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r5   )r  r   r}  )rS   rT   rU   rV   r'   r@   r   r   rI   rJ  rY   r  r   r   r   r   r   r   rO   rZ   r[   r\   s   @r3   r  r    s,   z   .2.259:>:>26:>*.!%i
##d*i
 t+i
 !++d2	i

 !& 0 04 7i
 e//047i
 ((4/i
  %0047i
   4'i
 $;i
 +,i
 
0	0i
  i
r5   r  c                     ^  \ rS rSrU 4S jr\\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\
\R                     S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )BartForQuestionAnsweringiF  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r<   )
r?   r@   r  rb  r   r   r   hidden_size
qa_outputsr:  r  s     r3   r@   !BartForQuestionAnswering.__init__H  s[      ++v&
))F$6$68I8IJ 	r5   Nr(   ro   ro  rp  rq  start_positionsend_positionsr=  rr  r   rr   r   c                 <   Ub  Ub  Sn
U R                   " U4UUUUUU	U
S.UD6nUS   nU R                  U5      nUR                  SSS9u  nnUR                  S5      R	                  5       nUR                  S5      R	                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S	-  n[        UUUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S
9
$ )rt  NFr  r   r&   r,   ru   )ignore_indexr=   )
r  start_logits
end_logitsr   rv  rw  rO  rx  r   ry  )r   r  splitr  r   rz  rz   r   r   r   r   rv  rw  rO  rx  r   ry  )rA   r(   ro   ro  rp  rq  r  r  r=  rr  r   rr   r  sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_losss                         r3   rO    BartForQuestionAnswering.forwardT  s   N &=+DI&*jj
'
)/#9+'"7
'
 
'
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r5   )r   r  r  r  )rS   rT   rU   rV   r@   r   r   rI   rY   rJ  r  r   r   r   r   r   r   rO   rZ   r[   r\   s   @r3   r  r  F  s<   
  *..259:>:>371526:>!%W
<<$&W
 t+W
 !++d2	W

 !& 0 04 7W
 e//047W
 ))D0W
 ''$.W
 ((4/W
  %0047W
 $;W
 +,W
 
4	4W
  W
r5   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )BartDecoderWrapperi  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rc   )r?   r@   rL  rf  r:  r  s     r3   r@   BartDecoderWrapper.__init__  s&     "6*r5   c                 &    U R                   " U0 UD6$ rc   rf  )rA   argsrr   s      r3   rO   BartDecoderWrapper.forward  s    ||T,V,,r5   r  )	rS   rT   rU   rV   rW   r@   rO   rZ   r[   r\   s   @r3   r  r    s    

- -r5   r  zu
    BART decoder with a language modeling head on top (linear layer with weights tied to the input embeddings).
    c                   j  ^  \ rS rSrSS0rU 4S jrS rS r\\	         SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S\\
R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )BartForCausalLMi  r  z!model.decoder.embed_tokens.weightc                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr   )r   rV  r?   r@   r  r   r   r   r  r2  r  r:  r  s     r3   r@   BartForCausalLM.__init__  sX     $)! '/
yy!3!3V5F5FUS 	r5   c                 B    U R                   R                  R                  $ rc   r   rf  r3  r  s    r3   ri  $BartForCausalLM.get_input_embeddings  s    zz!!...r5   c                 8    XR                   R                  l        g rc   r  rl  s     r3   rm  $BartForCausalLM.set_input_embeddings  s    */

'r5   Nr(   ro   r   r   r   r=  r  r   logits_to_keeprr   r   c
                 
   U R                   R                  " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUba  UR                  UR                  5      n[        5       nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, BartForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
>>> model = BartForCausalLM.from_pretrained("facebook/bart-base")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```ru  r   Nr,   )r  r  r   r   r(  rO  r  )r   rf  r   rX   slicer  r@  rH   r   r   r   r2  r   r   r   r(  rO  )rA   r(   ro   r   r   r   r=  r  r   r  rr   r  r   slice_indicesr  r  r  s                    r3   rO   BartForCausalLM.forward  s   L >BZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r5   r  )	NNNNNNNNr   )rS   rT   rU   rV   r~  r@   ri  rm  r   r   rI   rJ  rY   r   r   r   rX   r   r   r   r   rO   rZ   r[   r\   s   @r3   r  r    s1    	=	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r5   r  )r  r   r  r  rb  r   r#  r  )Nr   )TrW   r0  r  collections.abcr   rI   r   torch.nnr   r   r    r	   r   activationsr
   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    r!   r"   utils.genericr#   utils.output_capturingr$   r%   configuration_bartr'   
get_loggerrS   r   rY   rX   r4   r  r7   r^   Modulerj   r   r   r   r   r   r   r  r#  r'  rL  rb  r   r  r  r  r  __all__r  r5   r3   <module>r     s      $   A A & ! C C ) J B 9   G &  8 E * 
		H	%%,, c [^  ;R\\ ;6
=bll 
=( !%II%<<% 
% <<	%
 LL4'% T\% % '(%8r)BII r)j01 0fO1 OdRYY 0 /  :
- 

- 
V
% V
r|
% |
~ q
# q
 q
h 
`h#6 `h
`hF y
$7 y
y
x f
2 f
 f
R-, - 
Y
)? Y

Y
x	r5   