
    Z j                        S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSKJ	r
  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/  SSK0J1r1  \*Rd                  " \35      r4S\Rj                  S\6S\64S jr7 " S S\Rp                  5      r9 " S S\Rp                  5      r:  S>S\Rv                  S\Rj                  S \Rj                  S!\Rj                  S"\Rj                  S-  S#\<S-  S$\<S%\$\&   4S& jjr= " S' S(\Rv                  5      r> " S) S*\5      r? " S+ S,\5      r@\' " S- S.\"5      5       rA " S/ S0\A5      rB " S1 S2\A5      rC\' " S3 S4\A5      5       rD\'" S5S69 " S7 S8\A\5      5       rE " S9 S:\A5      rF " S; S<\A\5      rG/ S=QrHg)?zPyTorch Blenderbot model.    N)Callable)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)OutputRecordercapture_outputs   )BlenderbotConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr!   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r#   r$   r%   shifted_input_idss       ڃ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/blenderbot/modeling_blenderbot.pyshift_tokens_rightr/   1   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S	\R                  S-  4U 4S
 jjjr
SrU =r$ )$BlenderbotLearnedPositionalEmbeddingA   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 $   > [         TU ]  X5        g N)super__init__)selfr4   r5   	__class__s      r.   r9   -BlenderbotLearnedPositionalEmbedding.__init__F   s    7r0   Ninput_ids_shapepast_key_values_lengthposition_idsc                    > UcB  USS u  pE[         R                  " X"U-   [         R                  U R                  R                  S9n[
        TU ]  U5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].N   )dtypedevice)torcharangelongweightrC   r8   forward)r:   r=   r>   r?   bszseq_lenr;   s         r.   rH   ,BlenderbotLearnedPositionalEmbedding.forwardI   sX     *2A.LC <<&(HPUPZPZcgcncncucuL w|,,r0    )r   N)__name__
__module____qualname____firstlineno____doc__intr9   rD   SizeTensorrH   __static_attributes____classcell__r;   s   @r.   r2   r2   A   sT    8s 83 8 qu	-$zz	-CF	-Z_ZfZfimZm	- 	-r0   r2   c            
       r   ^  \ rS rSrSrSS\S\S\S\S-  4U 4S jjjrS	\R                  4U 4S
 jjr
SrU =r$ )BlenderbotScaledWordEmbeddingV   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r4   r5   padding_idxembed_scaleNc                 2   > [         TU ]  XU5        X@l        g r7   )r8   r9   r\   )r:   r4   r5   r[   r\   r;   s        r.   r9   &BlenderbotScaledWordEmbedding.__init__[   s    D&r0   r#   c                 <   > [         TU ]  U5      U R                  -  $ r7   )r8   rH   r\   )r:   r#   r;   s     r.   rH   %BlenderbotScaledWordEmbedding.forward_   s    wy)D,<,<<<r0   r\   )      ?)rM   rN   rO   rP   rQ   rR   floatr9   rD   rT   rH   rU   rV   rW   s   @r.   rY   rY   V   sJ    's '3 'S '_dgk_k ' '= = =r0   rY   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr'         rA   r   dimptrainingr!   )
sizerD   matmul	transposer   
functionalsoftmaxrj   rr   
contiguous)
rd   re   rf   rg   rh   ri   rj   rk   attn_weightsattn_outputs
             r.   eager_attention_forwardr{   d   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r0   c                   $  ^  \ rS rSrSr      SS\S\S\S\S\S	\S
\S-  S\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\S-  S\
R                  S-  S\\   S\\
R                  \
R                  S-  4   4S jjrSrU =r$ )BlenderbotAttention   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsrj   
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rm   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )r8   r9   r   r   rj   head_dimr   r+   ri   r   r   r   loggerwarning_oncer;   rM   r   Lineark_projv_projq_projout_proj)
r:   r   r   rj   r   r   r   r   r   r;   s
            r.   r9   BlenderbotAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr0   hidden_stateskey_value_statespast_key_valuesrh   rk   returnc                 ,   USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Sn
Ub]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      n/ UR                   SS QSPU R                  P7nUR                  U5      R	                  SS5      nUR                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U U	UUU4U R.                  (       d  SOU R0                  U R2                  S.UD6u  nnUR4                  " / UQSP76 R7                  5       nU R9                  U5      nUU4$ )	z#Input shape: Batch x Time x ChannelNr'   r!   rA   FT        )rj   ri   )r)   r   r   viewru   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacer   _attn_implementationr{   rr   rj   ri   reshaperx   r   )r:   r   r   r   rh   rk   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacerz   ry   s                      r.   rH   BlenderbotAttention.forward   sc    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFHmmK0L((r0   )r   rj   r   r   r   r   r   r   r   r   r   ri   r   )r   FTFNNNNN)rM   rN   rO   rP   rQ   rR   rc   boolr"   r9   rD   rT   r	   r   r   tuplerH   rU   rV   rW   s   @r.   r}   r}      s   G  *. $%C%C %C 	%C
 %C %C %C !4'%C :%C %CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H) H)r0   r}   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jr
S	rU =r$ )
BlenderbotEncoderLayer   r   c                 j  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  US9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)r   r   rj   r   )r8   r9   d_modelr   r}   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrj   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normr:   r   r;   s     r.   r9   BlenderbotEncoderLayer.__init__   s    ,nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r0   r   rh   rk   r   c                    UnU R                  U5      nU R                  " SUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nXA-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nXA-   nUR                  [        R                  :X  aC  [        R                  " UR                  5      R                  S-
  n[        R                   " X* US9nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
)r   rh   rp   i  )minmaxrL   )r   r   r   rv   rj   rr   r   r   r   r   r   rB   rD   float16finfor   clamp)r:   r   rh   rk   residual_clamp_values          r.   rH   BlenderbotEncoderLayer.forward
  sD    !11-@>> 
')
 

 --m||VZVcVc-d 0 --m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0%--/++m&9&9:>>EK!KK<[YMr0   )	r   r   rj   r   r   r   r   r   r   )rM   rN   rO   rP   r"   r9   rD   rT   r   r   rH   rU   rV   rW   s   @r.   r   r      sR    =/ =$"||" " +,	"
 
" "r0   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\
S-  S\\   S\R                  4S jjrSrU =r$ )BlenderbotDecoderLayeri0  Nr   r   c           
        > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r   r   rj   r   r   r   r   )rj   r   r   r   )r8   r9   r   r   r}   decoder_attention_headsr   r   rj   r   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   )r:   r   r   r;   s      r.   r9   BlenderbotDecoderLayer.__init__1  s    ,nn44,,
 ~~#F$>$>?"(";";$&LL$@!/NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r0   r   rh   encoder_hidden_statesencoder_attention_maskr   	use_cacherk   r   c                    UnU R                  U5      nU R                  " SUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUbb  UnU R                  U5      nU R                  " SUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    past_key_values (`Cache`): cached past key and value projection states
)r   r   rh   rp   )r   r   rh   r   rL   )r   r   r   rv   rj   rr   r   r   r   r   r   r   r   )
r:   r   rh   r   r   r   r   rk   r   r   s
             r.   rH   BlenderbotDecoderLayer.forwardP  s   * !11-@  >> 
'+)
 	
 --m||VZVcVc-d 0 !,$H 88GM#00  +!65 /	 
  M MM11-<<Z^ZgZg1hM$4M !--m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0r0   )r   r   rj   r   r   r   r   r   r   r   r   r7   )NNNNT)rM   rN   rO   rP   r"   rR   r9   rD   rT   r	   r   r   r   rH   rU   rV   rW   s   @r.   r   r   0  s    =/ =C$J = =D /3596:(,!%:||: t+:  %||d2	:
 !&t 3: : $;: +,: 
: :r0   r   c                   \   ^  \ rS rSr% \\S'   SrSrSrSr	Sr
SrU 4S jr\S 5       rSrU =r$ )BlenderbotPreTrainedModeli  r   modelTc                    > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g g r7   )r8   _init_weightsr   "BlenderbotForConditionalGenerationinitzeros_final_logits_bias)r:   rd   r;   s     r.   r   'BlenderbotPreTrainedModel._init_weights  s6    f%f@AAKK001 Br0   c                     U R                   R                  n[        R                  " / SQSSSSU//U R                  S9nUR                  U5      UUS.nU$ )N)r      
      rA   r         rA   rC   )rh   r#   decoder_input_ids)r   r$   rD   tensorrC   ne)r:   	pad_tokenr#   dummy_inputss       r.   r   &BlenderbotPreTrainedModel.dummy_inputs  sZ    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"!*

 r0   rL   )rM   rN   rO   rP   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   propertyr   rU   rV   rW   s   @r.   r   r     sE    &*#N!2
  r0   r   c                      ^  \ rS rSrSr\\S.rS\4U 4S jjr	\
\\   SS\R                  S-  S\R                  S-  S	\R                   S-  S
\\   S\4
S jj5       5       5       rSrU =r$ )BlenderbotEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`BlenderbotEncoderLayer`].

Args:
    config: BlenderbotConfig
    embed_tokens (nn.Embedding): output embedding
)r   
attentionsr   c                   > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSn[        UR                  X R                  US9U l        [!        UR                  U5      U l        [$        R&                  " [)        UR*                  5       Vs/ s H  n[-        U5      PM     sn5      U l        [$        R0                  " UR
                  5      U l        SU l        U R7                  5         g s  snf )Nrb   ra   F)r8   r9   rj   encoder_layerdrop	layerdropr   r$   r[   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtrY   
vocab_sizeembed_tokensr2   embed_positionsr   
ModuleListrangeencoder_layersr   r   r   
layer_normgradient_checkpointing	post_init)r:   r   r   r\   r   r;   s        r.   r9   BlenderbotEncoder.__init__  s    ~~11NN	!..$*$B$B!.4.D.Ddii	*#9y*:*:
  D** 
 mmUSYShShMi$jMi%;F%CMi$jk,,v~~6&+# %ks   0ENr#   rh   inputs_embedsrk   r   c                 D   US L US L-  (       a  [        S5      eUc  U R                  U5      nUR                  5       S S nU R                  U5      nX6-   n[        R
                  R                  XpR                  U R                  S9n[        U R                  UUS9n[        U R                  5       HR  u  pSn
U R                  (       a'  [        R                  " / 5      nXR                  :  a  Sn
U
(       a  MH  U	" UU40 UD6nMT     U R                  U5      n[!        US9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr'   rp   )r   r  rh   FT)last_hidden_state)r+   r  rs   r  r   rv   rj   rr   r   r   	enumerater   rD   randr   r  r   )r:   r#   rh   r  rk   r   	embed_posr   idxencoder_layerto_dropdropout_probabilitys               r.   rH   BlenderbotEncoder.forward  s     -t";<YZZ  --i8M#((*3B/((5	%1--m||VZVcVc-d2;;')

 #,DKK"8CG}}&+jjn#&7"G7 -!"! ! #9  6+
 	
r0   )	rj   r  r  r  r  r   r   r  r[   r   )rM   rN   rO   rP   rQ   r   r}   _can_record_outputsr"   r9   r   r    r   rD   
LongTensorrT   FloatTensorr   r   r   rH   rU   rV   rW   s   @r.   r   r     s     0)
/ 4   .2.226	,
##d*,
 t+,
 ((4/	,

 +,,
 
,
    ,
r0   r   c                   J  ^  \ rS rSrSr\\" \SSS9\" \SSS9S.rS\	4U 4S	 jjr
\\\       SS\R                  S
-  S\R                   S
-  S\R"                  S
-  S\R                  S
-  S\S
-  S\R"                  S
-  S\S
-  S\\   S\4S jj5       5       5       rSrU =r$ )BlenderbotDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BlenderbotDecoderLayer`]

Args:
    config: BlenderbotConfig
    embed_tokens (nn.Embedding): output embedding
r!   r   )index
layer_namer   )r   r   cross_attentionsr   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R                  US9U l        [!        UR                  UR                  5      U l        [$        R&                  " [)        UR*                  5       Vs/ s H  n[-        XS9PM     sn5      U l        [$        R0                  " UR                  5      U l        SU l        U R7                  5         g s  snf )Nrb   ra   )r   F)r8   r9   rj   decoder_layerdropr   r$   r[   r  max_target_positionsr  r  r  r   rY   r  r  r2   r  r   r	  r
  decoder_layersr   r   r   r  r  r  )r:   r   r\   ir;   s       r.   r9   BlenderbotDecoder.__init__  s    ~~11!..$*$B$B!393I3Idii/s9v~~t/?/?[
  D**NN 
 mmBGH]H]B^_B^Q#F8B^_
 ,,v~~6&+# `s   ENr#   rh   r   r   r   r  r   rk   r   c                    US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       ab  Uc_  Uc  U R                  R                  (       a.  [	        [        U R                  S9[        U R                  S95      O[        U R                  S9nUR                  5       S S u  pUb  UR                  5       OSn[        R                  " XR                  S9U-   nUc2  [        5       (       d#  X-   n[        R                  " XUR                  S9n[        U[        5      (       a  UR                  OUn[        U R                  UUUS9n[!        U R                  UUUS9nU R#                  X4XS9nXl-   n[$        R&                  R)                  UU R(                  U R*                  S	9n[-        U R.                  5       Hj  u  nnU R*                  (       a(  [        R0                  " / 5      nUU R2                  :  a  M?  U" UUU4UUUS
.UD6n[        U[4        5      (       a  US   OUnMl     U R7                  U5      n[9        UUS9$ )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)r   r'   r   r   )r   r  rh   r   )r   r  rh   r   )r?   rp   )r   r   r   )r  r   )r+   r  r   is_encoder_decoderr   r
   rs   get_seq_lengthrD   rE   rC   r   onesr   r   r   r   r  r   rv   rj   rr   r  r   r  r   r   r  r   )r:   r#   rh   r   r   r   r  r   rk   
batch_size
seq_lengthr>   r?   mask_seq_lengthself_attn_cachecausal_maskr   r  decoder_layerr  layer_outputss                        r.   rH   BlenderbotDecoder.forward+  sS    -t";<stt  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
ETE`!?!?!Afg||J7K7KLOee!*B*D*D4AO"ZZ
ML`L`aN /+>?? 00  	 );;')+	
 ";;;'1"7	"
 ++$&< , 
 &4--mt||VZVcVc-d"+DKK"8C}}&+jjn#&7)% (> /# M 1;=%0P0PM!,VcM! #9& 68++
 	
r0   )	rj   r  r  r  r  r   r   r%  r[   )NNNNNNN)rM   rN   rO   rP   rQ   r   r   r}   r  r"   r9   r   r    r   rD   r  rT   r  r	   r   r   r   r   rH   rU   rV   rW   s   @r.   r  r    s    0$%8kZ*+>aTbc/ 2   .2.2:>:>(,26!%U
##d*U
 t+U
  %0047	U

 !& 0 04 7U
 U
 ((4/U
 $;U
 +,U
 
3U
    U
r0   r  c                   Z  ^  \ rS rSrSSS.rS\4U 4S jjrS rS r\	\
         SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\R                   S-  S\R                   S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )BlenderbotModeli  zshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr   c                 J  > [         TU ]  U5        UR                  UR                  p2UR                  (       a   [
        R                  " UR                  5      OSn[        X1R                  X$S9U l	        [        U5      U l        [        U5      U l        U R                  5         g )Nrb   ra   )r8   r9   r$   r  r  r  r  r   rY   sharedr   encoderr  decoderr  )r:   r   r[   r  r\   r;   s        r.   r9   BlenderbotModel.__init__  sv     "("5"5v7H7HZ393I3Idii/s3JP[u(0(0 	r0   c                     U R                   $ r7   )r8  r:   s    r.   get_input_embeddings$BlenderbotModel.get_input_embeddings  s    {{r0   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g r7   )r8  r9  r  r:  r:   rg   s     r.   set_input_embeddings$BlenderbotModel.set_input_embeddings  s'    $(KK!$(KK!r0   Nr#   rh   r   decoder_attention_maskencoder_outputsr   r  decoder_inputs_embedsr   rk   r   c
                    Uc  U R                   " S	UUUS.U
D6nOK[        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  " S	UUUS   UUUU	S.U
D6n[        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )
a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

Example:

```python
>>> from transformers import AutoTokenizer, BlenderbotModel

>>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

>>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
>>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_input_ids)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 6, 1280]
```N)r#   rh   r  r   r!   rA   )r  r   r   r#   rh   r   r   r   r  r   )r  r   decoder_hidden_statesdecoder_attentionsr"  encoder_last_hidden_stater   encoder_attentionsrL   )r9  r   r   lenr:  r   r  r   r   r   r"  )r:   r#   rh   r   rD  rE  r   r  rF  r   rk   decoder_outputss               r.   rH   BlenderbotModel.forward  s   ` "/3|| 0#-+0 	0O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO FJ\\ 	F
'1"1!"4#1+/	F
 	F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r0   )r:  r9  r8  )	NNNNNNNNN)rM   rN   rO   rP   _tied_weights_keysr"   r9   r>  rB  r   r   rD   r  rT   r   r	   r  r   r   r   r   rH   rU   rV   rW   s   @r.   r6  r6    s/    (7'6

/ 
0
  .2.259:>26(,26:>!%P
##d*P
 t+P
 !++d2	P

 !& 0 04 7P
 )4/P
 P
 ((4/P
  %0047P
 $;P
 +,P
 
P
  P
r0   r6  z\
    The Blenderbot Model with a language modeling head. Can be used for summarization.
    )custom_introc                     ^  \ rS rSrSrS/rSS0rS\4U 4S jjr SS	\	S
\	S-  S\
S\R                  4U 4S jjjrS	\	SS4S jr\\          SS\R$                  S-  S\R&                  S-  S\R$                  S-  S\R$                  S-  S\S-  S\S-  S\R,                  S-  S\R,                  S-  S\R$                  S-  S\
S-  S\\   S\4S jj5       5       rSrU =r$ )r   i  r   r   lm_head.weightzmodel.shared.weightr   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nr   r!   Fr   )r8   r9   r6  r   register_bufferrD   zerosr8  r4   r   r   r   lm_headr  r   s     r.   r9   +BlenderbotForConditionalGeneration.__init__  s     $V,
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r0   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 x   > [         TU ]  XU5      nU R                  UR                  R                  S   5        U$ )Nr   )r8   resize_token_embeddings_resize_final_logits_biasrG   r)   )r:   rY  rZ  r[  new_embeddingsr;   s        r.   r]  :BlenderbotForConditionalGeneration.resize_token_embeddings  s<     8]jk&&~'<'<'B'B1'EFr0   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr'   r!   r   rn   r   )r   r)   rD   rV  rC   catrU  )r:   rY  old_num_tokensnew_bias
extra_biass        r.   r^  <BlenderbotForConditionalGeneration._resize_final_logits_bias  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r0   r#   rh   r   rD  rE  r   r  rF  labelsr   rk   c                    U	bX  U
(       a  [         R                  S5        Sn
Uc7  Uc4  [        XR                  R                  U R                  R
                  5      nU R                  " U4UUUUUUUU
S.UD6nU R                  US   5      nXR                  R                  UR                  5      -   nSnU	ba  U	R                  UR                  5      n	[        5       nU" UR                  SU R                  R                  5      U	R                  S5      5      n[        UUUR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9	$ )a	  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example conversation:

```python
>>> from transformers import AutoTokenizer, BlenderbotForConditionalGeneration

>>> mname = "facebook/blenderbot-400M-distill"
>>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
>>> tokenizer = AutoTokenizer.from_pretrained(mname)
>>> UTTERANCE = "My friends are cool but they eat too many carbs."
>>> print("Human: ", UTTERANCE)
Human:  My friends are cool but they eat too many carbs.

>>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
>>> reply_ids = model.generate(**inputs)
>>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
Bot: That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?

>>> REPLY = "I'm not sure"
>>> print("Human: ", REPLY)
Human: I'm not sure

>>> NEXT_UTTERANCE = (
...     "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
...     "Are they trying to lose weight or are they just trying to be healthier?</s> "
...     "<s> I'm not sure."
... )
>>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
>>> next_reply_ids = model.generate(**inputs)
>>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
Bot:   I see. Well, it's good that they're trying to change their eating habits.
```
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rh   r   rE  rD  r   r  rF  r   r   r'   )	losslogitsr   rI  rJ  r"  rK  r   rL  )r   warningr/   r   r$   r%   r   rW  r   torC   r   r   r  r   r   rI  rJ  r"  rK  r   rL  )r:   r#   rh   r   rD  rE  r   r  rF  rg  r   rk   outputs	lm_logitsmasked_lm_lossloss_fcts                   r.   rH   *BlenderbotForConditionalGeneration.forward  sd   H klI (-B-J$6KK44dkk6X6X%! '+jj'
)/+#9+'"7'
 '
 LL,	 6 6 9 9):J:J KK	YYy//0F')H%innR9O9O&PRXR]R]^`RabN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r0   rW  r   )NT)
NNNNNNNNNN)rM   rN   rO   rP   r   _keys_to_ignore_on_load_missingrP  r"   r9   rR   r   r   	Embeddingr]  r^  r   r   rD   r  rT   r   r	   r  r   r   r   rH   rU   rV   rW   s   @r.   r   r     s     ':&;#//  ae!7:TzY]	 < < <  .2.259:>26(,26:>*.!%k
##d*k
 t+k
 !++d2	k

 !& 0 04 7k
 )4/k
 k
 ((4/k
  %0047k
   4'k
 $;k
 +,k
 
k
  k
r0   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )BlenderbotDecoderWrapperi  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r7   )r8   r9   r  r:  r  r   s     r.   r9   !BlenderbotDecoderWrapper.__init__  s&     (0r0   c                 &    U R                   " U0 UD6$ r7   r:  )r:   argsrk   s      r.   rH    BlenderbotDecoderWrapper.forward  s    ||T,V,,r0   rz  )	rM   rN   rO   rP   rQ   r9   rH   rU   rV   rW   s   @r.   rv  rv    s    

- -r0   rv  c                   j  ^  \ rS rSrSS0rU 4S jrS rS r\\	         SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S\\
R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )BlenderbotForCausalLMi  rS  z!model.decoder.embed_tokens.weightc                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr   )r   r*  r8   r9   rv  r   r   r   hidden_sizer  rW  r  r   s     r.   r9   BlenderbotForCausalLM.__init__  sX     $)! -f5
yy!3!3V5F5FUS 	r0   c                 B    U R                   R                  R                  $ r7   r   r:  r  r=  s    r.   r>  *BlenderbotForCausalLM.get_input_embeddings  s    zz!!...r0   c                 8    XR                   R                  l        g r7   r  rA  s     r.   rB  *BlenderbotForCausalLM.set_input_embeddings  s    */

'r0   Nr#   rh   r   r   r   r  rg  r   logits_to_keeprk   r   c
                 
   U R                   R                  " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUba  UR                  UR                  5      n[        5       nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, BlenderbotForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
>>> model = BlenderbotForCausalLM.from_pretrained("facebook/blenderbot-400M-distill")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```rH  r   Nr'   )ri  rj  r   r   r   r"  rL   )r   r:  r   rR   slicerW  rl  rC   r   r   r   r  r   r   r   r   r"  )r:   r#   rh   r   r   r   r  rg  r   r  rk   rm  r   slice_indicesrj  ri  rp  s                    r.   rH   BlenderbotForCausalLM.forward  s   L >BZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r0   rr  )	NNNNNNNNr   )rM   rN   rO   rP   rP  r9   r>  rB  r   r   rD   r  rT   r  r	   r   rR   r   r   r   r   rH   rU   rV   rW   s   @r.   r~  r~    s/   =	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r0   r~  )r~  r   r6  r   )Nr   )IrQ   r  collections.abcr   rD   r   torch.nnr    r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r    configuration_blenderbotr"   
get_loggerrM   r   rT   rR   r/   rt  r2   rY   Modulerc   r{   r}   r   r   r   r   r  r6  r   rv  r~  __all__rL   r0   r.   <module>r     s      $   % & ! C C ) J B 9  G & l l 7 E 6 
		H	%%,, c [^  -2<< -*
=BLL 
=( !%II%<<% 
% <<	%
 LL4'% T\% % '(%:r)")) r)l57 5rZ7 Zz   4X
1 X
v@
1 @
F l
/ l
 l
^ 
M
)BO M

M
b-8 - Y
5 Y
xr0   