
    Z jpu                        S r SSKJr  SSKrSSKJr  SSKJrJrJr  SSK	J
r
  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJr  SSK J!r!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*  \$RV                  " \,5      r- " S S\R\                  5      r/ S2S\R`                  S\Rb                  S\Rb                  S\Rb                  S\Rb                  S-  S\2S\24S jjr3 " S S \R`                  5      r4 " S! S"\5      r5\" " S# S$\5      5       r6 " S% S&\65      r7\" " S' S(\65      5       r8 " S) S*\6\5      r9\"" S+S,9 " S- S.\65      5       r:\" " S/ S0\65      5       r;/ S1Qr<g)3zPyTorch OPT model.    )CallableN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )	OPTConfigc                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\S	\R                  S-  4U 4S
 jjjr	Sr
U =r$ )OPTLearnedPositionalEmbedding-   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g N   )offsetsuper__init__)selfr"   r#   	__class__s      u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/opt/modeling_opt.pyr)   &OPTLearnedPositionalEmbedding.__init__2   s"     ++5}E    Nattention_maskpast_key_values_lengthposition_idsc                    > Uc5  [         R                  " USS9nX1-  S-
  R                  5       nUSS2US24   n[        TU ]  X0R
                  -   5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr(   forwardr'   )r*   r/   r0   r1   r+   s       r,   r8   %OPTLearnedPositionalEmbedding.forward8   sZ      <<A>L(9A=CCEL'+A+B(BCLw|kk9::r.   )r'   )r   N)__name__
__module____qualname____firstlineno____doc__intr)   r5   
LongTensorr8   __static_attributes____classcell__r+   s   @r,   r    r    -   s]    Fs F3 F '(04	;((; !$; &&-	; ;r.   r    modulequerykeyvaluer/   scalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )N)r4   dtypeptrainingr   r&   )r5   matmul	transposer   
functionalsoftmaxfloat32torM   rI   rP   
contiguous)
rD   rE   rF   rG   r/   rH   rI   kwargsattn_weightsattn_outputs
             r,   eager_attention_forwardr[   J   s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r.   c                      ^  \ rS rSrSr SS\S\S-  4U 4S jjjr   SS\R                  S\
S-  S	\R                  S-  S
\S\\R                  \R                  S-  \
S-  4   4
S jjrSrU =r$ )OPTAttentiona   z=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        X l
        Uc-  [        R                  SU R                  R                   S35        U R                  U R                  -  U l        SU l        U R                  U R                  -  U R                  :w  a&  [#        SU R                   SU R                   S35      eU R                  S-  U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        g )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r(   r)   r_   hidden_size	embed_dimnum_attention_heads	num_headsattention_dropoutrI   enable_biasr`   loggerwarning_oncer+   r:   head_dim	is_causal
ValueErrorrH   r   Lineark_projv_projq_projout_proj)r*   r_   r`   rX   r+   s       r,   r)   OPTAttention.__init__d   s{    	++33//!--" !8!8 9 :, , $..8MMDNN*t~~=MdnnM]$T^^$4B8  }}d*iiTEUEUViiTEUEUViiTEUEUV		$..$..tGWGWXr.   hidden_statespast_key_valuesr/   output_attentionsreturnc                 ^   UR                  5       u  pgnU R                  U5      U R                  -  n	U	R                  USU R                  U R
                  5      R                  SS5      n	U R                  U5      n
U R                  U5      nU
R                  USU R                  U R
                  5      R                  SS5      n
UR                  USU R                  U R
                  5      R                  SS5      nUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U U	U
UU4U R                   (       d  SOU R"                  SS.UD6u  pUR%                  XgS5      R'                  5       nU R)                  U5      nX4$ )z#Input shape: Batch x Time x ChannelrK   r   r&           g      ?)rI   rH   )sizerr   rH   viewrg   rl   rR   rp   rq   updater`   r   get_interfacer_   _attn_implementationr[   rP   rI   reshaperW   rs   )r*   ru   rv   r/   rw   rX   bsztgt_len_query_states
key_statesvalue_statesattention_interfacerZ   rY   s                  r,   r8   OPTAttention.forward   s    (,,.a {{=1DLL@#((b$..$--PZZ[\^_`[[/
{{=1__S"dnndmmLVVWXZ[\
#((b$..$--PZZ[\^_`&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$,,	%
 	%
! "))#;FFHmmK0((r.   )r_   rI   re   ri   rl   rm   rp   r`   rg   rs   rr   rH   rq   N)NNF)r:   r;   r<   r=   r>   r   r?   r)   r5   Tensorr
   booltupler8   rA   rB   rC   s   @r,   r]   r]   a   s    G
 !%!Y!Y :!Y !YL )-.2"'.)||.) .) t+	.)
  .) 
u||U\\D0%$,>	?.) .)r.   r]   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr    SS\R                  S\R                  S-  S\	S-  S	\
S-  S
\R                  S-  S\\   S\R                  4S jjrSrU =r$ )OPTDecoderLayer   Nr_   r`   c                 p  > [         TU ]  5         UR                  U l        [	        XS9U l        UR                  U l        UR                  U l        [        UR                     U l
        [        R                  " U R                  UR                  S9U l        [        R                  " U R                  UR                   UR"                  S9U l        [        R                  " UR                   U R                  UR"                  S9U l        [        R                  " U R                  UR                  S9U l        g )N)r_   r`   elementwise_affinerb   )r(   r)   rd   re   r]   	self_attndo_layer_norm_beforerI   r	   activation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normro   ffn_dimri   fc1fc2final_layer_norm)r*   r_   r`   r+   s      r,   r)   OPTDecoderLayer.__init__   s    ++%VI$*$?$?!~~#F$>$>?$&LLNNv/S/S%
! 99T^^V^^&BTBTU99V^^T^^&BTBTU "T^^PVPtPt ur.   ru   r/   rv   	use_cacher1   rX   rx   c                    UnU R                   (       a  U R                  U5      nU R                  " SUUUUS.UD6u  p[        R                  R                  XR
                  U R                  S9nXq-   nU R                   (       d  U R                  U5      nUR                  n	UR                  SUR                  S5      5      nUnU R                   (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n[        R                  R                  XR
                  U R                  S9nXq-   R                  U	5      nU R                   (       d  U R                  U5      nU$ )N)ru   rv   r1   r/   rN   rK    )r   r   r   r   rS   rI   rP   shaper   r{   r   r   r   r   r|   )
r*   ru   r/   rv   r   r1   rX   residualr   hidden_states_shapes
             r,   r8   OPTDecoderLayer.forward   sk    ! $$ 55mDM  >> 
'+%)	

 
 --m||VZVcVc-d 0 (( 55mDM ,11%--b-2D2DR2HI  $$ 11-@M/**=9/--m||VZVcVc-d!1778KL (( 11-@Mr.   )	r   r   rI   re   r   r   r   r   r   r   )NNFN)r:   r;   r<   r=   r   r?   r)   r5   r   r
   r   r@   r   r   r8   rA   rB   rC   s   @r,   r   r      s    vy vS4Z v v( /3(,!&043||3 t+3 	3
 $;3 &&-3 -.3 
3 3r.   r   c                   L    \ rS rSr% \\S'   SrSrS/rSr	Sr
SrSrSr\\S.rSrg)	OPTPreTrainedModel   r_   modelTr   )ru   
attentionsr   N)r:   r;   r<   r=   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r]   _can_record_outputsrA   r   r.   r,   r   r      sH    &*#*+"&N!("r.   r   c                     ^  \ rS rSrSrS\4U 4S jjr\\\	      SS\
R                  S-  S\
R                  S-  S\S-  S	\
R                  S-  S
\S-  S\
R                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )
OPTDecoderi  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

Args:
    config: OPTConfig
r_   c           
      F  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " UR                  UR                  U R
                  5      U l        [        UR                  UR                  5      U l        UR                  UR                  :w  a0  [        R                   " UR                  UR                  SS9U l        OS U l        UR                  UR                  :w  a0  [        R                   " UR                  UR                  SS9U l        OS U l        UR&                  (       a@  UR(                  (       d/  [        R*                  " UR                  UR,                  S9U l        OS U l        [        R0                  " [3        UR4                  5       Vs/ s H  n[7        XS9PM     sn5      U l        SU l        U R=                  5         g s  snf )NFrb   r   )r`   )r(   r)   rI   	layerdroppad_token_idpadding_idxmax_position_embeddingsmax_target_positions
vocab_sizer   	Embeddingword_embed_proj_dimembed_tokensr    rd   embed_positionsro   project_out
project_inr   _remove_final_layer_normr   r   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing	post_init)r*   r_   ir+   s      r,   r)   OPTDecoder.__init__  s    ~~))!..$*$B$B! ++LL):):F<V<VX\XhXhi<V=[=[]c]o]op%%););;!yy););V=W=W^cdD#D%%););; ii(B(BFDVDV]bcDO"DO
 &&v/N/N$&LL""v7[7[%D! %)D!mmSXY_YqYqSr$sSra_V%ISr$st&+#	 %ts   'HN	input_idsr/   rv   inputs_embedsr   r1   rX   rx   c           	      
   US L US L-  (       a  [        S5      eUb  UR                  SUR                  S   5      nUc  U R                  U5      nU(       a  Uc  [	        U R
                  S9nUb  UR                  5       OSnUc=  XR                  S   -   n	[        R                  " UR                  S   XR                  S9nUc5  [        R                  " USS9nXb-  S-
  R                  5       nUS S 2US 24   n[        U R
                  UUUS9n
U R                  X(US	9nU R                  b  U R                  U5      nXKR                  UR                  5      -   n[!        U R"                  5       HK  u  pU R$                  (       a'  [        R&                  " / 5      nXR(                  :  a  M=  U" U4U
UUUS
.UD6nMM     U R*                  b  U R+                  U5      nU R,                  b  U R-                  U5      n[/        UUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedsrK   )r_   r   r   devicer3   )r_   r   r/   rv   )r1   )r/   r1   rv   r   )last_hidden_staterv   )rn   r|   r   r   r   r_   get_seq_lengthr5   onesr   r6   r7   r   r   r   rV   	enumerater   rP   randr   r   r   r   )r*   r   r/   rv   r   r   r1   rX   past_seen_tokens
seq_lengthcausal_mask
pos_embedsru   idxdecoder_layerdropout_probabilitys                   r,   r8   OPTDecoder.forward>  s    -t";<YZZ !r9??2+>?I  --i8M0*$++>O?N?Z?99;`a!),?,?,BBJ"ZZ(;(;A(>
SgSghN  <<A>L(9A=CCEL'+;+<(<=L(;;')+	
 )).Ye)f
??& OOM:M%m6J6J(KK #,DKK"8C}}&+jjn#&7)*) /# M #9    , 11-@M' ,,];M&++
 	
r.   )rI   r   r   r   r   r   r   r   r   r   r   r   NNNNNN)r:   r;   r<   r=   r>   r   r)   r   r   r   r5   r@   r   r
   FloatTensorr   r   r   r   r8   rA   rB   rC   s   @r,   r   r     s    #y #J   .2.2(,26!%04K
##d*K
 t+K
 	K

 ((4/K
 $;K
 &&-K
 +,K
 
!K
    K
r.   r   c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	      SS\
R                  S-  S\
R                  S-  S	\S-  S
\
R                  S-  S\S-  S\
R                  S-  S\\   S\4S jj5       5       rSrU =r$ )OPTModeli  r_   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )r(   r)   r   decoderr   r*   r_   r+   s     r,   r)   OPTModel.__init__  s&     !&)r.   c                 .    U R                   R                  $ r   r   r   r*   s    r,   get_input_embeddingsOPTModel.get_input_embeddings  s    ||(((r.   c                 $    XR                   l        g r   r   r*   rG   s     r,   set_input_embeddingsOPTModel.set_input_embeddings  s    $)!r.   Nr   r/   rv   r   r   r1   rX   rx   c           
          U R                   " SUUUUUUS.UD6n[        UR                  UR                  UR                  UR
                  S9$ )Nr   r/   r1   rv   r   r   )r   rv   ru   r   r   )r   r   r   rv   ru   r   )	r*   r   r/   rv   r   r   r1   rX   decoder_outputss	            r,   r8   OPTModel.forward  sf     48<< 4
)%+'4
 4
 '-??+;;)77&11	
 	
r.   )r   r   )r:   r;   r<   r=   r   r)   r   r   r   r   r5   r@   r   r
   r   r   r   r   r   r8   rA   rB   rC   s   @r,   r   r     s    y )*  .2.2(,26!%04
##d*
 t+
 	

 ((4/
 $;
 &&-
 +,
 
!
  
r.   r   c                   J  ^  \ rS rSrSS0rU 4S jrS rS r\\	        SS\
R                  S-  S	\
R                  S-  S
\S-  S\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\\
R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )OPTForCausalLMi  zlm_head.weightz!model.decoder.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFrb   )
r(   r)   r   r   r   ro   r   r   lm_headr   r   s     r,   r)   OPTForCausalLM.__init__  sK     f%
 yy!;!;V=N=NUZ[ 	r.   c                 B    U R                   R                  R                  $ r   r   r   r   r   s    r,   r   #OPTForCausalLM.get_input_embeddings      zz!!...r.   c                 8    XR                   R                  l        g r   r   r   s     r,   r   #OPTForCausalLM.set_input_embeddings      */

'r.   Nr   r/   rv   r   labelsr   r1   logits_to_keeprX   rx   c	           
         U R                   R                  " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      R                  5       nSnUb)  U R                  " SXU R                  R                  S.U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, OPTForCausalLM

>>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
```r   N)logitsr  r   lossr  rv   ru   r   r   )r   r   r   
isinstancer?   slicer   rW   loss_functionr_   r   r   rv   ru   r   )r*   r   r/   rv   r   r  r   r1   r  rX   outputsru   slice_indicesr  r  s                  r,   r8   OPTForCausalLM.forward  s    J ,0::+=+= ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@ALLN%%pVt{{OeOepiopD%#33!//))
 	
r.   )r   r   )NNNNNNNr   )r:   r;   r<   r=   _tied_weights_keysr)   r   r   r   r   r5   r@   r   r
   r   r   r?   r   r   r   r   r8   rA   rB   rC   s   @r,   r   r     s   *,OP/0  .2.2(,26*.!%04-.<
##d*<
 t+<
 	<

 ((4/<
   4'<
 $;<
 &&-<
 ell*<
 +,<
 
'	'<
  <
r.   r   a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   *  ^  \ rS rSrS\4U 4S jjr\\       SS\R                  S-  S\R                  S-  S\S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                  S-  S\\   S\\-  4S jj5       5       rS rS rSrU =r$ )OPTForSequenceClassificationi  r_   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r   )
r(   r)   
num_labelsr   r   r   ro   r   scorer   r   s     r,   r)   %OPTForSequenceClassification.__init__  sT      ++f%
YYv994??QVW
 	r.   Nr   r/   rv   r   r  r   r1   rX   rx   c           	         U R                   " U4UUUUUS.UD6n	U	R                  n
U R                  U
5      nUb  UR                  SS u  pOUR                  SS u  pU R                  R
                  c  US:w  a  [        S5      eU R                  R
                  c  SnOUb  XR                  R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGb  U R                  R"                  c  U R$                  S:X  a  S
U R                  l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                  l        OSU R                  l        U R                  R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                  R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                  R"                  S:X  a  [5        5       nU" UU5      n[7        UUU	R8                  U	R:                  U	R<                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
rv   r/   r1   r   r   Nr&   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rK   )r   rM   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr  )r   r   r  r   r_   r   rn   rV   r   r5   int32arangeargmaxrj   rk   r+   r:   problem_typer  rM   r7   r?   r   squeezer   r|   r   r   rv   ru   r   )r*   r   r/   rv   r   r  r   r1   rX   transformer_outputsru   r  
batch_sizesequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fcts                       r,   r8   $OPTForSequenceClassification.forward'  s   & 8<zz8
+)%'8
 8
 ,==M* *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6/ /??-;;*55
 	
r.   c                 B    U R                   R                  R                  $ r   r   r   s    r,   r   1OPTForSequenceClassification.get_input_embeddings|  r   r.   c                 8    XR                   R                  l        g r   r   r   s     r,   r   1OPTForSequenceClassification.set_input_embeddings  r   r.   )r   r  r  )NNNNNNN)r:   r;   r<   r=   r   r)   r   r   r5   r@   r   r
   r   r   r   r   r   r8   r   r   rA   rB   rC   s   @r,   r  r    s    y   .237(,26*.!%04Q
##d*Q
 ))D0Q
 	Q

 ((4/Q
   4'Q
 $;Q
 &&-Q
 +,Q
 
1	1Q
  Q
f/0 0r.   r  c                   J  ^  \ rS rSrS\4U 4S jjr\\        SS\R                  S-  S\R                  S-  S\S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\S-  S\R                  S-  S\\   S\\-  4S jj5       5       rS rS rSrU =r$ )OPTForQuestionAnsweringi  r_   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g r%   )	r(   r)   r   r   r   ro   r   
qa_outputsr   r   s     r,   r)    OPTForQuestionAnswering.__init__  s@     f%
))F$>$>B 	r.   Nr   r/   rv   r   start_positionsend_positionsr   r1   rX   rx   c	           	      (   U R                   " U4UUUUUS.U	D6n
U
R                  nU R                  U5      nUR                  SSS9u  pUR	                  S5      R                  5       nUR	                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR	                  S5      n[        UR                  5       5      S:  a  UR	                  S5      nUR                  S5      nUR                  SU5      R                  UR                  5      nUR                  SU5      R                  UR                  5      n[        US9nU" X5      nU" X5      nUU-   S-  n[        UUUU
R                  U
R                  S	9$ )
a  
Example:

```python
>>> from transformers import AutoTokenizer, OPTForQuestionAnswering
>>> import torch

>>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

>>> # note: we are loading a OPTForQuestionAnswering from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random
>>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

>>> inputs = tokenizer(question, text, return_tensors="pt")
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> answer_start_index = outputs.start_logits.argmax()
>>> answer_end_index = outputs.end_logits.argmax()

>>> answer_offset = len(tokenizer(question)[0])

>>> predict_answer_tokens = inputs.input_ids[
...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
... ]
>>> predicted = tokenizer.decode(predict_answer_tokens)
>>> predicted
' a nice puppet'
```r  r   rK   r3   Nr   )ignore_indexr&   )r  start_logits
end_logitsru   r   )r   r   r/  splitr  rW   lenr{   clamprV   r   r   r   ru   r   )r*   r   r/   rv   r   r1  r2  r   r1   rX   r  ru   r  r5  r6  
total_lossignored_indexr&  
start_lossend_losss                       r,   r8   OPTForQuestionAnswering.forward  s   \ 8<zz8
+)%'8
 8
 ,==/#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EHHWO)//=ADDV]]SM']CH!,@J
:H$x/14J+%!-;;*55
 	
r.   c                 B    U R                   R                  R                  $ r   r   r   s    r,   r   ,OPTForQuestionAnswering.get_input_embeddings  r   r.   c                 8    XR                   R                  l        g r   r   r   s     r,   r   ,OPTForQuestionAnswering.set_input_embeddings  r   r.   )r   r/  )NNNNNNNN)r:   r;   r<   r=   r   r)   r   r   r5   r@   r   r
   r   r   r   r   r   r8   r   r   rA   rB   rC   s   @r,   r-  r-    s   y   .237(,263715!%04Q
##d*Q
 ))D0Q
 	Q

 ((4/Q
 ))D0Q
 ''$.Q
 $;Q
 &&-Q
 +,Q
 
-	-Q
  Q
f/0 0r.   r-  )r   r   r   r  r-  )rz   )=r>   collections.abcr   r5   r   torch.nnr   r   r   activationsr	   cache_utilsr
   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   configuration_optr   
get_loggerr:   rj   r   r    Moduler   floatr[   r]   r   r   r   r   r   r  r-  __all__r   r.   r,   <module>rV     s    $   A A ! . ) / B 9  G & R R 7 5 ( 
		H	%;BLL ;H %II%<<% 
% <<	%
 LL4'% % %.T)299 T)nE0 EP    {
# {
| (
! (
 (
VQ
' Q
h c0#5 c0c0L b00 b0 b0Jr.   