
    Z jY                       S r SSKJr  SSKJr  SSKrSSKJr  SSKJrJ	r	  SSK
Jr  SS	KJrJrJrJrJr  SS
KJr  SSKJr  SSKJr  SSKJrJr  SSKJr  SSKJr  \R>                  " \ 5      r!\" SS9\ " S S\5      5       5       r"\\ " S S\5      5       5       r#\" SS9\ " S S\5      5       5       r$\ " S S\$5      5       r%\" SS9 " S S\$5      5       r&\" SS9 " S  S!\$\5      5       r'/ S"Qr(g)#zRAG model implementation.    )Callable)	dataclassN)nn   )CacheEncoderDecoderCache)PreTrainedConfig)GenerationConfigGenerationMixinGenerationModeLogitsProcessorListStoppingCriteriaList)GENERATION_MODES_MAPPING)ModelOutput)PreTrainedModel)auto_docstringlogging   )	RagConfig)RagRetrieverzI
    Base class for retriever augmented marginalized models outputs.
    )custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\R                  S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Srg)RetrievAugLMMarginOutput$   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
    each vocabulary token.
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
    (see `past_key_values` input) to speed up sequential decoding.
retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
    Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
    the `doc_scores`.
retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
    The indexes of the embedded documents retrieved by the retriever.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever.
question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
    model.
question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
    shape `(batch_size, sequence_length, hidden_size)`.

    Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
    average in the self-attention heads.
generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
    shape `(batch_size, sequence_length, hidden_size)`.

    Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
    average in the self-attention heads.
generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
    shape `(batch_size, sequence_length, hidden_size)`.

    Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
    average in the self-attention heads.
generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
    weighted average in the cross-attention heads.
Nlosslogits
doc_scorespast_key_valuesretrieved_doc_embedsretrieved_doc_idscontext_input_idscontext_attention_mask"question_encoder_last_hidden_state.question_enc_hidden_statesquestion_enc_attentionsgenerator_enc_last_hidden_stategenerator_enc_hidden_statesgenerator_enc_attentionsgenerator_dec_hidden_statesgenerator_dec_attentionsgenerator_cross_attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   r   r   r   r    
LongTensorr!   r"   r#   r$   tupler%   r&   r'   r(   r)   r*   r+   __static_attributes__r,       u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/rag/modeling_rag.pyr   r   $   s   DL &*D%

d
")'+FE$++/J!!D(/$(OUT\(59%++d2915u''$.515u''$.56:E,,t3:CG&(9(9D(@GGKe&7&7&< = DKDHU5#4#4c#9:TAH@D#U%6%6%=DHLu'8'8#'=!>!ELEIeE$5$5s$:;dBIHLu'8'8#'=!>!ELEIeE$5$5s$:;dBIGKe&7&7&< = DKr8   r   c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Sr\R                  S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\R                  S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Srg)RetrievAugLMOutput   a:  
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
    each vocabulary token.
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
    (see `past_key_values` input) to speed up sequential decoding.
retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
    Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
    the `doc_scores`.
retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
    The indexes of the embedded documents retrieved by the retriever.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever.
question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
    model.
question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
    shape `(batch_size, sequence_length, hidden_size)`.

    Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
    average in the self-attention heads.
generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
    shape `(batch_size, sequence_length, hidden_size)`.

    Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
    average in the self-attention heads.
generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
    shape `(batch_size, sequence_length, hidden_size)`.

    Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
    average in the self-attention heads.
generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
    weighted average in the cross-attention heads.
Nr   r   r   r   r    r!   r"   r#   .r$   r%   r&   r'   r(   r)   r*   r+   r,   )r-   r.   r/   r0   r1   r   r2   r3   r4   r   r   r   r   r    r5   r!   r"   r#   r$   r6   r%   r&   r'   r(   r)   r*   r+   r7   r,   r8   r9   r;   r;      s   BH (,FE$++/J!!D(/$(OUT\(59%++d2915u''$.515u''$.56:E,,t3:CG&(9(9D(@GGKe&7&7&< = DKDHU5#4#4c#9:TAH@D#U%6%6%=DHLu'8'8#'=!>!ELEIeE$5$5s$:;dBIHLu'8'8#'=!>!ELEIeE$5$5s$:;dBIGKe&7&7&< = DKr8   r;   a  
    RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
    Tasks](https://huggingface.co/papers/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.

    RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
    generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
    c            
       l    \ rS rSr% \\S'   SrSrSr\	   SS\
S-  S\
S-  S\S-  S	\4S
 jj5       rSrg)RagPreTrainedModel   configragTN.question_encoder_pretrained_model_name_or_path'generator_pretrained_model_name_or_path	retrieverreturnc                    UR                  5        VVs0 s H,  u  pVUR                  S5      (       d  M  U[        S5      S U_M.     nnnUR                  5        VVs0 s H,  u  pVUR                  S5      (       d  M  U[        S5      S U_M.     nnnU H	  n	USU	-   	 M     U H	  n	USU	-   	 M     UR                  SS5      n
U
cL  Uc   S5       eSSKJn  SU;  a#  SS	KJn  UR                  " U40 UDS
S0D6u  pXS'   UR                  " U40 UD6n
UR                  SS5      nUcN  Uc   S5       eSSKJ	n  SU;  a%  SS	KJn  UR                  " U40 UDS
S0D6u  nnUUS'   UR                  " U40 UD6nUR                  S5      nUc,  [        R                  " U
R                  UR                  40 UD6nU " XUUS9$ s  snnf s  snnf )a	  
Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
model checkpoints.

The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
the model, you need to first set it back in training mode with `model.train()`.

Params:
    question_encoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
        Information necessary to initiate the question encoder. Can be either:

            - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
            - A path to a *directory* containing model weights saved using
              [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.

    generator_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
        Information necessary to initiate the generator. Can be either:

            - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
            - A path to a *directory* containing model weights saved using
              [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.

    model_args (remaining positional arguments, *optional*):
        All remaining positional arguments will be passed to the underlying model's `__init__` method.
    retriever ([`RagRetriever`], *optional*):
        The retriever to use.
    kwwargs (remaining dictionary of keyword arguments, *optional*):
        Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
        `output_attentions=True`).

        - To update the question_encoder configuration, use the prefix *question_encoder_* for each
          configuration parameter.
        - To update the generator configuration, use the prefix *generator_* for each configuration parameter.
        - To update the parent model configuration, do not use a prefix for each configuration parameter.

        Behaves differently depending on whether a `config` is provided or automatically loaded.

Example:

```python
>>> from transformers import RagModel

>>> # initialize a RAG from two pretrained models.
>>> model = RagModel.from_pretrained_question_encoder_generator(
...     "facebook/dpr-question_encoder-single-nq-base", "google-t5/t5-small"
... )
>>> # saving model after fine-tuning
>>> model.save_pretrained("./rag")
>>> # load fine-tuned model
>>> model = RagModel.from_pretrained("./rag")
```question_encoder_N
generator_modelznIf `model` is not defined as an argument, a `question_encoder_pretrained_model_name_or_path` has to be defined   	AutoModelr@   )
AutoConfigreturn_unused_kwargsTzqIf `generator_model` is not defined as an argument, a `generator_pretrained_model_name_or_path` has to be definedAutoModelForSeq2SeqLM)question_encoder	generatorr@   rD   )items
startswithlenpopauto.modeling_autorL   auto.configuration_autorM   from_pretrainedrP   getr   'from_question_encoder_generator_configsr@   )clsrB   rC   rD   kwargsargumentvaluekwargs_question_encoderkwargs_generatorkeyrQ   rL   rM   question_encoder_configrR   rP   generator_configr@   s                     r9   *from_pretrained_question_encoder_generator=RagPreTrainedModel.from_pretrained_question_encoder_generator   sX   | $*<<>#
#1""#67 8HS,-/0%7#1 	  #
 $*<<>
#1""<0 1HS&()50#1 	 
 +C*S01 +#C|c)* $ 366wE#AM M 766@CMC]C]BD-D *.D@'
 5L1(88> BY  %(($7	:F !F C//@5?5O5O;6?O6fj62 "2 .> *-==7;KI
 H%>FF '')9)9=CF $4RXdmnnO#

s   F=F=G=Gr,   )NNN)r-   r.   r/   r0   r   r4   base_model_prefix_supports_flash_attn_supports_sdpaclassmethodstrr   r   re   r7   r,   r8   r9   r>   r>      sy     N FJ>B)-	Bo8;d
Bo 25tBo  $&	Bo 
Bo Bor8   r>   c            !         ^  \ rS rSr    SS\S-  S\S-  S\S-  S\S-  4U 4S jjjr\              SS\	R                  S-  S	\	R                  S-  S
\\\	R                        S-  S\	R                  S-  S\	R                  S-  S\S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\S-  S\S-  S\\	R                     \-  4S jj5       rSrU =r$ )RagModelis  Nr@   rQ   rR   rD   c                   > Uc  Ub  Uc   S5       eUc-  [         R                  " UR                  UR                  40 UD6nO1[        XR                  5      (       d   SU SU R                   35       e[
        TU ]  U5        Uc!  SSKJn  UR                  UR                  5      nUc!  SSKJn  UR                  UR                  5      nX@l        U R                  b9  [        U[        5      (       d   S[        U R                  5       S	35       eX@l        X l
        X0l        SU l        S
U l        U R%                  5         g)  
question_encoder (`PreTrainedModel`, *optional*):
    The model responsible for encoding the question into hidden states for retrieval.
generator (`PreTrainedModel`, *optional*):
    The model responsible for generating text based on retrieved documents.
retriever (`RagRetriever`, *optional*):
    The component responsible for retrieving documents from a knowledge base given the encoded question.
NzQEither a configuration or an question_encoder and a generator has to be provided.zconfig: z has to be of type rJ   rK   rO   z`self.retriever` is of type z&, but should be of type `RagRetriever`F)r   r[   r@   
isinstanceconfig_classsuper__init__rW   rL   from_configrQ   rP   rR   rD   r   typectx_encodercontext_encoder_training	post_init)	selfr@   rQ   rR   rD   r]   rL   rP   	__class__s	           r9   rs   RagModel.__init__u  sB     !&6&ByG\ 	
_	
] >FF '')9)9=CF f&7&788sHVHL_`d`q`q_r:ss8 #6(44V5L5LMB-99&:J:JKI">>%i66 .tDNN/C.DDjk6 'N 0"(-%r8   	input_idsattention_maskencoder_outputsdecoder_input_idsdecoder_attention_maskr   r   r!   r"   	use_cacheoutput_attentionsoutput_hidden_statesoutput_retrievedn_docsrE   c                 d   Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nU R                  SL=(       a%    USL =(       d    U	SL =(       d    USL =(       a    USL nUGc+  U(       Ga  U R                  XSS9nUS   nU R                  UUR                  5       R                  S[        R                  S9R                  5       [        U R                  R                   SS5      USS	9nU R                  (       a  US
   US   US   US   US   US   4u  nn	nnnnUR                  U5      nU	R                  U5      n	UR                  U5      nUR                  U5      nU R!                  UUSS9R"                  nUR%                  SUUR&                  S   5      n[        R(                  " UR+                  S5      UR-                  SS5      5      R/                  S5      nOUS
   US   US   US   4u  pnnUR                  U5      nUR                  U5      nU	R                  U5      n	[        R(                  " UR+                  S5      UR-                  SS5      5      R/                  S5      nOUc   S5       eU	c   S5       eUc   S5       eUc   S5       eUR&                  S   U-  S:X  d   SU SUR&                  S    S35       eUb  UR1                  USS9nUb  UR1                  USS9nU R                  UU	UUUUU
USS9	nU(       d  SnSnSnSnSnOWR2                  nUR4                  nU(       a  U(       d  SnSn	SnSn[7        S*0 SUR8                  _SU_SUR:                  _S
U_SU	_SW_S W_S!W_S"U_S#U_S$UR<                  _S%UR>                  _S&UR@                  _S'URB                  _S(URD                  _S)URF                  _6$ )+a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagModel
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> outputs = model(input_ids=inputs["input_ids"])
```NT)r}   return_dictr   cpudevicedtypeprefixptr   r   return_tensorsr!   r"   r   tokenized_doc_idstokenized_doc_attention_maskdoc_idsr   rJ   zMake sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `context_attention_mask` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.z^Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function.M The first dimension of `context_input_ids` should be a multiple of `n_docs`=	, but is .dim)	r|   r}   r~   r   r   r   r   r   r   Nr   r   r   r    r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   )$r@   r   r   r   r   r   rD   rQ   detachtor2   float32numpygetattrrR   rw   rv   pooler_outputviewshapebmm	unsqueeze	transposesqueezerepeat_interleavehidden_states
attentionsr;   r   r   encoder_last_hidden_stateencoder_hidden_statesencoder_attentionsdecoder_hidden_statesdecoder_attentionscross_attentions)ry   r|   r}   r~   r   r   r   r   r!   r"   r   r   r   r   r   r]   has_to_retrievequestion_enc_outputsr#   retriever_outputsr   retrieved_doc_input_idsretrieved_doc_attention_maskr    gen_outputsr$   r%   s                              r9   forwardRagModel.forward  s   T "-4;;3E3E!*!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 0@/K+QUQ\Q\QmQm NN$& ("d*b.D.LbPZ^bPb(4' 	 "'+'<'<$ (= ($ 6J!5L2$(NN6==?BB%W\WdWdBekkm"4>>#8#8(DI!#' %3 %! 00 **=>)*BC)*@A)*=>)*HI))4).,/4) ):(<(<Y(G%-C-F-Fy-Q*.E.H.H.S+3O3R3RS\3]0+/+;+;/@\jn ,< ,#m ) ,@+D+DF$F$L$LQ$O,(
 "':DDQGI]IgIghiklIm"gaj  **=>)*BC)*@A))4	jf%?SUf ,@+B+BCe+f((9(<(<Y(G%-C-F-Fy-Q* "':DDQGI]IgIghiklIm"gaj  )4 P4 .9 T9 "- J-
 % 	
l	
%   #f,2 	
[\b[c d!''*+1.	
2 ( 1 C CFPQ C R!-%;%M%MfZ[%M%\"nn'1+/#9+/ % 

 15.)-&&*##'  $)=)K)K&&:&E&E#&6 '%)"#'  $! 
%%
!
 (77
 0	

 $:
 "6
 0
 0R
 (B
 %<
 -8,Q,Q
 )4(I(I
 &1%C%C
 )4(I(I
 &1%C%C
  (3'C'C!
 	
r8   )rw   rv   rR   rQ   rD   NNNN)NNNNNNNNNNNNNN)r-   r.   r/   r0   r	   r   r   rs   r   r2   r5   Tensorr6   r3   
BoolTensorr   boolintr;   r   r7   __classcell__rz   s   @r9   rm   rm   s  s    +/37,0)-2 4'2 *D02 #T)	2
  $&2 2h  .2.2BF59:>(,/359:>!%)-,0(,!e
##d*e
 t+e
 uU%6%6784?	e

 !++d2e
 !& 0 04 7e
 e
 %%,e
 !++d2e
 !& 0 04 7e
 $;e
  $;e
 #Tke
 +e
 d
e
" 
u||	1	1#e
 e
r8   rm   zu
    A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
    c            &         ^  \ rS rSr    S(S\S-  S\S-  S\S-  S\S-  4U 4S jjjrS\4S jrS	\4S
 jr	\
                 S)S\R                  S-  S\R                  S-  S\\\R                        S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R"                  S-  S\S-  S\S-  S\S-  S\S-  S\S-  S\S-  S\R                  S-  S\S-  S\4$S jj5       r\S 5       r\S 5       r\S  5       r\R4                  " 5                S*S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R"                  S-  S!\S-  S"\S-  S#\S-  S\S-  S\R                  4S$ jj5       r S+S% jr\S& 5       rS'rU =r $ ),RagSequenceForGenerationi  Nr@   rQ   rR   rD   c                    > Uc  Ub  Uc   S5       eUc,  [         R                  " UR                  UR                  40 UD6n[        TU ]  U5        [        XX4S9U l        U R                  5         gro   NzHEither a configuration or an encoder and a generator has to be provided.)r@   rQ   rR   rD   r   r[   r@   rr   rs   rm   rA   rx   ry   r@   rQ   rR   rD   r]   rz   s         r9   rs   !RagSequenceForGeneration.__init__  s      !&6&ByG\ 	
V	
] >FF '')9)9=CF 	  6Xawr8   c                 $    XR                   l        g r   rA   rD   ry   rD   s     r9   set_retriever&RagSequenceForGeneration.set_retriever      &r8   rv   c                 F    SU R                   l        XR                   l        g NTrA   rw   rv   ry   rv   s     r9    set_context_encoder_for_training9RagSequenceForGeneration.set_context_encoder_for_training      ,0)*r8   r|   r}   r~   r   r   r   r!   r"   r   r   r   r   r   exclude_bos_scorereduce_losslabelsr   rE   c                 ,   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Uc  UnSn
U R	                  UUUUUUUU	UU
UUUUS9nSnUb=  U R                  UR                  UR                  UUU R                   R                  UUS9n[        S0 SU_SUR                  _SUR                  _SUR                  _S	UR                  _S
UR                  _SUR                  _SUR                  _SUR                  _SUR                   _SUR"                  _SUR$                  _SUR&                  _SUR(                  _SUR*                  _SUR,                  _SUR.                  _6$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
exclude_bos_score (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when computing
    the loss.
reduce_loss (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
    operation.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = inputs["input_ids"]
>>> labels = targets["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=labels)

>>> # or use retriever separately
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
>>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0]
>>> # 2. Retrieve
>>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
>>> doc_scores = torch.bmm(
...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
... ).squeeze(1)
>>> # 3. Forward to generator
>>> outputs = model(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
...     decoder_input_ids=labels,
... )
```NFr|   r}   r~   r   r   r!   r"   r   r   r   r   r   r   r   )r   epsilonr   r   r   r   r   r   r!   r"   r   r    r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   )r@   r   r   r   rA   get_nllr   r   label_smoothingr   r   r!   r"   r   r    r#   r$   r%   r&   r'   r(   r)   r*   r+   )ry   r|   r}   r~   r   r   r   r!   r"   r   r   r   r   r   r   r   r   r   r]   outputsr   s                        r9   r    RagSequenceForGeneration.forward  s   N "-4;;3E3E1B1N-TXT_T_TqTq%0%<k$++BYBY ($*!I(()+/#9/#9!+/!5-  
" <<""!'33"3   D ( 

>>
 ))
 $33	

 &77
 $+#A#A
 ")!=!=
 &77
 07/Y/Y
 (/'I'I
 %,$C$C
 -4,S,S
 )0(K(K
 &-%E%E
 )0(K(K
  &-%E%E!
" (/'I'I#
 	
r8   c                 .    U R                   R                  $ r   r   ry   s    r9   rD   "RagSequenceForGeneration.retriever_      xx!!!r8   c                 .    U R                   R                  $ r   rA   rR   r   s    r9   rR   "RagSequenceForGeneration.generatorc  r   r8   c                 .    U R                   R                  $ r   rA   rQ   r   s    r9   rQ   )RagSequenceForGeneration.question_encoderg      xx(((r8   do_deduplicationnum_return_sequences	num_beamsc
                    U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc
  Uc   S5       eU R
                  b  Uc  U R                  XS9S   nU R                  UUR                  5       R                  S[        R                  S9R                  5       [        U R                  R                   SS5      U	SS	9S
   nUR                  U5      n/ nXS'   XS'   SU
S'   Ub  UR                  S   OUR                  S   U	-  n[        U5       GHW  nX?U	-  US-   U	-   nU R                  R                   " U40 U
D6nU(       aV  [        R"                  " [%        U Vs0 s H  n['        UR)                  5       5      U_M     snR+                  5       5      5      nUR                  S   nUb   XUS-    R-                  US5      nU " UUSS9nOnUc   S5       eUc   S5       eUR-                  US5      nXOU	-  US-   U	-   nUR-                  US5      nX_US-   2SS24   nUR-                  US5      nU " UUUUSS9nUS   * R/                  U5      S   nUR1                  UU   5        GMZ     U R3                  XR                   R                  R4                  S9$ s  snf )a  
Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
for more information on how to set other generate input parameters.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        The sequence used as a prompt for the generation. If `input_ids` is not passed, then
        `context_input_ids` has to be provided.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
        retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model is not initialized with a `retriever` or `input_ids` is not given, `context_input_ids` and
        `context_attention_mask` have to be provided to the forward pass. They are returned by
        [`~RagRetriever.__call__`].
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.

        If the model is not initialized with a `retriever` or `input_ids` is not given, `doc_scores` has to be
        provided to the forward pass. `doc_scores` are returned by [`~RagRetriever.__call__`].
    do_deduplication (`bool`, *optional*):
        Whether or not to deduplicate the generations from different context documents for a given input. Has
        to be set to `False` if used while training with distributed backend.
    num_return_sequences(`int`, *optional*, defaults to 1):
        The number of independently computed returned sequences for each element in the batch. Note that this
        is not the value we pass to the `generator`'s `[`~generation.GenerationMixin.generate`]` function,
        where we set `num_return_sequences` to `num_beams`.
    num_beams (`int`, *optional*, defaults to 1):
        Number of beams for beam search. 1 means no beam search.
    n_docs (`int`, *optional*, defaults to `config.n_docs`)
        Number of documents to retrieve and/or number of documents for which to generate an answer.
    kwargs (`dict[str, Any]`, *optional*):
        Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].

Return:
    `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
    sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all batches
    finished early due to the `eos_token_id`.
Nz= At least one of input_ids or context_input_ids must be givenr}   r   r   r   r   r   r   r!   r   r   r}   r   T)r   r   zMake sure that `context_attention_mask` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.)r!   r"   r   r   r   r   )pad_token_id)r@   r   r   r   r   rD   rQ   r   r   r2   r   r   r   rR   r   rangegeneratestacklistrk   tolistvaluesrepeattopkappend_cat_and_padr   )ry   r|   r}   r!   r"   r   r   r   r   r   model_kwargsnum_doc_return_sequencesquestion_hidden_stateshypos
batch_sizeindexgenerator_input_idsoutput_sequencesknum_candidatesnew_input_idsr   individual_input_idsindividual_attention_maskindividual_doc_scorestop_cand_indss                             r9   r   !RagSequenceForGeneration.generatek  si   B "-4;;3E3E/?/K+QUQ\Q\QmQm$8$D $++JjJj 	! "+!6IDKK<Q<Q	$(9(E 	
K	
E >>%*;*C%)%:%:9%:%def%g" $&--/22%u}}2U[[]t~~44hE# !/ ! "!# !2 4 4Y ?$-[!/8+,)-%&+4+@Y__Q'FWF]F]^_F`djFj
:&E"3FNeaiSYEY"Z#~~66#    #(;;tQa4bQaAS_a5GQa4b4i4i4k/l#m -33N
 $ )%!) < C CNTU V}5EY]^-9 T9 "- J-
 (;'A'A"A($ -C6>UZ]^U^bhTh,i),E,L,L^]^,_)(2EAI3F3I(J%(=(D(D^UV(W%&:+D4+&* &fo-334LMaPM LL)-89g 'j   [[5J5J5W5W XXW 5cs   &$K!c                 2  ^ ^ [         R                  " TS S 2SS 24   TR                  TR                  S   S5      R	                  T R
                  R                  R                  5      /S5      mUb  UOT R
                  R                  nT R
                  R                  =(       d     T R
                  R                  R                  nUS L=(       a&    TS S 2S4   R                  U5      R                  5       n	U U4S jn
[        R                  R                  USS9R                  UR                  S   U-  USUR!                  S5      5      n[        R                  R                  USS9R#                  S5      R#                  S5      nUS S 2S S 2S S2S S 24   nUS S 2S S 2SS2S S 24   nUS S 2S S 2SS 2S S 24   n[         R                  " XU-   U/SS9nTR#                  S5      R#                  S5      R%                  SUSS5      mTR'                  5       UR'                  5       :X  d   eUR)                  STS9nUR+                  SSS	9nU
" UU5      u  nnU(       a$  U	(       a  US S 2S S 2SS 24   R+                  S5      OUR+                  S5      nUR+                  S5      nUR-                  S5      nUR-                  S5      nU* nU* nU(       a   UR+                  5       nUR+                  5       nUUR!                  S5      -  nS
U-
  U-  UU-  -   nU$ )Nr   r   c                   > TR                  TR                  R                  R                  5      nUR	                  5       (       a$  U R                  US5        UR                  US5        U R                  S5      UR                  S5      4$ N        r   eqr@   rR   r   anymasked_fill_r   ll
smooth_objpad_maskry   targets      r9   
_mask_pads4RagSequenceForGeneration.get_nll.<locals>._mask_pads  h    yy!6!6!C!CDH||~~#.''#6::b>:#5#5b#999r8   r   r   rJ   r   r   Tr   keepdim      ?)r2   catnewr   fill_r@   rR   r   r   bos_token_idr  allr   
functionallog_softmaxr   sizer   r   r   gathersum	logsumexp)ry   
seq_logitsr   r  r   r   r   r   r  use_bosr  seq_logprobsdoc_logprobsfirst_token_scoressecond_token_scores	remainderrag_logprobsr  r  nll_losssmooth_losseps_ir   s   `  `                   r9   r    RagSequenceForGeneration.get_nll  s    AqrE]FJJv||A:@@AVAVAcAcdegh
 "-4;;3E3E {{//U4;;3H3H3U3Ud*Rvad||/L/P/P/R	: }}000DIIQ6)62zr7J
 }}000CMMbQ[[\^_ *!QA+6*1a1a<8 Aqr1-	yy"4L6XZc!djkl !!!$..r299!VQJzz||//1111  Rv 6!%%"d%;
#B
3J %6'R1ab\a rvvay^^A&
\\!_))!,
3!k||~H%//+K,++B//g)EK,??r8   c                    U S   R                  [        S U  5       5      [        S U  5       5      5      R                  U5      nSnU  H:  nXBX3UR                  S   -   2S UR                  S   24'   X4R                  S   -  nM<     U$ )Nr   c              3   >   #    U  H  oR                   S    v   M     g7f)r   Nr   .0ts     r9   	<genexpr>8RagSequenceForGeneration._cat_and_pad.<locals>.<genexpr>?  s     #@1GGAJ   c              3   >   #    U  H  oR                   S    v   M     g7f)r   Nr4  r5  s     r9   r8  r9  ?  s     EbZaUVggajZar:  r   )r  r$  maxr  r   )tensorsr   outputindr7  s        r9   r   %RagSequenceForGeneration._cat_and_pad=  s    #@#@ @#EbZaEbBbciijvwA;<3qwwqz))<QWWQZ<78771:C  r8   rA   r   NNNNNNNNNNNNNNNNN)	NNNNNNNNN)Fr
  FN)!r-   r.   r/   r0   r	   r   r   rs   r   r   r   r2   r5   r   r6   r   r   r3   r   r   r   r   propertyrD   rR   rQ   no_gradr   r   staticmethodr   r7   r   r   s   @r9   r   r     sb    +/37,0)- 4' *D0 #T)	
  $& >'| '+O +  .2.2=A59:>(,59:>/3!%)-,0(,)-#'*.!%^
##d*^
 t+^
 uU\\23d:	^

 !++d2^
 !& 0 04 7^
 ^
 !++d2^
 !& 0 04 7^
 %%,^
 $;^
  $;^
 #Tk^
 +^
  $;^
  D[!^
"   4'#^
$ d
%^
( 
")^
 ^
@ " " " " ) ) ]]_ .22659:>/3(,+/ $!TY##d*TY ((4/TY !++d2	TY
 !& 0 04 7TY %%,TY +TY "DjTY :TY d
TY 
		TY TYn os9v  r8   r   zo
    A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
    c            &         ^  \ rS rSr    S0S\S-  S\S-  S\S-  S\S-  4U 4S jjjrS\4S jrS	\4S
 jr	      S1S jr
\S 5       r\S 5       r\S 5       r\S 5       rS2S jr\                 S3S\R(                  S-  S\R*                  S-  S\\\R.                        S-  S\R(                  S-  S\R0                  S-  S\S-  S\R(                  S-  S\R(                  S-  S\R*                  S-  S\S-  S\S-  S\S-  S\S-  S\S-  S\S-  S \R(                  S-  S!\S-  S"\4$S# jj5       r\R<                  " 5       SSSSSSSS\" 5       \ " 5       4
S\R(                  S-  S\R(                  S-  S\R(                  S-  S\R(                  S-  S\R*                  S-  S!\S-  S$\!S-  S%\"\\R.                  /\#\   4   S-  S&\S-  S'\ S-  S"\R(                  4S( jj5       r$S) r%S* r&S+ r'S, r(S2S- jr)S4S. jr*S/r+U =r,$ )5RagTokenForGenerationiG  Nr@   rQ   rR   rD   c                    > Uc  Ub  Uc   S5       eUc,  [         R                  " UR                  UR                  40 UD6n[        TU ]  U5        [        XX4S9U l        U R                  5         gr   r   r   s         r9   rs   RagTokenForGeneration.__init__M  s      !&6&ByG\ 	
V	
] >FF '')9)9=CF 	  6Xawr8   c                 $    XR                   l        g r   r   r   s     r9   r   #RagTokenForGeneration.set_retrieverm  r   r8   rv   c                 F    SU R                   l        XR                   l        g r   r   r   s     r9   r   6RagTokenForGeneration.set_context_encoder_for_trainingp  r   r8   c           
      6    Ub  US S 2SS 24   nS UUUUUUSUS.	$ )Nr   T)	r|   r~   r   r"   r   r   r   do_marginalizer   r,   )	ry   r   r   r}   r   r~   r   r   r]   s	            r9   prepare_inputs_for_generation3RagTokenForGeneration.prepare_inputs_for_generationt  sB     & 1!RS& 9 .$&4!2.""

 
	
r8   c                 .    U R                   R                  $ r   r   r   s    r9   rD   RagTokenForGeneration.retriever  r   r8   c                 .    U R                   R                  $ r   r   r   s    r9   rR   RagTokenForGeneration.generator  r   r8   c                 .    U R                   R                  $ r   r   r   s    r9   rQ   &RagTokenForGeneration.question_encoder  r   r8   c                 Z  ^^	 S m	Sn[        [        U 5      5       H  n[        U [        5      (       a  U	U4S jU R                  R
                  U   R                  U R                  R
                  U   R                  U R                  R
                  U   R                  U R                  R
                  U   R                  4 5       u  pEpgXEXg4nOBU	U4S jU R
                  U   R                  U R
                  U   R                  4 5       u  pEXE4nX(4-  nM     [        U 5      " U5      $ )zeReorders cache for generation. BART-inspired but we need to take care of the extra dimension for docsc                     U R                   S   UR                   S   -  nU R                  " SU/U R                   SS  Q76 n U R                  SU5      n U R                  " S/U R                   SS  Q76 nU$ )Nr   r   r   rJ   )r   r   index_select)r   	new_orderr   results       r9   _reorder_stacked>RagTokenForGeneration._reorder_cache.<locals>._reorder_stacked  s    "((+yq/AAF)..r6TM<O<OPQPR<STM)66q)DM"''E]-@-@-DEFMr8   r,   c              3   h   >#    U  H'  nT" UTR                  UR                  5      5      v   M)     g 7fr   r   r   r6  xr]  beam_idxs     r9   r8  7RagTokenForGeneration._reorder_cache.<locals>.<genexpr>  s4      \ %QAHH(=>>   /2c              3   h   >#    U  H'  nT" UTR                  UR                  5      5      v   M)     g 7fr   r`  ra  s     r9   r8  rd    s/      6c %QAHH(=>>cre  )
r   rU   rp   r   self_attention_cachelayerskeysr   cross_attention_cacheru   )
r   rc  reordered_pastidxself_attention_kself_attention_vcross_attention_kcross_attention_v	new_tupler]  s
    `       @r9   _reorder_cache$RagTokenForGeneration._reorder_cache  s#   	 _-.C/+>??\ (<<CCCHMM'<<CCCHOO'==DDSINN'==DDSIPP	\X 4E .ARf	6-44S9>>@V@VWZ@[@b@bc62  .@	l*N% /& O$^44r8   c                 n   Ub  UOU R                   R                  n[        R                  R	                  USS9R                  UR                  S   U-  USUR                  S5      5      n[        R                  " USS9nXER                  S5      R                  S5      -   n[        R                  " USS9$ )Nr   r   r   r   )r@   r   r   r   r!  r   r   r"  r2   r   r%  )ry   r&  r   r   r(  r)  log_prob_sums          r9   marginalize!RagTokenForGeneration.marginalize  s    !-4;;3E3E }}000DIIQ6)62zr7J
 ((;#&<&<R&@&J&J2&NN|33r8   r|   r}   r~   r   r   r   r!   r"   r   r   r   r   r   rO  r   r   r   rE   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Uc  UnSn
U R	                  UUUUUUUU	UU
UUUUS9nSnUR
                  nUbA  Uc   eU R                  UR
                  UR                  UUU R                   R                  US9nU(       a  U R                  UUR                  U5      n[        S0 SU_SU_SUR                  _SUR                  _S	UR                  _S
UR                  _SUR                  _SUR                  _SUR                   _SUR"                  _SUR$                  _SUR&                  _SUR(                  _SUR*                  _SUR,                  _SUR.                  _SUR0                  _6$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
do_marginalize (`bool`, *optional*):
    If `True`, the logits are marginalized over all documents by making use of
    `torch.nn.functional.log_softmax`.
reduce_loss (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
    operation.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = inputs["input_ids"]
>>> labels = targets["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=labels)

>>> # or use retriever separately
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
>>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0]
>>> # 2. Retrieve
>>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
>>> doc_scores = torch.bmm(
...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
... ).squeeze(1)
>>> # 3. Forward to generator
>>> outputs = model(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
...     decoder_input_ids=labels,
... )

>>> # or directly generate
>>> generated = model.generate(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
... )
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
```NFr   )r   r   r   r   r   r   r   r!   r"   r   r    r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   )r@   r   rO  r   rA   r   r   r   r   rv  r   r   r!   r"   r   r    r#   r$   r%   r&   r'   r(   r)   r*   r+   )ry   r|   r}   r~   r   r   r   r!   r"   r   r   r   r   r   rO  r   r   r   r]   r   r   r   s                         r9   r   RagTokenForGeneration.forward  s"   ^ "-4;;3E3E+9+E4;;KeKe%0%<k$++BYBY ($*!I(()+/#9/#9!+/!5-  
" $000<<""'33   D %%fg.@.@&IF' 


 ))
 $33	

 &77
 $+#A#A
 ")!=!=
 &77
 07/Y/Y
 (/'I'I
 %,$C$C
 -4,S,S
 )0(K(K
 &-%E%E
 )0(K(K
  &-%E%E!
" (/'I'I#
 	
r8   generation_configprefix_allowed_tokens_fnlogits_processorstopping_criteriac           	        ^^ U R                  SUSSS5      nU R                  " U40 UD6u  p}UR                  5       nU[        R                  [        R
                  [        R                  [        R                  4;  a  [        SU S35      e[        [        U 5      [        U   5      nU R                  UR                  5       5        U R                  XU5        UR                  SS5      SLnU R!                  UU5        Tb  TOU R"                  R$                  mU R&                  Gb  UGc  U R)                  XS9S   nU R'                  UUR+                  5       R-                  S[.        R0                  S	9R3                  5       [        U R4                  R"                  S
S5      TSS9nUS   US   US   npCUR-                  U5      nUR-                  U5      nUR-                  U5      n[.        R6                  " UR9                  S5      UR;                  SS5      5      R=                  S5      nUR>                  S   T-  S:X  d   ST SUR>                  S    S35       eUR>                  S   T-  mU R@                  R4                  RC                  5       nU" X4SS9n[.        RD                  " TURF                  -  S4URH                  [.        RJ                  [M        U RO                  5       5      RP                  S9nUR>                  S   nUS   nS%UU4S jjnU" XGRF                  S9nU" UURF                  S9US'   URS                  URF                  SS9nX]S'   UUS'   XMS'   TUS'   URT                  US '   U RW                  UUUUU	URP                  S!9nU RY                  XzS"9nU R[                  UUSUR>                  S   UR\                  S-
  S#9  U" U U4UUUS$.UDUD6$ )&a  
Implements RAG token decoding.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        The sequence used as a prompt for the generation. If `input_ids` is not passed, then
        `context_input_ids` has to be provided.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    n_docs (`int`, *optional*, defaults to `config.n_docs`)
        Number of documents to retrieve and/or number of documents for which to generate an answer.
    generation_config (`~generation.GenerationConfig`, *optional*):
        The generation configuration to be used as base parametrization for the generation call. `**kwargs`
        passed to generate matching the attributes of `generation_config` will override them. If
        `generation_config` is not provided, the default will be used, which has the following loading
        priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
        configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
        default values, whose documentation should be checked to parameterize generation.
    prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
        If provided, this function constraints the beam search to allowed tokens only at each step. If not
        provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
        `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
        the previously generated tokens `inputs_ids` and the batch ID `batch_id`. This argument is useful for
        constrained generation conditioned on the prefix, as described in [Autoregressive Entity
        Retrieval](https://huggingface.co/papers/2010.00904).
    logits_processor (`LogitsProcessorList`, *optional*):
        Custom logits processors that complement the default logits processors built from arguments and a
        model's config. If a logit processor is passed that is already created with the arguments or a model's
        config an error is thrown.
    stopping_criteria (`StoppingCriteriaList`, *optional*):
        Custom stopping criteria that complement the default stopping criteria built from arguments and a
        model's config. If a stopping criteria is passed that is already created with the arguments or a
        model's config an error is thrown.
    kwargs (`dict[str, Any]`, *optional*):
        Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
        forwarded to the `forward` function of the model.

Return:
    `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
    sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches
    finished early due to the `eos_token_id`.
NFz!RAG model is not compatible with z5 generation. Please check your generation parameters.r}   r   r   r   r   r   r   r   r!   r"   r   r   rJ   r   r   r   T)r|   r}   r   )r   r   r   last_hidden_statec                    > U S S S S 24   R                  TST4U R                  SS  -   5      n U R                  TUT4U R                  SS  -   5      n U R                  TU-  T-  4U R                  SS  -   5      $ )Nr   r   )reshaper   expand)tensorr   r   r   s     r9   extend_enc_output9RagTokenForGeneration.generate.<locals>.extend_enc_output  s    D$M*22J63JV\\Z[Z\M]3]^F]]J	6#BV\\RSRTEU#UVF>>:	#9F#B"Dv||TUTVGW"WXXr8   )r   r   r   r~   r   r   )rz  input_ids_seq_lengthencoder_input_idsr{  r|  r   )rz  r}  )generation_moder   max_cache_length)r|  r}  rz  r   )/_extract_generation_mode_kwargs_prepare_generation_configget_generation_moder   SAMPLEGREEDY_SEARCHBEAM_SEARCHBEAM_SAMPLE
ValueErrorr   ru   r   _validate_model_kwargscopy_validate_generation_moderZ   _prepare_special_tokensr@   r   rD   rQ   r   r   r2   r   r   rR   r   r   r   r   r   rA   get_encoderfullr   decoder_start_token_idlongnext
parametersr   r   r   _get_logits_processor_get_stopping_criteria_prepare_cache_for_generation
max_length)ry   r|   r}   r!   r"   r   r   rz  r{  r|  r}  r]   generation_mode_kwargsr   r  decoding_methodkwargs_has_attention_maskr   outr   encoderr~   r  r  r  prepared_logits_processorprepared_stopping_criteriar   s         `                    @r9   r   RagTokenForGeneration.generatev  sy   b "&!E!EdFTY[_ae!f*.*I*IJ[*f_e*f'+??A!!((&&&&	#
 
 3O3DDyz  "$t*.F.WX##L$5$5$78&&Kab$0$4$45Et$LTX$X!$$%68QR "-4;;3E3E >>%*;*C%)%:%:9%:%def%g"..&--/22%u}}2U[[]t~~44hE# ! C '(,-*+ 8L5 $8#:#:;Q#R  1 4 4Y ?%;%>%>y%I" #9#C#CA#FH\HfHfghjkHlmuuJ "''*V39 	
[\b[c d!''*+1.	
9 ',,Q/69
(($$002!,=rvwJJ+555q944**)*11	
	  )r2+,?@	Y 	Y "33IUpUp!q/@):)D)D0
+,  112C2M2MST1U
 &0\"*9&')?%&!'X$5$?$?[!$($>$>/!5/%=-## %? %
! &*%@%@/ &A &
" 	**  q).99A= 	+ 	
 
 78/
 %
 
 	
r8   c                 (    U R                  X5      nU$ r   )rr  )ry   r   rc  s      r9   _temporary_reorder_cache.RagTokenForGeneration._temporary_reorder_cacheC  s     --oHr8   c                 J    U R                   R                  R                  5       $ r   )rA   rR   get_input_embeddingsr   s    r9   r  *RagTokenForGeneration.get_input_embeddingsJ  s    xx!!6688r8   c                 J    U R                   R                  R                  5       $ r   )rA   rR   get_output_embeddingsr   s    r9   r  +RagTokenForGeneration.get_output_embeddingsM  s    xx!!7799r8   c                 L    U R                   R                  R                  U5      $ r   )rA   rR   set_output_embeddings)ry   new_embeddingss     r9   r  +RagTokenForGeneration.set_output_embeddingsP  s    xx!!77GGr8   c                     Uc  U R                   R                  nUR                  UR                  5      nUSS2SS24   R	                  5       USS2SS24'   X#SS2S4'   U$ )zCShift input ids one token to the right, and pad with start_token_idNr   r   r   )r@   r  	new_zerosr   clone)ry   r|   start_token_idshifted_input_idss       r9   shift_tokens_right(RagTokenForGeneration.shift_tokens_rightS  se    !![[??N%//	@#,QV#4#:#:#<!QR% "0!Q$  r8   c                   ^ ^ Ub  UOT R                   R                  n[        R                  " TS S 2SS 24   TR	                  TR
                  S   S5      R                  T R                   R                  R                  5      /S5      mU U4S jnT R                  XU5      nTR                  S5      mTR                  5       UR                  5       :X  d   eUR                  STS9n	UR                  SSS9n
U" X5      u  pU	R                  S5      n	U
R                  S5      n
U	* nU
* nU(       a   UR                  5       nUR                  5       nXXR                  S5      -  nSU-
  U-  X-  -   nU$ )	Nr   r   c                   > TR                  TR                  R                  R                  5      nUR	                  5       (       a$  U R                  US5        UR                  US5        U R                  S5      UR                  S5      4$ r	  r  r  s      r9   r  1RagTokenForGeneration.get_nll.<locals>._mask_padsc  r  r8   r   r  Tr  r  )r@   r   r2   r  r  r   r  rR   r   rv  r   r   r#  r$  r"  )ry   r&  r   r  r   r   r   r  r-  r  r  r.  r/  r0  r   s   `  `           r9   r   RagTokenForGeneration.get_nll\  sW   !-4;;3E3EAqrE]FJJv||A:@@AVAVAcAcdegh
	: ''
G!!"%zz||//1111  Rv 6!%%"d%;
#B3VVAY^^A&
3!k||~H%//+K++B//g)E,??r8   rA  r   )NNNNNNr   rB  )Fr
  N)-r-   r.   r/   r0   r	   r   r   rs   r   r   rP  rC  rD   rR   rQ   rE  rr  rv  r   r2   r5   r3   r6   r   r   r   r   r   r   r   rD  r   r   r
   r   r   r   r  r  r  r  r  r   r7   r   r   s   @r9   rG  rG  G  s    +/37,0)- 4' *D0 #T)	
  $& @'| '+O + 
: " " " " ) ) 5 5@	4  .237=A59:>(,59:>/3!%)-,0(,&*#'*.!%j
##d*j
 ))D0j
 uU\\23d:	j

 !++d2j
 !& 0 04 7j
 j
 !++d2j
 !& 0 04 7j
 %%,j
 $;j
  $;j
 #Tkj
 +j
 tj
  D[!j
"   4'#j
$ d
%j
( 
")j
 j
X ]]_ .22659:>/3!59TX7J7L9M9OI
##d*I
 ((4/I
 !++d2	I

 !& 0 04 7I
 %%,I
 d
I
 ,d2I
 #+C+>S	+I"JT"QI
 .4I
 0$6I
 
		I
 I
X9:H!" "r8   rG  )rm   r>   r   rG  ))r1   collections.abcr   dataclassesr   r2   r   cache_utilsr   r   configuration_utilsr	   
generationr
   r   r   r   r   generation.utilsr   modeling_outputsr   modeling_utilsr   utilsr   r   configuration_ragr   retrieval_ragr   
get_loggerr-   loggerr   r;   r>   rm   r   rG  __all__r,   r8   r9   <module>r     sZ     $ !   5 3 v v 8 + - , ( ' 
		H	% 
 WL{ WL WLt TL TL  TLn  Io Io IoX [
! [
 [
| 
m1 m
m` 
r. r
rj br8   