
    Z ji                        S r SSKrSSKrSSKJr  SSKJrJrJr  SSK	J
r  SSKJrJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  \R8                  " \5      rS rS r S"S jr! " S S\RD                  5      r#S r$ " S S\RD                  5      r%\ " S S\5      5       r&\ " S S\&5      5       r'\" SS9 " S S\&\5      5       r(\" SS9 " S S \&5      5       r)/ S!Qr*g)#zPyTorch CTRL model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )
CTRLConfigc                 P    S[         R                  " SSUS-  -  U-  5      -  nX-  $ )Nr   i'     )torchpow)posid_model_sizeangle_ratess       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defnr   %   s-    eiiQ!V'DEEK    c                    [        [        R                  " U [        R                  S9R	                  U5      R                  S5      [        R                  " U[        R                  S9R	                  U5      R                  S5      U5      n[        R                  " US S 2SS S24   5      n[        R                  " US S 2SS S24   5      n[        R                  " XE/SS9nU$ )Ndtyper   r   r   dim)	r   r   arangeint64to	unsqueezesincoscat)positionr   r!   
angle_radssinescosinespos_encodings          r   positional_encodingr1   *   s    XU[[144U;EEaH\588?II!LJ IIjADqD)*Eii
1add7+,G99e-26Lr   c           	      x   [         R                  " XR                  SSSS5      5      nUR                  S   nU[        R
                  " U5      -  nUb3  UR                  S5      UR                  S5      pXsX-
  U	2S U	24   S-  -  nUb  Xt-   n[         R                  " USS9n
[         R                  " X5      nX4$ )	Nr   r   r   r   r"   g     r#   )r   matmulpermuteshapenpsqrtsizesoftmax)qkvmaskattention_mask	matmul_qkdkscaled_attention_logitsndnsattention_weightsoutputs               r   scaled_dot_product_attentionrG   9   s    Q		!Q1 56I	
B'"''"+5(--b13J3O3OPR3SB"crc(9#:T#AA!"9"J&=2F\\+/F$$r   c                   D   ^  \ rS rSrSU 4S jjrS r    SS jrSrU =r$ )MultiHeadAttentionO   c                 V  > [         TU ]  5         X l        Xl        X0l        [        XR                  -  5      U l        [        R                  " X5      U l	        [        R                  " X5      U l
        [        R                  " X5      U l        [        R                  " X5      U l        g N)super__init__	num_headsr   	layer_idxintdepthr   LinearWqWkWvdense)selfr   rO   rP   	__class__s       r   rN   MultiHeadAttention.__init__P   sq    "("67
))L7))L7))L7YY|:
r   c                 x    UR                  USU R                  U R                  5      nUR                  / SQ5      $ )Nr"   r   r   r   r   )reshaperO   rR   r5   )rX   x
batch_sizes      r   split_into_heads#MultiHeadAttention.split_into_heads^   s-    IIj"dnndjjAyy&&r   c	                    UR                   S   n
U R                  U5      nU R                  U5      nU R                  U5      nU R	                  X:5      nU R	                  X*5      nU R	                  X5      nUb  UR                  X!U R                  5      u  p![        X2XU5      nUS   R                  / SQ5      nUS   nUR                  U
SU R                  5      nU R                  U5      nX4$ )Nr   r\   r   r"   )r6   rT   rU   rV   r`   updaterP   rG   r5   r]   r   rW   )rX   r=   r<   r;   r>   
layer_pastr?   	use_cacheoutput_attentionskwargsr_   rF   scaled_attentionattnoriginal_size_attentions                  r   forwardMultiHeadAttention.forwardb   s     WWQZ
GGAJGGAJGGAJ!!!0!!!0!!!0!$$Q4>>:DA-aA^L!!9,,\:ay"2":"::r4K\K\"]34|r   )rU   rT   rV   r   rW   rR   rP   rO   rL   NNFF)	__name__
__module____qualname____firstlineno__rN   r`   rk   __static_attributes____classcell__rY   s   @r   rI   rI   O   s%    ;'  r   rI   c                     [         R                  " [         R                  " X5      [         R                  " 5       [         R                  " X5      5      $ rL   )r   
SequentialrS   ReLU)r   dffs     r   point_wise_feed_forward_networkry      s-    ==<5rwwy"))CB^__r   c                   >   ^  \ rS rSrSU 4S jjr    SS jrSrU =r$ )EncoderLayer   c                 4  > [         TU ]  5         [        XUS9U l        [	        X5      U l        [        R                  " USS9U l        [        R                  " USS9U l	        [        R                  " U5      U l        [        R                  " U5      U l        g )NrP   gư>eps)rM   rN   rI   multi_head_attentionry   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)rX   r   rO   rx   raterP   rY   s         r   rN   EncoderLayer.__init__   sn    $6|Zc$d!2<E,,|>,,|>

4(

4(r   c                    U R                  U5      nU R                  UUUUUUUUS9n	U	S   n
U R                  U
5      n
X-   nU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU4U	SS  -   nU$ )Nrd   r?   re   rf   r   r   )r   r   r   r   r   r   )rX   r^   r>   rd   r?   re   rf   rg   normedattn_outputsattn_outputout1out2
ffn_outputoutputss                  r   rk   EncoderLayer.forward   s     #00!)/ 1 	
 #1ommK0t$XXd^
]]:.
 'L,,r   )r   r   r   r   r   r   )g?Nrm   )rn   ro   rp   rq   rN   rk   rr   rs   rt   s   @r   r{   r{      s     
)   r   r{   c                   8   ^  \ rS rSr% \\S'   SrU 4S jrSrU =r	$ )CTRLPreTrainedModel   configtransformerc                    > [         TU ]  U5        [        U[        5      (       aY  [        R
                  " UR                  [        UR                  R                  UR                  [        R                  5      5        g g rL   )rM   _init_weights
isinstance	CTRLModelinitcopy_r0   r1   r   n_positionsr   r   float)rX   modulerY   s     r   r   !CTRLPreTrainedModel._init_weights   s[    f%fi((JJ##%89R9RTZTgTginitit%u )r    )
rn   ro   rp   rq   r   __annotations__base_model_prefixr   rr   rs   rt   s   @r   r   r      s    % r   r   c                   L  ^  \ rS rSrU 4S jrS rS r\          SS\R                  S-  S\
S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                     \-  4S jj5       rSrU =r$ )r      c                   > [         TU ]  U5        UR                  U l        UR                  U l        [        R                  " UR                  UR                  5      U l	        [        R                  " UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H8  n[        UR                  UR                   UR"                  UR$                  US9PM:     sn5      U l        [        R(                  " UR                  UR*                  S9U l        U R/                  S[1        UR2                  U R                  [4        R6                  5      SS9  U R9                  5         g s  snf )Nr~   r   r0   F)
persistent)rM   rN   n_embdr   n_layer
num_layersr   	Embedding
vocab_sizewr   
embd_pdropdropout
ModuleListranger{   n_headrx   resid_pdrophr   layer_norm_epsilon	layernormregister_bufferr1   r   r   r   	post_init)rX   r   r   rY   s      r   rN   CTRLModel.__init__   s    "MM ..f//?zz&"3"34 v~~..A V]]FMM6::vGYGYefg.
 fmm9R9RS/0B0BDDUDUW\WbWbcpu 	 	

 	s   /?E7c                     U R                   $ rL   r   )rX   s    r   get_input_embeddingsCTRLModel.get_input_embeddings   s    vvr   c                     Xl         g rL   r   )rX   new_embeddingss     r   set_input_embeddingsCTRLModel.set_input_embeddings   s    r   N	input_idspast_key_valuesr?   token_type_idsposition_idsinputs_embedsre   rf   output_hidden_statesreturn_dictreturnc           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  Ub  [        S5      eUbF  U R                  X5        UR                  5       nUR                  SUS   5      nUR                  S   nO1Ub#  UR                  5       SS nUR                  S   nO[        S5      eUb  UR                  OUR                  nU(       a  Uc  [        U R                   S9nUb  UR                  5       OSnUc;  [        R                  " XS   U-   [        R                  US9nUR!                  S5      nUb  US::  a  [        S5      eUR                  US5      nUR!                  S	5      R!                  S
5      nUR#                  U R$                  S9nSU-
  [        R&                  " U R$                  5      R(                  -  nUbJ  UR                  SUS   5      nU R+                  U5      nU[,        R.                  " U R0                  5      -  nOSnUc  U R+                  U5      nUS   n[        R2                  " [        R4                  " UU-   UU-   5      S	5      R#                  U5      nU[,        R.                  " U R0                  5      -  nU R6                  R#                  U5      U l        U R6                  USS24   nUU-   U-   nU R9                  U5      nU	(       a  SOSnU(       a  SOSn[;        U R<                  5       H5  u  nnU	(       a  UU4-   nU" UUUUUUS9nUS   nU(       d  M,  UUS	   4-  nM7     U R?                  U5      nU	(       a  UU4-   nU
(       d  [A        S UUUU4 5       5      $ [C        UUUUS9$ )a1  
Example:

```python
>>> from transformers import AutoTokenizer, CTRLModel
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 5, 1280]
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer"   r   z5You have to specify either input_ids or inputs_embeds)r   )r!   devicez$batch_size has to be defined and > 0r   r   r    g      ?r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frL   r   ).0r=   s     r   	<genexpr>$CTRLModel.forward.<locals>.<genexpr>e  s      ^a^s   	)last_hidden_stater   hidden_states
attentions)"r   rf   re   r   r   
ValueError%warn_if_padding_and_no_attention_maskr9   viewr6   r   r
   get_seq_lengthr   r%   longr(   r'   r!   finfominr   r7   r8   r   triuonesr0   r   	enumerater   r   tupler   )rX   r   r   r?   r   r   r   re   rf   r   r   rg   input_shaper_   r   past_lengthtoken_type_embedsseq_lenr>   
pos_embedsr   all_hidden_statesall_attentionsr   r   r   s                             r   rk   CTRLModel.forward   s   F 2C1N-TXT_T_TqTq!*!6IDKK<Q<Q	$8$D $++JjJj 	 &1%<k$++BYBY ]%>cdd"66yQ#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T0*$++>O:I:Uo446[\ <<_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,55a8BB1EN ,..TZZ.@N!N2ekk$**6M6Q6QQN%+00[_EN $~ 6):):!;; !  FF9-Mb/zz%**W{%:Gk<QRTUVYYZ`a!2!233 !--008&&|Q7
%
25FF]3"6BD0ddff%DAq#$58H$H!*-#"3G $AJM  71:-/ & }5 1]4D D )?<M~^   '+++%	
 	
r   )r   r   r   r   r   r0   r   )
NNNNNNNNNN)rn   ro   rp   rq   rN   r   r   r   r   
LongTensorr	   FloatTensorboolr   Tensorr   rk   rr   rs   rt   s   @r   r   r      s   0   .2(,37260426!%)-,0#'J
##d*J
 J
 ))D0	J

 ((4/J
 &&-J
 ((4/J
 $;J
  $;J
 #TkJ
 D[J
 
u||	6	6J
 J
r   r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                     ^  \ rS rSrSS0rU 4S jr\            SS\R                  S-  S\	S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                  -  S\\R                     \-  4S jj5       r SU 4S jjrSrU =r$ )CTRLLMHeadModeliq  zlm_head.weightztransformer.w.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NTbias)
rM   rN   r   r   r   rS   r   r   lm_headr   rX   r   rY   s     r   rN   CTRLLMHeadModel.__init__z  sG     $V,yy0A0AM 	r   Nr   r   r?   r   r   r   labelsre   rf   r   r   logits_to_keepr   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb*  U R                  " UU4SU R                   R                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLLMHeadModel

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> sequence_ids = model.generate(inputs["input_ids"])
>>> sequences = tokenizer.batch_decode(sequence_ids)
>>> sequences
['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

>>> outputs = model(**inputs, labels=inputs["input_ids"])
>>> round(outputs.loss.item(), 2)
9.21

>>> list(outputs.logits.shape)
[1, 5, 246534]
```N	r   r?   r   r   r   re   rf   r   r   r   r   r   )losslogitsr   r   r   )r   r   r   r   rQ   slicer   loss_functionr   r   r   r   r   )rX   r   r   r?   r   r   r   r   re   rf   r   r   r   rg   transformer_outputsr   slice_indicesr   r   rF   s                       r   rk   CTRLLMHeadModel.forward  s0   ` &1%<k$++BYBY"..+))%'/!5# / 
 ,A.8B>SV8W8W~ot4]kmA}a,?@A%%  ;;11 	D Y!4QR!88F)-)9TGf$EvE%/??-;;*55
 	
r   c                 V   > [         TU ]  " U4UUUS.UD6nUR                  SS 5        U$ )N)r   re   is_first_iterationr   )rM   prepare_inputs_for_generationpop)rX   r   r   re   r  rg   model_inputsrY   s          r   r  -CTRLLMHeadModel.prepare_inputs_for_generation  sH    
 w<
+1	

 
 	)40r   )r   r   )NNNNNNNNNNNr   )NNF)rn   ro   rp   rq   _tied_weights_keysrN   r   r   r   r	   r   r   rQ   r   r   r   rk   r  rr   rs   rt   s   @r   r   r   q  s_    +,BC  .2(,37260426*.!%)-,0#'-.V
##d*V
 V
 ))D0	V

 ((4/V
 &&-V
 ((4/V
   4'V
 $;V
  $;V
 #TkV
 D[V
 ell*V
 
u||	5	5V
 V
r SX r   r   a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                   `  ^  \ rS rSrU 4S jr\           SS\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
S-  S\
S-  S\
S-  S\
S-  S\\R                     \-  4S jj5       rSrU =r$ )CTRLForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g )NFr   )
rM   rN   
num_labelsr   r   r   rS   r   
classifierr   r   s     r   rN   &CTRLForSequenceClassification.__init__  sR      ++$V,))FMM4??O 	r   Nr   r   r?   r   r   r   r   re   rf   r   r   r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nU R                  U5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S	35        U[        R                  " UUR                  S
9U4   nSnUGb  U R                   R"                  c  U R$                  S:X  a  SU R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  S9$ )a
  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Example of single-label classification:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLForSequenceClassification

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax().item()
>>> model.config.id2label[predicted_class_id]
'LABEL_0'
```

```python
>>> import torch

>>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
>>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
>>> num_labels = len(model.config.id2label)
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

>>> labels = torch.tensor(1)
>>> loss = model(**inputs, labels=labels).loss
>>> round(loss.item(), 2)
0.93
```

Example of multi-label classification:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLForSequenceClassification

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLForSequenceClassification.from_pretrained(
...     "Salesforce/ctrl", problem_type="multi_label_classification"
... )

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax().item()
>>> model.config.id2label[predicted_class_id]
'LABEL_0'
```

```python
>>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
>>> num_labels = len(model.config.id2label)
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

>>> num_labels = len(model.config.id2label)
>>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
...     torch.float
... )
>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()  # doctest: +IGNORE_RESULT
```Nr   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r"   )r   r!   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r   r   r   r   )r   r   r   r  r6   pad_token_idr   r'   r   r   int32r%   argmaxloggerwarning_oncerY   rn   problem_typer  r!   r   rQ   r   squeezer   r   r   r   r   r   )rX   r   r   r?   r   r   r   r   re   rf   r   r   rg   r   r   r   r_   sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctrF   s                            r   rk   %CTRLForSequenceClassification.forward  s   z &1%<k$++BYBY"..+))%'/!5# / 
 ,A./ *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE' -;;*55	
 	
r   )r  r  r   )NNNNNNNNNNN)rn   ro   rp   rq   rN   r   r   r   r	   r   r   r   r   r   rk   rr   rs   rt   s   @r   r	  r	    s(     .2(,37260426*.!%)-,0#'d
##d*d
 d
 ))D0	d

 ((4/d
 &&-d
 ((4/d
   4'd
 $;d
  $;d
 #Tkd
 D[d
 
u||	7	7d
 d
r   r	  )r	  r   r   r   rL   )+__doc__numpyr7   r   r   torch.nnr   r   r    r   r   cache_utilsr	   r
   
generationr   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   configuration_ctrlr   
get_loggerrn   r  r   r1   rG   ModulerI   ry   r{   r   r   r   r	  __all__r   r   r   <module>r-     s       A A & . ) i i - + 
		H	%
%,1 1h`,299 ,^ 	/ 	 	 j
# j
 j
Z t)? ttn 
o
$7 o

o
d cr   