
    Z jM                     
   S r SSKrSSKrSSKJr  SSKJrJrJr  SSKJ	r	  SSK
JrJr  SSKJr  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*   " S S\(5      r+ " S S\&5      r, " S S\$5      r- " S S\%5      r.\ " S S\5      5       r/\ " S S\/5      5       r0\" S S!9 " S" S#\/\5      5       r1\ " S$ S%\/5      5       r2\" S&S!9 " S' S(\/5      5       r3/ S)Qr4g)*zPyTorch BioGPT model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogger)merge_with_config_defaults)capture_outputs   )BartAttentionBartDecoderLayerBartScaledWordEmbedding)OPTLearnedPositionalEmbedding   )BioGptConfigc                   n   ^  \ rS rSr  SS\R
                  S\S\R
                  S-  4U 4S jjjrSrU =r	$ )	 BioGptLearnedPositionalEmbedding3   Nattention_maskpast_key_values_lengthposition_idsc                 $   > [         TU ]  XU5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].)superforward)selfr"   r#   r$   	__class__s       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/biogpt/modular_biogpt.pyr'   (BioGptLearnedPositionalEmbedding.forward4   s     w~|TT     )r   N)
__name__
__module____qualname____firstlineno__torch
LongTensorintr'   __static_attributes____classcell__r)   s   @r*   r    r    3   sG     '(04	U((U !$U &&-	U Ur,   r    c                       \ rS rSrSrg)BioGptScaledWordEmbedding>   r-   Nr.   r/   r0   r1   r5   r-   r,   r*   r9   r9   >       r,   r9   c                       \ rS rSrSrg)BioGptAttentionB   r-   Nr;   r-   r,   r*   r>   r>   B   r<   r,   r>   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr    SS\R                  S\R                  S-  S\	S-  S	\
S-  S
\R                  S-  S\\   S\R                  4S jjrSrU =r$ )BioGptDecoderLayerF   Nconfig	layer_idxc           
        > [         TU ]  U5        UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l	        [        UR                     U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  U R                  5      U l        U ?U ?g )NT)	embed_dim	num_headsdropout
is_decoder	is_causalrC   rD   )r&   __init__hidden_sizerF   r>   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrH   r   
hidden_actactivation_fnnnLinearintermediate_sizefc1fc2encoder_attnencoder_attn_layer_norm)r(   rC   rD   r)   s      r*   rK   BioGptDecoderLayer.__init__G   s     ++(nn0077
 11#F$5$5699T^^V-E-EF99V55t~~F(r,   hidden_statesr"   past_key_values	use_cacher$   kwargsreturnc                 &   UnU R                  U5      nU R                  " SUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nXq-   nUnU R                  U5      nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nXq-   nU$ )aa  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    past_key_values (`Cache`): cached past key and value projection states
)r[   r\   r"   r$   ptrainingr-   )self_attn_layer_normrO   rS   
functionalrH   rc   final_layer_normrV   rR   activation_dropoutrW   )	r(   r[   r"   r\   r]   r$   r^   residual_s	            r*   r'   BioGptDecoderLayer.forward]   s     !11-@  >> 
'+)%	

 
 --m||VZVcVc-d 0 !--m</**=9--m?V?Vaeanan-o/--m||VZVcVc-d 0r,   )rR   rH   rF   rV   rW   rO   N)NNTN)r.   r/   r0   r1   r   r4   rK   r2   Tensorr   boolr3   r   r   r'   r5   r6   r7   s   @r*   rA   rA   F   s    )| )d
 ) )2 /3(,!%04)||) t+) 	)
 $;) &&-) +,) 
) )r,   rA   c                   B    \ rS rSr% \\S'   SrSrSrSr	Sr
Sr\\S.rSrg)BioGptPreTrainedModel   rC   biogptT)r[   
attentionsr-   N)r.   r/   r0   r1   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphrA   r>   _can_record_outputsr5   r-   r,   r*   ro   ro      s9     &*#N!+%r,   ro   c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\S-  S
\	R                  S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )BioGptModel   rC   c           
        > [         TU ]  U5        Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  U R                  U R                  US9U l        [!        UR"                  U R                  5      U l        [&        R(                  " [+        UR,                  5       Vs/ s H  n[/        XS9PM     sn5      U l        [&        R2                  " U R                  5      U l        SU l        U R9                  5         g s  snf )Ng      ?)embed_scale)rD   F)r&   rK   rC   	layerdroprP   rH   rL   rF   pad_token_idpadding_idxscale_embeddingmathsqrtr9   
vocab_sizeembed_tokensr    max_position_embeddingsembed_positionsrS   
ModuleListrangenum_hidden_layersrA   layers	LayerNorm
layer_normgradient_checkpointing	post_init)r(   rC   r   ir)   s       r*   rK   BioGptModel.__init__   s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vVuQR%7%LVu$vw,,t~~6&+# %ws   	E%N	input_idsr"   inputs_embedsr\   r]   r$   r^   r_   c           	      ^   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUR	                  5       S S u  pUb  UR                  5       OSn
Uc#  X-   n[        R                  " XUR                  S9nUn[        U R                  UUUS9nUc2  [        R                  " XR                  S9U
-   nUR                  S5      nU R                  X*US9nX>-   n[        R                  R                  XR                  U R                   S9n[#        U R$                  5       HM  u  nnU R                   (       a(  [        R&                  " / 5      nUU R(                  :  a  M?  U" U4UUUUS	.UD6nMO     U R+                  U5      n[-        UUS
9$ )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)rC   r   device)rC   input_embedsr"   r\   )r$   ra   )r"   r\   r]   r$   )last_hidden_stater\   )
ValueErrorr   r	   rC   sizeget_seq_lengthr2   onesr   r   arange	unsqueezer   rS   re   rH   rc   	enumerater   randr   r   r   )r(   r   r"   r   r\   r]   r$   r^   
batch_size
seq_lengthr#   mask_seq_lengthself_attn_cachecausal_mask	positionsr[   idxdecoder_layerdropout_probabilitys                      r*   r'   BioGptModel.forward   s    -t";<stt  --i8M 0*$++>O!.!3!3!5cr!:
ETE`!?!?!Afg!4AO"ZZ
ML`L`aN)(;;&)+	
  <<
;O;OPSiiL'11!4L((^j(k	%1--m||VZVcVc-d"+DKK"8C}}&+jjn#&7)* /#) M #9 68++
 	
r,   )
rC   rH   rF   r   r   r   r   r   r   r   )NNNNNN)r.   r/   r0   r1   r   rK   r   r   r   r2   r3   FloatTensorr   rm   r   r   tupler   r'   r5   r6   r7   s   @r*   r|   r|      s    | *   .23726(,!%04B
##d*B
 ))D0B
 ((4/	B

 B
 $;B
 &&-B
 +,B
 
:	:B
    B
r,   r|   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                   J  ^  \ rS rSrSS0rU 4S jrS rS r\\	        SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\S-  S\
R                  S-  S\S-  S\
R                  S-  S\\
R                   -  S\\   S\\-  4S jj5       5       rSrU =r$ )BioGptForCausalLM   zoutput_projection.weightzbiogpt.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NF)bias)
r&   rK   r|   rq   rS   rT   rL   r   output_projectionr   r(   rC   r)   s     r*   rK   BioGptForCausalLM.__init__   sJ     !&)!#6+=+=v?P?PW\!] 	r,   c                     U R                   $ rk   r   r(   s    r*   get_output_embeddings'BioGptForCausalLM.get_output_embeddings  s    %%%r,   c                     Xl         g rk   r   )r(   new_embeddingss     r*   set_output_embeddings'BioGptForCausalLM.set_output_embeddings  s    !/r,   Nr   r"   r   r\   labelsr]   r$   logits_to_keepr^   r_   c	           	         U R                   " U4UUUUUS.U	D6n
U
S   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nSnUb)  U R
                  " SXU R                  R                  S.U	D6n[        UUU
R                  U
R                  U
R                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
)r"   r   r\   r]   r$   r   N)logitsr   r   )lossr   r\   r[   rr   cross_attentionsr-   )rq   
isinstancer4   slicer   loss_functionrC   r   r   r\   r[   rr   r   )r(   r   r"   r   r\   r   r]   r$   r   r^   outputsr[   slice_indicesr   r   s                  r*   r'   BioGptForCausalLM.forward  s    ( ++
)'+%
 
  
8B>SV8W8W~ot4]k''a6I(JK%%pVt{{OeOepiopD0#33!//))$55
 	
r,   )rq   r   NNNNNNNr   )r.   r/   r0   r1   _tied_weights_keysrK   r   r   r   r   r2   r3   r   r   rm   r4   rl   r   r   r   r   r'   r5   r6   r7   s   @r*   r   r      s    56RS&0  .23726(,*.!%04-.+
##d*+
 ))D0+
 ((4/	+

 +
   4'+
 $;+
 &&-+
 ell*+
 +,+
 
2	2+
  +
r,   r   c                   ,  ^  \ rS rSrU 4S jr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\
S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                  S-  S\\-  4S jj5       5       rSrU =r$ )BioGptForTokenClassificationi>  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nOUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropout)r&   rK   
num_labelsr|   rq   hasattrr   rP   rS   DropoutrH   rT   rL   
classifierr   )r(   rC   r   r)   s      r*   rK   %BioGptForTokenClassification.__init__@  s      ++!&)6/00V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr,   Nr   token_type_idsr"   r\   r   r   r]   r$   r_   c	           	      d   U R                   " U4UUUUUS.U	D6n
U
S   nU R                  U5      nU R                  U5      nSnUb  [        5       nUb  UR	                  S5      S:H  nUR	                  SU R
                  5      n[        R                  " XR	                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR	                  SU R
                  5      UR	                  S5      5      n[        UUU
R                  U
R                  S9$ )e  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r\   r"   r   r]   r$   r   Nr   r   )r   r   r[   rr   )rq   rH   r   r   viewr   r2   wheretensorignore_indextype_asr   r[   rr   )r(   r   r   r"   r\   r   r   r]   r$   r^   transformer_outputsr[   r   r   loss_fctactive_lossactive_logitsactive_labelss                     r*   r'   $BioGptForTokenClassification.forwardN  s*   ( #kk
+)'%
 
 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/R$-;;*55	
 	
r,   )rq   r   rH   r   )NNNNNNNN)r.   r/   r0   r1   rK   r   r   r2   r3   r   r   rm   r   r   r'   r5   r6   r7   s   @r*   r   r   >  s      .22637(,26*.!%042
##d*2
 ((4/2
 ))D0	2

 2
 ((4/2
   4'2
 $;2
 &&-2
 
&	&2
  2
r,   r   a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   @  ^  \ rS rSrS\4U 4S jjr\\        SS\R                  S-  S\R                  S-  S\S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                  S-  S\\R                  -  S\\-  4S jj5       5       rS rS rSrU =r$ )BioGptForSequenceClassificationi  rC   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r   )
r&   rK   r   r|   rq   rS   rT   rL   scorer   r   s     r*   rK   (BioGptForSequenceClassification.__init__  sS      ++!&)YYv114??O
 	r,   Nr   r"   r\   r   r   r]   r$   r   r_   c	           	         U R                   " U4UUUUUS.U	D6n
U
S   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nUb  UR
                  SS u  pOUR
                  SS u  pU R                  R                  c  SnOUbV  [        R                  " XR                  R                  5      R                  S5      S-
  R                  UR                  5      nO/Sn[        R                  " U R                  R                    S35        U[        R"                  " XR                  S9U4   nSnUGb  U R                  R$                  c  U R&                  S:X  a  S	U R                  l        OoU R&                  S:  aN  UR(                  [        R*                  :X  d  UR(                  [        R                  :X  a  S
U R                  l        OSU R                  l        U R                  R$                  S	:X  aJ  [-        5       nU R&                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                  R$                  S
:X  a=  [1        5       nU" UR3                  SU R&                  5      UR3                  S5      5      nO-U R                  R$                  S:X  a  [5        5       nU" UU5      n[7        UUU
R8                  U
R:                  U
R<                  S9$ )r   r   r   Nr   r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r   r   r\   r[   rr   )rq   r   r4   r   r   shaperC   r   r2   nesumtor   r   warning_oncer)   r.   r   problem_typer   dtypelongr   squeezer   r   r   r   r\   r[   rr   )r(   r   r"   r\   r   r   r]   r$   r   r^   r   r[   r   r   r   sequence_lengthpooled_logitsr   r   s                      r*   r'   'BioGptForSequenceClassification.forward  s   ( #kk
+)'%
 
 ,A.8B>SV8W8W~ot4]kM!]A*=>? *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88I{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||J}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6/ /??-;;*55
 	
r,   c                 .    U R                   R                  $ rk   rq   r   r   s    r*   get_input_embeddings4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r,   c                 $    XR                   l        g rk   r   )r(   values     r*   set_input_embeddings4BioGptForSequenceClassification.set_input_embeddings  s    #( r,   )rq   r   r   r   )r.   r/   r0   r1   r   rK   r   r   r2   r3   r   r   rm   r4   rl   r   r   r'   r   r  r5   r6   r7   s   @r*   r   r     s   |   .237(,26*.!%04-.O
##d*O
 ))D0O
 	O

 ((4/O
   4'O
 $;O
 &&-O
 ell*O
 
1	1O
  O
b() )r,   r   )r   r   r   r|   ro   )5__doc__r   r2   torch.nnrS   r   r   r   activationsr   cache_utilsr   r	   
generationr
   masking_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   bart.modeling_bartr   r   r   opt.modeling_optr   configuration_biogptr   r    r9   r>   rA   ro   r|   r   r   r   __all__r-   r,   r*   <module>r     sH       A A ! . ) /  . &  8 5 
 = .U'D U	 7 		m 	@) @F O   [
' [
 [
| 
?
- ?

?
D C
#8 C
 C
L a)&; a)a)Hr,   