
    Z jl                        S SK r S SKJr  S SKrS SKJr  S SKJrJrJr  SSK	J
r
  SSKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+  \%RX                  " \-5      r. " S S\R^                  5      r0 " S S\R^                  5      r1  S2S\Rd                  S\Rf                  S\Rf                  S\Rf                  S\Rf                  S-  S\4S-  S\4S\ \"   4S jjr5 " S  S!\Rd                  5      r6 " S" S#\5      r7\# " S$ S%\5      5       r8\# " S& S'\85      5       r9\#" S(S)9 " S* S+\8\5      5       r:\# " S, S-\85      5       r;\#" S.S)9 " S/ S0\85      5       r</ S1Qr=g)3    N)Callable)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )BioGptConfigc                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\S	\R                  S-  4U 4S
 jjjr	Sr
U =r$ ) BioGptLearnedPositionalEmbedding3   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g )N   )offsetsuper__init__)selfr"   r#   	__class__s      {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/biogpt/modeling_biogpt.pyr(   )BioGptLearnedPositionalEmbedding.__init__8   s"     ++5}E    Nattention_maskpast_key_values_lengthposition_idsc                    > Uc5  [         R                  " USS9nX1-  S-
  R                  5       nUSS2US24   n[        TU ]  X0R
                  -   5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr'   forwardr&   )r)   r.   r/   r0   r*   s       r+   r7   (BioGptLearnedPositionalEmbedding.forward>   sZ      <<A>L(9A=CCEL'+A+B(BCLw|kk9::r-   )r&   )r   N)__name__
__module____qualname____firstlineno____doc__intr(   r4   
LongTensorr7   __static_attributes____classcell__r*   s   @r+   r    r    3   s]    Fs F3 F '(04	;((; !$; &&-	; ;r-   r    c            
       r   ^  \ rS rSrSrSS\S\S\S\S-  4U 4S jjjrS	\R                  4U 4S
 jjr
SrU =r$ )BioGptScaledWordEmbeddingO   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r"   r#   padding_idxembed_scaleNc                 2   > [         TU ]  XU5        X@l        g N)r'   r(   rG   )r)   r"   r#   rF   rG   r*   s        r+   r(   "BioGptScaledWordEmbedding.__init__T   s    D&r-   	input_idsc                 <   > [         TU ]  U5      U R                  -  $ rI   )r'   r7   rG   )r)   rK   r*   s     r+   r7   !BioGptScaledWordEmbedding.forwardX   s    wy)D,<,<<<r-   rG   )      ?)r9   r:   r;   r<   r=   r>   floatr(   r4   Tensorr7   r@   rA   rB   s   @r+   rD   rD   O   sJ    's '3 'S '_dgk_k ' '= = =r-   rD   modulequerykeyvaluer.   scalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )N      r%   r   r2   ptrainingr   )
sizer4   matmul	transposenn
functionalsoftmaxrW   r^   
contiguous)
rR   rS   rT   rU   r.   rV   rW   rX   attn_weightsattn_outputs
             r+   eager_attention_forwardrh   \   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r-   c                   $  ^  \ rS rSrSr      SS\S\S\S\S\S	\S
\S-  S\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\S-  S\
R                  S-  S\\   S\\
R                  \
R                  S-  4   4S jjrSrU =r$ )BioGptAttentionx   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsrW   
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r[   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.ro   )r'   r(   rl   rm   rW   head_dimrq   
ValueErrorrV   rn   rp   rr   loggerwarning_oncer*   r9   rb   Lineark_projv_projq_projout_proj)
r)   rl   rm   rW   rn   ro   rp   rq   rr   r*   s
            r+   r(   BioGptAttention.__init__{   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr-   hidden_stateskey_value_statespast_key_valuesr.   rX   returnc                 ,   USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Sn
Ub]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      n/ UR                   SS QSPU R                  P7nUR                  U5      R	                  SS5      nUR                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U U	UUU4U R.                  (       d  SOU R0                  U R2                  S.UD6u  nnUR4                  " / UQSP76 R7                  5       nU R9                  U5      nUU4$ )	z#Input shape: Batch x Time x ChannelNrZ   r   r%   FT        )rW   rV   )shaperu   r|   viewra   
isinstancer   
is_updatedgetrr   cross_attention_cacheself_attention_cachelayerskeysvaluesrz   r{   updater   get_interfacerq   _attn_implementationrh   r^   rW   rV   reshapere   r}   )r)   r   r   r   r.   rX   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacerg   rf   s                      r+   r7   BioGptAttention.forward   sc    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFHmmK0L((r-   )rq   rW   rl   ru   rp   rn   rz   rr   rm   r}   r|   rV   r{   )r   FTFNN)NNN)r9   r:   r;   r<   r=   r>   rP   boolr   r(   r4   rQ   r	   r   r   tupler7   r@   rA   rB   s   @r+   rj   rj   x   s   G  &* $%C%C %C 	%C
 %C %C %C t#%C :%C %CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H) H)r-   rj   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr    SS\R                  S\R                  S-  S\	S-  S	\
S-  S
\R                  S-  S\\   S\R                  4S jjrSrU =r$ )BioGptDecoderLayer   Nrq   rr   c           
      p  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l	        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [        R"                  " U R                  UR$                  5      U l        [        R"                  " UR$                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)rl   rm   rW   rn   rp   rq   rr   )r'   r(   hidden_sizerl   rj   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrW   r   
hidden_actactivation_fnactivation_dropoutrb   	LayerNormself_attn_layer_normry   intermediate_sizefc1fc2final_layer_norm)r)   rq   rr   r*   s      r+   r(   BioGptDecoderLayer.__init__   s    ++(nn0077
 11#F$5$56"(";";$&LL$@!99T^^V-E-EF99V55t~~F "T^^ <r-   r   r.   r   	use_cacher0   rX   r   c                 &   UnU R                  U5      nU R                  " SUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nXq-   nUnU R                  U5      nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nXq-   nU$ )aa  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    past_key_values (`Cache`): cached past key and value projection states
)r   r   r.   r0   r\    )r   r   rb   rc   rW   r^   r   r   r   r   r   )	r)   r   r.   r   r   r0   rX   residual_s	            r+   r7   BioGptDecoderLayer.forward  s     !11-@  >> 
'+)%	

 
 --m||VZVcVc-d 0 !--m</**=9--m?V?Vaeanan-o/--m||VZVcVc-d 0r-   )	r   r   rW   rl   r   r   r   r   r   rI   )NNTN)r9   r:   r;   r<   r   r>   r(   r4   rQ   r	   r   r?   r   r   r7   r@   rA   rB   s   @r+   r   r      s    =| =d
 = =4 /3(,!%04)||) t+) 	)
 $;) &&-) +,) 
) )r-   r   c                   B    \ rS rSr% \\S'   SrSrSrSr	Sr
Sr\\S.rSrg)BioGptPreTrainedModeli1  rq   biogptT)r   
attentionsr   N)r9   r:   r;   r<   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   rj   _can_record_outputsr@   r   r-   r+   r   r   1  s9     &*#N!+%r-   r   c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\S-  S
\	R                  S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )BioGptModeli@  rq   c           
        > [         TU ]  U5        Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  U R                  U R                  US9U l        [!        UR"                  U R                  5      U l        [&        R(                  " [+        UR,                  5       Vs/ s H  n[/        XS9PM     sn5      U l        [&        R2                  " U R                  5      U l        SU l        U R9                  5         g s  snf )NrO   rN   )rr   F)r'   r(   rq   	layerdropr   rW   r   rl   pad_token_idrF   scale_embeddingmathsqrtrD   
vocab_sizeembed_tokensr    max_position_embeddingsembed_positionsrb   
ModuleListrangenum_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r)   rq   rG   ir*   s       r+   r(   BioGptModel.__init__B  s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vVuQR%7%LVu$vw,,t~~6&+# %ws   	E%NrK   r.   inputs_embedsr   r   r0   rX   r   c           	      ^   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUR	                  5       S S u  pUb  UR                  5       OSn
Uc#  X-   n[        R                  " XUR                  S9nUn[        U R                  UUUS9nUc2  [        R                  " XR                  S9U
-   nUR                  S5      nU R                  X*US9nX>-   n[        R                  R                  XR                  U R                   S9n[#        U R$                  5       HM  u  nnU R                   (       a(  [        R&                  " / 5      nUU R(                  :  a  M?  U" U4UUUUS	.UD6nMO     U R+                  U5      n[-        UUS
9$ )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)rq   rZ   r   device)rq   input_embedsr.   r   )r0   r\   )r.   r   r   r0   )last_hidden_stater   )rv   r   r
   rq   r_   get_seq_lengthr4   onesr   r   arange	unsqueezer   rb   rc   rW   r^   	enumerater   randr   r   r   )r)   rK   r.   r   r   r   r0   rX   
batch_size
seq_lengthr/   mask_seq_lengthself_attn_cachecausal_mask	positionsr   idxdecoder_layerdropout_probabilitys                      r+   r7   BioGptModel.forwardW  s    -t";<stt  --i8M 0*$++>O!.!3!3!5cr!:
ETE`!?!?!Afg!4AO"ZZ
ML`L`aN)(;;&)+	
  <<
;O;OPSiiL'11!4L((^j(k	%1--m||VZVcVc-d"+DKK"8C}}&+jjn#&7)* /#) M #9 68++
 	
r-   )
rq   rW   rl   r   r   r   r   r   r   rF   )NNNNNN)r9   r:   r;   r<   r   r(   r   r   r   r4   r?   FloatTensorr	   r   r   r   r   r   r7   r@   rA   rB   s   @r+   r   r   @  s    | *   .23726(,!%04B
##d*B
 ))D0B
 ((4/	B

 B
 $;B
 &&-B
 +,B
 
:	:B
    B
r-   r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                   J  ^  \ rS rSrSS0rU 4S jrS rS r\\	        SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\S-  S\
R                  S-  S\S-  S\
R                  S-  S\\
R                   -  S\\   S\\-  4S jj5       5       rSrU =r$ )BioGptForCausalLMi  zoutput_projection.weightzbiogpt.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFrt   )
r'   r(   r   r   rb   ry   r   r   output_projectionr   r)   rq   r*   s     r+   r(   BioGptForCausalLM.__init__  sJ     !&)!#6+=+=v?P?PW\!] 	r-   c                     U R                   $ rI   r   r)   s    r+   get_output_embeddings'BioGptForCausalLM.get_output_embeddings  s    %%%r-   c                     Xl         g rI   r   )r)   new_embeddingss     r+   set_output_embeddings'BioGptForCausalLM.set_output_embeddings  s    !/r-   NrK   r.   r   r   labelsr   r0   logits_to_keeprX   r   c	           	         U R                   " U4UUUUUS.U	D6n
U
S   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nSnUb)  U R
                  " SXU R                  R                  S.U	D6n[        UUU
R                  U
R                  U
R                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
)r.   r   r   r   r0   r   N)logitsr  r   )lossr  r   r   r   cross_attentionsr   )r   r   r>   slicer   loss_functionrq   r   r   r   r   r   r  )r)   rK   r.   r   r   r  r   r0   r  rX   outputsr   slice_indicesr  r  s                  r+   r7   BioGptForCausalLM.forward  s    ( ++
)'+%
 
  
8B>SV8W8W~ot4]k''a6I(JK%%pVt{{OeOepiopD0#33!//))$55
 	
r-   )r   r   NNNNNNNr   )r9   r:   r;   r<   _tied_weights_keysr(   r   r   r   r   r4   r?   r   r	   r   r>   rQ   r   r   r   r   r7   r@   rA   rB   s   @r+   r   r     s    56RS&0  .23726(,*.!%04-.+
##d*+
 ))D0+
 ((4/	+

 +
   4'+
 $;+
 &&-+
 ell*+
 +,+
 
2	2+
  +
r-   r   c                   ,  ^  \ rS rSrU 4S jr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\
S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                  S-  S\\-  4S jj5       5       rSrU =r$ )BioGptForTokenClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nOUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropout)r'   r(   
num_labelsr   r   hasattrr  r   rb   DropoutrW   ry   r   
classifierr   )r)   rq   r  r*   s      r+   r(   %BioGptForTokenClassification.__init__  s      ++!&)6/00V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr-   NrK   token_type_idsr.   r   r   r  r   r0   r   c	           	      d   U R                   " U4UUUUUS.U	D6n
U
S   nU R                  U5      nU R                  U5      nSnUb  [        5       nUb  UR	                  S5      S:H  nUR	                  SU R
                  5      n[        R                  " XR	                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR	                  SU R
                  5      UR	                  S5      5      n[        UUU
R                  U
R                  S9$ )e  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r   r.   r   r   r0   r   NrZ   r   )r  r  r   r   )r   rW   r  r   r   r  r4   wheretensorignore_indextype_asr   r   r   )r)   rK   r  r.   r   r   r  r   r0   rX   transformer_outputsr   r  r  loss_fctactive_lossactive_logitsactive_labelss                     r+   r7   $BioGptForTokenClassification.forward  s*   ( #kk
+)'%
 
 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/R$-;;*55	
 	
r-   )r   r  rW   r  )NNNNNNNN)r9   r:   r;   r<   r(   r   r   r4   r?   r   r	   r   r   r   r7   r@   rA   rB   s   @r+   r  r    s      .22637(,26*.!%042
##d*2
 ((4/2
 ))D0	2

 2
 ((4/2
   4'2
 $;2
 &&-2
 
&	&2
  2
r-   r  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   @  ^  \ rS rSrS\4U 4S jjr\\        SS\R                  S-  S\R                  S-  S\S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                  S-  S\\R                  -  S\\-  4S jj5       5       rS rS rSrU =r$ )BioGptForSequenceClassificationi-  rq   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r   )
r'   r(   r  r   r   rb   ry   r   scorer   r   s     r+   r(   (BioGptForSequenceClassification.__init__<  sS      ++!&)YYv114??O
 	r-   NrK   r.   r   r   r  r   r0   r  r   c	           	         U R                   " U4UUUUUS.U	D6n
U
S   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nUb  UR
                  SS u  pOUR
                  SS u  pU R                  R                  c  SnOUbV  [        R                  " XR                  R                  5      R                  S5      S-
  R                  UR                  5      nO.Sn[        R                  U R                  R                    S35        U[        R"                  " XR                  S9U4   nSnUGb  U R                  R$                  c  U R&                  S:X  a  S	U R                  l        OoU R&                  S:  aN  UR(                  [        R*                  :X  d  UR(                  [        R                  :X  a  S
U R                  l        OSU R                  l        U R                  R$                  S	:X  aJ  [-        5       nU R&                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                  R$                  S
:X  a=  [1        5       nU" UR3                  SU R&                  5      UR3                  S5      5      nO-U R                  R$                  S:X  a  [5        5       nU" UU5      n[7        UUU
R8                  U
R:                  U
R<                  S9$ )r  r  r   Nr%   rZ   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r  r  r   r   r   )r   r   r>   r  r(  r   rq   r   r4   nesumtor   rw   rx   r*   r9   r   problem_typer  dtyper6   r   squeezer   r   r   r   r   r   r   )r)   rK   r.   r   r   r  r   r0   r  rX   r  r   r
  r  r   sequence_lengthpooled_logitsr  r   s                      r+   r7   'BioGptForSequenceClassification.forwardE  s   ( #kk
+)'%
 
 ,A.8B>SV8W8W~ot4]kM!]A*=>? *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88I{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||J}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6/ /??-;;*55
 	
r-   c                 .    U R                   R                  $ rI   r   r   r   s    r+   get_input_embeddings4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r-   c                 $    XR                   l        g rI   r8  )r)   rU   s     r+   set_input_embeddings4BioGptForSequenceClassification.set_input_embeddings  s    #( r-   )r   r  r(  r  )r9   r:   r;   r<   r   r(   r   r   r4   r?   r   r	   r   r>   rQ   r   r   r7   r9  r<  r@   rA   rB   s   @r+   r&  r&  -  s   |   .237(,26*.!%04-.O
##d*O
 ))D0O
 	O

 ((4/O
   4'O
 $;O
 &&-O
 ell*O
 
1	1O
  O
b() )r-   r&  )r   r  r&  r   r   )Nr   )>r   collections.abcr   r4   torch.nnrb   r   r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   configuration_biogptr   
get_loggerr9   rw   	Embeddingr    rD   ModulerQ   rP   rh   rj   r   r   r   r   r  r&  __all__r   r-   r+   <module>rQ     s  *  $   A A ! C C ) / B 9  G & R R 7 5 . 
		H	%;r|| ;8
= 
=& !%II%<<% 
% <<	%
 LL4'% T\% % '(%8r)bii r)jA3 AH O   [
' [
 [
| 
?
- ?

?
D C
#8 C
 C
L a)&; a)a)Hr-   