
    Z jhj                     Z   S r SSKrSSKJr  SSKJr  SSKrSSKJs  J	r
  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3  SSK4J5r5J6r6  \(Rn                  " \85      r9\'" 5       (       a  SSK:J;r;  \&" SS9\ " S S\5      5       5       r< " S S\/5      r= " S S\05      r>  SAS \R~                  S!\R                  S"\R                  S#\R                  S$\\R                  S%4   S&\AS-  S'\AS-  S(\B\R                  \R                  4   4S) jjrC\ " 5       rD\C\DS*'    " S+ S,\R~                  5      rE " S- S.\-5      rF " S/ S0\R~                  5      rG " S1 S2\5      rH " S3 S4\.5      rI " S5 S6\65      rJ    SBS7\R                  \B\R                     -  S-  S8\KS-  S9\KS-  S:\KS$\R                  S-  S(\R                  \K-  4S; jjrL " S< S=\55      rM " S> S?\,5      rN/ S@QrOg)CzPyTorch Doge model.    N)Callable)Union)strict)nn   )initialization)ACT2FN)Cache)PreTrainedConfig)compile_friendly_flex_attention)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)RopeParameters)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_flex_attn_availablelogging)OutputRecorder   )LlamaForSequenceClassificationLlamaMLPLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward	repeat_kv)MixtralForCausalLMMixtralModel)	BlockMaskzSmallDoge/Doge-320M)
checkpointc                   D  ^  \ rS rSr% SrSrS/rSSSSSSSSSSSS	.rS
/S/4SS/S/4S/S/4S.rSr	\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\\
-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   Sr\
\S#'   S$r\\-  S$-  \S%'   S&r\
\S''   S$r\
S$-  \S('   S!r\\S)'   Sr\S$-  \S*'   S!r \\S+'   S$r!\
S$-  \S,'   Sr"\
\S-'   S!r#\\S.'   S/r$\
\S0'   S1r%\
\S2'   S!r&\\S3'   S!r'\\S4'   S5r(\\S6'   S$r)\
S$-  \S7'   S$r*\
S$-  \S8'   S$r+\
\,\
   -  S$-  \S9'   U 4S: jr-S;r.U =r/$ )<
DogeConfig:   a  
keep_window_size (`int`, *optional*, defaults to 2048):
    The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
is_moe (`bool`, *optional*, defaults to `False`):
    Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.

```python
>>> from transformers import DogeConfig, DogeModel

>>> # Initializing a Doge-320M style configuration
>>> configuration = DogeConfig()

>>> # Initializing a model from the Doge-320M style configuration
>>> model = DogeModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```dogepast_key_valuescolwiserowwisecolwise_gather_outputrowwise_split_input)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.dt_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.mlp.router_gatezlayers.*.mlp.down_embedzlayers.*.mlp.up_embed	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   
vocab_sizei   hidden_size   intermediate_size    num_hidden_layers        hidden_dropoutsilu
hidden_actg{Gz?initializer_rangegư>rms_norm_epsT	use_cacheFtie_word_embeddingsmax_position_embeddingsNrope_parameters   num_attention_headsnum_key_value_headsattention_biasattention_dropoutmlp_biassliding_windowkeep_window_sizeis_moei @  num_experts@   num_experts_per_toknorm_topk_proboutput_router_logitsgMbP?router_aux_loss_coefpad_token_idbos_token_ideos_token_idc                 b   > U R                   c  U R                  U l         [        TU ]  " S0 UD6  g )N )rH   rG   super__post_init__)selfkwargs	__class__s     v/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/doge/modular_doge.pyr[   DogeConfig.__post_init__   s-    ##+'+'?'?D$''    )rH   )0__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr6   int__annotations__r7   r9   r;   r=   floatr?   strr@   rA   rB   boolrC   rD   rE   r   dictrG   rH   rI   rJ   rK   rL   rM   rN   rO   rQ   rR   rS   rT   rU   rV   rW   listr[   __static_attributes____classcell__r^   s   @r_   r'   r'   :   s   & J#4"5 &/%.%.&/%."+ )"+$;#8!6 &(9:#%568IJ!"_$56 JK!s!s"%NECK%J#u#L%It %%#'S'48O^d*T18  &*t* ND &)ut|)Hd!%NC$J% c FDK!! ND !&$&"'%'#L#*##L#*#+/L#S	/D(/( (ra   r'   c                       \ rS rSrSrg)DogeRMSNorm   rY   Nrb   rc   rd   re   rr   rY   ra   r_   rv   rv          ra   rv   c                       \ rS rSrSrg)DogeRotaryEmbedding   rY   Nrx   rY   ra   r_   r{   r{      ry   ra   r{   modulequerykeyvaluer2   r$   scalingsoftcapreturnc                 2  ^^ S nS m[        U[        5      (       a  UnOUmTb  TS S 2S S 2S S 2S UR                  S   24   mUU4S jn	[        UUUU	USUSS9u  pUR	                  UR
                  5      nU
R                  SS5      R                  5       n
X4$ )Nc                 n   > Tb  T[         R                  " U T-  5      -  n Tb  U TU   U   U   U   -   n U $ N)torchtanh)score	batch_idxhead_idxq_idxkv_idxcausal_maskr   s        r_   	score_mod)flex_attention_forward.<locals>.score_mod   sI    ejj99E"K	28<UCFKKEra   T)r   
block_mask
enable_gqascale
return_lse   r   )
isinstancer$   shaper   todtype	transpose
contiguous)r}   r~   r   r   r2   r   r   r]   r   r   attn_outputattention_weightsr   s         `     @r_   flex_attention_forwardr      s     JK.),,#
$!!Q?SYYr]?":; &E &"K *,,U[[9''1-88:K))ra   doge_flex_attentionc                     ^  \ rS rSrSS\S\S-  4U 4S jjjr  SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\	\R                  \R                  S-  \	\R                     S-  4   4
S jjr  SS\R                  S\R                  S\S\R                  S-  4S jjrSrU =r$ )DogeAttention   Nconfig	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        UR                  U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R&                  " [(        R*                  " UR                  5      5      U l        [        R                  " UR                  U R                  -  UR                  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        [3        U R                  UR4                  S9U l        [3        U R                  UR4                  S9U l        g )Nhead_dimg      ࿩biaseps)rZ   __init__r   r   getattrr7   rG   r   rH   num_key_value_groupsr   rJ   rM   r   LinearrI   q_projk_projv_proj	Parameterr   zerosAdt_projo_projrv   rA   q_normk_normr\   r   r   r^   s      r_   r   DogeAttention.__init__   s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9 & 7 7ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ekk&*D*DEFyy&&68R8RY_YnYn
 ii&&68J8JQWQfQf
 "$--V5H5HI!$--V5H5HIra   r1   position_embeddingsr2   r*   r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U5      R	                  U5      R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  pU R                  U
R                  SS5      R                  U
R                   S   U
R                   S   S5      5      n[        R                  " U R                   ["        R$                  " U5      -  5      R                  SS5      nU R'                  UUU R(                  US9n[+        XR,                  5      n[.        R1                  U R2                  R4                  [6        5      nU" U UU	U
4UU R8                  (       d  SOU R:                  U R<                  S.UD6u  nnUR                  " / UQSP76 R?                  5       nU RA                  U5      nUU4$ )	Nr   r   r   r   )r1   	dt_statesrM   r2   r<   )r2   dropoutr   )!r   r   r   r   viewr   r   r   r   r   updater   r   reshaper   expr   Fsoftplusprepare_dynamic_maskrM   r!   r   ALL_ATTENTION_FUNCTIONSget_interfacer   _attn_implementationr    trainingrJ   r   r   r   )r\   r1   r   r2   r*   r]   input_shapehidden_shapequery_states
key_statesvalue_statescossinr   	attn_maskattention_interfacer   attn_weightss                     r_   forwardDogeAttention.forward   sE    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J LL""1a(001C1CA1FHZHZ[]H^`bc
	 IIdffqzz)'<<=GGBO	--'!22)	 . 
	 i)B)BC	(?(M(MKK,,.E)
 %8		%

 %#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((ra   r   rM   c           	         [         R                  " UR                  5      R                  nUR                  nUSS2SS2SSS24   R	                  SSUR
                  S   S5      nUb  [        U[        5      (       d  UR                  [         R                  :X  aB  UR                  n[         R                  " U[         R                  " SUR                  US9U5      nUR                  USS2SS2SS2SUR
                  S   24   S:g  U5      nUR
                  S   U:  ah  [         R                  " XvUR                  S9n[         R                  " XsSSS	S
9R                  n	UR!                  SU	S5      nUR                  US:H  U5      nU$ )a  
The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

Args:
    hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
    dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
    keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
    attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
Nr   r   r<   )devicer   r   r   r   TF)dimlargestsortedg      ?)r   finfor   minexpandr   r   r$   ro   wheretensorr   masked_fill
zeros_liketopkindicesscatter)
r\   r1   r   rM   r2   	min_dtyper   r   active_masktopk_indicess
             r_   r   "DogeAttention.prepare_dynamic_mask  se   $ KK 3 3488	##aD!m,33M''*B
	 %j.S.S##uzz1%++!&"ELL^=R=RZ_$`bk" "--nQ1F[	XZH[F[=[.\`a.aclmI??2!11**9)JZJZ[K ::irSW`efnnL%--b,DK!--kS.@)LIra   )r   rJ   r   r   r   r   r   rM   r   r   r   r   r   r   r   r   NN)r8   N)rb   rc   rd   re   r'   rk   r   r   Tensortupler
   r   r   rr   rs   rt   s   @r_   r   r      s    Jz JcDj J JD /3(,3)||3) #5<<#=>3) t+	3)
 3) 
u||U\\D0%2E2LL	M3)r !%.2#||# <<# 	#
 t+# #ra   r   c                       \ rS rSrSrg)DogeMLPi?  rY   Nrx   rY   ra   r_   r   r   ?  ry   ra   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	DogeCDMoEiC  r   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        UR
                     U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        UR                  U l        UR                  U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  S-  SS9U l        [        R,                  " U R                  U R                  5      U l        [        R,                  " U R                  U R                  5      U l        g )Nr   r   F)rZ   r   r7   r9   r	   r?   act_fnrO   mathfloorsqrtnum_keysrQ   top_krR   r   r   rK   	gate_projup_proj	down_projrouter_gate	Embedding
down_embedup_embedr\   r   r^   s     r_   r   DogeCDMoE.__init__D  s_   !--!'!9!9V../!--

499T-=-=#>?//
$33 4#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRab 99T%5%5t}}q7HuU ,,t'7'79I9IJT%5%5t7G7GHra   r1   r   c                    UR                   u  p4nU R                  U5      R                  SX4-  S5      nUR                  U R                  SS9u  u  pxu  pUR                  S5      UR                  S5      -   nU	R                  S5      U R                  -  U
R                  S5      -   nUR                  " / UR                   S S QSP76 nUR                  " / UR                   S S QSP76 nUR                  U R                  SS9u  pUR                  SU5      n[        R                  " USS9nU R                  (       a  UUR                  SSS9-  nU R                  U5      nU R                  U5      n[        R                  " UUR                  X4-  SS5      5      R                  X4-  S5      nU R!                  U5      U-  n[        R                  " UR                  X4-  SS5      U5      R                  X4S5      nU R#                  U R!                  U R%                  U5      5      U R'                  U5      -  5      nUU-   nX4$ )Nr   r   r   r   T)r   keepdimr   )r   r   r   r   r   	unsqueezer   gatherr   softmaxrR   sumr   r   r   matmulr   r   r   r   )r\   r1   r]   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr   r   experts_weightsexperts_statess                        r_   r   DogeCDMoE.forward[  s+   
 (--a ((7<<QrR 8E7I7I$--]_7I7`44y''+h.@.@.DD
))"-=	@S@STV@WW__@j&6&6s&;@R@
!&&C(9(9#2(>CC#-??4::2?#F $$R)9:))F322r42HHO __W-
==),,z=3E3EcmUWYZ3[\aabeboqst++o6Ho&:&:3=!R&PRZ[``adoqrt{{4>>-3P'QTXT`T`anTo'op%6++ra   )r   r   r   r   r7   r9   rR   rO   r   r   r   r   r   )rb   rc   rd   re   r'   r   r   r   r   rr   rs   rt   s   @r_   r   r   C  s5    Iz I.,||, 
	, ,ra   r   c                   ^  ^  \ rS rSrSS\S\S-  4U 4S jjjr     SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\\   S\	\R                  \	\R                  \R                  4   S-  4   4S jjrSrU =r$ )DogeDecoderLayeri|  Nr   r   c                 (  > [         TU ]  5         UR                  U l        [        UR                  UR
                  S9U l        [        XS9U l        [        R                  " [        R                  " UR                  5      5      U l        [        UR                  UR
                  S9U l        UR                  (       d  [!        U5      O
[#        U5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   )r   r   )rZ   r   r=   rv   r7   rA   input_layernormr   	self_attnr   r   r   onesinput_residualpost_attention_layernormrN   r   r   mlppost_attention_residualr   s      r_   r   DogeDecoderLayer.__init__}  s    $33*6+=+=6CVCVW&fJ ll5::f6H6H+IJ(3F4F4FFL_L_(`%*0--76?Yv=N')||EJJv?Q?Q4R'S$ra   r1   r   r2   position_idsr*   rB   r]   r   c           
         UnU R                  U5      nU R                  " SUUUUUUS.UD6u  p[        R                  " XR                  U R
                  S9nU R                  U-  U-   nUnU R                  U5      nU R                  U5      n[        R                  " XR                  U R
                  S9nU R                  U-  U-   nU$ )N)r1   r   r2   r&  r*   rB   )pr   rY   )
r  r  r   r   r=   r   r!  r"  r#  r$  )
r\   r1   r   r2   r&  r*   rB   r]   residualself_attn_weightss
             r_   r   DogeDecoderLayer.forward  s     !,,];+/>> ,
' 3)%+,
 ,
( 		-3F3FQUQ^Q^_++h6F !55mD/		-3F3FQUQ^Q^_44x?-Ora   )r=   r  r!  r#  r"  r$  r  r   )NNNNF)rb   rc   rd   re   r'   rk   r   r   r   r   
LongTensorr
   ro   r   r   FloatTensorr   rr   rs   rt   s   @r_   r  r  |  s    
Tz 
TcDj 
T 
T IM.204(,!& ||  #5<<#=>E  t+	 
 &&-    $;  +,  
u  %(9(95;L;L(L"MPT"TT	U   ra   r  c                   `    \ rS rSrSrSr\" \SS9\\	S.r
\R                  " 5       S 5       rSrg)	DogePreTrainedModeli  Fr   )index)r  r1   
attentionsc                    [         R                  " X5        [        U[        5      (       a3  [	        US5      (       a!  [
        R                  " UR                  5        gg[        U[        5      (       ad  [	        US5      (       a   [
        R                  " UR                  5        [	        US5      (       a!  [
        R                  " UR                  5        ggg)zInitialize the weightsr   r!  r$  N)r   _init_weightsr   r   hasattrinitzeros_r   r  ones_r!  r$  )r\   r}   s     r_   r3  !DogePreTrainedModel._init_weights  s     	%%d3fm,,vs##FHH% $ 011v/00

6001v899

699: : 2ra   rY   N)rb   rc   rd   re   _supports_flash_attn_can_compile_fullgraphr   r   r  r   _can_record_outputsr   no_gradr3  rr   rY   ra   r_   r/  r/    s@     "'	;)# ]]_
; 
;ra   r/  c                       \ rS rSrSrg)	DogeModeli  rY   Nrx   rY   ra   r_   r>  r>    ry   ra   r>  gate_logitsrO   r   r   c                    U b  [        U [        5      (       d  gU S   R                  nU S   R                  n/ n/ nU  GH  n	U	R	                  U5      n	U	R                  USS9u  u  pu  pU
R                  S5      UR                  S5      -   nUR                  S5      U-  UR                  S5      -   nUR                  " / UR                  SS QSP76 nUR                  " / UR                  SS QSP76 nUR                  USS9u  nnUR                  SU5      n[        R                  " USS9nUR                  U5        UR                  U5        GM     [        R                  " USS9n[        R                  " USS9nUcu  UR                  S5      n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      UR                  S   -  n[        R$                  " USS9nGO;UR                  u  nn['        U 5      nUSSS2SS2S4   R)                  UUUU45      R+                  S5      R	                  U5      nUR                  S5      UR-                  5          n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      [        R.                  " U5      -  nUSSS2SS2S4   R)                  UUUU45      R+                  SU5      R	                  U5      n[        R.                  " UU-  SS9[        R.                  " USS9-  n[        R.                  " UU-  5      nUU-  $ )a  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [2, batch_size * sequence_length, num_keys].
    num_experts:
        Number of experts
    num_keys:
        Number of keys
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   r   r  r   r   )r   r   r   r   r   r   r  r   r   r  r   r  appendr   catr   	ones_likescatter_add_meanlenr   r   ro   r	  )r?  rO   r   r   r2   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr  r  r  r  r  r  r  r  expert_indicesr  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthr;   expert_attention_mask router_per_expert_attention_maskoverall_losss                                r_   load_balancing_loss_funcrU    si   @ *[%"@"@N((M ^**N(-00@7H7M7Mh\^7M7_44y''+h.@.@.DD
))"-89;N;Nr;RR__@j&6&6s&;@R@
!&&C(9(9#2(>CC(ooeo<$++B0@A))JB7!!.1""?3! )" #51=))$7Q?/44R8!KKQ_`oo0n]-::1>PRUVYkYqYqrsYtt "',?Q!G&4&:&:#
O, 4At+,V&
OUKLWR[R	 	 044R89N9S9S9UV "KKQ_`oo0n]-::1>PRUVY^YbYb!Z
 
 4At+,V&
O[QRWR%R	 	) "'+>Aa+agh!ilqlulu,!m
 "
 99.1GGHL+%%ra   c                   (  ^  \ rS rSrU 4S jr         SS\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S	\R                  S-  S
\
S-  S\\R                  -  S\
S-  S\\   S\4S jjrSrU =r$ )DogeForCausalLMi1  c                 f   > [         TU ]  U5        [        U5      U l        UR                  U l        g r   )rZ   r   r>  modelrO   r  s     r_   r   DogeForCausalLM.__init__2  s*     v&
!--ra   Nr/   r2   r&  r*   r0   labelsrB   logits_to_keeprS   r]   r   c
           
         U	b  U	OU R                   R                  n	U R                  " SUUUUUUS.U
D6nUR                  n[	        U[
        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " XU R                  40 U
D6nSnU	(       a  [        UR                  U R                  [        R                  " [        R                  " U R                  5      5      U R                   U5      nUb*  XR"                  UR%                  UR&                  5      -  -  n[)        UUUUR*                  UR,                  UR.                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, DogeForCausalLM

>>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
>>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)r/   r2   r&  r*   r0   rB   )lossaux_losslogitsr*   r1   r1  r  rY   )r   rS   rY  last_hidden_stater   rk   slicelm_headloss_functionr6   rU  r  rO   r   r   r   rQ   rT   r   r   r   r*   r1   r1  )r\   r/   r2   r&  r*   r0   r[  rB   r\  rS   r]   outputsr1   slice_indicesr`  r^  r_  s                    r_   r   DogeForCausalLM.forward7  sh   H %9$D $++JjJj 	
 +/** +
)%+'+
 +
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD/%%  

499T%5%567((H !11HKK4LLL(#33!//))!//
 	
ra   )rY  rO   )	NNNNNNNr   N)rb   rc   rd   re   r   r   r,  r   r
   r-  ro   rk   r   r   r   r   rr   rs   rt   s   @r_   rW  rW  1  s    . .2.204(,26*.!%-.,0O
##d*O
 t+O
 &&-	O

 O
 ((4/O
   4'O
 $;O
 ell*O
 #TkO
 +,O
 
#O
 O
ra   rW  c                       \ rS rSrSrg)DogeForSequenceClassificationi  rY   Nrx   rY   ra   r_   ri  ri    ry   ra   ri  )r'   rW  r>  r/  ri  r   )NNr   N)Prf   r   collections.abcr   typingr   r   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr    r   r5  activationsr	   cache_utilsr
   configuration_utilsr   integrations.flex_attentionr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.output_capturingr   llama.modeling_llamar   r   r   r   r   r   r    r!   mixtral.modeling_mixtralr"   r#   
get_loggerrb   logger!torch.nn.attention.flex_attentionr$   r'   rv   r{   Moduler   rm   r   r   r   r   r   r   r  r/  r>  rk   rU  rW  ri  __all__rY   ra   r_   <module>r     sk      $     .  & !   3 J 9 Q 1 A & ^ ^ 4	 	 	 H 
		H	%!!; 01L(! L(  2L(^	, 		. 	 ! +*II+*<<+* 
+* <<	+*
 %,,34+* T\+* T\+* 5<<%&+*\ -. 1G - .wBII wt	h 	6,		 6,r-1 -`;. ;.	 	 #*.g&ell 33d:g&tg& Djg& 	g&
 LL4'g& \\Cg&TU
( U
p	$B 	ra   