
    Z j{                     P   S r SSKrSSKJr  SSKJr  SSKrSSKJs  J	r
  SSKJrJrJr  SSKJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/J0r0  SSK1J2r2J3r3  SSK4J5r5  SSK6J7r7  \0Rp                  " \95      r:S r; " S S\Rx                  5      r= " S S\R|                  5      r?S r@S:S jrA " S S\R|                  5      rBS \R                  S!\CS"\R                  4S# jrD " S$ S%\R|                  5      rE " S& S'\E5      rF " S( S)\E5      rG\E\F\GS*.rH " S+ S,\!5      rI\. " S- S.\)5      5       rJ\. " S/ S0\J5      5       rK " S1 S2\J\5      rL " S3 S4\\J5      rM " S5 S6\\J5      rN " S7 S8\ \J5      rO/ S9QrPg);zPyTorch Nemotron model.    N)Callable)Optional)SizeTensornn   )initialization)ACT2FN)CacheDynamicCacheStaticCache)GenerationMixin)create_causal_mask)_flash_attention_forward!flash_attn_supports_top_left_mask)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocastmerge_with_config_defaults)capture_outputs   )NemotronConfigc                     [         R                  " 5       (       d  U$ [         R                  " U 5      n[         R                  R                  R                  XU5      $ N)torchis_autocast_enabledget_autocast_dtypeampautocast_mode_cast)device_typeargstarget_dtypes      /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/nemotron/modeling_nemotron.py_cast_if_autocast_enabledr1   8   sC    $$&&//<yy&&,,TMM    c            	       n   ^  \ rS rSr     SS\\\   -  \-  S\S\S\4U 4S jjjr	S\
S\
4S	 jrS
rU =r$ )NemotronLayerNorm1P@   normalized_shapeepselementwise_affinebiasc                 (   > [         TU ]  XX4XV5        g r&   )super__init__)selfr6   r7   r8   r9   devicedtype	__class__s          r0   r<   NemotronLayerNorm1P.__init__A   s     	)0B&Xr2   inputreturnc                 r   UR                   R                  S:w  a  UR                   R                  OSn[        X!U R                  U R                  S-   U R
                  U R                  5      n[        UR                   R                  SS9   [        R                  " U6 sS S S 5        $ ! , (       d  f       g = f)Nmpscpur#   Fr-   enabled)
r>   typer1   r6   weightr9   r7   r    F
layer_norm)r=   rB   r-   r.   s       r0   forwardNemotronLayerNorm1P.forwardL   s    +0<<+<+<+Eell''5( 5 5t{{Q		SWS[S[
 (9(95I<<& JIIs   B((
B6 )gh㈵>TTNN)__name__
__module____qualname____firstlineno__intlistr   floatboolr<   r   rM   __static_attributes____classcell__r@   s   @r0   r4   r4   @   so     #'	YS	/D0	Y 	Y !		Y
 	Y 	Y'V ' ' 'r2   r4   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )NemotronRotaryEmbeddingV   inv_freqNconfigc                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultr^   F)
persistentoriginal_inv_freq)r;   r<   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr_   rope_parametersra   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r=   r_   r>   rope_init_fnr^   r@   s        r0   r<    NemotronRotaryEmbedding.__init__Y   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr2   r>   ztorch.deviceseq_lenrC   ztorch.Tensorc           	      j   U R                   S   nU R                   R                  SS5      n[        U SS5      =(       d    U R                  U R                  -  n[        XT-  5      nSnSU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  nX4$ )
aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetapartial_rotary_factorg      ?head_dimNr      r?   )r>   r?   )rh   getgetattrhidden_sizenum_attention_headsrT   r'   arangeint64torV   )	r_   r>   ro   baserr   rs   dimattention_factorr^   s	            r0   ri   7NemotronRotaryEmbedding.compute_default_rope_parametersi   s    ( %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(23 U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r2   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r#   rE   rF   FrG   rt   r~   ru   )r^   rV   expandshaper|   r>   
isinstancerI   strr    	transposer'   catcosrj   sinr?   )
r=   xposition_idsinv_freq_expandedposition_ids_expandedr-   freqsembr   r   s
             r0   rM   NemotronRotaryEmbedding.forward   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)rj   r_   rf   rg   ra   r&   )NNN)rP   rQ   rR   rS   r'   r   __annotations__r$   r<   staticmethodr   rT   tuplerV   ri   no_gradr   rM   rX   rY   rZ   s   @r0   r\   r\   V   s    llV~ V V   )-+/"*%*(* t* 
~u$	%	* *> ]]_<  <r2   r\   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   rt   r   )r   r'   r   )r   x1x2s      r0   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                 L   UR                  U5      nUR                  U5      nUR                  S   nU SSU24   U SUS24   p`USSU24   USUS24   pqX-  [        U 5      U-  -   nX-  [        U5      U-  -   n	[        R                  " X4SS9[        R                  " X4SS94$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
r   .Nr   )	unsqueezer   r   r'   r   )
qkr   r   unsqueeze_dimrot_dimq_passk_passq_embedk_embeds
             r0   apply_rotary_pos_embr      s    $ --
&C
--
&CiimG#xx- !CM"2v#xx- !CM"2vw;q>C/0Gw;q>C/0G99g&B/G;LRT1UUUr2   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )NemotronMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [        UR                     U l        g )Nr9   )r;   r<   r_   rx   intermediate_sizer   Linearmlp_biasup_proj	down_projr
   
hidden_actact_fnr=   r_   r@   s     r0   r<   NemotronMLP.__init__   s    !--!'!9!9yy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r2   c                 `    U R                  U R                  U R                  U5      5      5      $ r&   )r   r   r   )r=   r   s     r0   rM   NemotronMLP.forward   s"    ~~dkk$,,q/:;;r2   )r   r_   r   rx   r   r   )rP   rQ   rR   rS   r<   rM   rX   rY   rZ   s   @r0   r   r      s    0< <r2   r   hidden_statesn_reprC   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r#   N)r   r   reshape)r   r   batchnum_key_value_headsslenrs   s         r0   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr2   c                   T  ^  \ rS rSrSrSS\S\S-  4U 4S jjjr     SS\R                  S\
\R                  \R                  4   S	\R                  S-  S
\R                  S-  S\S-  S\S\S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )NemotronAttention   z=Multi-headed attention from 'Attention Is All You Need' paperNr_   	layer_idxc                 $  > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l	        UR                  U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        UR                  U l        UR                   S   U l        SU l        ['        US9U l        [*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  -  U R                  UR.                  S9U l        g )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.rr   Tr_   r   )r;   r<   r_   r   loggerwarning_oncer@   rP   attention_dropoutrx   ry   	num_headsrs   r   num_key_value_groupsre   rh   rr   	is_causalr\   
rotary_embr   r   attention_biasq_projk_projv_projo_projr=   r_   r   r@   s      r0   r<   NemotronAttention.__init__   s   " !8!8 9 :, , "(!9!9!--33#)#=#= $(NNd6N6N$N!'-'E'E$%+%;%;<S%T"1@ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlmr2   r   position_embeddingsattention_maskr   past_key_valuesoutput_attentions	use_cacherC   c                    UR                  5       u  pnU R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUu  nn[        XUU5      u  pUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      n[        R                  " XR                  SS5      5      [         R"                  " U R                  5      -  nUb  UU-   n[$        R&                  R)                  US[        R*                  S9R-                  UR.                  5      n[$        R&                  R1                  UU R2                  U R4                  S9n[        R                  " UU5      nUR                  SS5      R7                  5       nUR9                  XS5      nU R;                  U5      nUU4$ )Nr#   rt   r   r   )r~   r?   )ptraining)sizer   r   r   viewr   rs   r   r   r   updater   r   r   r'   matmulmathsqrtr   
functionalsoftmaxfloat32r|   r?   dropoutr   r   
contiguousr   r   )r=   r   r   r   r   r   r   r   kwargsbszq_len_query_states
key_statesvalue_statesr   r   attn_weightsattn_outputs                      r0   rM   NemotronAttention.forward   s    &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&S#7RUWZ#[ &'6'='=jX\XfXf'g$Jz+D+DE
 /H/HI||L2F2Fq!2LMPTPYPYZ^ZgZgPhh%'.8L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,lll<>!++Aq1<<>!))#b9kk+.L((r2   )r   r_   rs   rx   r   r   r   re   r   r   r   r   rr   r   r   r   r&   NNNFF)rP   rQ   rR   rS   __doc__r$   rT   r<   r'   r   r   
LongTensorr   rW   rM   rX   rY   rZ   s   @r0   r   r      s    Gn~ n#* n n@ /304(,"'.)||.) #5<<#=>.) t+	.)
 &&-.) .)  .) .) 
u||U\\D0%2E2LL	M.) .)r2   r   c                   >  ^  \ rS rSrSrU 4S jr     SS\R                  S\\R                  \R                  4   S\R                  S-  S\R                  S-  S	\
S-  S
\S\S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )NemotronFlashAttention2i.  a<  
Nemotron flash attention module. This module inherits from `NemotronAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r&   )r;   r<   r   _flash_attn_uses_top_left_mask)r=   r.   r   r@   s      r0   r<    NemotronFlashAttention2.__init__5  s#    $)&)
 /P.Q+r2   Nr   r   r   r   r   r   r   rC   c                 :   [        U[        5      (       a  [        S5      eUR                  5       u  pnU R	                  U5      nU R                  U5      nU R                  U5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUu  nn[        XUU5      u  pUb  UR                  XU R                  5      u  pUR                  SS5      nUR                  SS5      nUR                  SS5      nU R                  (       a  U R                   OSnUR"                  nUR$                  R&                  S:w  a  UR$                  R&                  OSnU[(        R*                  :X  a  [(        R,                  " 5       (       a  [(        R.                  " U5      nOR[1        U R2                  S5      (       a  U R2                  R"                  nO U R                  R4                  R"                  n[6        R9                  SU S	35        UR;                  U5      nUR;                  U5      nUR;                  U5      n[=        UUUUU
UU[?        U S
S 5      U R@                  U RB                  S9
nURE                  XS5      RG                  5       nU RI                  U5      nUS 4$ )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr#   rt           rE   rF   _is_quantizedzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .sliding_window)r   r   r   use_top_left_maskr   r   )%r   r   
ValueErrorr   r   r   r   r   r   rs   r   r   r   r   r   r   r   r?   r>   rI   r'   r   r(   r)   hasattrr_   rJ   r   r   r|   r   rw   r   r   r   r   r   )r=   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   dropout_rateinput_dtyper-   r/   r   s                         r0   rM   NemotronFlashAttention2.forward=  s    o{33} 
 &**,A{{=1[[/
{{=1
 $((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&S#7RUWZ#[ &'6'='=jX\XfXf'g$J $--a3))!Q/
#--a315t--C #((2>2E2E2J2Je2Sl))..Y^%--'((**$77Do66#{{00#{{1177 >$ (??<8L#|4J'??<8L.% "4)94@"AAnn
 "))#b9DDFkk+.D  r2   )r   r   )rP   rQ   rR   rS   r   r<   r'   r   r   r   r   rW   rM   rX   rY   rZ   s   @r0   r   r   .  s    R 3704(,"'W!||W! #5<<#=>W! ((4/	W!
 &&-W! W!  W! W! 
u||U\\D0%2E2LL	MW! W!r2   r   c                   *   \ rS rSrSr     SS\R                  S\\R                  \R                  4   S\R                  S-  S\R                  S-  S\	S-  S	\
S
\
S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrg)NemotronSdpaAttentioni  z
Nemotron attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`NemotronAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
Nr   r   r   r   r   r   r   rC   c           	         U(       a,  [         R                  U R                  R                   S35        UR	                  5       u  pnU R                  U5      nU R                  U5      nU R                  U5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUu  nn[        XUU5      u  pUb  UR                  XU R                  5      u  p[!        XR"                  5      n[!        XR"                  5      nUnUb  US S 2S S 2S S 2S UR$                  S   24   nUS L =(       a    U
S:  n[&        R(                  R*                  R-                  UUUUU R.                  (       a  U R0                  OSUS9nUR                  SS5      R3                  5       nUR                  XS5      nU R5                  U5      nUS 4$ )Nz does not support `output_attentions=True`. The returned attention weights will be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model.r#   rt   r   )	attn_mask	dropout_pr   r   )r   r   r@   rP   r   r   r   r   r   r   rs   r   r   r   r   r   r   r   r   r'   r   r   scaled_dot_product_attentionr   r   r   r   )r=   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   causal_maskr   r   s                       r0   rM   NemotronSdpaAttention.forward  s    >>**+ ,D D &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&S#7RUWZ#[ &'6'='=jX\XfXf'g$Jz+D+DE
 /H/HI$%%aA/E1A1A"1E/E&EFK  4'5EAI	hh))FF!04d,,3 G 
 "++Aq1<<>!&&s26kk+.D  r2   rO   r   )rP   rQ   rR   rS   r   r'   r   r   r   r   rW   rM   rX   rO   r2   r0   r  r    s     /304(,"'9!||9! #5<<#=>9! t+	9!
 &&-9! 9!  9! 9! 
u||U\\D0%2E2LL	M9! 9!r2   r  )eagerflash_attention_2sdpac                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\R                  4S jjrSrU =r$ )NemotronDecoderLayeri  r_   r   c                 (  > [         TU ]  5         UR                  U l        [        UR                     " XS9U l        [        U5      U l        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        g )N)r_   r   r7   )r;   r<   rx   NEMOTRON_ATTENTION_CLASSES_attn_implementation	self_attnr   mlpr4   norm_epsinput_layernormpost_attention_layernormr   s      r0   r<   NemotronDecoderLayer.__init__  su    !--3F4O4OPX^tv&263E3E6??[(;F<N<NTZTcTc(d%r2   Nr   r   r   r   r   r   rC   c           	          UnU R                  U5      nU R                  UUUUUUS9u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r   r   r   r   r   r   )r  r  r  r  )
r=   r   r   r   r   r   r   r   residualr   s
             r0   rM   NemotronDecoderLayer.forward  s     !,,];  >>')%+ 3 * 
 !0 !55mD/ 0r2   )rx   r  r  r  r  )NNNFN)rP   rQ   rR   rS   r$   rT   r<   r'   r   r   r   rW   r   rM   rX   rY   rZ   s   @r0   r  r    s    e~ e# e /304(,!&HL ||  t+  &&-	 
   $;  #5<<#=>E  
   r2   r  c                      ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSr\\S.r\R"                  " 5       U 4S j5       rS	rU =r$ )
NemotronPreTrainedModeli  r_   modelTr  r   )r   
attentionsc                    > [         TU ]  U5        [        U[        5      (       aA  [        R
                  " UR                  5        [        R                  " UR                  5        g g r&   )	r;   _init_weightsr   r4   initones_rJ   zeros_r9   )r=   moduler@   s     r0   r"  %NemotronPreTrainedModel._init_weights$  sC    f%f122JJv}}%KK$ 3r2   rO   )rP   rQ   rR   rS   r$   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr  r   _can_record_outputsr'   r   r"  rX   rY   rZ   s   @r0   r  r    s`    &*#/0#4"5N!-'
 ]]_% %r2   r  c                     ^  \ rS rSrSrS\4U 4S jjr\\\	      SS\
R                  S-  S\
R                  S-  S\
R                  S-  S	\S-  S
\
R                  S-  S\S-  S\\   S\4S jj5       5       5       rSrU =r$ )NemotronModeli,  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`NemotronDecoderLayer`]

Args:
    config: NemotronConfig
r_   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr  r   F)r;   r<   pad_token_idpadding_idx
vocab_sizer   	Embeddingrx   embed_tokens
ModuleListrangenum_hidden_layersr  layersr4   r  normr\   r   gradient_checkpointing	post_initr   s      r0   r<   NemotronModel.__init__5  s     !.. ++LL):):F<N<NPTP`P`ammFKFLdLdFefFe!&4Fef
 ((:(:P	1@&+# 	 gs   C?N	input_idsr   r   r   inputs_embedsr   r   rC   c           
         US L US L-  (       a  [        S5      eU(       a  Uc  [        U R                  S9nUc  U R                  U5      nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  XS9nU R                   H  nU" U
4U	UUUUS.UD6n
M     U R                  U
5      n
[        U
US	9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r#   )r>   )r_   rA  r   r   r   )r   )r   r   r   r   r   )last_hidden_stater   )r   r   r_   r7  get_seq_lengthr'   rz   r   r>   r   r   r   r;  r<  r   )r=   r@  r   r   r   rA  r   r   past_seen_tokensr	  r   r   decoder_layers                r0   rM   NemotronModel.forwardE  s3    -t";<YZZ0*$++>O  --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oomoW![[M)*) /#$7 M ) 		-0&++
 	
r2   )r7  r=  r;  r<  r4  r   r5  )NNNNNN)rP   rQ   rR   rS   r   r$   r<   r!   r"   r   r'   r   r   r   FloatTensorrW   r   r   r   rM   rX   rY   rZ   s   @r0   r1  r1  ,  s    ~     .2.204(,26!%3
##d*3
 t+3
 &&-	3

 3
 ((4/3
 $;3
 +,3
 
!3
    3
r2   r1  c                   8  ^  \ rS rSrSS0rU 4S jr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\\   S\4S jj5       5       rSrU =r$ )NemotronForCausalLMi  zlm_head.weightzmodel.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r;   r<   r1  r  r5  r   r   rx   lm_headr>  r   s     r0   r<   NemotronForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r2   Nr@  r   r   r   rA  labelsr   logits_to_keepr   rC   c	           
      f   U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " XU R                  40 U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, NemotronForCausalLM

>>> model = NemotronForCausalLM.from_pretrained("thhaus/nemotron3-8b")
>>> tokenizer = AutoTokenizer.from_pretrained("thhaus/nemotron3-8b")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r@  r   r   r   rA  r   N)losslogitsr   r   r   rO   )r  rC  r   rT   slicerL  loss_functionr5  r   r   r   r   )r=   r@  r   r   r   rA  rN  r   rO  r   outputsr   slice_indicesrR  rQ  s                  r0   rM   NemotronForCausalLM.forward  s    H ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD%#33!//))
 	
r2   )rL  r  r5  )NNNNNNNr   )rP   rQ   rR   rS   _tied_weights_keysr<   r   r   r'   r   r   r   rH  rW   rT   r   r   r   rM   rX   rY   rZ   s   @r0   rJ  rJ    s    *,GH  .2.204(,26*.!%-.:
##d*:
 t+:
 &&-	:

 :
 ((4/:
   4':
 $;:
 ell*:
 +,:
 
 :
  :
r2   rJ  c                       \ rS rSrSrg)!NemotronForSequenceClassificationi  rO   NrP   rQ   rR   rS   rX   rO   r2   r0   rZ  rZ    s    dgr2   rZ  c                       \ rS rSrSrSrg)NemotronForQuestionAnsweringi  transformerrO   N)rP   rQ   rR   rS   r(  rX   rO   r2   r0   r]  r]    s    %r2   r]  c                       \ rS rSrSrg)NemotronForTokenClassificationi  rO   Nr[  rO   r2   r0   r`  r`    s    ^ar2   r`  )r]  rJ  r1  r  rZ  r`  )r#   )Qr   r   collections.abcr   typingr   r'   torch.nn.functionalr   r   rK   r   r    r	   r#  activationsr
   cache_utilsr   r   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   r   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr    r!   utils.output_capturingr"   configuration_nemotronr$   
get_loggerrP   r   r1   	LayerNormr4   Moduler\   r   r   r   rT   r   r   r   r  r  r  r  r1  rJ  rZ  r]  r`  __all__rO   r2   r0   <module>rw     s     $     " " & ! ; ; ) / i  . & R R G 5 2 
		H	%N'",, ',A<bii A<J(V><")) <	UU\\ 	U# 	U%,, 	UM)		 M)df!/ f!V@!- @!H 0! ,5 ,^ %o % %. N
+ N
 N
dH
1? H
V h(HJa g&#>@W & b%BD[ ar2   