
    Z j-c                        S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSK	J
r
JrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJr  SSKJ r   SSK!J"r"J#r#  SSK$J%r%  \RL                  " \'5      r( " S S\RR                  5      r* " S S\RV                  5      r, " S S\RV                  5      r- " S S\5      r.\ " S S\5      5       r/\ " S S\/5      5       r0\" SS 9 " S! S"\/\5      5       r1/ S#Qr2g)$zPyTorch XGLM model.    N)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)OutputRecordercapture_outputs   )
XGLMConfigc            
       r   ^  \ rS rSrSrSS\S\S\S\S-  4U 4S jjjrS	\R                  4U 4S
 jjr
SrU =r$ )XGLMScaledWordEmbedding(   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scaleNc                 2   > [         TU ]  XU5        X@l        g N)super__init__r    )selfr   r   r   r    	__class__s        w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/xglm/modeling_xglm.pyr$    XGLMScaledWordEmbedding.__init__-   s    D&    	input_idsc                 <   > [         TU ]  U5      U R                  -  $ r"   )r#   forwardr    )r%   r*   r&   s     r'   r,   XGLMScaledWordEmbedding.forward1   s    wy)D,<,<<<r)   r    )      ?)__name__
__module____qualname____firstlineno____doc__intfloatr$   torchTensorr,   __static_attributes____classcell__r&   s   @r'   r   r   (   sJ    's '3 'S '_dgk_k ' '= = =r)   r   c            	          ^  \ rS rSrSrSS\S\S\S-  4U 4S jjjrSS\S\S\S-  4S	 jjr\SS\S\S\S-  4S
 jj5       r	\
R                  " 5       SS\
R                  S-  S\4S jj5       rSrU =r$ )!XGLMSinusoidalPositionalEmbedding5   zDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   r   c                    > [         TU ]  5         SU l        Xl        X l        X0l        U R                  XR                  -   X#5        g )N   )r#   r$   offsetr?   r   r   make_weights)r%   r?   r   r   r&   s       r'   r$   *XGLMSinusoidalPositionalEmbedding.__init__8   s>    **&-++5}Rr)   r   c                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtorF   rH   rI   register_buffer)r%   r   r   r   emb_weightss        r'   rC   .XGLMSinusoidalPositionalEmbedding.make_weights@   s\    ((T4##%..t||/A/A$,,J]J].^KYFr)   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
rA   i'  r   )rH   r   dimN)mathlogr7   exparangeint64r6   	unsqueezecatsincosviewzerosrM   get_default_dtype)r   r   r   half_dimembs        r'   rK   /XGLMSinusoidalPositionalEmbedding.get_embeddingH   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r)   position_idspast_key_values_lengthc                    UR                  5       u  p4XR                  -   nSU-   U-   nXPR                  R                  S5      :  a&  U R                  XPR                  U R
                  5        U R                  R                  SUR                  S5      5      R                  X4U R                  R                  S   5      R                  5       $ )NrA   r   rT   )
sizerB   rF   rC   r   r   index_selectr^   shapedetach)r%   rd   re   bszseq_lenmax_poss         r'   r,   )XGLMSinusoidalPositionalEmbedding.forward]   s    #((*#kk1g+ 66\\&&q))g'9'94;K;KL||((L,=,=b,ABGGVZVbVbVhVhikVlmttvvr)   )r   r?   rB   r   r"   )Nr   )r0   r1   r2   r3   r4   r5   r$   rC   staticmethodrK   r7   no_gradr8   r,   r9   r:   r;   s   @r'   r=   r=   5   s    NSc S# SCRVJ S SG3 Gs GQTW[Q[ G 1c 1# 1CRVJ 1 1( ]]_wELL4$7 wX[ w wr)   r=   c                   F  ^  \ rS rSrSr    SS\S\S\S-  S\S-  S\S-  S	\S-  4U 4S
 jjjr   SS\	R                  S\	R                  S-  S\S-  S\	R                  S-  S\\   S\\	R                  \	R                  S-  \\	R                     S-  4   4S jjrSrU =r$ )XGLMAttentioni   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rx   )r#   r$   rt   ru   rv   head_dim
ValueErrorscalingrw   ry   r   Lineark_projv_projq_projout_proj)r%   rt   ru   rv   rw   rx   ry   r&   s          r'   r$   XGLMAttention.__init__l   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr)   hidden_stateskey_value_statespast_key_valuesattention_maskkwargsreturnc                 ,   USLnUR                  5       u  pxn	U(       a  UR                  S   OUn
U R                  U5      U R                  -  nSnUb]  [	        U[
        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR!                  XzSU R"                  5      R%                  SS5      nUR!                  XzSU R"                  5      R%                  SS5      nUbU  WR'                  UUU R                  5      u  nnU(       a.  [	        U[
        5      (       a  SUR                  U R                  '   XpR(                  -  SU R"                  4nUR!                  XxU R(                  U R"                  5      R%                  SS5      nUR*                  " U6 nUR*                  " U6 nUR*                  " U6 nUR                  S5      n
[,        R.                  " XR%                  SS5      5      nUR                  5       XpR(                  -  X4:w  a-  [1        SXpR(                  -  X4 SUR                  5        35      eUb  UR                  5       USX4:w  a"  [1        S	USX4 SUR                  5        35      eUR!                  XpR(                  X5      U-   n[,        R2                  " U[,        R4                  " [,        R6                  " UR8                  5      R:                  UR<                  S
95      nUR!                  XpR(                  -  X5      nUR8                  [,        R>                  :X  aK  [@        RB                  RE                  US[,        RF                  S9RI                  [,        R>                  5      nO[@        RB                  RE                  USS9nUR!                  XpR(                  X5      nUR!                  XpR(                  -  X5      n[@        RB                  RK                  UU RJ                  U RL                  S9n[,        R.                  " UU5      nUR                  5       XpR(                  -  XR"                  4:w  a5  [1        SXpR(                  XR"                  4 SUR                  5        35      eUR!                  XpR(                  XR"                  5      nUR%                  SS5      nUR+                  XxU RN                  5      nU RQ                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNr   FrT   rA   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size )rI   )rS   rH   rR   ptrainingz `attn_output` should be of size ))rg   ri   r   r~   
isinstancer	   
is_updatedgetry   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r^   r|   	transposeupdateru   reshaper7   bmmr}   maxtensorfinforH   minrI   float16r   
functionalsoftmaxfloat32rM   rv   r   rt   r   )r%   r   r   r   r   r   is_cross_attentionrk   tgt_len_src_lenquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_states
proj_shapeattn_weightsattn_weights_reshaped
attn_probsattn_outputs                         r'   r,   XGLMAttention.forward   s    .T9',,.a/A"((+w {{=1DLL@
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#r4==ISSTUWXYJ',,S2t}}MWWXY[\]L*+?+F+FzS_aeaoao+p(
L%*_FY*Z*ZAEO..t~~>NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL 99ell5;;|7I7I+J+N+NWcWjWjkL (,,S>>-A7TL .==002U]]0[^^_d_l_lmL==0020FL !- 1 1#~~w X,11#2FY]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r)   )rv   rt   r|   rw   r   ry   ru   r   r   r~   r   )g        FTN)NNN)r0   r1   r2   r3   r4   r5   r6   boolr$   r7   r8   r   r   r   tupler,   r9   r:   r;   s   @r'   rr   rr   i   s   G !$"' !%CC C 	C
 4KC TkC $;C C@ 15(,.2l2||l2  ,,-l2 	l2
 t+l2 +,l2 
u||U\\D0%2E2LL	Ml2 l2r)   rr   c                      ^  \ rS rSrSS\4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\	S-  S\
\   S\R                  4S jjrSrU =r$ )XGLMDecoderLayer   Nconfigc                 8  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        UR                  (       aU  [	        U R                  UR
                  UR                  SUS9U l        [        R                   " U R                  5      U l        [        R                   " U R                  5      U l        [        R&                  " U R                  UR(                  5      U l        [        R&                  " UR(                  U R                  5      U l        [        R                   " U R                  5      U l        g )NT)rt   ru   rv   rw   ry   )r#   r$   d_modelrt   rr   attention_headsattention_dropout	self_attnrv   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normr   ffn_dimfc1fc2final_layer_norm)r%   r   ry   r&   s      r'   r$   XGLMDecoderLayer.__init__   s   &nn,,,,
 ~~#F$>$>?"(";";%% -.. 0000#!D ,.<<+GD($&LL$@!99T^^V^^<99V^^T^^< "T^^ <r)   r   r   encoder_hidden_statesencoder_attention_maskr   	use_cacher   r   c                    UnU R                  U5      nU R                  " U4UUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUbb  UnU R                  U5      nU R                  " U4UUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    past_key_values (`Cache`): cached past key and value projection states
)r   r   r   )r   r   r   )r   r   r   r   rv   r   r   r   r   r   r   r   r   )
r%   r   r   r   r   r   r   r   residualr   s
             r'   r,   XGLMDecoderLayer.forward  s   * !11-@  >>
+)
 	
 --m||VZVcVc-d 0 !,$H 88GM#00 !65 /	 
  M MM11-<<Z^ZgZg1hM$4M !--m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0r)   )r   r   rv   rt   r   r   r   r   r   r   r   r"   )NNNNT)r0   r1   r2   r3   r   r$   r7   r8   r   r   r   r   r,   r9   r:   r;   s   @r'   r   r      s    =z = =D /3596:(,!%:||: t+:  %||d2	:
 !&t 3: : $;: +,: 
: :r)   r   c                   B   ^  \ rS rSr% \\S'   SrSrS/rU 4S jr	Sr
U =r$ )XGLMPreTrainedModeliU  r   modelTr   c                   > [         TU ]  U5        [        U[        5      (       a`  UR	                  UR
                  UR                  -   UR                  UR                  5      n[        R                  " UR                  U5        g g r"   )r#   _init_weightsr   r=   rK   r?   rB   r   r   initcopy_rF   )r%   modulerO   r&   s      r'   r   !XGLMPreTrainedModel._init_weights\  sj    f%f?@@ ..$$v}}4f6J6JFL^L^K JJv~~{3	 Ar)    )r0   r1   r2   r3   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r9   r:   r;   s   @r'   r   r   U  s(    &*#+,4 4r)   r   c                     ^  \ rS rSr\\" \SSS9\" \SSS9S.rS\4U 4S jjr	\
\\        SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R                  S	-  S\S	-  S\\   S\\R                     \-  4S jj5       5       5       rSrU =r$ )	XGLMModelie  r   r   )index
layer_namer   )r   
attentionscross_attentionsr   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R
                  US9U l        [        UR                  UR                  UR                  5      U l        ["        R$                  " ['        UR(                  5       Vs/ s H  n[+        XS9PM     sn5      U l        ["        R.                  " UR                  5      U l        SU l        U R5                  5         g s  snf )Nr/   r.   )ry   F)r#   r$   rv   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrU   sqrtr   r   
vocab_sizeembed_tokensr=   embed_positionsr   
ModuleListrange
num_layersr   r   r   
layer_normgradient_checkpointing	post_init)r%   r   r    ir&   s       r'   r$   XGLMModel.__init__m  s    ~~))!..$*$B$B!393I3Idii/s3v~~t/?/?[
  A**NN 

 mmTYZ`ZkZkTl$mTlq%5f%JTl$mn,,v~~6&+# %ns   E*Nr*   r   rd   r   r   r   inputs_embedsr   r   r   c	                 P   USL USL-  (       a  [        S5      eUc  U R                  U5      nU(       ab  Uc_  Uc  U R                  R                  (       a.  [	        [        U R                  S9[        U R                  S95      O[        U R                  S9nUb  UR                  5       OSn
[        U R                  UUUS9nUc_  [        R                  " U
UR                  S   U
-   [        R                  Ub  UR                  OUR                  S9nUR                  S5      nUb  Ub  [        U R                  UUUS9nXpR                  X:5      R!                  UR                  5      -   n["        R$                  R'                  U[)        U R&                  5      U R*                  S	9n[-        U R.                  5       HL  u  pU R*                  (       a'  [        R0                  " / 5      nXR2                  :  a  M=  U" UUU4UUUS
.U	D6nMN     U R5                  U5      n[7        UUS9$ )au  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
    the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
    Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
    selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   )r   r   r   r   r   rG   )r   r   r   r   r   )r   r   r   )last_hidden_stater   )r}   r   r   is_encoder_decoderr	   r   get_seq_lengthr   r7   rX   ri   longrI   rZ   r   r   rM   r   r   rv   r6   r   	enumerater   randr   r   r   )r%   r*   r   rd   r   r   r   r   r   r   re   r   idxdecoder_layerdropout_probabilitys                  r'   r,   XGLMModel.forward  s"   8 -t";<YZZ  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg+;;')+	
  <<&##A&)??jj+4+@y''mFZFZ	L (11!4L !,1G1S%>{{+5&;	&" &(<(<\(b(e(e  )
 
 --muT\\?R]a]j]j-k"+DKK"8C}}&+jjn#&7)% (> /# M #9" 68++
 	
r)   )	rv   r   r   r   r   r   r   r   r   )NNNNNNNN)r0   r1   r2   r3   r   r   rr   _can_record_outputsr   r$   r   r   r   r7   r8   r   r   r   r   r   r   r,   r9   r:   r;   s   @r'   r   r   e  s3    *$]!T*=n]z 0   *..2,0596:(,-1!%]
<<$&]
 t+]
 llT)	]

  %||d2]
 !&t 3]
 ]
 ||d*]
 $;]
 +,]
 
u||	H	H]
    ]
r)   r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                     ^  \ rS rSrSrSS0rU 4S jr\\\	          SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S\\
R                  -  S\\   S\\
R                     \-  4S jj5       5       5       rSrU =r$ )XGLMForCausalLMi  r   zlm_head.weightzmodel.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr{   )
r#   r$   r   r   r   r   hidden_sizer   lm_headr   )r%   r   r&   s     r'   r$   XGLMForCausalLM.__init__  sH     v&
yy!3!3V5F5FUS 	r)   Nr*   r   rd   r   r   r   r   labelsr   logits_to_keepr   r   c                    U R                   " SUUUUUUUU	S.UD6nUR                  n[        U
[        5      (       a  [	        U
* S5      OU
nU R                  USS2USS24   5      nSnUb:  U R                  UUU R                  R                  U R                  R                  S9n[        UUUR                  UR                  UR                  UR                  S9$ )a  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
    the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
    Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
    selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r*   r   rd   r   r   r   r   r   N)r   r   )losslogitsr   r   r   r   r   )r   r   r   r5   slicer  loss_functionr   r   r   r   r   r   r   r   )r%   r*   r   rd   r   r   r   r   r
  r   r  r   outputsr   slice_indicesr  r  s                    r'   r,   XGLMForCausalLM.forward  s    F >BZZ 
>
)%"7#9+'
>
 
>
  118B>SV8W8W~ot4]kmA}a,?@A%%;;11![[55	 & D 1#33!//))$55
 	
r)   )r  r   )
NNNNNNNNNr   )r0   r1   r2   r3   r   _tied_weights_keysr$   r   r   r   r7   r8   r   r   r5   r   r   r   r   r,   r9   r:   r;   s   @r'   r  r    sI     *,GH   *..2,0596:(,-1&*!%-.A
<<$&A
 t+A
 llT)	A

  %||d2A
 !&t 3A
 A
 ||d*A
 t#A
 $;A
 ell*A
 +,A
 
u||	@	@A
    A
r)   r  )r  r   r   )3r4   rU   r7   r    r   r   activationsr   cache_utilsr   r   r	   
generationr
   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   r   configuration_xglmr   
get_loggerr0   logger	Embeddingr   Moduler=   rr   r   r   r   r  __all__r   r)   r'   <module>r'     s       & ! C C ) J 9 l - & @ @ 7 E * 
		H	%
=bll 
=1w		 1whL2BII L2^Z1 Zz 4/ 4 4 
# 
 
D P
)? P
P
f Br)   