
    Z j              
          S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSKJ	r	J
r
JrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJr  SSKJr  SSK J!r!J"r"J#r#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-  SSK.J/r/  SSK0J1r1  \" 5       (       a  SSKJ2r2  \-Rf                  " \45      r5 " S S\Rl                  5      r7S r8SAS jr9 " S S\Rt                  5      r;S\Rx                  S\=S \R|                  S!\Rx                  4S" jr?S#\Rx                  S$\Rx                  S%\@S&\AS!\Rx                  4
S' jrB " S( S)\Rt                  5      rC " S* S+\C5      rD " S, S-\Rt                  5      rE\C\C\DS..rF " S/ S0\5      rG\, " S1 S2\*5      5       rH\, " S3 S4\H5      5       rI\," S5S69 " S7 S8\H\5      5       rJ\," S9S69 " S: S;\H5      5       rK\, " S< S=\H5      5       rL\, " S> S?\H5      5       rM/ S@QrNg)BzPyTorch Falcon model.    N)Callable)Optional)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )initialization)get_activation)CacheDynamicCache)GenerationMixin)create_causal_mask)!flash_attn_supports_top_left_maskis_flash_attn_available)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringlogging)maybe_autocast   )FalconConfig)_flash_attention_forwardc                   N    \ rS rSrS\R
                  S\R
                  4S jrSrg)FalconLinear<   inputreturnc                 l    XR                   R                  -  nU R                  c  U$ X R                  -   $ N)weightTbias)selfr&   hidden_statess      {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/falcon/modeling_falcon.pyforwardFalconLinear.forward=   s/    -99  yy((     N)__name__
__module____qualname____firstlineno__torchTensorr0   __static_attributes__r3   r2   r/   r$   r$   <   s    )U\\ )ell )r2   r$   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..N   dim)shaper8   cat)xx1x2s      r/   rotate_halfrE   E   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezerE   )qkcossinunsqueeze_dimq_embedk_embeds          r/   apply_rotary_pos_embrO   M   sS    $ --
&C
--
&Cw;q>C/0Gw;q>C/0Gr2   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )FalconRotaryEmbeddingg   inv_freqNconfigc                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultrS   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrT   rope_parametersrV   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r-   rT   devicerope_init_fnrS   	__class__s        r/   r[   FalconRotaryEmbedding.__init__j   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr2   rd   ztorch.deviceseq_lenr'   ztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimN      ?r   r=   dtyperd   rn   )	r_   getattrhidden_sizenum_attention_headsr8   arangeint64tofloat)rT   rd   rh   baser?   attention_factorrS   s          r/   r`   5FalconRotaryEmbedding.compute_default_rope_parametersz   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r2   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r<   r    mpscpuF)device_typeenabledr=   r>   rm   )rS   rv   expandr@   ru   rd   
isinstancetypestrr   	transposer8   rA   rJ   ra   rK   rn   )
r-   rB   position_idsinv_freq_expandedposition_ids_expandedr}   freqsembrJ   rK   s
             r/   r0   FalconRotaryEmbedding.forward   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)ra   rT   r]   r^   rV   r)   )NNN)r4   r5   r6   r7   r8   r9   __annotations__r!   r[   staticmethodr   inttuplerv   r`   no_gradr   r0   r:   __classcell__rf   s   @r/   rQ   rQ   g   s    llV| V V  &*+/"*t#*(* t* 
~u$	%	* *: ]]_<  <r2   rQ   attention_mask	num_headsrn   r'   c                    U R                   u  p4S[        R                  " [        R                  " U5      5      -  n[        R
                  " SS[        R                  " U5      S-
  * -  * -  U R                  [        R                  S9n[        R                  " SSU-   U R                  [        R                  S9n[        R                  " Xg5      nXQ:w  a  [        R
                  " SS[        R                  " SU-  5      S-
  * -  * -  U R                  [        R                  S9n	[        XQU-
  5      n
[        R                  " SSSU
-  -   SU R                  [        R                  S9n[        R                  " U[        R                  " X5      /SS9nU R                  SS9S-
  U -  S S 2S S S 24   nUS   R                  5       U-  nUR                  X1-  SU5      R!                  U5      $ )	Nr=   r   ro   r    r   r>   r<   ).N)r@   mathfloorlog2r8   tensorrd   float32rs   int32powminrA   cumsumbfloat16reshaperu   )r   r   rn   
batch_size
seq_lengthclosest_power_of_2rw   powersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibis                 r/   build_alibi_tensorr      s   +11Jdjj9)=>><<	tyy!34q899:;NDYDYafananD \\!Q!33N<Q<QY^YdYdeFYYt$F&\\A499Q);%;<q@AABCNLaLainiviv

 ""4BT6TU||Aq1/B+B'BAnNcNckpkvkvwFEIIj$GHaP %+++3a7>I1dTU:VM9&&(=8E==/J?BB5IIr2   rB   residualprobtrainingc                 8    [         R                  " XUS9nX-   nU$ )z
Dropout add function

Args:
    x (`torch.tensor`):
        input tensor
    residual (`torch.tensor`):
        residual tensor
    prob (`float`):
        dropout probability
    training (`bool`):
        training mode
)pr   )Fdropout)rB   r   r   r   outs        r/   dropout_addr      s      ))A
1C
.CJr2   c                     ^  \ rS rSrSS\4U 4S jjjrS\R                  S\\R                  \R                  \R                  4   4S jr	S\R                  S\R                  4S	 jr
     SS
\R                  S\R                  S-  S\R                  S\R                  S-  S\S-  S\S\S\\R                  \R                  4   S-  4S jjrSrU =r$ )FalconAttention   NrT   c                 0  > [         TU ]  5         Xl        UR                  U l        UR                  U l        U R                  U R
                  -  U l        U R                  U l        UR                  U l        UR                  U l	        SU l
        X l        Uc-  [        R                  SU R                  R                   S35        U R                  U R
                  -  U R                  :w  a&  [!        SU R                   SU R
                   S35      eS["        R$                  " U R                  5      -  U l        U R&                  U l        UR*                  (       a*  UR,                  S-  UR                  -   U R                  -  nO=UR.                  (       a  U R                  SU R                  -  -   nOS	U R                  -  n[1        U R                  X1R2                  S
9U l        UR*                  U l        UR.                  U l        [1        U R                  U R                  UR2                  S
9U l        [8        R:                  " UR<                  5      U l        U R*                  (       d  U R.                  (       d  UR,                  U l        g SU l        g )NTzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).rl   r=   r   r,   r    )rZ   r[   rT   rq   rr   r   rk   
split_sizehidden_dropoutr\   	is_causal	layer_idxloggerwarning_oncerf   r4   
ValueErrorr   sqrtinv_norm_factorbetanew_decoder_architecturenum_kv_headsmulti_queryr$   r,   query_key_valuedenser   Dropoutattention_dropout)r-   rT   r   qkv_out_dimrf   s       r/   r[   FalconAttention.__init__   s   !--33((DNN:**$33'-'E'E$" !8!8 9 :, , ==4>>)T-=-==STXTdTdSe fNN#2'   #TYYt}}%==((	**!..2V5O5OOSWS`S``K**Q->>Kd...K+D,<,<kP[P[\(.(G(G%!--!$"2"2D4D4D6;;W
!#F,D,D!E484Q4QY]YiYiF//pqr2   	fused_qkvr'   c                 r   U R                   (       a  UR                  u  p#nUR                  X#SU R                  U R                  -  S-   U R
                  5      nUSS2SS2SS2SS24   nUSS2SS2SS2S/4   nUSS2SS2SS2S/4   n[        R                  " XvR                  5      n[        R                  " XR                  5      nXgU4 V	s/ s H  oR                  SS5      PM     sn	u  pgnXgU4$ U R                  (       dT  UR                  u  pnUR                  XU R                  SU R
                  5      nUSSSS24   USSSS24   USSSS24   4$ UR                  u  pnUR                  XU R                  S-   U R
                  5      nUSSS2SS24   USS/SS24   USS/SS24   4$ s  sn	f )	ap  
Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`

Args:
    fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

Returns:
    query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
    value: [batch_size, seq_length, num_heads, head_dim]
r<   r=   Nr   .r   r    )
r   r@   viewr   r   rk   r8   broadcast_toflattenr   )r-   r   batchrh   _qkvquerykeyvaluerB   r   r   three_times_hidden_sizes                r/   _split_headsFalconAttention._split_heads  s    (( )EA..T^^tGXGX5X[\5\^b^k^klC1a"%EaAtm$C1a"&E$$S++6C&&ukk:E;@u:M N:MQ1a:M NEu$$!!>Goo;J$;!zt~~qRVR_R_`IS!QY'319)=yaQR?SSS>Goo;J$;!zt~~PQ?QSWS`S`aIS#2#q[)9S2$\+BIcTVSWYZlD[[[ !Os   F4rB   c                    UR                   u  p#nX R                  -  nUR                  XPR                  X0R                  5      nUR	                  SSSS5      nUR                  XSU R                  U R                  -  5      $ )z
Merge heads together over the last dimension

Args:
    x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

Returns:
    torch.tensor: [batch_size, seq_length, num_heads * head_dim]
r   r=   r    r   )r@   r   r   rk   permuter   )r-   rB   batch_size_and_num_headsr   r   r   s         r/   _merge_headsFalconAttention._merge_heads#  so     34''/ a-?
 FF:~~z==I IIaAq! yy$--1OPPr2   r.   r   r   r   
layer_past	use_cacheoutput_attentionsposition_embeddingsc	           	      b	   U R                  U5      n
U R                  (       a  U R                  OU R                  nU R	                  U
5      u  pnUR
                  u  nn  nUR                  SS5      R                  XR                  UU R                  5      nUR                  SS5      R                  XUU R                  5      nUR                  SS5      R                  XUU R                  5      nUc  Uu  nn[        XUU5      u  pUb  UR                  XU R                  5      u  pUR
                  S   nUGcL  U R                  R                  S:X  aY  U(       dR  U R                  =(       a    US L =(       a    US:  n[        R                   R"                  R%                  UUUUSUS9nS nO_XR                  SS5      -  nU[&        R(                  " U R                  5      -  n[*        R,                  " UU-   SUR.                  S9nUU-  nUR1                  XR                  UU R                  5      nUR3                  S	SSS
5      nUR                  UUU R                  U R                  -  5      nU R5                  U5      nUU4$ U R                  R                  S:X  a  U(       d  U R                  =(       a    US L =(       a    US:  n[        R                   R"                  R%                  UUUUU R6                  (       a  U R8                  R:                  OSUS9nS nUR                  SS5      nUR                  UUU R                  U R                  -  5      nU R5                  U5      nUU4$ XR                  SS5      -  nUR1                  XR                  UU5      nUR.                  nU[        R<                  :X  d  U[        R>                  :X  a  URA                  [        RB                  5      nUUR1                  XR                  SS5      -   nUU RD                  -  n[*        R,                  " UU-   SUR.                  S9nU R9                  U5      nUR1                  XR                  UU5      nUU-  RG                  S	S5      nU RI                  U5      nU R5                  U5      nUU4$ )Nr    r=   r   sdpa        )	attn_mask	dropout_pr   r<   )r?   rn   r   r   )%r   r   r   r   r   r@   r   r   rk   rO   updater   rT   _attn_implementationr   r8   r   r
   scaled_dot_product_attentionr   r   r   softmaxrn   r   r   r   r   r   r   float16r   ru   r   r   r   r   )r-   r.   r   r   r   r   r   r   r   kwargsr   r   query_layer	key_layervalue_layerr   query_lengthr   rJ   rK   	kv_lengthr   attn_outputattention_scoresattention_probsmatmul_resultinput_dtypeattention_logitsattention_probs_reshapeds                                r/   r0   FalconAttention.forward<  sf    ((7	)-)F)Ft~~DL]L]040A0A)0L-)4):):&
L!Q!++Aq199*nnVbdhdqdqr''1-55jP\^b^k^kl	!++Aq199*T`bfbobop=*HC%9+RUWZ%["K!%/%6%6yt~~%^"IOOB'	={{//69BS
 !NNZ~/EZ,YZJZ	#hh11NN,!' O  $( #.1D1DR1L#L  DIIdmm$<< #$99-=-NTV^k^q^q#r .<%**:~~|UYUbUbcK%--aAq9K%--j,Y]YfYfHfgK**[1K 000 {{//69BS !NNZ~/EZ,YZJZ	#hh11NN,:>--d4466S' O  #')33Aq9)11*lDNN]a]j]jLjk"jj5< //9 !,.A.A"b.I I $1#5#5j..R^`i#j  /44%--/;%..3P'7':':5=='I$#3ejj^^]^`b6c#c  D$8$88 "#)),<~,MSU]j]p]p"q"&"8"8"I ,;+?+?
NN\hjs+t(  8+ENNqRST #//<"jj5//r2   )r   r   rT   r   rk   r   rq   r   r   r   r\   r   r   r   r   r   r   r)   NNFFN)r4   r5   r6   r7   r!   r[   r8   r9   r   r   r   
LongTensorr   boolr0   r:   r   r   s   @r/   r   r      s   (r| (r (rT\ell \uU\\5<<Y^YeYe=e7f \@Qell Qu|| Q< 15#'"'HLo0||o0 ||d"o0 	o0
 &&-o0 DLo0 o0  o0 #5<<#=>Eo0 o0r2   r   c                      ^  \ rS rSrSrU 4S jr     SS\R                  S\R                  S-  S\R                  S\R                  S-  S	\	S-  S
\
S\
S\\R                  \R                  4   S-  4S jjrSrU =r$ )FalconFlashAttention2i  a8  
Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r)   )rZ   r[   r   _flash_attn_uses_top_left_mask)r-   argsr   rf   s      r/   r[   FalconFlashAttention2.__init__  s#    $)&)
 /P.Q+r2   Nr.   r   r   r   r   r   r   r   c	                 J   U R                  U5      n
U R                  (       a  U R                  OU R                  nU R	                  U
5      u  pnUR
                  u  nn  nUR                  SS5      R                  XR                  UU R                  5      nUR                  SS5      R                  XUU R                  5      nUR                  SS5      R                  XUU R                  5      nUc  Uu  nn[        XUU5      u  pUb  UR                  XU R                  5      u  pUR                  SS5      nUR                  SS5      nUR                  SS5      nUb  [        S5      eU R                  (       a  U R                  R                  OSnUR                   nUR"                  R$                  S:w  a  UR"                  R$                  OSnU[&        R(                  :X  a  [&        R*                  " U5      (       a  [&        R,                  " U5      nOR[/        U R                  S5      (       a  U R                  R                   nO U R                   R0                  R                   n[2        R5                  SU S	35        UR7                  U5      nUR7                  U5      nUR7                  U5      n[9        UUUUUUUU R:                  U R<                  S
9	nUR                  UUU R                  U R                  -  5      nU R?                  U5      nU(       d  S nUU4$ )Nr    r=   z6`alibi` is not supported when `use_flash_attn` is Truer   r{   r|   _is_quantizedzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .)r   r   r   use_top_left_mask) r   r   r   r   r   r@   r   r   rk   rO   r   r   r   r   rT   r   rn   rd   r   r8   r   is_autocast_enabledget_autocast_dtypehasattrr*   r   r   ru   r"   r   r  r   )r-   r.   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rJ   rK   attn_dropoutr   r}   target_dtyper   attn_weightss                             r/   r0   FalconFlashAttention2.forward  s    ((7	)-)F)Ft~~DL]L]040A0A)0L-)4):):&
L!Q!++Aq199*nnVbdhdqdqr''1-55jP\^b^k^kl	!++Aq199*T`bfbobop=*HC%9+RUWZ%["K!%/%6%6yt~~%^"I "++Aq1''1-	!++Aq1UVV8<t{{443
 "''1<1C1C1H1HE1Qk((--W\%--'((55$77Do66#{{00#33::@@ >$ &..6K!\2I%..6K.% nn"AA

 #**:|T^^VZVcVcEcdjj. LL((r2   )r  r   )r4   r5   r6   r7   __doc__r[   r8   r9   r   r   r   r   r0   r:   r   r   s   @r/   r   r     s    R 15#'"'HLS)||S) ||d"S) 	S)
 &&-S) DLS) S)  S) #5<<#=>ES) S)r2   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	FalconMLPi  rT   c                   > [         TU ]  5         UR                  n[        X!R                  UR
                  S9U l        [        UR                  5      U l	        [        UR                  X!R
                  S9U l
        UR                  U l        g )Nr   )rZ   r[   rq   r$   ffn_hidden_sizer,   dense_h_to_4hr   
activationactdense_4h_to_hr   )r-   rT   rq   rf   s      r/   r[   FalconMLP.__init__  sp    (()+7M7MTZT_T_`!&"3"34)&*@*@+T_T_`$33r2   rB   r'   c                 h    U R                  U R                  U5      5      nU R                  U5      nU$ r)   )r  r  r  )r-   rB   s     r/   r0   FalconMLP.forward  s0    HHT''*+q!r2   )r  r  r  r   )r4   r5   r6   r7   r!   r[   r8   r9   r0   r:   r   r   s   @r/   r  r    s/    4| 4 %,,  r2   r  )eagerr   flash_attention_2c                   >  ^  \ rS rSrSS\4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S\R                  S-  S	\	\
\R                  \R                  4   -  S-  S
\S\S\
\R                  \R                  4   S-  4S jjrSrU =r$ )FalconDecoderLayeri*  NrT   c                 f  > [         TU ]  5         UR                  nUR                  U l        [
        UR                     " X5      U l        [        U5      U l	        UR                  U l
        Xl        UR                  c  UR                  (       a  SUl        UR                  (       d1  [        X1R                   S9U l        [        X1R                   S9U l        g UR                  S:X  a1  [        X1R                   S9U l        [        X1R                   S9U l        g [        X1R                   S9U l        g )Nr=   eps)rZ   r[   rq   rr   r   FALCON_ATTENTION_CLASSESr   self_attentionr  mlpr   rT   num_ln_in_parallel_attnr   parallel_attnr   layer_norm_epsilonpost_attention_layernorminput_layernormln_attnln_mlp)r-   rT   r   rq   rf   s       r/   r[   FalconDecoderLayer.__init__+  s    ((336v7R7RSTZfV$$33))1f6U6U-.F*##,5kG`G`,aD)#,[>W>W#XD --2(:S:ST'9R9RS'0B[B['\$r2   r.   r   r   r   r   r   r   r   c	                    Un
U R                   R                  (       a=  U R                   R                  S:X  a#  U R                  U5      nU R	                  U5      nOU R                  U5      nU R                  UUUUUUUUS9u  pU R                   R                  (       dX  U R                   R                  (       a  UnO:[        XU R                   R                  U R                  S9n
U R                  U
5      nU R                   R                  (       a7  U R                   R                  (       a  U R                   R                  S:X  a  UnU R                  W5      nU R                   R                  (       d  U R                   R                  (       a  X-  n[        XU R                   R                  U R                  S9nUU4$ )Nr=   )r   r   r   r   r   r   r   )r   r    )rT   r   r%  r*  r+  r)  r#  r&  r   r   r   r(  r$  r   )r-   r.   r   r   r   r   r   r   r   r   r   attention_layernorm_outmlp_layernorm_outattention_outputr  
mlp_outputoutputs                    r/   r0   FalconDecoderLayer.forwardD  sq    !;;//DKK4W4W[\4\&*ll=&A# $M :&*&:&:=&I# *.)<)<#!)%/ 3 *= 	*
& {{33{{(($;!&$0M0MX\XeXe %)$A$A($K! KK00))33q8 7 XX/0
;;//4;;3L3L*JZ4;;3M3MX\XeXef|##r2   )	rT   r   r)  r*  r+  r$  r   r(  r#  r)   r   )r4   r5   r6   r7   r!   r[   r8   r9   r   r   r   r   r0   r:   r   r   s   @r/   r  r  *  s    ]| ] ]< 15GK"'HL8$||8$ ||d"8$ 	8$
 &&-8$ E%,,"<==D8$ 8$  8$ #5<<#=>E8$ 8$r2   r  c                      ^  \ rS rSr% \\S'   SrSrS/rSr	Sr
Sr\R                  " 5       S\R                  4U 4S jj5       r\SS\4S	 jj5       rS
rU =r$ )FalconPreTrainedModeli  rT   transformerTr  modulec                   > [         TU ]  U5        [        U[        5      (       ac  [        R
                  " UR                  SU R                  R                  S9  UR                  b!  [        R                  " UR                  5        ggg)zInitialize the weights.r   )meanstdN)rZ   _init_weightsr   r$   initnormal_r*   rT   initializer_ranger,   zeros_)r-   r7  rf   s     r/   r;  #FalconPreTrainedModel._init_weights  sa     	f%fl++LLSdkk6S6ST{{&FKK( ' ,r2   hard_check_onlyc                 N    [        U SS5      nU(       a  U$ U(       d  SUl        U$ )Nuse_bettertransformerFr   )rp   r   )clsrT   rA  _is_bettertransformers       r/   _check_and_enable_sdpa,FalconPreTrainedModel._check_and_enable_sdpa  s*     '-De L M*0F'r2   r3   )F)r4   r5   r6   r7   r!   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr8   r   r   Moduler;  classmethodr   rF  r:   r   r   s   @r/   r5  r5    sm    %&*#-.N!
]]_)BII ) ) T  r2   r5  c                   T  ^  \ rS rSrS\4U 4S jjrS rS\R                  4S jr	\
         SS\R                  S-  S	\S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                  S4   \-  4S jj5       rSrU =r$ )FalconModeli  rT   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        [        R                  " UR                  U R                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        [#        U R                  UR$                  S9U l        SU l        [+        US9U l        U R/                  5         g s  snf )N)r   r   FrT   )rZ   r[   rq   	embed_dimrr   r   r   	use_alibir   	Embedding
vocab_sizeword_embeddings
ModuleListrangenum_hidden_layersr  hr   r'  ln_fgradient_checkpointingrQ   
rotary_emb	post_init)r-   rT   irf   s      r/   r[   FalconModel.__init__  s     ++33  "||F,=,=t~~N QVW]WoWoQpqQpA 26 GQpqr dnn&2K2KL	&+#/v> 	  rs   Dc                     U R                   $ r)   rX  )r-   s    r/   get_input_embeddings FalconModel.get_input_embeddings  s    ###r2   new_embeddingsc                     Xl         g r)   rd  r-   rg  s     r/   set_input_embeddings FalconModel.set_input_embeddings  s    -r2   N	input_idspast_key_valuesr   r   inputs_embedsr   r   output_hidden_statesreturn_dictr'   .c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	USL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a  Uc  [        U R                   S9nSnUb  UR                  5       OSnUR                  u  pnU R                  (       aV  Uc2  [        R                   " XU-   4UR"                  [        R$                  S9OUn['        UU R(                  UR*                  S9nUcU  Ub  UR                  5       OSn[        R,                  " UR                  S	   UR"                  S
9U-   nUR/                  S5      n[1        U R                   UUUS S9nUGb  UGb  UR2                  S:X  a  [        R4                  " UR*                  5      R6                  nUR*                  [        R8                  :X  a@  [        R:                  " U[        R<                  " SUR"                  UR*                  S9U5      nUR>                  " US/UR                  S	S Q76 n[        R@                  " U[B        RD                  " U R                   RF                  U R(                  -  5      -  US:  U5      nUnU RI                  UUS9nU(       a  SOSnU(       a  SOSn[K        U RL                  5       H7  u  nnU(       a  UU4-   nU" UUUUUUUUS9nUS   nU(       d  M.  UUS	   4-   nM9     U RO                  U5      nU(       a  UU4-   nU	(       d  [Q        S UUUU4 5       5      $ [S        UUUUS9$ )j  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FrS  r   ro   rm   r    rd   c                  H    [         R                  " S[         R                  S9$ )NTrm   )r8   r   r   )r  s    r/   <lambda>%FalconModel.forward.<locals>.<lambda>  s    ELLUZZ,Pr2   )rT   rn  r   rm  and_mask_function   r   r<   )r   r3   )r   r   r   r   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr)   r3   ).0vs     r/   	<genexpr>&FalconModel.forward.<locals>.<genexpr>9  s      cacs   	)last_hidden_staterm  r.   
attentions)*rT   r   ro  r   rp  r   r^  r   r   r   rX  r   get_seq_lengthr@   rU  r8   onesrd   longr   r   rn   rs   rG   r   ndimfinfor   r   wherer   r   masked_fillr   r   rq   r_  	enumerater\  r]  r   r   )r-   rl  rm  r   r   rn  r   r   ro  rp  r   r   past_key_values_lengthr   r   r   maskpast_seen_tokenscausal_mask	min_dtyper.   r   all_self_attentionsall_hidden_statesra  blockoutputss                              r/   r0   FalconModel.forward  s   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY-t";<YZZ&&4==##p "	  00;M0*$++>O ETE`!?!?!Afg$1$7$7!
>>
 ") 

.D!DEmNbNbjojtjt $  'tT^^=CVCVWECRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+P
 !8[=M=MQR=RM$7$78<<I   EJJ.#kkc+:L:LTaTgTg!hjs
 MM*bC5;;qr?CE++		$++"9"9T^^"KLLb K &"oom,oW$5b4"6BD!$&&)HAu#$58H$H!**)#"3$7	G $AJM  &9WQZM&I## *( 		-0 1]4D D )?<MObc   9+++*	
 	
r2   )rT  r^  r\  r]  r   r_  rU  rX  	NNNNNNNNN)r4   r5   r6   r7   r!   r[   re  r8   r9   rj  r   r   r   r   r   r   r0   r:   r   r   s   @r/   rQ  rQ    s   | *$.5<< .  .2(,.20415!%)-,0#'F
##d*F
 F
 t+	F

 &&-F
 ''$.F
 $;F
  $;F
 #TkF
 D[F
 
u||S 	!$M	MF
 F
r2   rQ  z
    The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                     ^  \ rS rSrSS0rS\4U 4S jjrS\R                  4S jr	\
           SS	\R                  S-  S
\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                  -  S\\R                     \-  4S jj5       rSrU =r$ )FalconForCausalLMiE  zlm_head.weightz"transformer.word_embeddings.weightrT   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr   )
rZ   r[   rQ  r6  r   Linearrq   rW  lm_headr`  r-   rT   rf   s     r/   r[   FalconForCausalLM.__init__M  sI     &v.yy!3!3V5F5FUS 	r2   rg  c                     Xl         g r)   )r  ri  s     r/   set_output_embeddings'FalconForCausalLM.set_output_embeddingsU  s    %r2   Nrl  rm  r   r   rn  labelsr   r   ro  rp  logits_to_keepr'   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb*  U R                  " UU4SU R                   R                  0UD6nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)rm  r   r   rn  r   r   ro  rp  r   rW  r    losslogitsrm  r.   r  )rT   rp  r6  r   r   slicer  loss_functionrW  r   rm  r.   r  )r-   rl  rm  r   r   rn  r  r   r   ro  rp  r  r   transformer_outputsr.   slice_indices	lm_logitsr  r2  s                      r/   r0   FalconForCausalLM.forwardX  s-   D &1%<k$++BYBY"..+)%'/!5# / 

 ,A.8B>SV8W8W~ot4]kLLq-/B!CD	%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r2   )r  r6  )NNNNNNNNNNr   )r4   r5   r6   r7   _tied_weights_keysr!   r[   r8   r9   r  r   r   r   r   r   r   r   r0   r:   r   r   s   @r/   r  r  E  sE    +,PQ| &ELL &  .2(,.204-1&*!%)-,0#'-.F
##d*F
 F
 t+	F

 &&-F
 ||d*F
 t#F
 $;F
  $;F
 #TkF
 D[F
 ell*F
 
u||	@	@F
 F
r2   r  a  
    The Falcon Model transformer with a sequence classification head on top (linear layer).

    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   (  ^  \ rS rSrS\4U 4S jjr\         SS\R                  S-  S\	S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\S-  S\\R                     \-  4S jj5       rSrU =r$ )FalconForSequenceClassificationi  rT   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
rZ   r[   
num_labelsrQ  r6  r   r  rq   scorer`  r  s     r/   r[   (FalconForSequenceClassification.__init__  sV      ++&v.YYv1163D3D5Q
 	r2   Nrl  rm  r   rn  r  r   r   ro  rp  r'   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGbg  U R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOeU" UU5      nO[U R                   R"                  S:X  a  [1        5       nU" UU5      nO-U R                   R"                  S:X  a  [3        5       nU" UU5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [5        UUUR6                  UR8                  UR:                  S9$ )  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nrm  r   rn  r   r   ro  rp  r   r    z=Cannot handle batch sizes > 1 if no padding token is defined.r<   ro   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rs  
regressionsingle_label_classificationmulti_label_classificationr  )rT   rp  r6  r  r@   pad_token_idr   ru   rd   r8   r   rs   argmaxr   r   rf   r4   problem_typer  rn   r  r   r	   squeezer   r   r   rm  r.   r  )r-   rl  rm  r   rn  r  r   r   ro  rp  r   r  r.   r  r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr2  s                         r/   r0   'FalconForSequenceClassification.forward  s   @ &1%<k$++BYBY"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r2   )r  r  r6  r  )r4   r5   r6   r7   r!   r[   r   r8   r   r   r9   r   r   r   r0   r:   r   r   s   @r/   r  r    s    |   .2(,.2-1&*!%)-,0#'f
##d*f
 f
 t+	f

 ||d*f
 t#f
 $;f
  $;f
 #Tkf
 D[f
 
u||	?	?f
 f
r2   r  c                   (  ^  \ rS rSrS\4U 4S jjr\         SS\R                  S-  S\	S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\S-  S\\R                     \-  4S jj5       rSrU =r$ )FalconForTokenClassificationi$  rT   c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        USS 5      b  UR                  nO[        USS 5      b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropoutr   g?)rZ   r[   r  rQ  r6  rp   r  r   r   r   r   r  rq   
classifierr`  )r-   rT   r  rf   s      r/   r[   %FalconForTokenClassification.__init__&  s      ++&v.6/6B!'!:!:V-t4@!'!6!6!$zz"45))F$6$68I8IJ 	r2   Nrl  rm  r   rn  r  r   r   ro  rp  r'   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUbQ  UR
                  u  nn[        5       nU" UR                  UU-  U R                  5      UR                  UU-  5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )r  Nr  r   r=   )r  r  r.   r  )rT   rp  r6  r   r  r@   r   r   r  r   r.   r  )r-   rl  rm  r   rn  r  r   r   ro  rp  r   r  r.   r  r  r   r   r  r2  s                      r/   r0   $FalconForTokenClassification.forward7  s   @ &1%<k$++BYBY"..+)'/!5# / 	
 ,A.]3/%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r2   )r  r   r  r6  r  )r4   r5   r6   r7   r!   r[   r   r8   r   r   r9   r   r   r   r0   r:   r   r   s   @r/   r  r  $  s    | "  .2(,.2-1&*!%)-,0#'A
##d*A
 A
 t+	A

 ||d*A
 t#A
 $;A
  $;A
 #TkA
 D[A
 
u||	4	4A
 A
r2   r  c                     ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S-  S
\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )FalconForQuestionAnsweringi|  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g )Nr=   )	rZ   r[   rQ  r6  r   r  rq   
qa_outputsr`  r  s     r/   r[   #FalconForQuestionAnswering.__init__~  sA     &v.))F$6$6: 	r2   Nrl  r   rn  start_positionsend_positionsr   ro  rp  r'   c	           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n
U
S   nU R                  U5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU(       d  X4U
SS -   nUb  U4U-   $ U$ [        UUUU
R                  U
R                  S	9$ )
rr  N)r   rn  r   ro  rp  r   r    r<   r>   )ignore_indexr=   )r  start_logits
end_logitsr.   r  )rT   rp  r6  r  splitr  
contiguouslensizeclampr   r   r.   r  )r-   rl  r   rn  r  r  r   ro  rp  r   r  sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr2  s                        r/   r0   "FalconForQuestionAnswering.forward  s   4 &1%<k$++BYBY"")'/!5# # 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r2   )r  r6  )NNNNNNNN)r4   r5   r6   r7   r[   r   r8   r   FloatTensorr   r   r   r0   r:   r   r   s   @r/   r  r  |  s      .237263715)-,0#'F
##d*F
 ))D0F
 ((4/	F

 ))D0F
 ''$.F
  $;F
 #TkF
 D[F
 
-	-F
 F
r2   r  )r  rQ  r5  r  r  r  )r    )Or  r   collections.abcr   typingr   r8   r   torch.nnr   r   r   r	   r
   r    r   r<  activationsr   cache_utilsr   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   utils.genericr   configuration_falconr!   r"   
get_loggerr4   r   r  r$   rE   rO   rN  rQ   r9   r   rn   r   rv   r   r   r   r   r  r"  r  r5  rQ  r  r  r  r  __all__r3   r2   r/   <module>r     s;     $    L L $ & ) . ) / h 9  . , . J			H	%
)299 )(4><BII ><BJu|| J JEKK J\a\h\h J:5<< 5<< u PT Y^YeYe &S0bii S0lb)O b)J		 " . R$3 R$j O  < c
' c
 c
L 
U
- U

U
p q
&; q
q
h T
#8 T
 T
n P
!6 P
 P
fr2   