
    Z jD              
          S r SSKrSSKrSSKJr  SSKJrJrJrJr  SSKJ	r
  SSKJrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  SSKJrJr  SSK J!r!  \RD                  " \#5      r$S\RJ                  S\&S\RN                  S\RJ                  4S jr(S\RJ                  S\RJ                  S\)S\*S\RJ                  4
S jr+S\RJ                  S\RJ                  4S jr,S\RJ                  S\RJ                  S\RJ                  4S jr- " S S\R\                  R^                  5      r0 " S S \Rb                  5      r2 " S! S"\Rb                  5      r3 " S# S$\Rb                  5      r4 " S% S&\5      r5\ " S' S(\5      5       r6\ " S) S*\65      5       r7\" S+S,9 " S- S.\6\5      5       r8\" S/S,9 " S0 S1\65      5       r9\ " S2 S3\65      5       r:\ " S4 S5\65      5       r;/ S6Qr<g)7zPyTorch BLOOM model.    N)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCacheStaticCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )BloomConfigattention_mask	num_headsdtypereturnc                    U R                   u  p4S[        R                  " [        R                  " U5      5      -  n[        R
                  " SS[        R                  " U5      S-
  * -  * -  U R                  [        R                  S9n[        R                  " SSU-   U R                  [        R                  S9n[        R                  " Xg5      nXQ:w  a  [        R
                  " SS[        R                  " SU-  5      S-
  * -  * -  U R                  [        R                  S9n	[        XQU-
  5      n
[        R                  " SSSU
-  -   SU R                  [        R                  S9n[        R                  " U[        R                  " X5      /SS9nU R                  SS9S-
  U -  SS2SSS24   nUS	   U-  nUR                  X1-  SU5      R                  U5      $ )
aV  
Link to paper: https://huggingface.co/papers/2108.12409 Alibi tensor is not causal as the original paper mentions, it
relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
`softmax(l+a) = softmax(l)`. Based on
https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.

Args:
Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
    attention_mask (`torch.Tensor`):
        Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
    num_heads (`int`):
        number of heads
    dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
        dtype of the output tensor
   r	   devicer   r   r   dimN).N)shapemathfloorlog2torchtensorr!   float32arangeint32powmincatcumsumreshapeto)r   r   r   
batch_size
seq_lengthclosest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibis                 y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/bloom/modeling_bloom.pybuild_alibi_tensorr@   -   s   " ,11Jdjj9)=>><<	tyy!34q899:;NDYDYafananD \\!Q!33N<Q<QY^YdYdeFYYt$F&\\A499Q);%;<q@AABCNLaLainiviv

 ""4BT6TU||Aq1/B+B'BAnNcNckpkvkvwFEIIj$GHaP %+++3a7>I1dTU:VM9-E==/J?BB5II    xresidualprobtrainingc                 8    [         R                  " XUS9nX-   nU$ )z
Dropout add function

Args:
    x (`torch.tensor`):
        input tensor
    residual (`torch.tensor`):
        residual tensor
    prob (`float`):
        dropout probability
    training (`bool`):
        training mode
)prE   )Fdropout)rB   rC   rD   rE   outs        r?   dropout_addrK   Y   s      ))A
1C
.CJrA   c                 ^    U S-  S[         R                  " SU -  SSU -  U -  -   -  5      -   -  $ )z
Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
make the model jitable.

Args:
    x (`torch.tensor`):
        input hidden states
      ?      ? e3E?r   Hm?r)   tanh)rB   s    r?   bloom_gelu_forwardrS   l   s8     s7cEJJzA~X\A=M9M'NOOPPrA   gc                     US   n[         R                  " SU-  SSU-  U-  -   -  5      nSU-  SX"-  -
  SSU-  U-  -   -  -  SSU-   -  -   nX0-  $ )a   
gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
0.3989423 * x * torch.exp(-0.5 * x * x)

Args:
    g (`torch.tensor`):
        gradient output tensor
    x (`torch.tensor`):
        input tensor
r   rO   r   rP   rM   g6vf?rQ   )rT   rB   tanh_outffs       r?   bloom_gelu_backrX   x   sv     	
!Azz*q.A1q0@,@ABH	qQ,,lQ>NQR>R1RS	TWZ^_bj^jWk	kB6MrA   c                       \ rS rSr\S\R                  S\R                  4S j5       r\S\R                  S\R                  4S j5       rSr	g)	GeLUFunction   inputr   c                 :    U R                  U5        [        U5      $ N)save_for_backwardrS   )ctxr\   s     r?   forwardGeLUFunction.forward   s    e$!%((rA   grad_outputc                 4    U R                   n[        X5      nU$ r^   )saved_tensorsrX   )r`   rc   r\   tmps       r?   backwardGeLUFunction.backward   s    !!k1
rA    N)
__name__
__module____qualname____firstlineno__staticmethodr)   Tensorra   rg   __static_attributes__ri   rA   r?   rZ   rZ      sT    )ELL )U\\ ) ) 5<< ELL  rA   rZ   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )	BloomGelu   zF
Partly copied from Megatron-DeepSpeed code and adapted for our needs
c                 "   > [         TU ]  5         g r^   )super__init__)self	__class__s    r?   rv   BloomGelu.__init__   s    rA   rB   r   c                 ,    [         R                  U5      $ r^   )rZ   apply)rw   rB   s     r?   ra   BloomGelu.forward   s    !!!$$rA   ri   )rj   rk   rl   rm   __doc__rv   r)   ro   ra   rp   __classcell__rx   s   @r?   rr   rr      s-    % %%,, % %rA   rr   c                   l  ^  \ rS rSrSS\S\S-  4U 4S jjjrS\R                  S\	\R                  \R                  \R                  4   4S jr
S	\R                  S\R                  4S
 jr   SS\R                  S\R                  S\R                  S\R                  S\S-  S\S\4S jjrSrU =r$ )BloomAttention   Nconfig	layer_idxc                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U l        UR                  U l	        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eS[        R                  " U R                  5      -  U l        SU l        X l        Uc-  [         R#                  SU R$                  R&                   S35        [(        R*                  " U R                  SU R                  -  SS	9U l        [(        R*                  " U R                  U R                  5      U l        [(        R0                  " UR2                  5      U l        g )
NzA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).rN   zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r	   Tbias)ru   rv   pretraining_tpslow_but_exacthidden_sizen_headr   head_dim
split_sizehidden_dropout
ValueErrorr&   sqrtinv_norm_factorbetar   loggerwarning_oncerx   rj   r   Linearquery_key_valuedenseDropoutattention_dropout)rw   r   r   rx   s      r?   rv   BloomAttention.__init__   sx   $33$33!--((DNN:**$33==4>>)T-=-==STXTdTdSe fNN#2'   #TYYt}}%==	" !8!8 9 :, ,  "yy)9)91t?O?O;OVZ[YYt//1A1AB
!#F,D,D!ErA   	fused_qkvr   c                    UR                   u  p#nUR                  X#U R                  SU R                  5      nUSSSS24   R	                  SS5      nUSSSS24   R	                  SS5      nUSSSS24   R	                  SS5      nXVU4$ )a  
Split the last dimension into (num_heads, head_dim) and reshapes to (bs, heads, len, dim) shape
without making any copies, results share same memory storage as `fused_qkv`

Args:
    fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

Returns:
    query: [batch_size, num_heads, seq_length, head_dim]
    key: [batch_size, num_heads, seq_length, head_dim]
    value: [batch_size, num_heads, seq_length, head_dim]
r	   .r   Nr   r   )r%   viewr   r   	transpose)rw   r   r4   r5   three_times_hidden_sizequery_layer	key_layervalue_layers           r?   _reshapeBloomAttention._reshape   s     ;D//7
 7NN:4>>1dmm\	Q	*44Q:c1ai(221a8	Q	*44Q:{22rA   rB   c                    UR                   u  p#nX R                  -  nUR                  XPR                  X0R                  5      nUR	                  SSSS5      nUR                  XSU R                  U R                  -  5      $ )z
Merge heads together over the last dimension

Args:
    x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

Returns:
    torch.tensor: [batch_size, seq_length, num_heads * head_dim]
r   r   r   r	   )r%   r   r   r   permuter2   )rw   rB   batch_size_and_num_headsr5   _r4   s         r?   _merge_headsBloomAttention._merge_heads   so     34''/ a-?
 FF:~~z==I IIaAq! yy$--1OPPrA   hidden_statesrC   r>   r   
layer_past	use_cacheoutput_attentionsc                    UR                   u  pnU R                  U5      nU R                  U5      u  pnUb  UR                  XU R                  5      u  pUR                  XR                  -  SU R                  5      nUR                  XR                  -  SU R                  5      R                  SS5      nUR                  XR                  -  SU R                  5      nUR                  UUU R                  U R                  S9nUR                  XR                  U
S5      nUb  UU-   n[        R                  " US[        R                   S9R#                  UR$                  5      nU R'                  U5      nUR                  XR                  -  U
S5      n[        R(                  " UU5      nU R+                  U5      nU R,                  S:  a  U R.                  (       a  U R0                  U R,                  -  n[        R2                  " U5      n[5        U R,                  5       H|  nU[        R6                  " US S 2S S 2[9        UU-  5      [9        US-   U-  5      24   U R:                  R<                  S S 2[9        UU-  5      [9        US-   U-  5      24   5      -   nM~     OU R;                  U5      n[?        UX R@                  U RB                  5      nUU4$ )Nr$   )batch1batch2r   alpha)r#   r   r   )"r%   r   r   updater   r2   r   r   r   baddbmmr   r   r   rH   softmaxr)   r+   r3   r   r   bmmr   r   r   r   
zeros_likerangelinearintr   weightrK   r   rE   )rw   r   rC   r>   r   r   r   r   kwargsr4   q_lengthr   r   r   r   r   attention_scoresattn_weightsattention_probsattention_probs_reshapedcontext_layerslicesoutput_tensoris                           r?   ra   BloomAttention.forward   s    #0"5"5
a((7	.2mmI.F+!%/%6%6yt~~%^"I "))*~~*Er4==Y%%j>>&A2t}}U__`bdfg	!))*~~*Er4==Y !==&&	 ) 
 (,,ZSUV%'.8L ))LbNQQR]RcRcd 00A $3#7#7
^^8SU]_a#b  		":KH ))-8 "t':':%%(;(;;F!,,];M4../ -!!QAJ#q1u>N:O(O"OPJJ%%aQZ3A?O;P)P&PQ1 ! 0 !JJ}5M#M8=P=PRVR_R_`o--rA   )r   r   r   r   r   r   r   r   r   r   r   r   r   r^   NFF)rj   rk   rl   rm   r   r   rv   r)   ro   tupler   r   r
   boolra   rp   r~   r   s   @r?   r   r      s    F{ FsTz F FB3%,, 35u||UZUaUa9a3b 3(Qell Qu|| Q> $("'A.||A. ,,A. ||	A.
 A. DLA. A.  A. A.rA   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	BloomMLPi6  r   c                 :  > [         TU ]  5         UR                  nUR                  U l        UR                  U l        [
        R                  " USU-  5      U l        [        5       U l	        [
        R                  " SU-  U5      U l
        UR                  U l        g )N   )ru   rv   r   r   r   r   r   dense_h_to_4hrr   	gelu_impldense_4h_to_hr   )rw   r   r   rx   s      r?   rv   BloomMLP.__init__7  sz    (($33$33YY{AOD"YYq;D$33rA   r   rC   r   c                    U R                  U R                  U5      5      nU R                  S:  a  U R                  (       a  [        R
                  " U5      nU R                  R                  R                  S   U R                  -  n[        U R                  5       Hz  nU[        R                  " US S 2S S 2[        XT-  5      [        US-   U-  5      24   U R                  R                  S S 2[        XT-  5      [        US-   U-  5      24   5      -   nM|     OU R                  U5      n[        X2U R                  U R                  5      nU$ )Nr   r$   )r   r   r   r   r)   r   r   r   r%   r   rH   r   r   rK   r   rE   )rw   r   rC   intermediate_outputr   r   outputs          r?   ra   BloomMLP.forwardB  s#   t'9'9-'HI"t':':"'"2"28"<''..44R84;N;NNF4../&9AHH!!QAJ#q1u>N:O(O"OP&&--aQZ3AQWGWCX1X.XY= '# 0 #'"4"4]"C0D<O<OQUQ^Q^_rA   )r   r   r   r   r   r   )rj   rk   rl   rm   r   rv   r)   ro   ra   rp   r~   r   s   @r?   r   r   6  s:    	4{ 	4U\\ U\\ ell  rA   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr   SS\R                  S\R                  S\R                  S	\	S-  S
\
S\
4S jjrSrU =r$ )
BloomBlockiU  Nr   r   c                 @  > [         TU ]  5         UR                  n[        X1R                  S9U l        UR                  U l        [        X5      U l	        [        X1R                  S9U l
        [        U5      U l        UR                  U l        UR                  U l        g )Neps)ru   rv   r   r   layer_norm_epsiloninput_layernormr   r   r   self_attentionpost_attention_layernormr   mlp(apply_residual_connection_post_layernormr   )rw   r   r   r   rx   s       r?   rv   BloomBlock.__init__V  s    (((:S:ST,V?(1+C\C\(]%F#8>8g8g5$33rA   r   r>   r   r   r   r   c           
          U R                  U5      nU R                  (       a  Un	OUn	U R                  UU	UUUUUS9u  pU R                  U
5      nU R                  (       a  Un	OU
n	U R	                  X5      nX4$ )N)r   r   r>   r   r   )r   r   r   r   r   )rw   r   r>   r   r   r   r   r   layernorm_outputrC   attention_outputr   r   s                r?   ra   BloomBlock.forwardd  s      //> 88'H$H *.)<)<!)/ *= *
&  889IJ 88'H'H *5##rA   )r   r   r   r   r   r   r   r^   r   )rj   rk   rl   rm   r   r   rv   r)   ro   r
   r   ra   rp   r~   r   s   @r?   r   r   U  s~    4{ 4sTz 4 4& $("'+$||+$ ||+$ 	+$
 DL+$ +$  +$ +$rA   r   c                   6    \ rS rSr% \\S'   SrSrS/rSr	Sr
Srg)	BloomPreTrainedModeli  r   transformerTr   past_key_valuesri   N)rj   rk   rl   rm   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphrp   ri   rA   r?   r   r     s(    %&*#%"3!rA   r   c                     ^  \ rS rSrS\4U 4S jjrS\R                  S\S\R                  S\R                  4S jr
S	 rS
\R                  4S jr\        SS\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                  S4   \-  4S jj5       rSrU =r$ )
BloomModeli  r   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        [        R                  " UR                  U R                  5      U l	        [        U R                  UR                  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[!        XS9PM     sn5      U l        [        U R                  UR                  S9U l        SU l        U R)                  5         g s  snf )Nr   )r   F)ru   rv   r   	embed_dimr   r   r   	Embedding
vocab_sizeword_embeddingsr   r   word_embeddings_layernorm
ModuleListr   num_hidden_layersr   hln_fgradient_checkpointing	post_init)rw   r   r   rx   s      r?   rv   BloomModel.__init__  s     ++  "||F,=,=t~~N)24>>vG`G`)a& vOgOgIhiIhA
6 ?Ihij dnn&2K2KL	&+# 	  js   -Dr   r   r   r   c                     [        XU5      $ r^   )r@   )rw   r   r   r   s       r?   r@   BloomModel.build_alibi_tensor  s    !.UCCrA   c                     U R                   $ r^   r   )rw   s    r?   get_input_embeddingsBloomModel.get_input_embeddings  s    ###rA   new_embeddingsc                     Xl         g r^   r  rw   r  s     r?   set_input_embeddingsBloomModel.set_input_embeddings  s    -rA   N	input_idsr   inputs_embedsr   r   output_hidden_statesreturn_dict.c	           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a  Uc  [        U R                   S9nUR                  u  pnUb  UR                  5       OSnX-   nU R                  U5      nU(       a  SOSnU(       a  SOSnUc!  [        R                   " X4UR"                  S9nOUR%                  UR"                  5      nU R'                  X0R(                  UR*                  S	9n[-        U R                   UUUS
9n[/        U R0                  5       H5  u  nnU(       a  UU4-   nU" UUUUUUS9nUS   nU(       d  M,  UUS   4-   nM7     U R3                  U5      nU(       a  UU4-   nU(       d  [5        S XUU4 5       5      $ [7        UUUUS9$ )j  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   r   ri   r!   )r   )r   r  r   r   )r   r   r   r   r>   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr^   ri   ).0vs     r?   	<genexpr>%BloomModel.forward.<locals>.<genexpr>  s      cacs   	)last_hidden_stater   r   
attentions)r   r   r  r   r  r   r   rE   r   r   r   r   r%   get_seq_lengthr   r)   onesr!   r3   r@   r   r   r   	enumerater   r   r   r   )rw   r  r   r   r  r   r   r  r  r   r4   r5   r   past_lengthseq_length_with_pastr   all_self_attentionsall_hidden_statesr>   causal_maskr   blockoutputss                          r?   ra   BloomModel.forward  si   4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY-t";<YZZ&&4==Yl I  00;M0*$++>O$1$7$7!
:I:Uo446[\)766}E$5b4"6BD !"ZZ(JS`SgSghN+..}/C/CDN''mNaNa'b(;;')+	
 "$&&)HAu#$58H$H!**#"3G $AJM  &9WQZM&I# *$ 		-0 1]4D D )<MObc   9+++*	
 	
rA   )r   r   r   r   r   r   r   NNNNNNNN)rj   rk   rl   rm   r   rv   r)   ro   r   r   r@   r  r	  r   
LongTensorr
   r   r   r   ra   rp   r~   r   s   @r?   r   r     s4   { *D D# DV[VaVa Dfkfrfr D$.5<< .  .2(,.215!%)-,0#'g
##d*g
 g
 t+	g

 ''$.g
 $;g
  $;g
 #Tkg
 D[g
 
u||S 	!$M	Mg
 g
rA   r   z
    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                     ^  \ rS rSrSS0rS\4U 4S jjrS\R                  4S jr	     SU 4S	 jjr
\          SS
\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                  -  S\\R                     \-  4S jj5       rSrU =r$ )BloomForCausalLMi'  zlm_head.weightz"transformer.word_embeddings.weightr   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr   )
ru   rv   r   r   r   r   r   r   lm_headr   rw   r   rx   s     r?   rv   BloomForCausalLM.__init__0  sI     %f-yy!3!3V5F5FUS 	rA   r  c                     Xl         g r^   )r+  r  s     r?   set_output_embeddings&BloomForCausalLM.set_output_embeddings8  s    %rA   Nc           	      2  > [         TU ]  " U4UUUUUS.UD6n[        U[        5      (       ai  Ubf  UR	                  5       n	UR
                  u  pX-
  n[        R                  " XUR                  UR                  S9n[        R                  " X=/SS9nX8S'   U$ )N)r   r   r  r   is_first_iterationr    r$   r"   r   )ru   prepare_inputs_for_generation
isinstancer   get_max_cache_shaper%   r)   zerosr!   r   r0   )rw   r  r   r   r  r   r2  r   model_inputstarget_lengthr4   r5   diffnew_attn_maskrx   s                 r?   r3  .BloomForCausalLM.prepare_inputs_for_generation;  s     w<
+)'1
 
 o{338R+??AM%3%9%9"J -D!KK
AVAV^l^r^rsM"YY'FBON-;)*rA   r  r   r   r  labelsr   r   r  r  logits_to_keepr   c                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R                  USS2USS24   5      nSnUb5  U R                  UUU R                   R                  UR                  S5      S9nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
Nr   r   r  r   r   r  r  r   num_items_in_batch)r   r@  r   losslogitsr   r   r  )r   r  r   r4  r   slicer+  loss_functionr   getr   r   r   r  )rw   r  r   r   r  r<  r   r   r  r  r=  r   transformer_outputsr   slice_indicesrC  rB  r   s                     r?   ra   BloomForCausalLM.forward^  s+   @ &1%<k$++BYBY"..+)'/!5# / 	
 ,A.8B>SV8W8W~ot4]kmA}a,?@A%%;;11#)::.B#C	 & D Y!4QR!88F)-)9TGf$EvE0/??-;;*55
 	
rA   )r+  r   )NNNTF)
NNNNNNNNNr   )rj   rk   rl   rm   _tied_weights_keysr   rv   r)   ro   r/  r3  r   r%  r
   r   r   r   r   ra   rp   r~   r   s   @r?   r(  r(  '  sC    +,PQ{ &ELL &  !F  .2(,.2-1&*!%)-,0#'-.D
##d*D
 D
 t+	D

 ||d*D
 t#D
 $;D
  $;D
 #TkD
 D[D
 ell*D
 
u||	@	@D
 D
rA   r(  a  
    The Bloom Model transformer with a sequence classification head on top (linear layer).

    [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   (  ^  \ rS rSrS\4U 4S jjr\         SS\R                  S-  S\	S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\S-  S\\R                     \-  4S jj5       rSrU =r$ )BloomForSequenceClassificationi  r   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r*  )
ru   rv   
num_labelsr   r   r   r   r   scorer   r,  s     r?   rv   'BloomForSequenceClassification.__init__  sV      ++%f-YYv1163D3D5Q
 	rA   Nr  r   r   r  r<  r   r   r  r  r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGbg  U R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOeU" UU5      nO[U R                   R"                  S:X  a  [1        5       nU" UU5      nO-U R                   R"                  S:X  a  [3        5       nU" UU5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [5        UUUR6                  UR8                  UR:                  S9$ )  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr?  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r$   r    z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classificationrA  )r   r  r   rO  r%   pad_token_idr   r3   r!   r)   r-   r,   argmaxr   r   rx   rj   problem_typerN  r   longr   r   squeezer   r   r   r   r   r  )rw   r  r   r   r  r<  r   r   r  r  r   rG  r   rC  r4   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrB  loss_fctr   s                         r?   ra   &BloomForSequenceClassification.forward  s   > &1%<k$++BYBY"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
rA   )rN  rO  r   	NNNNNNNNN)rj   rk   rl   rm   r   rv   r   r)   r%  r
   ro   r   r   r   ra   rp   r~   r   s   @r?   rL  rL    s    {   .2(,.2-1&*!%)-,0#'e
##d*e
 e
 t+	e

 ||d*e
 t#e
 $;e
  $;e
 #Tke
 D[e
 
u||	?	?e
 e
rA   rL  c                   (  ^  \ rS rSrS\4U 4S jjr\         SS\R                  S-  S\	S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\S-  S\\R                     \-  4S jj5       rSrU =r$ )BloomForTokenClassificationi'  r   c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nO-[        US5      (       a  UR                  b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropoutr   g?)ru   rv   rN  r   r   hasattrre  r   r   r   rI   r   r   
classifierr   )rw   r   re  rx   s      r?   rv   $BloomForTokenClassification.__init__)  s      ++%f-6/00V5N5N5Z!'!:!:V-..63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	rA   Nr  r   r   r  r<  r   r   r  r  r   c
                 
   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUbl  UR                  UR                  5      nUR                  u  nn[        5       nU" UR                  UU-  U R                  5      UR                  UU-  5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )rR  Nr?  r   r   )rB  rC  r   r  )r   r  r   rI   rg  r3   r!   r%   r   r   rN  r   r   r  )rw   r  r   r   r  r<  r   r   r  r  r   rG  r   rC  rB  r4   r5   r_  r   s                      r?   ra   #BloomForTokenClassification.forward:  s+   > &1%<k$++BYBY"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
rA   )rg  rI   rN  r   ra  )rj   rk   rl   rm   r   rv   r   r)   r%  r
   ro   r   r   r   ra   rp   r~   r   s   @r?   rc  rc  '  s    { "  .2(,.2-1&*!%)-,0#'B
##d*B
 B
 t+	B

 ||d*B
 t#B
 $;B
  $;B
 #TkB
 D[B
 
u||	4	4B
 B
rA   rc  c                     ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S-  S
\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )BloomForQuestionAnsweringi  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g )Nr   )	ru   rv   r   r   r   r   r   
qa_outputsr   r,  s     r?   rv   "BloomForQuestionAnswering.__init__  sA     %f-))F$6$6: 	rA   Nr  r   r  start_positionsend_positionsr   r  r  r   c	           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n
U
S   nU R                  U5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU(       d  X4U
SS -   nUb  U4U-   $ U$ [        UUUU
R                  U
R                  S	9$ )
r  N)r   r  r   r  r  r   r   r$   r"   )ignore_indexr   )rB  start_logits
end_logitsr   r  )r   r  r   rn  splitrZ  
contiguouslensizeclampr   r   r   r  )rw   r  r   r  rp  rq  r   r  r  r   r"  sequence_outputrC  rt  ru  
total_lossignored_indexr_  
start_lossend_lossr   s                        r?   ra   !BloomForQuestionAnswering.forward  s   4 &1%<k$++BYBY"")'/!5# # 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rA   )rn  r   r$  )rj   rk   rl   rm   rv   r   r)   r%  FloatTensorr   r   r   ra   rp   r~   r   s   @r?   rl  rl    s      .237263715)-,0#'F
##d*F
 ))D0F
 ((4/	F

 ))D0F
 ''$.F
  $;F
 #TkF
 D[F
 
-	-F
 F
rA   rl  )r(  r   r   rL  rc  rl  )=r}   r&   r)   r   torch.nnr   r   r   r   r   rH   cache_utilsr
   r   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_bloomr   
get_loggerrj   r   ro   r   r   r@   floatr   rK   rS   rX   autogradFunctionrZ   Modulerr   r   r   r   r   r   r(  rL  rc  rl  __all__ri   rA   r?   <module>r     s7       L L $ ; ; ) / 9  . - 
		H	%)Ju|| )J )JEKK )J\a\h\h )JX5<< 5<< u PT Y^YeYe &	Q%,, 	Q5<< 	Qu||   $
5>>** 
	%		 	%P.RYY P.fryy >:$+ :$z "? " " G
% G
 G
T v
+_ v
v
r p
%9 p
p
f U
"6 U
 U
p P
 4 P
 P
frA   