
    Z j_G              
       "   S r SSKrSSKJs  Jr  SSKJr  SSKJr  SSK	J
r
  SSKJrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  \RZ                  " \.5      r/   S4S\R`                  \1\R`                     -  S-  S\2S-  S\R`                  S-  S\R`                  \2-  4S jjr3\ " S S\Rh                  5      5       r5 " S S\Rh                  5      r6 " S S\Rh                  5      r7 " S S \)5      r8 " S! S"\*5      r9 " S# S$\"5      r: " S% S&\5      r; " S' S(\(5      r< " S) S*\'5      r= " S+ S,\#5      r> " S- S.\%5      r? " S/ S0\&5      r@ " S1 S2\$5      rA/ S3QrBg)5zPyTorch Mixtral model.    N)nn   )initialization)ACT2FN)CacheDynamicCache)use_experts_implementation)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)PreTrainedModel)Unpack)TransformersKwargslogging)OutputRecorder   )	MistralAttentionMistralForCausalLMMistralForQuestionAnswering MistralForSequenceClassificationMistralForTokenClassificationMistralModelMistralPreTrainedModelMistralRMSNormMistralRotaryEmbedding   )MixtralConfiggate_logitsnum_expertsattention_maskreturnc                    U b  [        U [        5      (       d  g[        U [        5      (       aC  U S   R                  n[        R                  " U  Vs/ s H  oUR                  U5      PM     snSS9n[        R                  R                  R                  WSS9n[        R                  " XrSS9u  p[        R                  R                  R                  X5      n
Uc:  [        R                  " U
R                  5       SS9n[        R                  " USS9nGOUR                  u  pUR                  S   X-  -  nUSSS2SS2SS4   R                  XXU45      R                  SX!5      R                  W5      n[        R                   " U
R                  5       U-  SS9[        R                   " USS9-  nUSSS2SS2S4   R                  XX45      R                  SU5      R                  U5      n[        R                   " UU-  SS9[        R                   " USS9-  n[        R                   " XR#                  S5      -  5      nUU-  $ s  snf )ax  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [batch_size X sequence_length, num_experts].
    num_experts:
        Number of experts
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   dim)
isinstancetupledevicetorchcattor   
functionalsoftmaxtopkone_hotmeanfloatshapeexpandreshapesum	unsqueeze)r    r!   top_kr"   compute_device
layer_gateconcatenated_gate_logitsrouting_weights_selected_expertsexpert_masktokens_per_expertrouter_prob_per_expert
batch_sizesequence_lengthnum_hidden_layersexpert_attention_mask router_per_expert_attention_maskoverall_losss                      |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mixtral/modular_mixtral.pyload_balancing_loss_funcrJ   5   s+   : *[%"@"@+u%%$Q..#(99^i-j^iPZmmN.K^i-jpq#r hh))112JPR1SO**_DA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
4::1=*B^_ 4AtT12V&OKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&OQRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1Q1QRS1TTUL+%%[ .ks   Ic                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )MixtralExperts   z2Collection of expert weights stored as 3D tensors.configc                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        R                  " [        R                  " U R                  SU R                  -  U R
                  5      5      U l        [        R                  " [        R                  " U R                  U R
                  U R                  5      5      U l        [        UR                     U l        g )Nr   )super__init__num_local_expertsr!   hidden_size
hidden_dimintermediate_sizeintermediate_dimr   	Parameterr+   emptygate_up_proj	down_projr   
hidden_actact_fnselfrN   	__class__s     rI   rQ   MixtralExperts.__init__   s    !33 ,, & 8 8LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV../    hidden_statestop_k_indextop_k_weightsr#   c                 X   [         R                  " U5      n[         R                  " 5          [         R                  R                  R                  X R                  S9nUR                  SSS5      n[         R                  " UR                  SS9S5      R                  5       nS S S 5        W H  nUS   nXpR                  :X  a  M  [         R                  " WU   5      u  pX   n
[        R                  R                  XR                  U   5      R                  SSS9u  pU R                  U5      U-  n[        R                  R                  XR                   U   5      nXXS 4   -  nUR#                  SXR%                  UR&                  5      5        M     U$ ! , (       d  f       N= f)N)num_classesr   r   r   )r'   r%   r'   )r+   
zeros_likeno_gradr   r.   r1   r!   permutegreaterr7   nonzerowherelinearrY   chunkr\   rZ   
index_add_r-   dtype)r^   rb   rc   rd   final_hidden_statesr@   
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statess                 rI   forwardMixtralExperts.forward   so    $..}=]]_((--55kO_O_5`K%--aA6K{8'DaHPPRJ 
 %J#AJ---#(;;{:/F#G I)4M}}++M;L;LZ;XY__`agi_jHD$(KK$5$:!$&MM$8$89NP^P^_iPj$k!$9)`dJd<e$e!**1i9Q9QReRkRk9lm % #"# _s   A7F
F))r\   rZ   rY   rT   rV   r!   )__name__
__module____qualname____firstlineno____doc__r   rQ   r+   Tensorr{   __static_attributes____classcell__r_   s   @rI   rL   rL      sR    <0} 0#||# \\# ||	#
 
# #ra   rL   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )MixtralTopKRouter   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        R                  " [        R                  " U R
                  U R                  5      5      U l        g N)rP   rQ   num_experts_per_tokr9   rR   r!   rS   rT   r   rW   r+   rX   weightr]   s     rI   rQ   MixtralTopKRouter.__init__   s[    //
!33 ,,ll5;;t/?/?#QRra   c                 X   UR                  SU R                  5      n[        R                  " XR                  5      n[
        R                  R                  R                  UR                  5       SS9n[
        R                  " X0R                  SS9u  pEXDR                  SSS9-  nUnX&U4$ )Nr'   r%   T)r&   keepdim)r6   rT   Frn   r   r+   r   r.   r/   r3   r0   r9   r7   )r^   rb   router_logitsrouter_probsrouter_top_valuerouter_indicesrouter_scoress          rI   r{   MixtralTopKRouter.forward   s    %--b$//B<xx**22=3F3F3Hb2Q+0::lJJTV+W(00R0FF(^;;ra   )rT   r!   r9   r   )r}   r~   r   r   rQ   r{   r   r   r   s   @rI   r   r      s    S< <ra   r   c                      ^  \ rS rSrU 4S jrS\R                  S\\R                  \R                  4   4S jrSr	U =r
$ )MixtralSparseMoeBlock   c                    > [         TU ]  5         UR                  U l        UR                  U l        [        U5      U l        [        U5      U l	        g r   )
rP   rQ   r   r9   router_jitter_noisejitter_noiser   rx   rL   expertsr]   s     rI   rQ   MixtralSparseMoeBlock.__init__   sA    //
"66%f-	%f-ra   rb   r#   c                    UR                   u  p#nU R                  (       aS  U R                  S:  aC  U[        R                  " U5      R                  SU R                  -
  SU R                  -   5      -  nUR                  SUR                   S   5      nU R                  U5      u  pVnU R                  XU5      nUR                  X#U5      nU$ )Nr   g      ?r'   )
r4   trainingr   r+   
empty_likeuniform_viewrx   r   r6   )r^   rb   rC   rD   rT   r>   rd   rc   s           rI   r{   MixtralSparseMoeBlock.forward   s    2?2E2E/
Z==T..2U--m<EEcDL]L]F]_beievev_vwwM%**2}/B/B2/FG(,		-(@%+]O%--j:Vra   )r   rx   r   r9   )r}   r~   r   r   rQ   r+   r   r)   r{   r   r   r   s   @rI   r   r      s6    .U\\ eELL%,,<V6W  ra   r   c                       \ rS rSrSrg)MixtralRMSNorm    Nr}   r~   r   r   r   r   ra   rI   r   r          ra   r   c                       \ rS rSrSrg)MixtralRotaryEmbedding   r   Nr   r   ra   rI   r   r      r   ra   r   c                       \ rS rSrSrg)MixtralAttention   r   Nr   r   ra   rI   r   r      r   ra   r   c                     ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\\   S\R                  4S jjrSrU =r$ )MixtralDecoderLayer   rN   	layer_idxc                   > [         TU ]  5         UR                  U l        [        X5      U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g )N)eps)rP   rQ   rS   r   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernorm)r^   rN   r   r_   s      rI   rQ   MixtralDecoderLayer.__init__   sj    !--)&<(0-f.@.@fFYFYZ(6v7I7IvObOb(c%ra   Nrb   position_embeddingsr"   position_idspast_key_valueskwargsr#   c           	          UnU R                  U5      nU R                  " SUUUUUS.UD6u  pXq-   nUnU R                  U5      nU R                  U5      nXq-   nU$ )N)rb   r   r"   r   r   r   )r   r   r   r   )	r^   rb   r   r"   r   r   r   residualr>   s	            rI   r{   MixtralDecoderLayer.forward   s     !,,];>> 
' 3)%+
 
 !0 55mD/ 0ra   )rS   r   r   r   r   )NNNN)r}   r~   r   r   r   intrQ   r+   r   r)   
LongTensorr   r   r   r{   r   r   r   s   @rI   r   r      s    d} d d IM.204(,|| #5<<#=>E t+	
 &&-  +, 
 ra   r   c                   X    \ rS rSr\" \SS9\\S.r\	R                  " 5       S 5       rSrg)MixtralPreTrainedModeli  r   )index)r   rb   
attentionsc                 t   [         R                  " X5        U R                  R                  n[	        U[
        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                  SUS9  g [	        U[        5      (       a!  [        R                  " UR                  SUS9  g g )Ng        )r2   std)r   _init_weightsrN   initializer_ranger(   rL   initnormal_rY   rZ   r   r   )r^   moduler   s      rI   r   $MixtralPreTrainedModel._init_weights  s    %%d3kk++fn--LL,,3C@LL))= 122LLSc: 3ra   r   N)r}   r~   r   r   r   r   r   r   _can_record_outputsr+   ri   r   r   r   ra   rI   r   r     s5    '(9C,& ]]_; ;ra   r   c                       \ rS rSr      SS\R
                  S-  S\R                  S-  S\R
                  S-  S\S-  S\R                  S-  S\	S-  S	\
\   S
\4S jjrSrg)MixtralModeli  N	input_idsr"   r   r   inputs_embeds	use_cacher   r#   c           
      ~   US L US L-  (       a  [        S5      eU(       a  Uc  [        U R                  S9nUc  U R                  U5      nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      nU R                  R                  c  [        O[        n	U	" U R                  UUUUS9n
UnU R                  XS9nU R                  S U R                  R                    H  nU" U4U
UUUUS.UD6nM     U R!                  U5      n[#        UUS	9$ )
Nz:You must specify exactly one of input_ids or inputs_embeds)rN   r   r   )r*   )rN   r   r"   r   r   )r   )r"   r   r   r   r   )last_hidden_stater   )
ValueErrorr   rN   embed_tokensget_seq_lengthr+   aranger4   r*   r8   sliding_windowr
   r   
rotary_emblayersrE   normr   )r^   r   r"   r   r   r   r   r   past_seen_tokensmask_functioncausal_maskrb   r   decoder_layers                 rI   r{   MixtralModel.forward  s^    -t";<YZZ0*$++>O  --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L.2kk.H.H.P*Vw#;;')+%
 &"oomoW![[)H4;;+H+HIM)*) /#$7 M J 		-0%++
 	
ra   r   )NNNNNN)r}   r~   r   r   r+   r   r   r   FloatTensorboolr   r   r   r{   r   r   ra   rI   r   r     s     .2.204(,26!%4
##d*4
 t+4
 &&-	4

 4
 ((4/4
 $;4
 +,4
 
 4
 4
ra   r   c                   0  ^  \ rS rSrSS0rU 4S jr         SS\R                  S-  S\R                  S-  S\R                  S-  S	\	S-  S
\R                  S-  S\R                  S-  S\S-  S\S-  S\\R                  -  S\\   S\4S jjrSrU =r$ )MixtralForCausalLMiP  zlm_head.weightzmodel.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        UR                  U l        UR
                  U l        UR                  U l        g r   )rP   rQ   r   modelrouter_aux_loss_coefrR   r!   r   r]   s     rI   rQ   MixtralForCausalLM.__init__S  sF     !&)
$*$?$?!!33#)#=#= ra   Nr   r"   r   r   r   labelsr   output_router_logitslogits_to_keepr   r#   c
                 z   Ub  UOU R                   R                  nU R                  " SUUUUUUUS.U
D6nUR                  n[	        U	[
        5      (       a  [        U	* S5      OU	nU R                  USS2USS24   5      nSnUb  U R                  " XU R                  40 U
D6nSnU(       aY  [        UR                  U R                  U R                  U5      nUb*  XR                  UR                  UR                   5      -  -  n[#        UUUUR$                  UR&                  UR(                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, MixtralForCausalLM

>>> model = MixtralForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)r   r"   r   r   r   r   r   )lossaux_losslogitsr   rb   r   r   r   )rN   r   r   r   r(   r   slicelm_headloss_function
vocab_sizerJ   r   r!   r   r   r-   r*   r   r   rb   r   )r^   r   r"   r   r   r   r   r   r   r   r   outputsrb   slice_indicesr   r   r   s                    rI   r{   MixtralForCausalLM.forwardZ  sP   J %9$D $++JjJj 	
 +/** 	+
)%+'!5	+
 	+
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
ra   )r   r!   r   r   )	NNNNNNNNr   )r}   r~   r   r   _tied_weights_keysrQ   r+   r   r   r   r   r   r   r   r   r   r{   r   r   r   s   @rI   r   r   P  s   *,GH> .2.204(,26*.!%,0-.P
##d*P
 t+P
 &&-	P

 P
 ((4/P
   4'P
 $;P
 #TkP
 ell*P
 +,P
 
#P
 P
ra   r   c                       \ rS rSrSrg) MixtralForSequenceClassificationi  r   Nr   r   ra   rI   r   r     r   ra   r   c                       \ rS rSrSrg)MixtralForTokenClassificationi  r   Nr   r   ra   rI   r  r    r   ra   r  c                       \ rS rSrSrg)MixtralForQuestionAnsweringi  r   Nr   r   ra   rI   r  r    r   ra   r  )r   r  r   r   r   r  )Nr   N)Cr   r+   torch.nn.functionalr   r.   r    r   r   activationsr   cache_utilsr   r   integrationsr	   masking_utilsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   utils.output_capturingr   mistral.modeling_mistralr   r   r   r   r   r   r   r   r   configuration_mixtralr   
get_loggerr}   loggerr   r)   r   rJ   ModulerL   r   r   r   r   r   r   r   r   r   r   r  r  __all__r   ra   rI   <module>r     s  &      & ! . 6 R 9 Q - & 0 4
 
 
 1 
		H	%
 #
*.	O&ell 33d:O&tO& LL4'	O&
 \\CO&d $#RYY $# $#N<		 <$BII &	^ 		3 		' 	#4 #L;3 ;$5
< 5
pZ
+ Z
z	'G 		$A 		"= 	ra   