
    Z j2                        S SK Jr  S SKrS SKJr  S SKJr  SSKJrJ	r	  SSK
Jr  SSKJrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  \RR                  " \*5      r+\" SS9\ " S S\5      5       5       r, " S S\$5      r- " S S\"5      r. " S S\5      r/ " S S\ 5      r0 " S S\#5      r1 " S S \(5      r2 " S! S"\!5      r3/ S#Qr4g)$    )CallableN)strict   )CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)RopeParametersdynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)maybe_autocast   )CohereAttentionCohereDecoderLayerCohereForCausalLMCohereLayerNormCoherePreTrainedModelCohereRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Gemma2ModelzCohereForAI/c4ai-command-r-v01)
checkpointc                     ^  \ rS rSr% SrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.rSr	\
\S'   Sr\
\S'   Sr\
\S'   Sr\\S'   Sr\
\S'   Sr\
\S'   Sr\
S-  \S'   Sr\\S'   Sr\
\S'   Sr\\S'   S r\\S!'   S"r\\S#'   S$r\
S-  \S%'   S&r\
S-  \S''   S(r\
\\
   -  S-  \S)'   S"r\\S*'   Sr\ \!-  S-  \S+'   S,r"\\S-'   S.r#\\
-  \S/'   S0r$\
S-  \S1'   Sr%\\   S-  \S2'   U 4S3 jr&S4r'U =r($ )5Cohere2Config1   a  
logit_scale (`float`, *optional*, defaults to 0.0625):
    The scaling factor for the output logits.

```python
>>> from transformers import Cohere2Model, Cohere2Config

>>> # Initializing a Cohere Nextmodel configuration
>>> configuration = Cohere2Config()

>>> # Initializing a model from the Cohere2 configuration
>>> model = Cohere2Model(configuration) # doctest: +SKIP

>>> # Accessing the model configuration
>>> configuration = model.config # doctest: +SKIP
```
cohere2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei    hidden_sizei X  intermediate_sizeg      ?logit_scale(   num_hidden_layers@   num_attention_headsNnum_key_value_headssilu
hidden_actmax_position_embeddingsg{Gz?initializer_rangegh㈵>layer_norm_epsT	use_cacher   pad_token_id   bos_token_idi eos_token_idtie_word_embeddingsrope_parametersFattention_bias        attention_dropouti   sliding_windowlayer_typesc                 l  > U R                   c  U R                  U l         U R                  U R                  -  U l        U R                  cU  UR                  SS5      n[        U R                  5       Vs/ s H  n[        US-   U-  5      (       a  SOSPM     snU l        [        TU ](  " S0 UD6  g s  snf )Nsliding_window_pattern      sliding_attentionfull_attention )r5   r4   r.   head_dimrF   popranger2   boolsuper__post_init__)selfkwargs_sliding_window_patterni	__class__s       |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/cohere2/modular_cohere2.pyrS   Cohere2Config.__post_init__m   s    ##+'+'?'?D$ ((D,D,DD #&,jj1I1&M# t556 6A (,QU6M,M'N'N#Tdd6 D
 	'' s   4$B1)rN   rF   r5   ))__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr-   int__annotations__r.   r/   r0   floatr2   r4   r5   r7   strr8   r9   r:   r;   rQ   r<   r>   r?   listr@   rA   r   dictrB   rD   rE   rF   rS   __static_attributes____classcell__rX   s   @rY   r    r    1   s   $ J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 JK"s"Ks!!&*t*J#'S'#u# NE It L#*  L#* +1L#S	/D(1 $$48O^d*T18 ND %(us{(!%NC$J%$(KcT!(( (    r    c                   L    \ rS rSr\R
                  " 5       \S 5       5       rSrg)Cohere2RotaryEmbedding   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       n[	        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " USSS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR                  UR                   S
9W	R                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rJ   mpscpuF)device_typeenabledr   )dim)dtype)inv_freqrf   expandshape
isinstancedevicetyperg   r   	transposetorchrepeat_interleavecosattention_scalingsintorx   )
rT   xposition_idsinv_freq_expandedposition_ids_expandedru   freqsembr   r   s
             rY   forwardCohere2RotaryEmbedding.forward   s>    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))%;C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs    BE<<
F
rM   N)	r[   r\   r]   r^   r   no_gradr   r   rj   rM   rm   rY   ro   ro      s"    
]]_<  <rm   ro   c                       \ rS rSrSrg)Cohere2LayerNorm   rM   Nr[   r\   r]   r^   rj   rM   rm   rY   r   r          rm   r   c                   "   \ rS rSrSrSS\S\S-  4S jjr SS\R                  S\
\R                  \R                  4   S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrg)Cohere2Attention   z=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    [         R                  R                  U 5        Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        SU l        [        US5      (       a  UR                  U   OS nUS:X  a  UR                   OS U l        [         R"                  " UR                  UR                  U R                  -  UR$                  S9U l        [         R"                  " UR                  UR                  U R                  -  UR$                  S9U l        [         R"                  " UR                  UR                  U R                  -  UR$                  S9U l        [         R"                  " UR                  U R                  -  UR                  UR$                  S9U l        g )NrN   g      TrF   rK   )bias)nnModule__init__r   r   getattrr.   r4   rN   r5   num_key_value_groupsscalingrD   	is_causalhasattrrF   rE   LinearrB   q_projk_projv_projo_proj)rT   r   r   
layer_types       rY   r   Cohere2Attention.__init__   s   
		4 "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!96=fm6T6TV''	2Z^
7AEX7Xf33^bii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
rm   r(   position_embeddingsr)   r#   rU   returnc                 8   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  pU R                  b  [        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                   (       d  SOU R"                  U R$                  U R                  S.UD6u  pUR&                  " / UQSP76 R)                  5       nU R+                  U5      nX4$ )Nrr   rJ   r   rC   )dropoutr   rE   )r{   rN   r   viewr   r   r   rE   r   updater   r   get_interfacer   _attn_implementationr   trainingrD   r   reshape
contiguousr   )rT   r(   r   r)   r#   rU   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   attention_interfaceattn_outputattn_weightss                   rY   r   Cohere2Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&*';LVY'_$L&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
! "));;;;FFHkk+.((rm   )rD   r   rN   r   r   r   r   r   r   r   rE   r   N)r[   r\   r]   r^   r_   r    rd   r   r   Tensortupler   r   r   r   rj   rM   rm   rY   r   r      s    G
} 
t 
< )-()||() #5<<#=>() t+	()
 () +,() 
u||U\\D0%2E2LL	M() ()rm   r   c                   4  ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\
S-  S
\S-  S\\   S\	\R                  \	\R                  \R                  4   S-  4   4S jjrSrU =r$ )Cohere2DecoderLayer   r   r   c                 $   > [         TU ]  X5        g r   )rR   r   )rT   r   r   rX   s      rY   r   Cohere2DecoderLayer.__init__   s    +rm   Nr(   r   r)   r#   r;   rU   r   c           	          UnU R                  U5      nU R                  " SUUUUUS.UD6u  pU R                  U5      n
Xx-   U
-   nU$ )N)r(   r   r)   r#   r;   rM   )input_layernorm	self_attnmlp)rT   r(   r   r)   r#   r;   rU   residualhidden_states_attention_hidden_states_mlps              rY   r   Cohere2DecoderLayer.forward   sn     !,,];%)^^ &
' 3)+&
 &
" !HH]3 :=NNrm   rM   )NNNF)r[   r\   r]   r^   r    rd   r   r   r   r   r   rQ   r   r   FloatTensorr   rj   rk   rl   s   @rY   r   r      s    ,} , , IM.2(,!&|| #5<<#=>E t+	
  $; +, 
u  %(9(95;L;L(L"MPT"TT	U rm   r   c                   *    \ rS rSr% \\S'   \\S.rSr	g)Cohere2PreTrainedModel   r   )r(   
attentionsrM   N)
r[   r\   r]   r^   r    re   r   r   _can_record_outputsrj   rM   rm   rY   r   r      s    ,&rm   r   c                      ^  \ rS rSrS\4U 4S jjr      SS\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S	\R                  S-  S
\S-  S\\   S\4S jjrSrU =r$ )Cohere2Modeli  r   c                    > [         TU ]  U5        [        UR                  UR                  S9U l        [        R                  " UR                  UR                  U R                  5      U l
        g )N)r.   eps)rR   r   r   r.   r:   r,   r   	Embeddingr-   padding_idxr*   )rT   r   rX   s     rY   r   Cohere2Model.__init__  sR     $&2D2D6K`K`a	LL):):F<N<NPTP`P`arm   Nr&   r)   r   r#   r'   r;   rU   r   c           
         US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       d)  U R                  UUUUS.n
[        S
0 U
D6[        S
0 U
D6S.n	UnU R                  X5      n[        U R                   5       H-  u  pU" U4XR                  R"                  U      UUUUS.UD6nM/     U R%                  U5      n['        UUS	9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   rJ   )r}   )r   r'   r)   r#   r   )rL   rK   )r)   r   r#   r;   r   )last_hidden_stater#   rM   )
ValueErrorr*   r   r   get_seq_lengthr   aranger{   r}   	unsqueezer|   ri   r	   r
   
rotary_emb	enumerater+   rF   r,   r   )rT   r&   r)   r   r#   r'   r;   rU   past_seen_tokenscausal_mask_mappingmask_kwargsr(   r   rW   decoder_layers                  rY   r   Cohere2Model.forward  s{    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-FF++!."0#2 ,K #5"C{"C%F%U%U#
 &"oomJ )$++ 6A)2;;3J3J13MN$7 /#) M !7 		-0&++
 	
rm   )r*   r,   )NNNNNN)r[   r\   r]   r^   r    r   r   
LongTensorr   r   r   rQ   r   r   r   r   rj   rk   rl   s   @rY   r   r     s    b} b .2.204(,26!%7
##d*7
 t+7
 &&-	7

 7
 ((4/7
 $;7
 +,7
 
!7
 7
rm   r   c                       \ rS rSrSrg)Cohere2ForCausalLMiA  rM   Nr   rM   rm   rY   r   r   A  r   rm   r   )r    r   r   r   )5collections.abcr   r   torch.nnr   huggingface_hub.dataclassesr   cache_utilsr   r   configuration_utilsr   masking_utilsr	   r
   modeling_outputsr   modeling_rope_utilsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   cohere.modeling_coherer   r   r   r   r   r   r   r   gemma2.modeling_gemma2r   
get_loggerr[   loggerr    ro   r   r   r   r   r   r   __all__rM   rm   rY   <module>r      s    %   . . 3 R 7 6 & @ @ +	 	 	 1 
		H	% ;<J($ J(  =J(Z<2 <"	 	D) D)N, :2 =
; =
@	* 	 \rm   