
    Z j1                       S SK r S SKJr  S SKJr  S SKJr  S SKJr  S SK	r	S SK
Jr  S SKJs  Jr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/J0r0J1r1  SSK2J3r3J4r4  SSK5J6r6  SSK7J8r8J9r9J:r:  \/\ " S S\#5      5       5       r;S r<\" S5      SeS j5       r=S\	R|                  S\?S\	R|                  4S jr@ SfS \R                  S!\	R|                  S"\	R|                  S#\	R|                  S$\	R|                  S-  S%\BS&\BS'\,\.   4S( jjrC\" \=5       " S) S*\R                  5      5       rD\" S+5       " S, S-\R                  5      5       rE " S. S/\R                  5      rF " S0 S1\ 5      rG " S2 S3\R                  5      rH " S4 S5\R                  5      rI " S6 S7\R                  5      rJ " S8 S9\R                  5      rK " S: S;\R                  5      rL " S< S=\R                  5      rM " S> S?\R                  5      rN " S@ SA\R                  5      rO " SB SC\R                  5      rP " SD SE\R                  5      rQ " SF SG\R                  5      rS " SH SI\R                  5      rT " SJ SK\R                  5      rU " SL SM\R                  5      rV " SN SO\R                  5      rW " SP SQ\R                  5      rX\/" SRSS9 " ST SU\*5      5       rY " SV SW5      rZ\/ " SX SY\*5      5       r[ " SZ S[\R                  5      r\\/ " S\ S]\[5      5       r]\/ " S^ S_\[\5      5       r^ " S` Sa\[5      r_ " Sb Sc\[\5      r`/ SdQrag)g    N)Callable)	dataclass)cached_property)Optional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Emu3VQVAEModelOutput1   z
image_tokens (`torch.LongTensor` of shape `(batch_size, config.vocab_size`):
    Indices of the image tokens predicted by the VQ-VAE model.
Nimage_tokens )
__name__
__module____qualname____firstlineno____doc__r(   torch
LongTensor__annotations____static_attributes__r)       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/emu3/modeling_emu3.pyr&   r&   1   s    
 -1L%""T)0r3   r&   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..N   dim)shaper/   cat)xx1x2s      r4   rotate_halfr?   <   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer?   )qkcossinunsqueeze_dimq_embedk_embeds          r4   apply_rotary_pos_embrJ   C   sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr3   hidden_statesn_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)r:   expandreshape)rK   rL   batchnum_key_value_headsslenhead_dims         r4   	repeat_kvrU   ]   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr3   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr7   r   r6   )r9   dtype)ptrainingr!   )rU   num_key_value_groupsr/   matmul	transposenn
functionalsoftmaxfloat32tor_   r\   ra   
contiguous)rV   rW   rX   rY   rZ   r[   r\   r]   
key_statesvalue_statesattn_weightsattn_outputs               r4   eager_attention_forwardro   i   s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r3   c                     ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  4   4S jjrSrU =r$ )Emu3Attention   =Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 P  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        g )NrT         Tbias)super__init__rt   ru   getattrhidden_sizenum_attention_headsrT   rR   rb   r[   attention_dropout	is_causalre   Linearattention_biasq_projk_projv_projo_projselfrt   ru   	__class__s      r4   r{   Emu3Attention.__init__   sI   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r3   NrK   position_embeddingsrZ   past_key_valuesr]   rM   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )Nr6   r!   r7           )r\   r[   )r:   rT   r   viewrd   r   r   rJ   updateru   r   get_interfacert   _attn_implementationro   ra   r   r[   rP   rj   r   )r   rK   r   rZ   r   r]   input_shapehidden_shapequery_statesrk   rl   rE   rF   attention_interfacern   rm   s                   r4   forwardEmu3Attention.forward   s~    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r3   )r   rt   rT   r   r   ru   rb   r   r   r[   r   NNN)r*   r+   r,   r-   r.   r"   intr{   r/   Tensortupler
   r   r   r   r2   __classcell__r   s   @r4   rq   rq      s    G
z 
c 
4 IM.2(,&)||&) #5<<#=>E&) t+	&)
 &) +,&) 
u||U\\)	*&) &)r3   rq   RMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Emu3RMSNorm   epsrM   Nc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z*
Emu3RMSNorm is equivalent to T5LayerNorm
N)rz   r{   re   	Parameterr/   onesweightvariance_epsilon)r   r}   r   r   s      r4   r{   Emu3RMSNorm.__init__   s/     	ll5::k#:; #r3   rK   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr7   r6   T)keepdim)	r_   ri   r/   rh   powmeanrsqrtr   r   )r   rK   input_dtypevariances       r4   r   Emu3RMSNorm.forward   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r3   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r   r   r:   r   r   s    r4   
extra_reprEmu3RMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr3   )r   r   )ư>)r*   r+   r,   r-   floatr{   r/   r   r   r   r2   r   r   s   @r4   r   r      sB    $ $$ $ $;U\\ ;ell ;J Jr3   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3MLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nrx   )rz   r{   rt   r}   intermediate_sizere   r   mlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr   rt   r   s     r4   r{   Emu3MLP.__init__   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r3   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ N)r   r   r   r   )r   r<   r   s      r4   r   Emu3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r3   )r   rt   r   r   r}   r   r   r*   r+   r,   r-   r{   r   r2   r   r   s   @r4   r   r      s    0 r3   r   c                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )Emu3DecoderLayer   rt   ru   c                 V  > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        [        R                  " UR                  5      U l        g )N)rt   ru   r   )rz   r{   r}   rq   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormre   Dropoutr   r\   r   s      r4   r{   Emu3DecoderLayer.__init__   s    !--&fJ6?*6+=+=6CVCVW(3F4F4FFL_L_(`%zz&":":;r3   NrK   rZ   position_idsr   	use_cacher   r]   rM   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pXR                  U5      -   nUnU R                  U5      nU R	                  U5      nXR                  U5      -   nU$ )N)rK   rZ   r   r   r   r   r)   )r   r   r\   r   r   )
r   rK   rZ   r   r   r   r   r]   residual_s
             r4   r   Emu3DecoderLayer.forward   s     !,,];>> 
')%+ 3
 
 !<<#>> 55mD/ <<#>>r3   )r\   r}   r   r   r   r   )NNNFN)r*   r+   r,   r-   r"   r   r{   r/   r   r0   r
   boolr   r   r   r   r2   r   r   s   @r4   r   r      s    	<z 	<c 	< /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 r3   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )Emu3VQVAEVectorQuantizeri  a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
rt   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        U R                  R                  R                  R                  SUR                  -  SUR                  -  5        g )Ng            ?)
rz   r{   re   	Embeddingcodebook_size	embed_dim	embeddingr   datauniform_r   s     r4   r{   !Emu3VQVAEVectorQuantizer.__init__!  sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr3   hidden_statec                    UR                   u  p#pEnUR                  SSSSS5      R                  5       nUR                  SU5      n[        R
                  " US-  SSS9n[        R
                  " U R                  R                  S-  SS	9n	S[        R                  " XpR                  R                  R                  SS5      5      -  n
X-   U
-
  n
[        R                  " U
SS	9nUR                  X#XV5      nU$ )
Nr   r!   r      r7   r6   T)r9   r   r8   )r:   permuterj   r   r/   sumr   r   rc   rd   argmin)r   r   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r4   r    Emu3VQVAEVectorQuantizer.forward&  s    8D8J8J5
h#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;^^=R=R=\=\]^`a=bcc	$4y@	$||I1=388v]##r3   )r   )r*   r+   r,   r-   r.   r$   r{   r/   r   r   r2   r   r   s   @r4   r   r     s+    e e
$ELL $ $r3   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3VQVAEEncoderConvDownsamplei8  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r7   r   kernel_sizestridepaddingrz   r{   re   Conv2dconvr   in_channelsr   s     r4   r{   'Emu3VQVAEEncoderConvDownsample.__init__9  %    IIkAaYZ[	r3   c                 V    [         R                  " USSSS9nU R                  U5      nU$ )N)r   r!   r   r!   constantr   )padmoderY   )Fr  r  r   rK   s     r4   r   &Emu3VQVAEEncoderConvDownsample.forward=  s+    mJVWX		-0r3   r  r   r   s   @r4   r   r   8  s    \ r3   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3VQVAEEncoderConvUpsampleiD  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r!   r   r   r  s     r4   r{   %Emu3VQVAEEncoderConvUpsample.__init__E  r  r3   c                 T    [         R                  " USSS9nU R                  U5      nU$ )N       @nearestscale_factorr	  )r
  interpolater  r  s     r4   r   $Emu3VQVAEEncoderConvUpsample.forwardI  s(    m#IV		-0r3   r  r   r   s   @r4   r  r  D  s    \ r3   r  c            	       j   ^  \ rS rSrS\S\S\\   S\\   4U 4S jjrS\R                  4S jr	S	r
U =r$ )
Emu3VQVAEConv3diO  
in_channelout_channelr   r   c                 R  > [         T	U ]  5         [        USS  USS  5       VVs/ s H	  u  pVXV-
  PM     nnnSU l        US S S2    H&  nU =R                  US-  US-  -   US-  4-  sl        M(     U =R                  S-  sl        [        R
                  " UUUUS9U l        g s  snnf )Nr!   r)   r6   r7   )r7   r   )r   )rz   r{   zipr   re   Conv3dr  )
r   r  r  r   r   
one_kernel
one_stridepadding_sizespad_sizer   s
            r4   r{   Emu3VQVAEConv3d.__init__P  s     	ORS^_`_aSbdjklkmdnOopOo5KZ0Oop%dd+HLLX]X\98q=IIL ,II	
	 qs   B#rK   c                 h    [         R                  " XR                  5      nU R                  U5      nU$ r   )r
  r  r   r  r  s     r4   r   Emu3VQVAEConv3d.forwardf  s(    m\\:		-0r3   )r  r   )r*   r+   r,   r-   r   r   r{   r/   r   r   r2   r   r   s   @r4   r  r  O  sK    

 
 3Z	

 c

,U\\  r3   r  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Emu3VQVAESpatialNormil  r  out_channelsc                    > [         TU ]  5         [        R                  " USSSS9U l        [        R
                  " UUSSSS9U l        [        R
                  " UUSSSS9U l        g )N    r   Tnum_channels
num_groupsr   affiner!   r   r   )rz   r{   re   	GroupNorm
norm_layerr   conv_yconv_br   r  r)  r   s      r4   r{   Emu3VQVAESpatialNorm.__init__m  sn    
 	,,%	
 ii
 ii
r3   rK   quant_statesc                     [         R                  " X!R                  SS  SS9nU R                  U5      nXR	                  U5      -  U R                  U5      -   nU$ )Nr  )sizer	  )r
  r  r:   r1  r2  r3  )r   rK   r6  s      r4   r   Emu3VQVAESpatialNorm.forward  sT    }}\8K8KBC8PW`a6%L(AADKKP\D]]r3   )r3  r2  r1  r*   r+   r,   r-   r   r{   r/   r   r   r2   r   r   s   @r4   r(  r(  l  s:    

 
8U\\   r3   r(  c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalUpsamplei  r  r  c                 D   > [         TU ]  5         [        UUSSS9U l        g )Nr   r   r   r!   r!   r!   r   r   rz   r{   r  r  r   r  r  r   s      r4   r{   "Emu3VQVAETemporalUpsample.__init__  (    
 	#!	
	r3   rK   c                 D   UR                   u  p#pEnUR                  SSSSS5      R                  5       R                  USU5      n[        R
                  " USSS	9nUR                  X#XVS5      R                  SSSSS5      R                  5       nU R                  U5      nU$ )
Nr   r!   r   r   r7   r6   r  r  r  )r:   r   rj   r   r
  r  r  )r   rK   r   r   r   r   r   s          r4   r   !Emu3VQVAETemporalUpsample.forward  s    8E8K8K5
h%--aAq!<GGINNz[]_ghm#IV%**:PRS[[\]_`bcefhijuuw		-0r3   r  r;  r   s   @r4   r=  r=    s/    

 
U\\  r3   r=  c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalDownsamplei  r  r  c                 D   > [         TU ]  5         [        UUSSS9U l        g )N)r   r   r   )r7   r!   r!   rA  rB  rC  s      r4   r{   $Emu3VQVAETemporalDownsample.__init__  rE  r3   rK   c                 (    U R                  U5      nU$ r   r  r  s     r4   r   #Emu3VQVAETemporalDownsample.forward  s    		-0r3   r  r;  r   s   @r4   rI  rI    s/    

 
U\\  r3   rI  c                   4   ^  \ rS rSr SU 4S jjrS rSrU =r$ )Emu3VQVAETemporalResnetBlocki  c                 f  > [         TU ]  5         Xl        Uc  UOUU l        [        R
                  " U5      U l        [        UUSSS9U l        [        R
                  " U5      U l	        [        UUSSS9U l
        U R                  U R                  :w  a  [        R                  " UUSSSS9U l        g g )Nr?  r@  rA  r!   r   r   )rz   r{   r  r)  re   BatchNorm3dnorm1r  conv1norm2conv2r  nin_shortcutr4  s      r4   r{   %Emu3VQVAETemporalResnetBlock.__init__  s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r3   c                 P   UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nX!-   $ r   )	rR  r/   sigmoidrS  rT  rU  r  r)  rV  )r   rK   r   s      r4   r   $Emu3VQVAETemporalResnetBlock.forward  s     

=1}55

=1

=1}55

=1t000((2H''r3   )rS  rU  r  rV  rR  rT  r)  r   r   r   s   @r4   rO  rO    s     @( (r3   rO  c                      ^  \ rS rSr  S
S\S\S-  S\S-  4U 4S jjjrSS\R                  S\R                  S-  4S jjrS	r	U =r
$ )Emu3VQVAEResnetBlocki  Nr  r)  quant_channelsc                   > [         TU ]  5         Xl        Uc  UOUnX l        X0l        Uc9  [
        R                  " USSSS9U l        [
        R                  " USSSS9U l        O [        X15      U l        [        X25      U l        [
        R                  " UUSSSS9U l        [
        R                  " UUSSSS9U l        U R                  U R                  :w  a  [
        R                  " UUSSSS9U l        g g )	Nr+  r   Tr,  r   r!   r   r   )rz   r{   r  r)  r]  re   r0  rR  rT  r(  r   rS  rU  rV  )r   r  r)  r]  r   s       r4   r{   Emu3VQVAEResnetBlock.__init__  s     	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nJDJ-nKDJYY

 YY

 t000 "		!D 1r3   rK   c                 |   U R                   c  SOU4nUnU R                  " U/UQ76 nU[        R                  " U5      -  nU R	                  U5      nU R
                  " U/UQ76 nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nXA-   $ Nr)   )
r]  rR  r/   rY  rS  rT  rU  r  r)  rV  )r   rK   r]  	norm_argsr   s        r4   r   Emu3VQVAEResnetBlock.forward  s    --5BN;L	 

==9=}55

=1

==9=}55

=1t000((2H''r3   )rS  rU  r  rV  rR  rT  r)  r]  )NNr   r;  r   s   @r4   r\  r\    s`     $(%)	** Dj* d
	* *X(U\\ (5<<RVCV ( (r3   r\  c            
          ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\R                  \R                  S-  4   4S	 jjr
S
rU =r$ )Emu3VQVAEAttentionBlocki*  rs   rt   c                 .  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        SU l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rw   Fr!   )rz   r{   rt   r}   r   r~   	num_headsrT   
ValueErrorscaler   r\   r   re   r   r   r   r   out_projrb   r   s     r4   r{    Emu3VQVAEAttentionBlock.__init__-  s"   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..A %&!r3   NrK   rZ   rM   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNr6   r!   r7   r   )r   r[   r\   )r:   rT   r   r   rd   r   r   r   r   rt   r   ro   r   ri  ra   r\   rP   rj   rj  )r   rK   rZ   r]   r   r   querieskeysvaluesr   rn   rm   s               r4   r   Emu3VQVAEAttentionBlock.forwardD  s6    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "));;;;FFHmmK0((r3   )rt   r\   r   rT   r   r   rg  rb   rj  r   ri  r   r   )r*   r+   r,   r-   r.   r$   r{   r/   r   r   r   r2   r   r   s   @r4   re  re  *  sa    G& &4 /3!)||!) t+!)
 
u||U\\D00	1!) !)r3   re  c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )Emu3VQVAEGroupNormih  z
Same as the torch GroupNorm with the only difference that this ones accepts
an optional kwarg `quant_states` which is not used. This class makes it easier to
use SpatialNorm or GroupNorm without conditionals
c                 &   > [         TU ]  " S0 UD6  g ra  )rz   r{   )r   r]   r   s     r4   r{   Emu3VQVAEGroupNorm.__init__o  s    "6"r3   c                     [         R                  " XR                  U R                  U R                  U R
                  5      $ r   )r
  
group_normr.  r   ry   r   )r   inputr6  s      r4   r   Emu3VQVAEGroupNorm.forwardr  s'    ||E??DKKDHHUUr3   r)   r   )	r*   r+   r,   r-   r.   r{   r   r2   r   r   s   @r4   rr  rr  h  s    #V Vr3   rr  c                   p   ^  \ rS rSrSU 4S jjrSS\R                  S\R                  S-  4S jjrSrU =r	$ )	Emu3VQVAEMiddleBlockiv  Nc                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        Uc  [        USSSS9U l        O[        X25      U l        [        UUUS9U l	        g )Nr  r)  r]  r+  r   Tr,  )
rz   r{   r\  block_1re  attn_1rr  	attn_normr(  block_2)r   rt   r  r]  r   s       r4   r{   Emu3VQVAEMiddleBlock.__init__w  sm    +#$)

 .f5!/[UW]ajnoDN1.NDN+#$)
r3   rK   r6  c                 N   U R                  X5      nUnU R                  X5      nUR                  u  pEpgUR                  XEXg-  5      R	                  SS5      nU R                  U5      S   nUR                  XFXu5      R                  SSSS5      nX1-   nU R                  X5      nU$ )Nr!   r7   r   r   )	r}  r  r:   r   rd   r~  rP   r   r  )r   rK   r6  r   r   r   r   r   s           r4   r   Emu3VQVAEMiddleBlock.forward  s    ]A }C.;.A.A+
f%**:PZZ[\^_`M215%--j%RZZ[\^_abdef 0]Ar3   )r~  r  r}  r  r   
r*   r+   r,   r-   r{   r/   FloatTensorr   r2   r   r   s   @r4   rz  rz  v  s2    
(
U%6%6 
eFWFWZ^F^ 
 
r3   rz  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEDownBlocki  c                   > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nS[        U5      -   nX@l        [        R                  " 5       U l        [        U R                  5       GHL  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nX$U   -  n	X#U   -  n
[        U R
                  5       H~  nUR                  [        U	U
S95        U
n	UR                  c  M-  XQR                  ;   d  M>  UR                  [!        U5      5        UR                  [        R"                  " U	SSSS95        M     [        R$                  " 5       nXll        X|l        Xl        XPR                  S-
  :w  a  [-        U	5      Ul        U R                  R                  U5        GMO     g )Nr!   r  r)  r+  r   Tr,  r!   )rz   r{   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsr   in_channel_multiplierre   
ModuleListdownrangeappendr\  attn_resolutionsre  r0  Moduleblockattn
attn_normsr   
downsample)r   rt   r  r  r  i_levelr  r  r  block_in	block_outi_blockr  r   s                r4   r{   Emu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112GMMOE==?DJ$W'EEH%7(CCI !4!45($,%. %**67F]F];]KK 7 ?@%%bllUW]ajn&op 6 99;DJI(O..22"@"JIIT"1 3r3   rK   c                 <   [        U R                  5       GH  u  p#[        U R                  5       H  nUR                  U   " U5      n[        UR                  5      S:  d  M3  UnUR                  U   " U5      nUR                  u  pgpUR                  XgX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XhX5      R                  SSSS5      nXQ-   nM     X R                  S-
  :w  d  M  UR                  U5      nGM     U$ )Nr   r!   r7   r   )	enumerater  r  r  r  r  r  r  r:   r   rd   rP   r   r  r  )
r   rK   r  blocksr  r   r   r   r   r   s
             r4   r   Emu3VQVAEDownBlock.forward  s   (3OG !4!45 &W 5m Dv{{#a',H$*$5$5g$>}$MM:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M 6 ..22 & 1 1- @  4" r3   )r  r  r  r  r  r   s   @r4   r  r    s     ##JU%6%6  r3   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Emu3VQVAEUpBlocki  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  UR                  S   -  n[        R                  " 5       U l
        [        [        U R                  5      5       GH8  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nUR                  UR                  U   -  n[        U R
                  S-   5       Hd  n	UR                  [        UUUS95        UnXAR                  ;   d  M0  UR                  [!        U5      5        UR                  [#        X#5      5        Mf     [        R$                  " 5       n
XZl        Xjl        Xzl        US:w  a  [-        U5      U
l        U R                  R1                  SU
5        GM;     g )Nr6   r!   r|  r   )rz   r{   r  r  r  r  r   r  re   r  upreversedr  r  r\  r  re  r(  r  r  r  r  r  upsampleinsert)r   rt   r]  r  r  r  r  r  r  r  r  r   s              r4   r{   Emu3VQVAEUpBlock.__init__  si   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;<GMMOE==?DJ,,v/H/H/QQI !4!4q!89($,%.'5 %555KK 7 ?@%%&:>&TU : BHG&M!|:8DGGNN1b!3 =r3   rK   r6  c                 b   [        U R                  S S S2   5       GH  u  p4[        U R                  S-   5       H  nUR                  U   " X5      n[        UR                  5      S:  d  M3  UnUR                  U   " X5      nUR                  u  pxpUR                  XxX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XyX5      R                  SSSS5      nXa-   nM     U[        U R                  5      S-
  :w  d  M  UR                  U5      nGM     U$ )Nr6   r!   r   r7   r   )r  r  r  r  r  r  r  r  r:   r   rd   rP   r   r  )r   rK   r6  r  r  r  r   r   r   r   r   s              r4   r   Emu3VQVAEUpBlock.forward  s   (27OG !4!4q!89 &W 5m Rv{{#a',H$*$5$5g$>}$[M:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M : #dgg,** & >  8  r3   )r  r  r  r  r   s   @r4   r  r    s-    #"JU%6%6 eFWFW  r3   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEEncoderi  c                   > [         TU ]  5         UR                  nUR                  nUR                  nUR
                  nUR                  nU(       a  SU-  OUnX&S   -  n[        R                  R                  X2SSSS9U l
        [        U5      U l        [        X5      U l        [        R                  R                  SUSSS	9U l        [        R                  R                  UUSSSS9U l        [%        [&        R(                  " UR*                  5      5      n	[        R,                  " 5       U l        [        R,                  " 5       U l        [3        U	5       H)  n
[5        Xw5      nU R.                  R7                  U5        M+     [3        UR8                  5       H(  n[;        UUS
9nU R0                  R7                  U5        M*     g )Nr7   r6   r   r!   r   r+  r   T)r.  r-  r   r/  r  )rz   r{   r  r  double_latentlatent_channelsr  r/   re   r   conv_inr  
down_blockrz  middle_blockr0  norm_outconv_outr   mathlog2temporal_downsample_factorr  	time_convtime_res_stackr  rI  r  r  rO  )r   rt   r  r  r  r  r  r)  r  temporal_down_blocksir  r   time_res_convr   s                 r4   r{   Emu3VQVAEEncoder.__init__  s   ,,((,, 00#66.;q?* b#99xx{qYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+,A.|JDNN!!$' - v,,-A8()M &&}5 .r3   pixel_valuesc                 t   UR                   S   nUR                  " S/UR                   SS  Q76 nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nUR                  " SU/UR                   SS  Q76 nUR                  SSSSS5      nU R                   H$  nU" U5      nU[        R                  " U5      -  nM&     U R                   H  nU" U5      nM     UR                  SSSSS5      nU$ )Nr!   r6   r7   r   r   r   )r:   rP   r  r  r  r  r/   rY  r  r   r  r  )r   r  temporal_dimrK   r  layers         r4   r   Emu3VQVAEEncoder.forward5  s:   #))!,#++BH1C1CAB1GH \26))-8 m4}55m4%--b,YATATUVUWAXY%--aAq!< NND /MU]]=99M # ((E!-0M ) &--aAq!<r3   )r  r  r  r  r  r  r  )
r*   r+   r,   r-   r{   r/   r0   r   r2   r   r   s   @r4   r  r    s     %6NE$4$4  r3   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Emu3VQVAEDecoderiS  rt   c                   > [         T	U ]  5         UR                  nUR                  UR                  S   -  n[
        R                  " 5       U l        [        UR                  5       H<  n[        UR                  UR                  S9nU R                  R                  U5        M>     [        [        R                  " UR                   5      5      n[
        R                  " 5       U l        [        U5       H>  n[%        UR                  UR                  5      nU R"                  R                  U5        M@     [
        R&                  " UR                  USSSS9U l        [+        XUS9U l        [/        U5      U l        UR                  UR                  S   -  n[3        X#5      U l        [
        R&                  " UUR6                  SSSS9U l        g )Nr6   r  r   r!   r   )r]  r   )rz   r{   r   r  r  re   r  r  r  r  rO  r  r  r   r  r  r  r  r=  r   r  rz  r  r  up_blockr(  r  r)  r  )
r   rt   r]  r  r   r  temp_upsample_block_numr  r  r   s
            r4   r{   Emu3VQVAEDecoder.__init__T  s|   ))''&*C*CB*GG mmov,,-A8"22AWAWM &&}5	 . #&dii0Q0Q&R"S./A,V-C-CVE[E[\DNN!!$' 0 yy""
 1R`a(0''&*C*CA*FF,^F		
r3   rK   r6  c                    [         R                  " X4SS9nUR                  SSSSS5      nU R                   H  nU" U5      nM     U R                   H$  nU" U5      nU[         R
                  " U5      -  nM&     UR                  SSSSS5      n[         R                  " USSS9u  pUR                  " S/UR                  SS  Q76 nUR                  " S/UR                  SS  Q76 nU R                  U5      nU R                  X5      nU R                  X5      nU R                  X5      nU[         R
                  " U5      -  nU R                  U5      nU$ )Nr   r8   r7   r!   r   r   r6   )r/   r;   r   r  r  rY  chunkrP   r:   r  r  r  r  r  )r   rK   r6  hidden_quant_statesr  s        r4   r   Emu3VQVAEDecoder.forward{  sV   #ii(E1M199!Q1aH ((E"'(;"< ) ^^E"'(;"<5==1D#EE $ 299!Q1aH&+kk2Eqa&P#%--bK=3F3Fqr3JK#++BH1C1CAB1GH]3 ))-FmBmB}55m4r3   )r  r  r  r  r  r  r  )r*   r+   r,   r-   r$   r{   r/   r   r   r2   r   r   s   @r4   r  r  S  s0    %
 %
NU\\   r3   r  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    custom_introc            
         ^  \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr/ SQr\\/\S.r\R&                  " 5       S	 5       rS\4U 4S
 jjr\\S\R0                  S\R0                  S\\   S\4S j5       5       rS\R0                  4S jrSrU =r$ )	Emu3VQVAEi  rt   
emuvideovqr  )imageT)rO  re  r\  r   rK   
attentionsc                 f   [        U[        R                  [        R                  45      (       a  [        R
                  " UR                  SSS9  UR                  br  [        R                  R                  R                  UR                  5      u  p#S[        R                  " U5      -  n[        R                  " UR                  U* U5        g g [        U[        R                  5      (       a  [        R                  " UR                  [        R                  " S5      S9  UR                  bz  [        R                  R                  R                  UR                  5      u  p#US:  a  S[        R                  " U5      -  OSn[        R                  " UR                  U* U5        g g [        U[        R                  [        R                   [        R"                  45      (       a  [        R$                  " UR                  S5        [        R$                  " UR                  S	5        ['        US
S 5      ba  [        R(                  " UR*                  5        [        R,                  " UR.                  5        [        R(                  " UR0                  5        g g [        U[        R2                  5      (       ay  [        R4                  " UR                  5        UR6                  bK  ['        UR                  SS5      (       d.  [        R(                  " UR                  UR6                     5        g g g g )Nfan_outrelu)r	  nonlinearityr!      )ar   r   r   running_mean_is_hf_initializedF)
isinstancere   r   r  initkaiming_normal_r   ry   r/   _calculate_fan_in_and_fan_outr  sqrtr   r   kaiming_uniform_BatchNorm2drQ  r0  	constant_r|   zeros_r  ones_running_varnum_batches_trackedr   normal_padding_idx)r   rV   fan_inr   bounds        r4   _init_weightsEmu3VQVAE._init_weights  s   fryy"))455  YVT{{&!HHMMGGV	DIIf--fkkE659 ' 		**!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659 '  NOONN6==#.NN6;;,v~t4@F//0

6--.F667 A --LL'!!-gfmmMach6i6iFMM&*<*<=> 7j- .r3   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        S[        UR                  5      S-
  -  U l        [        UR                  UR                  SSS9U l        [        UR                  UR                  SSS9U l        S[        UR                  5      S-
  -  U l        U R%                  5         U R'                  5         g )Nr7   r!   )r   r!   r!   r@  rA  )rz   r{   rt   r  encoderr  decoderr   quantizer  r  vision_spatial_factorr  r  r   
quant_convpost_quant_convspatial_scale_factoreval	post_initr   s     r4   r{   Emu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r3   image_sizesr]   rM   c                    UR                   S:H  nU(       aJ  U R                  R                  nUR                  u  pgpUR	                  S5      R                  SUSSS5      nOUR                  u  pepxn	U R                  U5      n
U
R                  SSSSS5      nU R                  U5      nUR                  SSSSS5      nU R                  U5      nU(       a  UR                  S5      OUn[        X5       VVs/ s HB  u  pUS [        US   U R                  -  5      2S [        US   U R                  -  5      24   PMD     nnn[        U
US9$ s  snnf )Nr   r!   r   r7   r   )last_hidden_stater(   )ndimrt   r  r:   rB   repeatr  r   r  r  squeezer  r   r  r&   )r   r  r  r]   is_imager   r   r   r   r   rK   conv_hidden_statescodesr(   single_imager9  s                   r4   encodeEmu3VQVAE.encode  sd   
  $$){{==H2>2D2D/J&'11!4;;AxAqQL<H<N<N9J(E\2 +221aAqA!__-?@ 0771aAF01+3u}}Q' '*,&D
&D" D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr&D 	 

 $+%
 	

s   6A	ErK   c                    UR                   S:H  nU(       a  UR                  S5      nUR                  u  p4pVU R                  R	                  UR                  5       5      nUR                  S   nUR                  X4XVU5      R                  SSSSS5      R                  5       nU R                  U5      n	UR                  SSSSS5      nU	R                  SSSSS5      n	U R                  X5      n
U
R                  UX@R                  R                  -  U R                  R                  XPR                  -  X`R                  -  5      n
U(       a	  U
S S 2S4   $ U
$ )Nr   r!   r6   r   r   r7   )r   rB   r:   r  r   flattenr   r   rj   r  r  rP   rt   r  r)  r  )r   rK   r  r   r   r   r   quantr   
post_quantvideos              r4   decodeEmu3VQVAE.decode  s;    %%*)33A6M.;.A.A+
f''(=(=(?@;;r?

:IQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/{{===KK$$...---
 'uQT{1E1r3   )rt   r  r  r  r  r  r  r  ) r*   r+   r,   r-   r$   r1   base_model_prefixmain_input_nameinput_modalities_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr\  rO  re  _can_record_outputsr/   no_gradr  r{   r   r    r   r   r   r&   r  r  r2   r   r   s   @r4   r  r    s     $$O!N"& /0LM-
 ]]_? ?4 *  
!LL
7<||
OUVhOi
	
   
B2ELL 2 2r3   r  c                       \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       rS
\\R                     S\R                  4S jrS
\R                  S\R                  4S jrSrg)Emu3ImageVocabularyMappingi"  zE
A class for mapping discrete image tokens from VQGAN to BPE tokens.
c                 h    Xl         UR                  S5      U l        UR                  S5      U l        g )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r   r  s     r4   r{   #Emu3ImageVocabularyMapping.__init__'  s)    "%MM/:'mmI6r3   c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf Nz<|visual tokensortedr  items
startswithr   namevals      r4   r(   'Emu3ImageVocabularyMapping.image_tokens,  s<    DNN,@,@,Bh,BytdooVfFgs,Bhiih   A
A
c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf r#  r$  r(  s      r4   image_tokens_str+Emu3ImageVocabularyMapping.image_tokens_str0  s<    T^^-A-A-Ci-C	tWgGht-Cijjir,  c                 z    U R                    Vs0 s H  n[        USS 5      U R                  U   _M!     sn$ s  snf )Nir8  )r.  r   r  )r   tokens     r4   img2bpe"Emu3ImageVocabularyMapping.img2bpe4  s;    FJF[F[\F[UE"RL!4>>%#88F[\\\s   &8c                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf r   )r2  r&  )r   rD   vs      r4   bpe2img"Emu3ImageVocabularyMapping.bpe2img8  s-    !%!3!3!56!5!5666s   0c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ Nr!   r_   )r/   zerosmaxr6  rn  r   r&  r   mappingrD   r5  s       r4   bpe2img_mapping_tensor1Emu3ImageVocabularyMapping.bpe2img_mapping_tensor<  R    ++c$,,"3"3"56:%))LLL&&(DAAJ )r3   c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ r9  )r/   r;  r<  r2  rn  r   r&  r=  s       r4   img2bpe_mapping_tensor1Emu3ImageVocabularyMapping.img2bpe_mapping_tensorC  rA  r3   	img_batchrM   c                 "   UR                   n[        R                  " UR                  S   S4[        R                  S9U R
                  -  nU R                  UR                  S5         n[        R                  " XC/SS9nUR                  U5      $ )Nr   r!   r:  cpur6   r8   )	devicer/   r   r:   r   r  rC  ri   r;   )r   rE  rH  eol_row
img_tokenss        r4   convert_img2bpe*Emu3ImageVocabularyMapping.convert_img2bpeJ  su    !!**iooa0!4EIIFIZIZZ00e1DE
YY
4"=
}}V$$r3   c                     UR                   nUSS S24   nU R                  UR                  S5         nUR                  U5      $ )N.r6   rG  )rH  r?  ri   )r   rE  rH  rJ  s       r4   convert_bpe2img*Emu3ImageVocabularyMapping.convert_bpe2imgQ  sG    !!c3B3h'	00e1DE
}}V$$r3   )r  r   r  N)r*   r+   r,   r-   r.   r{   r   r(   r.  r2  r6  r?  rC  listr/   r   rK  rN  r2   r)   r3   r4   r  r  "  s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r3   r  c                   X    \ rS rSr% \\S'   SrSrSrS/r	SS/r
SrSrSrSrSr\\S	.rS
rg)Emu3PreTrainedModeliX  rt   modelr  textTr   r   causal_maskr  r)   N)r*   r+   r,   r-   r"   r1   r  r  supports_gradient_checkpointingr  _skip_keys_device_placementr  r  _can_compile_fullgraphr  r  r   rq   r  r2   r)   r3   r4   rR  rR  X  s]    (&*# $5m"DN!"&)#r3   rR  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )Emu3RotaryEmbeddingin  inv_freqNrt   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultr\  F)
persistentoriginal_inv_freq)rz   r{   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrt   rope_parametersr^  compute_default_rope_parametersr   attention_scalingregister_bufferclone)r   rt   rH  rope_init_fnr\  r   s        r4   r{   Emu3RotaryEmbedding.__init__q  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr3   rH  ztorch.deviceseq_lenrM   ztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetarT   Nr   r   r7   r:  )rH  r_   )	re  r|   r}   r~   r/   arangeint64ri   r   )rt   rH  rl  baser9   attention_factorr\  s          r4   rf  3Emu3RotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r3   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r6   r!   mpsrG  F)device_typeenabledr7   r8   r:  )r\  r   rO   r:   ri   rH  r  typestrr   rd   r/   r;   rE   rg  rF   r_   )
r   r<   r   inv_freq_expandedposition_ids_expandedrv  freqsembrE   rF   s
             r4   r   Emu3RotaryEmbedding.forward  sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)rg  rt   rc  rd  r^  r   r   )r*   r+   r,   r-   r/   r   r1   r"   r{   staticmethodr   r   r   r   rf  r  r   r   r2   r   r   s   @r4   r[  r[  n  s    llVz V V  $(+/"*T!*(* t* 
~u$	%	* *: ]]_<  <r3   r[  c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\\\	      SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S	\
R                  S-  S
\S-  S\\   S\4S jj5       5       5       rSrU =r$ )Emu3TextModeli  rt   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   rt   F)rz   r{   pad_token_idr  
vocab_sizere   r   r}   embed_tokensr  r  num_hidden_layersr   layersr   r   normr[  
rotary_embgradient_checkpointingr  r   s      r4   r{   Emu3TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabBaYf0Bab
   2 28K8KL	-V<&+# 	 cs   C?N	input_idsrZ   r   r   inputs_embedsr   r]   rM   c           
      >   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  XS9nU R                  S U R                  R                    H  nU" U
4U	UUUUS.UD6n
M     U R                  U
5      n
[        U
US	9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r!   )rH  )rt   r  rZ   r   r   )r   )rZ   r   r   r   r   )r  r   )rh  r  r   rt   get_seq_lengthr/   ro  r:   rH  rB   r   r  r  r  r  r   )r   r  rZ   r   r   r  r   r]   past_seen_tokensrV  rK   r   decoder_layers                r4   r   Emu3TextModel.forward  sF    -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oomoW![[)H4;;+H+HIM)*$7) /# M J 		-0&++
 	
r3   )r  r  r  r  r  r  r  )NNNNNN)r*   r+   r,   r-   r#   r1   r{   r   r    r   r/   r0   r   r
   r  r   r   r   r   r   r2   r   r   s   @r4   r  r    s    ~     .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
!2
    2
r3   r  c                   \  ^  \ rS rSr% SS0rSS0rSS/S/40r\\S'   U 4S	 jr	\
\        SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R                   S
-  S\R                  S
-  S\S
-  S\\R                  -  S\\   S\4S jj5       5       rSrU =r$ )Emu3ForCausalLMi  lm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrK   logitsrt   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g NFrx   )
rz   r{   r  rS  r  re   r   r}   r  r  r   s     r4   r{   Emu3ForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r3   Nr  rZ   r   r   r  labelsr   logits_to_keepr]   rM   c	           
      |   U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SXU R                  R                  S.U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a[  
Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```r  rZ   r   r   r  r   Nr  r  r  lossr  r   rK   r  r)   )rS  r  r  r   slicer  loss_functionrt   r  r   r   rK   r  )r   r  rZ   r   r   r  r  r   r  r]   outputsrK   slice_indicesr  r  s                  r4   r   Emu3ForCausalLM.forward  s    @ ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD%#33!//))
 	
r3   )r  rS  r  )NNNNNNNr   )r*   r+   r,   r-   _tied_weights_keys_tp_plan_pp_planr#   r1   r{   r   r   r/   r0   r   r
   r  r   r   r   r   r   r   r2   r   r   s   @r4   r  r    s   *,GH23H_-z:;H  .2.204(,26*.!%-.7
##d*7
 t+7
 &&-	7

 7
 ((4/7
   4'7
 $;7
 ell*7
 +,7
 
 7
  7
r3   r  c                     ^  \ rS rSrU 4S jrS rS rS\R                  S\R                  S\R                  4S jr
\\" S	S
9S\R                  S\R                  S\\   S\\-  4S j5       5       r\R$                  " 5       S\R                  S\S\4S j5       rS\R                  S\R                  S\R                  4S jr\\        SS\R                  S-  S\R                  S-  S\R,                  S-  S\R,                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )	Emu3ModeliG  c                    > [         TU ]  U5        [        R                  UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l        U R                  5         g r   )rz   r{   r  _from_configtext_config
text_modelr  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingr  r   s     r4   r{   Emu3Model.__init__H  sY     '44V5G5GH !1!12"<V=R=R"S 	r3   c                 6    U R                   R                  5       $ r   )r  get_input_embeddingsr   s    r4   r  Emu3Model.get_input_embeddingsQ  s    3355r3   c                 :    U R                   R                  U5        g r   )r  set_input_embeddingsr   rY   s     r4   r  Emu3Model.set_input_embeddingsT  s    ,,U3r3   r  r  rM   c                     U R                   R                  XSS9nUR                   Vs/ s H+  o@R                  R	                  U5      R                  5       PM-     nn[        R                  " U5      nU$ s  snf )a  
Tokenizes images into discrete tokens with VQGAN module. Converts
obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
special tokens.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
        The sizes of the images in the batch, being (height, width) for each image.
T)return_dict)r  r  r(   r  rK  r
  r/   r;   )r   r  r  vqmodel_outputstokensbpe_tokens_list
bpe_tokenss          r4   get_image_tokensEmu3Model.get_image_tokensW  st     150C0CLko0C0pTcTpTp
Tp&##33F;CCETp 	 
 YY/
	
s   2A5zbTokenizes images into discrete tokens with VQGAN module and embeds them with text embeddings layerr  r]   c                    U R                   R                  " X4SS0UD6nU VVs/ s H9  u  pVXPR                   R                  -  X`R                   R                  -  S-   -  PM;     nnnUR                   Vs/ s H+  oR                  R                  U5      R                  5       PM-     n	n[        R                  " U	5      n
U R                  5       " U
5      n[        R                  " X5      nXl        U$ s  snnf s  snf )z
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
    The tensors corresponding to the input images.
r  Tr!   )r  r  r  r(   r  rK  r
  r/   r;   r  splitpooler_output)r   r  r  r]   r  r   r   split_sizesr  r  r  image_embeddingsimage_featuress                r4   get_image_featuresEmu3Model.get_image_featuresj  s     150C0C1
371
;A1

 "-
!, ||999e||GiGi>ilm>mn!, 	 

 UdTpTp
Tp&##33F;CCETp 	 
 YY/
446zB%5C(6%

s   A C462C:r(   r   r   c                     USS2SS24   R                  SX#S-   5      nU R                  R                  U5      nU R                  R	                  U5      nU$ )a  
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.

Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
    height (`int`):
        Height of the generated image before upsampling.
    width (`int`):
        Width of the generated image before upsampling.
Nr6   r!   )r   r  rN  r  r  )r   r(   r   r   	sequencesr  s         r4   decode_image_tokensEmu3Model.decode_image_tokens  sV     !CRC(--b&!)D	..>>yI##L1r3   r  r  r  c           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r_   rH  r6   r   r!   z6Image features and image tokens do not match, tokens: z, features: )r  r/   tensorr  r   longrH  allr   r:   rB   	expand_asri   r   numel)r   r  r  r  special_image_maskn_image_tokensn_image_featuress          r4   get_placeholder_maskEmu3Model.get_placeholder_mask  s    !.2K2K2MT44CC5::^k^r^rs3 " "4!7!7!;!*.E.E.T.T!T+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r3   NrZ   r   r   r   c	           	      B   USL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUbQ  U R                  X#5      R                  n
[        R
                  " U
SS9n
U R                  XU
S9nUR                  X5      nU R                  " SUUUUUS.U	D6nU$ )aH  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   r8   )r  r  )rZ   r   r   r  r   r)   )	rh  r  r  r  r/   r;   r  masked_scatterr  )r   r  r  r  rZ   r   r   r  r   r]   r  r  r  s                r4   r   Emu3Model.forward  s    ( -t";<s    557	BM#!44\O]]N"YY~1=N!%!:!:~ "; " *889K\M // 
)%+'
 
 r3   )r  r  r  )NNNNNNNN)r*   r+   r,   r-   r{   r  r  r/   r  r0   r  r   r   r   r   r   r&   r  r  r   r  r  r   r
   r   r   r   r2   r   r   s   @r4   r  r  G  s   64U->-> UM]M] bgbrbr & y!--<A<L<LX^_qXr	%	% 0 ]]_0@0@ # VY  $"))":?:K:K"]b]n]n"0  .215+/.204(,26!%,##d*, ''$., \\D(	,
 t+, &&-, , ((4/, $;, +,, 
'	',  ,r3   r  c                     ^  \ rS rSrSrSS0rU 4S jrS rS rS\	R                  4S	 jrS
 r\\          SS\R                   S-  S\R"                  S-  S\R$                  S-  S\R$                  S-  S\R                   S-  S\S-  S\R"                  S-  S\S-  S\R                   S-  S\\R$                  -  S\\   S\\-  4S jj5       5       r       SU 4S jjrSrU =r$ )Emu3ForConditionalGenerationi  rT  r  z$model.text_model.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g r  )rz   r{   r  rS  re   r   r  r}   r  r  r  r   s     r4   r{   %Emu3ForConditionalGeneration.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr3   c                 6    U R                   R                  5       $ r   )rS  r  r   s    r4   r  1Emu3ForConditionalGeneration.get_input_embeddings  s    zz..00r3   c                 :    U R                   R                  U5        g r   )rS  r  r  s     r4   r  1Emu3ForConditionalGeneration.set_input_embeddings  s    

''.r3   rM   c                     U R                   $ r   )r  r   s    r4   get_output_embeddings2Emu3ForConditionalGeneration.get_output_embeddings  s    ||r3   c                 :    U R                   R                  " S0 UD6$ ra  )rS  r  )r   r]   s     r4   r  0Emu3ForConditionalGeneration.decode_image_tokens  s    zz--777r3   Nr  r  r  rZ   r   r   r  r   r  r  r]   c           
         U R                   " SUUUUUUS.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnU	b3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> conversation = [
...     {
...     "role": "system",
...     "content": [
...         {"type": "text", "text": "You are a helpful assistant."},
...         ],
...     },
...     {
...     "role": "user",
...     "content": [
...         {"type": "image"},
...         {"type": "text", "text": "Please describe the image."},
...         ],
...     },
... ]

>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```r  r   Nr  r  r)   )rS  r  r   r  r  r  rt   r  r  r   r   rK   r  )r   r  r  r  rZ   r   r   r  r   r  r  r]   r  rK   r  r  r  s                    r4   r   $Emu3ForConditionalGeneration.forward  s    @ ** 
)%+'
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD &#33!//))
 	
r3   c	                 `   > [         TU ]  " U4UUUUUUUS.U	D6n
U(       d  U(       a  S U
S'   U
$ )N)r   rZ   r  r   r  r   is_first_iterationr  )rz   prepare_inputs_for_generation)r   r  r   rZ   r  r   r   r  r  r]   model_inputsr   s              r4   r  :Emu3ForConditionalGeneration.prepare_inputs_for_generationV  sR     w<

+)'%%1

 

 "i+/L(r3   )r  rS  )
NNNNNNNNNr   )NNNNTNF)r*   r+   r,   r-   output_modalitiesr  r{   r  r  re   r  r  r  r   r   r/   r0   r  r   r
   r   r   r   r   r   r   r   r  r2   r   r   s   @r4   r  r    s   )*,RS1/ryy 8  .215+/.204(,26!%*.-.Y
##d*Y
 ''$.Y
 \\D(	Y

 t+Y
 &&-Y
 Y
 ((4/Y
 $;Y
   4'Y
 ell*Y
 +,Y
 
'	'Y
  Y
|   r3   r  )r  r  r  rR  r  r  r  )r   )br  collections.abcr   dataclassesr   	functoolsr   typingr   r/   torch.nnre   torch.nn.functionalrf   r
   r   r  activationsr	   cache_utilsr
   r   
generationr   integrationsr   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr    configuration_emu3r"   r#   r$   r&   r?   rJ   r   r   rU   r  r   ro   rq   r   r   r   r   r   r  r  r(  r=  rI  rO  r\  re  r0  rr  rz  r  r  r  r  r  r  rR  r[  r  r  r  r  __all__r)   r3   r4   <module>r     s  ,  $ ! %      & ! . ) f f / 9 k k K F & a a G 5 K K 
15 1  1( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*@)BII @) +@)F Y'J")) J (J(bii  (1 (V$ryy $D	RYY 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~;)bii ;)|V V299 D8 8v7ryy 7tCryy CLCryy CL ~2 ~2~2B3% 3%l /  *><")) ><B H
' H
 H
V H
)? H
 H
VX# XvQ#6 Qhr3   