
    Z j                        S SK r S SKJr  S SKJr  S SKrS SKJr  S SKJs  J	r
  SSKJr  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/J0r0  \Rb                  " \25      r3\\ " S S\5      5       5       r4 " S S\&5      r5 " S S\'5      r6 " S S\Rn                  5      r8 " S S\$5      r9 " S S\Rn                  5      r: " S  S!\Rn                  5      r; " S" S#\Rn                  5      r< " S$ S%\Rn                  5      r= " S& S'\Rn                  5      r> " S( S)\Rn                  5      r? " S* S+\Rn                  5      r@ " S, S-\,5      rA " S. S/\R                  5      rC " S0 S1\Rn                  5      rD " S2 S3\Rn                  5      rE " S4 S5\Rn                  5      rF " S6 S7\Rn                  5      rG " S8 S9\Rn                  5      rH\" S:S;9 " S< S=\5      5       rI " S> S?5      rJ " S@ SA\#5      rK " SB SC\)\K5      rL " SD SE\(\K\5      rM " SF SG\K5      rN " SH SI\K\5      rO/ SJQrPg)K    N)	dataclass)cached_property   )initialization)Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tupleloggingtorch_compilable_check)merge_with_config_defaults)capture_outputs   )ChameleonPreTrainedModel#ChameleonVQVAEEncoderConvDownsample)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelTransformersKwargs)SiglipAttention   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Emu3VQVAEModelOutput-   z
image_tokens (`torch.LongTensor` of shape `(batch_size, config.vocab_size`):
    Indices of the image tokens predicted by the VQ-VAE model.
Nimage_tokens )
__name__
__module____qualname____firstlineno____doc__r#   torch
LongTensor__annotations____static_attributes__r$       v/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/emu3/modular_emu3.pyr!   r!   -   s    
 -1L%""T)0r.   r!   c                       \ rS rSrSrg)Emu3Attention8   r$   Nr%   r&   r'   r(   r-   r$   r.   r/   r1   r1   8       r.   r1   c                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )Emu3DecoderLayer=   config	layer_idxc                 n   > [         TU ]  X5        [        R                  " UR                  5      U l        g N)super__init__nnDropoutattention_dropoutdropoutselfr8   r9   	__class__s      r/   r=   Emu3DecoderLayer.__init__>   s&    +zz&":":;r.   Nhidden_statesattention_maskposition_idspast_key_values	use_cacheposition_embeddingskwargsreturnc           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pXR                  U5      -   nUnU R                  U5      nU R	                  U5      nXR                  U5      -   nU$ )N)rF   rG   rH   rI   rJ   rK   r$   )input_layernorm	self_attnrA   post_attention_layernormmlp)
rC   rF   rG   rH   rI   rJ   rK   rL   residual_s
             r/   forwardEmu3DecoderLayer.forwardB   s     !,,];>> 
')%+ 3
 
 !<<#>> 55mD/ <<#>>r.   )rA   )NNNFN)r%   r&   r'   r(   r   intr=   r*   Tensorr+   r   booltupler   r   rU   r-   __classcell__rD   s   @r/   r6   r6   =   s    <z <c < /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 r.   r6   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )Emu3VQVAEVectorQuantizera   a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
r8   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        U R                  R                  R                  R                  SUR                  -  SUR                  -  5        g )Ng            ?)
r<   r=   r>   	Embeddingcodebook_size	embed_dim	embeddingweightdatauniform_rC   r8   rD   s     r/   r=   !Emu3VQVAEVectorQuantizer.__init__l   sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr.   hidden_statec                    UR                   u  p#pEnUR                  SSSSS5      R                  5       nUR                  SU5      n[        R
                  " US-  SSS9n[        R
                  " U R                  R                  S-  SS	9n	S[        R                  " XpR                  R                  R                  SS5      5      -  n
X-   U
-
  n
[        R                  " U
SS	9nUR                  X#XV5      nU$ )
Nr   r   r      r   T)dimkeepdimro   )shapepermute
contiguousviewr*   sumre   rf   matmul	transposeargmin)rC   rk   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r/   rU    Emu3VQVAEVectorQuantizer.forwardq   s    8D8J8J5
h#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;^^=R=R=\=\]^`a=bcc	$4y@	$||I1=388v]##r.   )re   )r%   r&   r'   r(   r)   r   r=   r*   rX   rU   r-   r[   r\   s   @r/   r^   r^   a   s+    e e
$ELL $ $r.   r^   c                       \ rS rSrSrg)Emu3VQVAEEncoderConvDownsample   r$   Nr3   r$   r.   r/   r   r      r4   r.   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3VQVAEEncoderConvUpsample   c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r   kernel_sizestridepadding)r<   r=   r>   Conv2dconv)rC   in_channelsrD   s     r/   r=   %Emu3VQVAEEncoderConvUpsample.__init__   s%    IIkAaYZ[	r.   c                 T    [         R                  " USSS9nU R                  U5      nU$ )N       @nearestscale_factormode)Finterpolater   rC   rF   s     r/   rU   $Emu3VQVAEEncoderConvUpsample.forward   s(    m#IV		-0r.   r   r%   r&   r'   r(   r=   rU   r-   r[   r\   s   @r/   r   r      s    \ r.   r   c            	       j   ^  \ rS rSrS\S\S\\   S\\   4U 4S jjrS\R                  4S jr	S	r
U =r$ )
Emu3VQVAEConv3d   
in_channelout_channelr   r   c                 R  > [         T	U ]  5         [        USS  USS  5       VVs/ s H	  u  pVXV-
  PM     nnnSU l        US S S2    H&  nU =R                  US-  US-  -   US-  4-  sl        M(     U =R                  S-  sl        [        R
                  " UUUUS9U l        g s  snnf )Nr   r$   rn   r   )r   r   )r   )r<   r=   zipr   r>   Conv3dr   )
rC   r   r   r   r   
one_kernel
one_stridepadding_sizespad_sizerD   s
            r/   r=   Emu3VQVAEConv3d.__init__   s     	ORS^_`_aSbdjklkmdnOopOo5KZ0Oop%dd+HLLX]X\98q=IIL ,II	
	 qs   B#rF   c                 h    [         R                  " XR                  5      nU R                  U5      nU$ r;   )r   padr   r   r   s     r/   rU   Emu3VQVAEConv3d.forward   s(    m\\:		-0r.   )r   r   )r%   r&   r'   r(   rW   rZ   r=   r*   rX   rU   r-   r[   r\   s   @r/   r   r      sK    

 
 3Z	

 c

,U\\  r.   r   c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Emu3VQVAESpatialNorm   r   out_channelsc                    > [         TU ]  5         [        R                  " USSSS9U l        [        R
                  " UUSSSS9U l        [        R
                  " UUSSSS9U l        g )N    ư>Tnum_channels
num_groupsepsaffiner   r   r   )r<   r=   r>   	GroupNorm
norm_layerr   conv_yconv_brC   r   r   rD   s      r/   r=   Emu3VQVAESpatialNorm.__init__   sn    
 	,,%	
 ii
 ii
r.   rF   quant_statesc                     [         R                  " X!R                  SS  SS9nU R                  U5      nXR	                  U5      -  U R                  U5      -   nU$ )Nr   )sizer   )r   r   rr   r   r   r   )rC   rF   r   s      r/   rU   Emu3VQVAESpatialNorm.forward   sT    }}\8K8KBC8PW`a6%L(AADKKP\D]]r.   )r   r   r   r%   r&   r'   r(   rW   r=   r*   rX   rU   r-   r[   r\   s   @r/   r   r      s:    

 
8U\\   r.   r   c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalUpsample   r   r   c                 D   > [         TU ]  5         [        UUSSS9U l        g )Nr   r   r   r   r   r   r   r   r<   r=   r   r   rC   r   r   rD   s      r/   r=   "Emu3VQVAETemporalUpsample.__init__   (    
 	#!	
	r.   rF   c                 D   UR                   u  p#pEnUR                  SSSSS5      R                  5       R                  USU5      n[        R
                  " USSS	9nUR                  X#XVS5      R                  SSSSS5      R                  5       nU R                  U5      nU$ )
Nr   r   r   rm   r   rn   r   r   r   )rr   rs   rt   ru   r   r   r   )rC   rF   rz   r|   r{   r}   r~   s          r/   rU   !Emu3VQVAETemporalUpsample.forward   s    8E8K8K5
h%--aAq!<GGINNz[]_ghm#IV%**:PRS[[\]_`bcefhijuuw		-0r.   r   r   r\   s   @r/   r   r      s/    

 
U\\  r.   r   c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalDownsample   r   r   c                 D   > [         TU ]  5         [        UUSSS9U l        g )N)rm   r   r   )r   r   r   r   r   r   s      r/   r=   $Emu3VQVAETemporalDownsample.__init__   r   r.   rF   c                 (    U R                  U5      nU$ r;   r   r   s     r/   rU   #Emu3VQVAETemporalDownsample.forward   s    		-0r.   r   r   r\   s   @r/   r   r      s/    

 
U\\  r.   r   c                   4   ^  \ rS rSr SU 4S jjrS rSrU =r$ )Emu3VQVAETemporalResnetBlock   c                 f  > [         TU ]  5         Xl        Uc  UOUU l        [        R
                  " U5      U l        [        UUSSS9U l        [        R
                  " U5      U l	        [        UUSSS9U l
        U R                  U R                  :w  a  [        R                  " UUSSSS9U l        g g )Nr   r   r   r   r   r   )r<   r=   r   r   r>   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   s      r/   r=   %Emu3VQVAETemporalResnetBlock.__init__   s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r.   c                 P   UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nX!-   $ r;   )	r   r*   sigmoidr   r   r   r   r   r   )rC   rF   rS   s      r/   rU   $Emu3VQVAETemporalResnetBlock.forward  s     

=1}55

=1

=1}55

=1t000((2H''r.   )r   r   r   r   r   r   r   r;   r   r\   s   @r/   r   r      s     @( (r.   r   c                      ^  \ rS rSr  S
S\S\S-  S\S-  4U 4S jjjrSS\R                  S\R                  S-  4S jjrS	r	U =r
$ )Emu3VQVAEResnetBlocki.  Nr   r   quant_channelsc                   > [         TU ]  5         Xl        Uc  UOUnX l        X0l        Uc9  [
        R                  " USSSS9U l        [
        R                  " USSSS9U l        O [        X15      U l        [        X25      U l        [
        R                  " UUSSSS9U l        [
        R                  " UUSSSS9U l        U R                  U R                  :w  a  [
        R                  " UUSSSS9U l        g g )	Nr   r   Tr   r   r   r   r   )r<   r=   r   r   r   r>   r   r   r   r   r   r   r   r   )rC   r   r   r   rD   s       r/   r=   Emu3VQVAEResnetBlock.__init__/  s     	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nJDJ-nKDJYY

 YY

 t000 "		!D 1r.   rF   c                 |   U R                   c  SOU4nUnU R                  " U/UQ76 nU[        R                  " U5      -  nU R	                  U5      nU R
                  " U/UQ76 nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nXA-   $ Nr$   )
r   r   r*   r   r   r   r   r   r   r   )rC   rF   r   	norm_argsrS   s        r/   rU   Emu3VQVAEResnetBlock.forward[  s    --5BN;L	 

==9=}55

=1

==9=}55

=1t000((2H''r.   )r   r   r   r   r   r   r   r   )NNr;   r   r\   s   @r/   r   r   .  s`     $(%)	** Dj* d
	* *X(U\\ (5<<RVCV ( (r.   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Emu3VQVAEAttentionBlockim  r8   c                 2   > [         TU ]  U5        SU l        g )Nr   )r<   r=   num_key_value_groupsri   s     r/   r=    Emu3VQVAEAttentionBlock.__init__n  s      %&!r.   )r   )r%   r&   r'   r(   r   r=   r-   r[   r\   s   @r/   r   r   m  s    & & &r.   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )Emu3VQVAEGroupNormiu  z
Same as the torch GroupNorm with the only difference that this ones accepts
an optional kwarg `quant_states` which is not used. This class makes it easier to
use SpatialNorm or GroupNorm without conditionals
c                 &   > [         TU ]  " S0 UD6  g r   )r<   r=   )rC   rL   rD   s     r/   r=   Emu3VQVAEGroupNorm.__init__|  s    "6"r.   c                     [         R                  " XR                  U R                  U R                  U R
                  5      $ r;   )r   
group_normr   rf   biasr   )rC   inputr   s      r/   rU   Emu3VQVAEGroupNorm.forward  s'    ||E??DKKDHHUUr.   r$   r;   )	r%   r&   r'   r(   r)   r=   rU   r-   r[   r\   s   @r/   r   r   u  s    #V Vr.   r   c                   p   ^  \ rS rSrSU 4S jjrSS\R                  S\R                  S-  4S jjrSrU =r	$ )	Emu3VQVAEMiddleBlocki  Nc                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        Uc  [        USSSS9U l        O[        X25      U l        [        UUUS9U l	        g )Nr   r   r   r   r   Tr   )
r<   r=   r   block_1r   attn_1r   	attn_normr   block_2)rC   r8   r   r   rD   s       r/   r=   Emu3VQVAEMiddleBlock.__init__  sm    +#$)

 .f5!/[UW]ajnoDN1.NDN+#$)
r.   rF   r   c                 N   U R                  X5      nUnU R                  X5      nUR                  u  pEpgUR                  XEXg-  5      R	                  SS5      nU R                  U5      S   nUR                  XFXu5      R                  SSSS5      nX1-   nU R                  X5      nU$ )Nr   r   r   r   )	r  r  rr   ru   rx   r  reshapers   r  )rC   rF   r   rS   rz   r|   r}   r~   s           r/   rU   Emu3VQVAEMiddleBlock.forward  s    ]A }C.;.A.A+
f%**:PZZ[\^_`M215%--j%RZZ[\^_abdef 0]Ar.   )r  r  r  r  r;   
r%   r&   r'   r(   r=   r*   FloatTensorrU   r-   r[   r\   s   @r/   r  r    s2    
(
U%6%6 
eFWFWZ^F^ 
 
r.   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEDownBlocki  c                   > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nS[        U5      -   nX@l        [        R                  " 5       U l        [        U R                  5       GHL  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nX$U   -  n	X#U   -  n
[        U R
                  5       H~  nUR                  [        U	U
S95        U
n	UR                  c  M-  XQR                  ;   d  M>  UR                  [!        U5      5        UR                  [        R"                  " U	SSSS95        M     [        R$                  " 5       nXll        X|l        Xl        XPR                  S-
  :w  a  [-        U	5      Ul        U R                  R                  U5        GMO     g )N)r   r   r   r   r   Tr   r   )r<   r=   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrZ   in_channel_multiplierr>   
ModuleListdownrangeappendr   attn_resolutionsr   r   Moduleblockattn
attn_normsr   
downsample)rC   r8   r  r  r  i_levelr  r   r!  block_in	block_outi_blockr  rD   s                r/   r=   Emu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112GMMOE==?DJ$W'EEH%7(CCI !4!45($,%. %**67F]F];]KK 7 ?@%%bllUW]ajn&op 6 99;DJI(O..22"@"JIIT"1 3r.   rF   c                 <   [        U R                  5       GH  u  p#[        U R                  5       H  nUR                  U   " U5      n[        UR                  5      S:  d  M3  UnUR                  U   " U5      nUR                  u  pgpUR                  XgX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XhX5      R                  SSSS5      nXQ-   nM     X R                  S-
  :w  d  M  UR                  U5      nGM     U$ )Nr   r   r   r   )	enumerater  r  r  r  r  r   r!  rr   ru   rx   r  rs   r  r"  )
rC   rF   r#  blocksr&  rS   rz   r|   r}   r~   s
             r/   rU   Emu3VQVAEDownBlock.forward  s   (3OG !4!45 &W 5m Dv{{#a',H$*$5$5g$>}$MM:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M 6 ..22 & 1 1- @  4" r.   )r  r  r  r  r  r\   s   @r/   r  r    s     ##JU%6%6  r.   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Emu3VQVAEUpBlocki  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  UR                  S   -  n[        R                  " 5       U l
        [        [        U R                  5      5       GH8  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nUR                  UR                  U   -  n[        U R
                  S-   5       Hd  n	UR                  [        UUUS95        UnXAR                  ;   d  M0  UR                  [!        U5      5        UR                  [#        X#5      5        Mf     [        R$                  " 5       n
XZl        Xjl        Xzl        US:w  a  [-        U5      U
l        U R                  R1                  SU
5        GM;     g )Nrn   r   r  r   )r<   r=   r  r  r  r  rd   r  r>   r  upreversedr  r  r   r  r   r   r  r  r   r!  r   upsampleinsert)rC   r8   r   r$  r#  r  r   r!  r%  r&  r/  rD   s              r/   r=   Emu3VQVAEUpBlock.__init__  si   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;<GMMOE==?DJ,,v/H/H/QQI !4!4q!89($,%.'5 %555KK 7 ?@%%&:>&TU : BHG&M!|:8DGGNN1b!3 =r.   rF   r   c                 b   [        U R                  S S S2   5       GH  u  p4[        U R                  S-   5       H  nUR                  U   " X5      n[        UR                  5      S:  d  M3  UnUR                  U   " X5      nUR                  u  pxpUR                  XxX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XyX5      R                  SSSS5      nXa-   nM     U[        U R                  5      S-
  :w  d  M  UR                  U5      nGM     U$ )Nrn   r   r   r   r   )r)  r/  r  r  r  r  r   r!  rr   ru   rx   r  rs   r1  )rC   rF   r   r#  r*  r&  rS   rz   r|   r}   r~   s              r/   rU   Emu3VQVAEUpBlock.forward  s   (27OG !4!4q!89 &W 5m Rv{{#a',H$*$5$5g$>}$[M:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M : #dgg,** & >  8  r.   )r  r  r/  r  r\   s   @r/   r-  r-    s-    #"JU%6%6 eFWFW  r.   r-  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEEncoderi  c                   > [         TU ]  5         UR                  nUR                  nUR                  nUR
                  nUR                  nU(       a  SU-  OUnX&S   -  n[        R                  R                  X2SSSS9U l
        [        U5      U l        [        X5      U l        [        R                  R                  SUSSS	9U l        [        R                  R                  UUSSSS9U l        [%        [&        R(                  " UR*                  5      5      n	[        R,                  " 5       U l        [        R,                  " 5       U l        [3        U	5       H)  n
[5        Xw5      nU R.                  R7                  U5        M+     [3        UR8                  5       H(  n[;        UUS
9nU R0                  R7                  U5        M*     g )Nr   rn   r   r   r   r   r   T)r   r   r   r   r  )r<   r=   r  r   double_latentlatent_channelsr  r*   r>   r   conv_inr  
down_blockr  middle_blockr   norm_outconv_outrW   mathlog2temporal_downsample_factorr  	time_convtime_res_stackr  r   r  r  r   )rC   r8   r  r   r9  r:  r  r   r$  temporal_down_blocksir   rT   time_res_convrD   s                 r/   r=   Emu3VQVAEEncoder.__init__  s   ,,((,, 00#66.;q?* b#99xx{qYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+,A.|JDNN!!$' - v,,-A8()M &&}5 .r.   pixel_valuesc                 t   UR                   S   nUR                  " S/UR                   SS  Q76 nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nUR                  " SU/UR                   SS  Q76 nUR                  SSSSS5      nU R                   H$  nU" U5      nU[        R                  " U5      -  nM&     U R                   H  nU" U5      nM     UR                  SSSSS5      nU$ )Nr   rn   r   r   r   rm   )rr   r  r;  r<  r=  r>  r*   r   r?  rs   rC  rD  )rC   rI  temporal_dimrF   r   layers         r/   rU   Emu3VQVAEEncoder.forwardB  s:   #))!,#++BH1C1CAB1GH \26))-8 m4}55m4%--b,YATATUVUWAXY%--aAq!< NND /MU]]=99M # ((E!-0M ) &--aAq!<r.   )r;  r?  r<  r=  r>  rC  rD  )
r%   r&   r'   r(   r=   r*   r+   rU   r-   r[   r\   s   @r/   r7  r7    s     %6NE$4$4  r.   r7  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Emu3VQVAEDecoderi`  r8   c                   > [         T	U ]  5         UR                  nUR                  UR                  S   -  n[
        R                  " 5       U l        [        UR                  5       H<  n[        UR                  UR                  S9nU R                  R                  U5        M>     [        [        R                  " UR                   5      5      n[
        R                  " 5       U l        [        U5       H>  n[%        UR                  UR                  5      nU R"                  R                  U5        M@     [
        R&                  " UR                  USSSS9U l        [+        XUS9U l        [/        U5      U l        UR                  UR                  S   -  n[3        X#5      U l        [
        R&                  " UUR6                  SSSS9U l        g )Nrn   r  r   r   r   )r   r   )r<   r=   rd   r  r  r>   r  rD  r  r  r   r:  r  rW   r@  rA  rB  rC  r   r   r;  r  r=  r-  up_blockr   r>  r   r?  )
rC   r8   r   r$  rT   rG  temp_upsample_block_numrF  r   rD   s
            r/   r=   Emu3VQVAEDecoder.__init__a  s|   ))''&*C*CB*GG mmov,,-A8"22AWAWM &&}5	 . #&dii0Q0Q&R"S./A,V-C-CVE[E[\DNN!!$' 0 yy""
 1R`a(0''&*C*CA*FF,^F		
r.   rF   r   c                    [         R                  " X4SS9nUR                  SSSSS5      nU R                   H  nU" U5      nM     U R                   H$  nU" U5      nU[         R
                  " U5      -  nM&     UR                  SSSSS5      n[         R                  " USSS9u  pUR                  " S/UR                  SS  Q76 nUR                  " S/UR                  SS  Q76 nU R                  U5      nU R                  X5      nU R                  X5      nU R                  X5      nU[         R
                  " U5      -  nU R                  U5      nU$ )Nr   rq   r   r   r   rm   rn   )r*   catrs   rD  rC  r   chunkr  rr   r;  r=  rQ  r>  r?  )rC   rF   r   hidden_quant_statesrL  s        r/   rU   Emu3VQVAEDecoder.forward  sV   #ii(E1M199!Q1aH ((E"'(;"< ) ^^E"'(;"<5==1D#EE $ 299!Q1aH&+kk2Eqa&P#%--bK=3F3Fqr3JK#++BH1C1CAB1GH]3 ))-FmBmB}55m4r.   )r;  r?  r=  r>  rC  rD  rQ  )r%   r&   r'   r(   r   r=   r*   rX   rU   r-   r[   r\   s   @r/   rO  rO  `  s0    %
 %
NU\\   r.   rO  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    custom_introc            
         ^  \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr/ SQr\\/\S.r\R&                  " 5       S	 5       rS\4U 4S
 jjr\\S\R0                  S\R0                  S\\   S\4S j5       5       rS\R0                  4S jrSrU =r$ )	Emu3VQVAEi  r8   
emuvideovqrI  )imageT)r   r   r   r^   rF   
attentionsc                 f   [        U[        R                  [        R                  45      (       a  [        R
                  " UR                  SSS9  UR                  br  [        R                  R                  R                  UR                  5      u  p#S[        R                  " U5      -  n[        R                  " UR                  U* U5        g g [        U[        R                  5      (       a  [        R                  " UR                  [        R                  " S5      S9  UR                  bz  [        R                  R                  R                  UR                  5      u  p#US:  a  S[        R                  " U5      -  OSn[        R                  " UR                  U* U5        g g [        U[        R                  [        R                   [        R"                  45      (       a  [        R$                  " UR                  S5        [        R$                  " UR                  S	5        ['        US
S 5      ba  [        R(                  " UR*                  5        [        R,                  " UR.                  5        [        R(                  " UR0                  5        g g [        U[        R2                  5      (       ay  [        R4                  " UR                  5        UR6                  bK  ['        UR                  SS5      (       d.  [        R(                  " UR                  UR6                     5        g g g g )Nfan_outrelu)r   nonlinearityr      )ar   ra   g        running_mean_is_hf_initializedF)
isinstancer>   r   r   initkaiming_normal_rf   r   r*   _calculate_fan_in_and_fan_outr@  sqrtrh   Linearkaiming_uniform_BatchNorm2dr   r   	constant_getattrzeros_rg  ones_running_varnum_batches_trackedrb   normal_padding_idx)rC   modulefan_inrT   bounds        r/   _init_weightsEmu3VQVAE._init_weights  s   fryy"))455  YVT{{&!HHMMGGV	DIIf--fkkE659 ' 		**!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659 '  NOONN6==#.NN6;;,v~t4@F//0

6--.F667 A --LL'!!-gfmmMach6i6iFMM&*<*<=> 7j- .r.   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        S[        UR                  5      S-
  -  U l        [        UR                  UR                  SSS9U l        [        UR                  UR                  SSS9U l        S[        UR                  5      S-
  -  U l        U R%                  5         U R'                  5         g )Nr   r   )r   r   r   r   r   )r<   r=   r8   r7  encoderrO  decoderr^   quantizer  r  vision_spatial_factorr   r:  rd   
quant_convpost_quant_convspatial_scale_factoreval	post_initri   s     r/   r=   Emu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r.   image_sizesrL   rM   c                    UR                   S:H  nU(       aJ  U R                  R                  nUR                  u  pgpUR	                  S5      R                  SUSSS5      nOUR                  u  pepxn	U R                  U5      n
U
R                  SSSSS5      nU R                  U5      nUR                  SSSSS5      nU R                  U5      nU(       a  UR                  S5      OUn[        X5       VVs/ s HB  u  pUS [        US   U R                  -  5      2S [        US   U R                  -  5      24   PMD     nnn[        U
US9$ s  snnf )Nrm   r   r   r   r   )last_hidden_stater#   )ndimr8   rB  rr   	unsqueezerepeatr  rs   r  r  squeezer   rW   r  r!   )rC   rI  r  rL   is_imager{   rz   r|   r}   r~   rF   conv_hidden_statescodesr#   single_imager   s                   r/   encodeEmu3VQVAE.encode  sd   
  $$){{==H2>2D2D/J&'11!4;;AxAqQL<H<N<N9J(E\2 +221aAqA!__-?@ 0771aAF01+3u}}Q' '*,&D
&D" D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr&D 	 

 $+%
 	

s   6A	ErF   c                    UR                   S:H  nU(       a  UR                  S5      nUR                  u  p4pVU R                  R	                  UR                  5       5      nUR                  S   nUR                  X4XVU5      R                  SSSSS5      R                  5       nU R                  U5      n	UR                  SSSSS5      nU	R                  SSSSS5      n	U R                  X5      n
U
R                  UX@R                  R                  -  U R                  R                  XPR                  -  X`R                  -  5      n
U(       a	  U
S S 2S4   $ U
$ )Nr   r   rn   r   rm   r   )r  r  rr   r  re   flattenru   rs   rt   r  r  r  r8   rB  r   r  )rC   rF   r  rz   r{   r}   r~   quantr|   
post_quantvideos              r/   decodeEmu3VQVAE.decode  s;    %%*)33A6M.;.A.A+
f''(=(=(?@;;r?

:IQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/{{===KK$$...---
 'uQT{1E1r.   )r8   r  r  r  r  r  r  r  ) r%   r&   r'   r(   r   r,   base_model_prefixmain_input_nameinput_modalities_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr   r   r   _can_record_outputsr*   no_gradr|  r=   r   r   rX   r   r   r!   r  r  r-   r[   r\   s   @r/   r\  r\    s     $$O!N"& /0LM-
 ]]_? ?4 *  
!LL
7<||
OUVhOi
	
   
B2ELL 2 2r.   r\  c                       \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       rS
\\R                     S\R                  4S jrS
\R                  S\R                  4S jrSrg)Emu3ImageVocabularyMappingi/  zE
A class for mapping discrete image tokens from VQGAN to BPE tokens.
c                 h    Xl         UR                  S5      U l        UR                  S5      U l        g )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)rC   r  s     r/   r=   #Emu3ImageVocabularyMapping.__init__4  s)    "%MM/:'mmI6r.   c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf Nz<|visual tokensortedr  items
startswithrC   namevals      r/   r#   'Emu3ImageVocabularyMapping.image_tokens9  s<    DNN,@,@,Bh,BytdooVfFgs,Bhiih   A
A
c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf r  r  r  s      r/   image_tokens_str+Emu3ImageVocabularyMapping.image_tokens_str=  s<    T^^-A-A-Ci-C	tWgGht-Cijjir  c                 z    U R                    Vs0 s H  n[        USS 5      U R                  U   _M!     sn$ s  snf )Nir   )r  rW   r  )rC   tokens     r/   img2bpe"Emu3ImageVocabularyMapping.img2bpeA  s;    FJF[F[\F[UE"RL!4>>%#88F[\\\s   &8c                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf r;   )r  r  )rC   kvs      r/   bpe2img"Emu3ImageVocabularyMapping.bpe2imgE  s-    !%!3!3!56!5!5666s   0c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ Nr   dtype)r*   zerosmaxr  keysrW   r  rC   mappingr  r  s       r/   bpe2img_mapping_tensor1Emu3ImageVocabularyMapping.bpe2img_mapping_tensorI  R    ++c$,,"3"3"56:%))LLL&&(DAAJ )r.   c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ r  )r*   r  r  r  r  rW   r  r  s       r/   img2bpe_mapping_tensor1Emu3ImageVocabularyMapping.img2bpe_mapping_tensorP  r  r.   	img_batchrM   c                 "   UR                   n[        R                  " UR                  S   S4[        R                  S9U R
                  -  nU R                  UR                  S5         n[        R                  " XC/SS9nUR                  U5      $ )Nr   r   r  cpurn   rq   )	devicer*   onesrr   rW   r  r  torU  )rC   r  r  eol_row
img_tokenss        r/   convert_img2bpe*Emu3ImageVocabularyMapping.convert_img2bpeW  su    !!**iooa0!4EIIFIZIZZ00e1DE
YY
4"=
}}V$$r.   c                     UR                   nUSS S24   nU R                  UR                  S5         nUR                  U5      $ )N.rn   r  )r  r  r  )rC   r  r  r  s       r/   convert_bpe2img*Emu3ImageVocabularyMapping.convert_bpe2img^  sG    !!c3B3h'	00e1DE
}}V$$r.   )r  r  r  N)r%   r&   r'   r(   r)   r=   r   r#   r  r  r  r  r  listr*   rX   r  r  r-   r$   r.   r/   r  r  /  s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r.   r  c                   ,    \ rS rSrS/rSrSr\\S.r	Sr
g)Emu3PreTrainedModelie  r6   Tr_  r$   N)r%   r&   r'   r(   r  r  r  r6   r1   r  r-   r$   r.   r/   r  r  e  s)     "&)#r.   r  c                   <   ^  \ rS rSr% \\S'   S\4U 4S jjrSrU =r$ )Emu3TextModeliq  r8   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r;   )r<   r=   r>   r  r  num_hidden_layersr6   layersrB   s      r/   r=   Emu3TextModel.__init__t  sH     mmBGH`H`BabBaYf0Bab
bs   A)r  )	r%   r&   r'   r(   r   r,   r=   r-   r[   r\   s   @r/   r  r  q  s    
~ 
 
r.   r  c                   @   ^  \ rS rSr% \\S'   U 4S jrU 4S jrSrU =r	$ )Emu3ForCausalLMi{  r8   c                 D   > [         TU ]  U5        [        U5      U l        g r;   )r<   r=   r  modelri   s     r/   r=   Emu3ForCausalLM.__init__~  s     "6*
r.   c                  6   > [        5       R                  5         g)a[  
Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```N)r<   rU   )super_kwargsrD   s    r/   rU   Emu3ForCausalLM.forward  s    & 	r.   )r  )
r%   r&   r'   r(   r   r,   r=   rU   r-   r[   r\   s   @r/   r  r  {  s    + r.   r  c                     ^  \ rS rSrU 4S jrS rS rS\R                  S\R                  S\R                  4S jr
\\" S	S
9S\R                  S\R                  S\\   S\\-  4S j5       5       r\R$                  " 5       S\R                  S\S\4S j5       rS\R                  S\R                  S\R                  4S jr\\        SS\R                  S-  S\R                  S-  S\R,                  S-  S\R,                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )	Emu3Modeli  c                    > [         TU ]  U5        [        R                  UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l        U R                  5         g r;   )r<   r=   r  _from_configtext_config
text_modelr\  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingr  ri   s     r/   r=   Emu3Model.__init__  sY     '44V5G5GH !1!12"<V=R=R"S 	r.   c                 6    U R                   R                  5       $ r;   )r  get_input_embeddingsrC   s    r/   r  Emu3Model.get_input_embeddings  s    3355r.   c                 :    U R                   R                  U5        g r;   )r  set_input_embeddingsrC   values     r/   r  Emu3Model.set_input_embeddings  s    ,,U3r.   rI  r  rM   c                     U R                   R                  XSS9nUR                   Vs/ s H+  o@R                  R	                  U5      R                  5       PM-     nn[        R                  " U5      nU$ s  snf )a  
Tokenizes images into discrete tokens with VQGAN module. Converts
obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
special tokens.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
        The sizes of the images in the batch, being (height, width) for each image.
T)return_dict)r  r  r#   r  r  r  r*   rU  )rC   rI  r  vqmodel_outputstokensbpe_tokens_list
bpe_tokenss          r/   get_image_tokensEmu3Model.get_image_tokens  st     150C0CLko0C0pTcTpTp
Tp&##33F;CCETp 	 
 YY/
	
s   2A5zbTokenizes images into discrete tokens with VQGAN module and embeds them with text embeddings layerrY  rL   c                    U R                   R                  " X4SS0UD6nU VVs/ s H9  u  pVXPR                   R                  -  X`R                   R                  -  S-   -  PM;     nnnUR                   Vs/ s H+  oR                  R                  U5      R                  5       PM-     n	n[        R                  " U	5      n
U R                  5       " U
5      n[        R                  " X5      nXl        U$ s  snnf s  snf )z
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
    The tensors corresponding to the input images.
r  Tr   )r  r  r  r#   r  r  r  r*   rU  r  splitpooler_output)rC   rI  r  rL   r	  r}   r~   split_sizesr
  r  r  image_embeddingsimage_featuress                r/   get_image_featuresEmu3Model.get_image_features  s     150C0C1
371
;A1

 "-
!, ||999e||GiGi>ilm>mn!, 	 

 UdTpTp
Tp&##33F;CCETp 	 
 YY/
446zB%5C(6%

s   A C462C:r#   r}   r~   c                     USS2SS24   R                  SX#S-   5      nU R                  R                  U5      nU R                  R	                  U5      nU$ )a  
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.

Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
    height (`int`):
        Height of the generated image before upsampling.
    width (`int`):
        Width of the generated image before upsampling.
Nrn   r   )ru   r  r  r  r  )rC   r#   r}   r~   	sequencesr^  s         r/   decode_image_tokensEmu3Model.decode_image_tokens  sV     !CRC(--b&!)D	..>>yI##L1r.   	input_idsinputs_embedsr  c           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r  r  rn   r   r   z6Image features and image tokens do not match, tokens: z, features: )r  r*   tensorr  r  longr  allrv   rr   r  	expand_asr  r   numel)rC   r  r  r  special_image_maskn_image_tokensn_image_featuress          r/   get_placeholder_maskEmu3Model.get_placeholder_mask  s    !.2K2K2MT44CC5::^k^r^rs3 " "4!7!7!;!*.E.E.T.T!T+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r.   NrG   rH   rI   rJ   c	           	      B   USL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUbQ  U R                  X#5      R                  n
[        R
                  " U
SS9n
U R                  XU
S9nUR                  X5      nU R                  " SUUUUUS.U	D6nU$ )aH  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   rq   )r  r  )rG   rH   rI   r  rJ   r$   )	
ValueErrorr  r  r  r*   rU  r'  masked_scatterr  )rC   r  rI  r  rG   rH   rI   r  rJ   rL   r  r$  outputss                r/   rU   Emu3Model.forward  s    ( -t";<s    557	BM#!44\O]]N"YY~1=N!%!:!:~ "; " *889K\M // 
)%+'
 
 r.   )r  r  r  )NNNNNNNN)r%   r&   r'   r(   r=   r  r  r*   r  r+   r  r   r   r   r   rZ   r!   r  r  rW   r  r'  rX   r   rY   r
   rU   r-   r[   r\   s   @r/   r  r    s   64U->-> UM]M] bgbrbr & y!--<A<L<LX^_qXr	%	% 0 ]]_0@0@ # VY  $"))":?:K:K"]b]n]n"0  .215+/.204(,26!%,##d*, ''$., \\D(	,
 t+, &&-, , ((4/, $;, +,, 
'	',  ,r.   r  c                     ^  \ rS rSrSrSS0rU 4S jrS rS rS\	R                  4S	 jrS
 r\\          SS\R                   S-  S\R"                  S-  S\R$                  S-  S\R$                  S-  S\R                   S-  S\S-  S\R"                  S-  S\S-  S\R                   S-  S\\R$                  -  S\\   S\\-  4S jj5       5       r       SU 4S jjrSrU =r$ )Emu3ForConditionalGenerationi3  )r^  textzlm_head.weightz$model.text_model.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)r   )r<   r=   r  r  r>   rn  r  hidden_size
vocab_sizelm_headr  ri   s     r/   r=   %Emu3ForConditionalGeneration.__init__7  sS     v&
yy!3!3!?!?ASASA^A^ejkr.   c                 6    U R                   R                  5       $ r;   )r  r  r   s    r/   r  1Emu3ForConditionalGeneration.get_input_embeddings>  s    zz..00r.   c                 :    U R                   R                  U5        g r;   )r  r  r  s     r/   r  1Emu3ForConditionalGeneration.set_input_embeddingsA  s    

''.r.   rM   c                     U R                   $ r;   )r4  r   s    r/   get_output_embeddings2Emu3ForConditionalGeneration.get_output_embeddingsD  s    ||r.   c                 :    U R                   R                  " S0 UD6$ r   )r  r  )rC   rL   s     r/   r  0Emu3ForConditionalGeneration.decode_image_tokensG  s    zz--777r.   Nr  rI  r  rG   rH   rI   r  rJ   labelslogits_to_keeprL   c           
         U R                   " SUUUUUUS.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnU	b3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> conversation = [
...     {
...     "role": "system",
...     "content": [
...         {"type": "text", "text": "You are a helpful assistant."},
...         ],
...     },
...     {
...     "role": "user",
...     "content": [
...         {"type": "image"},
...         {"type": "text", "text": "Please describe the image."},
...         ],
...     },
... ]

>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```)r  rG   rH   rI   r  rJ   r   N)logitsr?  r3  )lossrB  rI   rF   r`  r$   )r  ri  rW   slicer4  loss_functionr8   r  r3  r
   rI   rF   r`  )rC   r  rI  r  rG   rH   rI   r  rJ   r?  r@  rL   r,  rF   slice_indicesrB  rC  s                    r/   rU   $Emu3ForConditionalGeneration.forwardJ  s    @ ** 
)%+'
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD &#33!//))
 	
r.   c	                 `   > [         TU ]  " U4UUUUUUUS.U	D6n
U(       d  U(       a  S U
S'   U
$ )N)rI   rG   r  rH   rI  rJ   is_first_iterationrI  )r<   prepare_inputs_for_generation)rC   r  rI   rG   r  rH   rJ   rI  rI  rL   model_inputsrD   s              r/   rJ  :Emu3ForConditionalGeneration.prepare_inputs_for_generation  sR     w<

+)'%%1

 

 "i+/L(r.   )r4  r  )
NNNNNNNNNr   )NNNNTNF)r%   r&   r'   r(   output_modalities_tied_weights_keysr=   r  r  r>   r  r;  r  r   r   r*   r+   r  rX   r   rY   rW   r   r   rZ   r
   rU   rJ  r-   r[   r\   s   @r/   r/  r/  3  s   )*,RS1/ryy 8  .215+/.204(,26!%*.-.Y
##d*Y
 ''$.Y
 \\D(	Y

 t+Y
 &&-Y
 Y
 ((4/Y
 $;Y
   4'Y
 ell*Y
 +,Y
 
'	'Y
  Y
|   r.   r/  )r/  r  r  r  r\  r  )Qr@  dataclassesr   	functoolsr   r*   torch.nnr>   torch.nn.functional
functionalr    r   rj  cache_utilsr   
generationr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   chameleon.modeling_chameleonr   r   llama.modeling_llamar   r   r   r   r   siglip.modeling_siglipr   configuration_emu3r   r   r   
get_loggerr%   loggerr!   r1   r6   r  r^   r   r   r   r   r   r   r   r   r   r   r   r  r  r-  r7  rO  r\  r  r  r  r  r  r/  __all__r$   r.   r/   <module>rd     s4     ! %     &   ) R - & V V 7 5 w v 4 K K 
		H	% 
15 1  1	N 	
!( !H$ryy $D	%H 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~&o &V V299 D8 8v7ryy 7tCryy CLCryy CL ~2 ~2~2B3% 3%l	2 	
J 3 
&(;_ :X# XvQ#6 Qhr.   