
    Z j                     P   S SK Jr  S SKJr  S SKrS SKJs  Jr  S SK	J
r
  S SKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSKJrJrJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,J-r-J.r.  SSK/J0r0  SSK1J2r2  SSK3J4r4J5r5J6r6J7r7J8r8  SSK9J:r:J;r;  SSK<J=r=  SSK>J?r?  SSK@JArAJBrBJCrC  \(" 5       (       a   \)R                  " \E5      rF\&" SS9\
 " S S\?5      5       5       rG\&" SS9\
 " S S\25      5       5       rH\&" SS9\
 " S  S!\5      5       5       rI\& " S" S#\!5      5       rJ\&" S$S%9\ " S& S'\5      5       5       rK " S( S)\:5      rL " S* S+\;5      rM " S, S-\C5      rN " S. S/\R                  5      rP " S0 S1\R                  5      rQ " S2 S3\B5      rR " S4 S5\A5      rS " S6 S7\05      rT " S8 S9\R                  5      rU " S: S;\85      rV " S< S=\75      rW " S> S?\55      rX " S@ SA\65      rY " SB SC\R                  5      rZ " SD SE\R                  5      r[ " SF SG\R                  5      r\ " SH SI\R                  5      r] " SJ SK\45      r^ " SL SM\R                  5      r_ " SN SO\R                  5      r`\&" SPS%9 " SQ SR\J5      5       ra " SS ST\J\5      rb/ SUQrcg)V    )Callable)	dataclassN)strict)nn   )initialization)ACT2FN)Cache)PreTrainedConfig)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_vision_availableloggingtorch_compilable_check   )CONFIG_MAPPING
AutoConfig	AutoModel)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddingszdeepseek-community/Janus-Pro-1B)
checkpointc                      \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\\S'   S	r
\\\   -  \\\4   -  \S
'   Sr\\S'   Sr\\-  \S'   Sr\\S'   Sr\\-  \S'   Sr\\S'   Sr\\-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   \" 5       rSrg) JanusVisionConfig@   z
projection_dropout (`float`, *optional*, defaults to 0.0):
    Dropout probability for the projection layer.
num_image_tokens (`int`, *optional*, defaults to 576):
    Number of image tokens.
i   hidden_size   num_hidden_layers   num_attention_headsi  
image_sizegelu
hidden_actg      @	mlp_ratioTattention_bias        hidden_dropout_rate   projection_dimprojection_dropoutFuse_qk_norm{Gz?initializer_ranger   depthi@  num_image_tokens N)__name__
__module____qualname____firstlineno____doc__r3   int__annotations__r5   r7   r8   listtupler:   strr;   floatr<   boolr>   r@   rA   rB   rD   rE   rF   AttributeErrorintermediate_size__static_attributes__rG       x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/janus/modular_janus.pyr1   r1   @   s     Ks!!47Jd3i%S/17J Ius{ ND'**NC&))K#u#E3Nc&(rW   r1   c                   @   \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\
\S'   S	r\\S
'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\   \\S4   -  \S'   Sr\\S'   Sr\\-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S '   \" 5       r\" 5       r\" 5       rS!r g")#JanusVQVAEConfig[   al  
base_channels (`int`, *optional*, defaults to 128):
    Base channel count.
channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
    Channel multipliers for each resolution.
num_res_blocks (`int`, *optional*, defaults to 2):
    Number of residual blocks.
num_patches (`int`, *optional*, defaults to 32):
    Num of patches the input images can be divided into.
out_channels (`int`, *optional*, defaults to 3):
    Number of out channels.
image_token_embed_dim (`int`, *optional*, defaults to 2048):
    Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
   	embed_dimi @  num_embeddingsFdouble_latent   latent_channels    num_patchesr   in_channelsout_channels   base_channels)   rh   r   r      .channel_multiplierr   num_res_blocksr=   dropoutrC   rD   r?   r@   r5   r9   r:   image_token_embed_dimrG   N)!rH   rI   rJ   rK   rL   r]   rM   rN   r^   r_   rS   ra   rc   rd   re   rg   rj   rO   rP   rk   rl   rR   rD   r@   r5   r:   rQ   rm   rT   
resolutionattn_resolutions	attn_typerV   rG   rW   rX   rZ   rZ   [   s     IsNCM4OSKKL#M36ES	E#s(O3ENCGUS[#u#NCsJ!%3%!J%' IrW   rZ   c                      ^  \ rS rSr% SrSr\\\S.r	Sr
\\-  S-  \S'   Sr\\-  S-  \S'   Sr\\-  S-  \S'   S	r\\S
'   Sr\\S'   U 4S jrSrU =r$ )JanusConfig   a  
Example:

```python
>>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

>>> # Initializing a Janus vision config
>>> vision_config = JanusVisionConfig()

>>> # Initializing a Llama config
>>> text_config = LlamaConfig()

>>> # Initializing a VQ config
>>> vq_config = JanusVQVAEConfig()

>>> # Initializing a Janus Pro 1B style configuration
>>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

>>> # Initializing a model from the Janus Pro 1B style configuration
>>> model = JanusForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```janus)text_configvision_config	vq_configNru   rv   rw   i image_token_idTtie_word_embeddingsc                   > [        U R                  [        5      (       aU  U R                  R                  SS5      U R                  S'   [        U R                  S      " S0 U R                  D6U l        O5U R                  c(  [
        R                  S5        [        S   " 5       U l        U R                  c%  [
        R                  S5        [        5       U l        O9[        U R                  [        5      (       a  [        S0 U R                  D6U l        U R                  c%  [
        R                  S5        [        5       U l	        O9[        U R                  [        5      (       a  [        S0 U R                  D6U l	        U R                  R                  U R                  R                  -  U R                  l        [        TU ]<  " S0 UD6  g )N
model_typellamaz7`text_config` is None. Initializing with default valueszK`vision_config` is None. Initializing with default JanusVisionConfig valueszF`vq_config` is None. Initializing with default JanusVQVAEConfig valuesrG   )
isinstanceru   dictgetr   loggerinforv   r1   rw   rZ   r8   
patch_sizerc   super__post_init__)selfkwargs	__class__s     rX   r   JanusConfig.__post_init__   sW   d&&---1-=-=-A-A,PW-XD\*-d.>.>|.LMaPTP`P`aD%KKQR-g68D%KKef!2!4D**D11!2!HT5G5G!HD>>!KK`a-/DN---??DN &*%7%7%B%BdFXFXFcFc%c"''rW   )rH   rI   rJ   rK   rL   r{   r   r1   rZ   sub_configsru   r~   r   rN   rv   rw   rx   rM   ry   rS   r   rV   __classcell__r   s   @rX   rr   rr      s    2 J!*%K 37K((4/648M4**T1804It&&-4 NC  $$( (rW   rr   c                   \   ^  \ rS rSr% \\S'   SrSrSrSS/r	SS	/r
SrSrSrU 4S
 jrSrU =r$ )JanusPreTrainedModel   configmodelimagetextTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskc                   > [         TU ]  U5        [        U[        5      (       a\  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g g )N)rh   r   )r   _init_weightsr}   JanusVisionEmbeddingsinitcopy_position_idstorcharangeshapeexpand)r   moduler   s     rX   r   "JanusPreTrainedModel._init_weights   s^    f%f344JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5rW   rG   )rH   rI   rJ   rK   rr   rN   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr   rV   r   r   s   @rX   r   r      sO    (&*#,.GH#4m"DN!i irW   r   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   j    \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
g)JanusVQVAEOutput   z
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    Reconstructed pixel values after encoding and decoding the input.
embedding_loss (`torch.FloatTensor`):
    Embedding loss.
Ndecoded_pixel_valuesembedding_lossrG   )rH   rI   rJ   rK   rL   r   r   FloatTensorrN   r   rV   rG   rW   rX   r   r      s4     6:%++d29/3NE%%,3rW   r   c                       \ rS rSrSrg)JanusBaseModelOutputWithPast   rG   NrH   rI   rJ   rK   rV   rG   rW   rX   r   r          rW   r   c                       \ rS rSrSrg)JanusCausalLMOutputWithPast   rG   Nr   rG   rW   rX   r   r      r   rW   r   c                   V    \ rS rSrSS\R
                  S\S\R
                  4S jjrSrg)	r      pixel_valuesinterpolate_pos_encodingreturnc                 V   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  U R                  XU5      n	OU R                  U R                  5      n	X-   nU$ )N)dtyper   rh   )
r   patch_embeddingweightr   toflatten	transposer   position_embeddingr   )
r   r   r   _heightwidthtarget_dtypepatch_embeds
embeddings
pos_embedss
             rX   forwardJanusVisionEmbeddings.forward   s    *001e++2288++LOO,O,OP!))!,66q!<
#66z5QJ001B1BCJ,
rW   rG   N)F)	rH   rI   rJ   rK   r   TensorrS   r   rV   rG   rW   rX   r   r      s,    ELL D ]b]i]i  rW   r   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   4S	 jjrS
rU =r$ )JanusVisionAttentioni  z(Attention Class for Janus Vision Encoderr   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        SU l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  5      U l        US:  a  [        R,                  " U5      O[        R.                  " 5       U l        U(       a   [        R0                  " U R                  5      O[        R.                  " 5       U l        U(       a&  [        R0                  " U R                  5      U l        g [        R.                  " 5       U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frh   biasr   )r   __init__r   r3   r]   r7   	num_headshead_dim
ValueErrorscaleattention_dropoutrA   rB   	is_causalnum_key_value_groupsr   Linearr<   q_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)r   r   proj_dropoutqk_normr   s       rX   r   JanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=rW   Nhidden_statesattention_maskr   c                     UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R                  XEU R
                  U R                  5      R                  SS5      n	[        R                  " U R                  R                  [        5      n
U
" U UUU	U4U R                   (       d  SOU R"                  U R$                  U R&                  S.UD6u  pUR	                  XEU R(                  5      nU R+                  U5      nU R-                  U5      nX4$ )Nr   rh   r   r=   )rl   scalingr   )sizer   r   r   reshaper   r   r   r   r   viewr   get_interfacer   _attn_implementationr*   trainingr   r   r   r]   r   rA   )r   r   r   r   
batch_sizeseq_lenr   query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightsoutputs                 rX   r   JanusVisionAttention.forward#  s    "/!3!3!5
Q{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJnn
%
 
%
! "))*t~~N&&{3((0##rW   )r   r   r]   r   r   r   r   r   r   rA   r   r   r   r   r   N)rH   rI   rJ   rK   rL   r1   r   r   r   r   r   r   rV   r   r   s   @rX   r   r     sT    2Q0 Q@ /3)$||)$ t+)$ +,	)$ )$rW   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )JanusVisionMLPiO  r   c                    > [         TU ]  5         Xl        [        UR                  UR
                  -  5      U l        [        UR                     U l	        [        R                  " UR                  U R                  5      U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g r  )r   r   r   rM   r3   r;   rU   r	   r:   activation_fnr   r   fc1fc2r   r>   dropout1dropout2r   r   r   s     rX   r   JanusVisionMLP.__init__P  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>rW   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r  )r  r  r  r  r	  r   r   s     rX   r   JanusVisionMLP.forwardZ  sP    /**=9m4/m4rW   )r  r   r  r	  r  r  rU   )rH   rI   rJ   rK   r1   r   r   r   r   rV   r   r   s   @rX   r  r  O  s0    ?0 ?U\\ ell  rW   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )r   ic  r   c                 J  > [         TU ]  U5        Xl        UR                  U l        [        U5      U l        [        R                  " U R                  UR                  S9U l
        [        R                  " U R                  UR                  S9U l        [        U5      U l        g )N)eps)r   r   r   r3   r]   r   	self_attnr   r   layer_norm_epslayer_norm1layer_norm2r  mlpr
  s     rX   r    JanusVisionEncoderLayer.__init__d  st     ++-f5<<F<Q<QR<<F<Q<QR!&)rW   )r   r]   r  r  r  r  rH   rI   rJ   rK   r1   r   rV   r   r   s   @rX   r   r   c  s    *0 * *rW   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )JanusVisionEncoderin  r   c                    > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r  )r   r   r   
ModuleListranger5   r   layersr   r   r   r   s      rX   r   JanusVisionEncoder.__init__o  sF     mmeTZTlTlNm$nNm%<V%DNm$no$ns   A)r  r  r   s   @rX   r  r  n  s    p0 p prW   r  c            
          ^  \ rS rSr\\S.rS\4U 4S jjr  SS\	R                  S-  S\S\\   S	\\-  4S
 jjrSrU =r$ )JanusVisionModelit  r   
attentionsr   c                 D   > [         TU ]  U5        [        U5      U l        g r  )r   r   r  encoderr
  s     rX   r   JanusVisionModel.__init__z  s     )&1rW   Nr   r   r   r   c                     Uc  [        S5      eU R                  XS9nU R                  " SSU0UD6nUR                  nU R	                  U5      nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r   inputs_embedsr   )last_hidden_statepooler_outputrG   )r   r   r&  r*  post_layernormr   )r   r   r   r   r   encoder_outputsr*  pooled_outputs           rX   r   JanusVisionModel.forward~  s     ?@@h+/<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rW   )r&  NF)rH   rI   rJ   rK   r   r   _can_record_outputsr1   r   r   r   rS   r   r   rP   r   r   rV   r   r   s   @rX   r"  r"  t  sm    0*
20 2 26).
''$.
 #'
 +,	

 
+	+
 
rW   r"  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVisionAlignerMLPi  r   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf Nrh   )r   r   r   r   r3   r@   r  r  r  rE   hidden_layersr	   r:   r  r  s      rX   r   JanusVisionAlignerMLP.__init__  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   (5Cc                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r  r  r6  r  r   r   layers      rX   r   JanusVisionAlignerMLP.forward  B    /''E ..}=M!-0M ( rW   r  r  r6  )	rH   rI   rJ   rK   r1   r   r   rV   r   r   s   @rX   r3  r3    s    70 7 rW   r3  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )JanusVQVAEVectorQuantizeri  r   c                 N   > [         TU ]  U5        UR                  /S-  U l        g )Nr   )r   r   rc   quant_state_dimsr
  s     rX   r   "JanusVQVAEVectorQuantizer.__init__  s&     !'!3!3 4q 8rW   image_tokensr   c                 >   UR                   S   nU R                  R                  R                   S   nU R                  U5      n[        R                  " USSS9nUR                  U/U R                  QUP75      nUR                  SSSS5      R                  5       nU$ )Nr   r   r   )pdimr   rh   )	r   	embeddingr   F	normalizer   rC  permute
contiguous)r   rE  r   emb_dimhidden_state_quants        rX   get_codebook_entry,JanusVQVAEVectorQuantizer.get_codebook_entry  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!rW   )rC  )rH   rI   rJ   rK   rZ   r   r   
LongTensorr   rP  rV   r   r   s   @rX   rA  rA    s4    9/ 9"u/?/? "EDUDU " "rW   rA  c                       \ rS rSrSrg)JanusVQVAEResnetBlocki  rG   Nr   rG   rW   rX   rT  rT    r   rW   rT  c                       \ rS rSrSrg)JanusVQVAEAttnBlocki  rG   Nr   rG   rW   rX   rV  rV    r   rW   rV  c                       \ rS rSrSrg)JanusVQVAEConvDownsamplei  rG   Nr   rG   rW   rX   rX  rX    r   rW   rX  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvUpsamplei  c                 l   > [         TU ]  5         [        R                  R	                  XSSSS9U l        g )Nr   rh   kernel_sizestridepadding)r   r   r   r   Conv2dconv)r   rd   r   s     rX   r   JanusVQVAEConvUpsample.__init__  s,    HHOOK!TU_`Oa	rW   c                 T    [         R                  " USSS9nU R                  U5      nU$ )Ng       @nearest)scale_factormode)rJ  interpolatera  r  s     rX   r   JanusVQVAEConvUpsample.forward  s(    m#IV		-0rW   )ra  )rH   rI   rJ   rK   r   r   rV   r   r   s   @rX   rZ  rZ    s    b rW   rZ  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	JanusVQVAEMidBlocki  r   channelsc                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        [        UUUS9U l        g )Nr   rd   re   )r   r   rT  block_1rV  attn_1block_2)r   r   rk  r   s      rX   r   JanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
rW   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )rn  ro  rp  r  s     rX   r   JanusVQVAEMidBlock.forward  s2    ]3M2]3rW   )ro  rn  rp  )rH   rI   rJ   rK   rZ   rM   r   r   r   r   rV   r   r   s   @rX   rj  rj    s7    
/ 
3 
U\\ ell  rW   rj  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )JanusVQVAEEncoderi  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  X2SSSS9U l        S[        U5      -   nXpl        [        R                   " 5       U l        [%        U R                  5       GH   n[        R                   " 5       n	[        R                   " 5       n
X'U   -  nX&U   -  n[%        U R
                  5       HM  nU	R'                  [)        UUUS95        UnXR                  S-
  :X  d  M3  U
R'                  [+        U5      5        MO     [        R,                  " 5       nXl        Xl        XR                  S-
  :w  a  [3        U5      Ul        U R"                  R'                  U5        GM     [7        UW5      U l        [        R                  R;                  SUSSS	9U l        [        R                  R                  UU(       a  S
U-  OUSSSS9U l        g )Nr   rh   r\  )rh   rm  rb   ư>T
num_groupsnum_channelsr  affiner   ) r   r   lenrj   num_resolutionsrk   rg   rd   r_   ra   r   r   r`  conv_inrP   in_channel_multiplierr  downr  appendrT  rV  ModuleblockattnrX  
downsamplerj  mid	GroupNormnorm_outconv_out)r   r   rg   rd   r_   ra   rj   r  i_levelr  r  block_in	block_outi_blockr  r   s                  rX   r   JanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!45)%$,%. %22Q66KK 3H => 6 99;DJI..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
rW   r   c                    U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr   r   rh   )r~  r  r}  rk   r  r  r|  r  r  r  r  r  r   sigmoidr  )r   r   r   r  r  hidden_stater*  s          rX   r   JanusVQVAEEncoder.forward"  sB   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  rW   )r~  r  r  r  r  r  rk   r}  )
rH   rI   rJ   rK   r   r   rR  r   rV   r   r   s   @rX   ru  ru    s     1
f!E$4$4 ! !rW   ru  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusVQVAEDecoderi;  c           
      d  > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nX!R                  U R                  S-
     -  n[        R                  R                  X5SSSS9U l        [        X5      U l        [        R                  " 5       U l        [#        [%        U R                  5      5       H  n[        R                  " 5       n[        R                  " 5       nX!R                  U   -  n	[%        U R
                  S-   5       HM  n
UR'                  [)        UUU	S95        U	nX`R                  S-
  :X  d  M3  UR'                  [+        U5      5        MO     [        R,                  " 5       nX{l        Xl        US:w  a  [3        U5      Ul        U R                   R'                  U5        M     [        R                  R7                  SUSSS	9U l        [        R                  R                  XTSSSS9U l        g )
Nrh   r   r\  rm  r   rb   rw  Trx  )r   r   r|  rj   r}  rk   rg   ra   re   r   r   r`  r~  rj  r  r  upreversedr  r  rT  rV  r  r  r  rZ  upsampler  r  r  )r   r   rg   ra   re   r  r  r  r  r  r  r  r   s               rX   r   JanusVQVAEDecoder.__init__<  s   "6#<#<=$33,, 00** !#<#<T=Q=QTU=U#VV xxaXYcde &f7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %22Q66KK 3H => : BHG!|4X>GGNN2) =. **bxUYbf*gAVWabcrW   r  r   c                 r   U R                  U5      nU R                  U5      n[        U R                  5       H  n[        U R                  S-   5       Ho  nU R
                  U   R                  U   " U5      n[        U R
                  U   R                  5      S:  d  MM  U R
                  U   R                  U   " U5      nMq     X R                  S-
  :w  d  M  U R
                  U   R                  U5      nM     U R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nrh   r   )r~  r  r  r}  rk   r  r  r|  r  r  r  r   r  r  )r   r  r  r  s       rX   r   JanusVQVAEDecoder.forwardj  s   ||L1 xx- T112G !4!4q!89#www/55g>|Ltwww',,-1#'777#3#8#8#A,#OL : ..22#www/88F 3 }}\2l33}}\2rW   )r~  r  r  r  rk   r}  r  )
rH   rI   rJ   rK   r   r   r   r   rV   r   r   s   @rX   r  r  ;  s.    ,d\E$5$5 %:K:K  rW   r  c                      ^  \ rS rSr/ SQr\\S.rSrS\	4U 4S jjr
S\R                  S\R                  4S	 jr\\S\R                  S\\R                  \R                  4   4S
 j5       5       rSrU =r$ )
JanusVQVAEi  )rV  rT  rA  r#  r   r   c                 r   > [         TU ]  U5        [        U5      U l        SU l        U R                  5         g r0  )r   r   r  decodergradient_checkpointing	post_initr
  s     rX   r   JanusVQVAE.__init__  s0     (0&+# 	rW   rE  r   c                    UR                   S   U R                  R                  S   U R                  R                  S   -  :w  aM  [        SU R                  R                  S   U R                  R                  S   -   SUR                    S35      eU R                  R	                  U5      nU R                  U5      nU R                  U5      nU$ )a  
Decodes quantized token IDs into pixel values.
Args:
    image_tokens (torch.LongTensor): Batch of token IDs.
Returns:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Pixel values decoded from the token IDs.
rh   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizerC  r   rP  post_quant_convr  )r   rE  codebook_entryr   r   s        rX   decodeJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2rW   c                     UR                   S   nU R                  " U4SS0UD6nU R                  UR                  R	                  US5      5      n[        XTR                  5      $ )Nr   return_dictTr   )r   encoder  rE  r   r   r   )r   r   r   r   encode_outputsr   s         rX   r   JanusVQVAE.forward  sa     "''*
\NtNvN#{{>+F+F+K+KJXZ+[\ 46S6STTrW   )r  r  )rH   rI   rJ   rK   r   rT  rV  r1  main_input_namerZ   r   r   rR  r   r  r   r   rP   r   rV   r   r   s   @rX   r  r    s     /) %O/ 5#3#3 8I8I & 	U''	U 
u  %"3"33	4		U  	UrW   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVQVAEAlignerMLPi  r   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf r5  )r   r   r   r   r]   r@   r  r  r  r5   r6  r	   r:   r  r  s      rX   r   JanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rr8  c                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r  r:  r;  s      rX   r   JanusVQVAEAlignerMLP.forward  r>  rW   r?  )	rH   rI   rJ   rK   rZ   r   r   rV   r   r   s   @rX   r  r    s    7/ 7 rW   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	JanusVQVAEHeadi  zOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  UR                  5      U l        g r  )r   r   r   r   rm   r@   proj_outr	   r:   r  r^   vision_headr
  s     rX   r   JanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRrW   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r  r  r  s     rX   r   JanusVQVAEHead.forward  s6    m4**=9((7rW   )r  r  r  )rH   rI   rJ   rK   rL   rZ   r   r   r   tensorr   rV   r   r   s   @rX   r  r    s5    YS/ SU\\ ell  rW   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	S\
R                  S\\   S\\-  4S	 j5       5       rS
\
R"                  S\
R                  S\
R                  4S jr\\	        SS
\
R"                  S-  S\
R                  S-  S\
R&                  S-  S\
R"                  S-  S\S-  S\
R                  S-  S\S-  S\\
R&                  -  S\4S jj5       5       rSrU =r$ )
JanusModeli  r   c                   > [         TU ]  U5        Xl        [        R	                  UR
                  5      U l        [        U R                  R                  5      U l        [        R	                  UR                  5      U l        [        R                  " U R                  R                  R                  U R                  R                  R                  5      U l        [#        U R                  R                  5      U l        ['        U R                  R                  5      U l        [*        R,                  " UR.                  S9U l        SU l        U R5                  5         g )N)r   F)r   r   r   r"  _from_configrv   vision_modelr3  alignerr  rw   vqmodelr   	Embeddingr^   r]   generation_embeddingsr  generation_alignerr  generation_headr    from_configru   language_modelr  r  r
  s     rX   r   JanusModel.__init__  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#rW   c                 6    U R                   R                  5       $ r  )r  get_input_embeddingsr   s    rX   r  JanusModel.get_input_embeddings  s    ""7799rW   c                 :    U R                   R                  U5        g r  )r  set_input_embeddingsr   values     rX   r  JanusModel.set_input_embeddings  s    007rW   r   r   r   c                 p    U R                   " U4SS0UD6nU R                  UR                  5      Ul        U$ )Nr  T)r  r  r*  r+  )r   r   r   vision_outputss       rX   get_image_featuresJanusModel.get_image_features  s;    
 **<TTTVT'+||N4T4T'U$rW   	input_idsr)  image_featuresc           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r   devicer   r   rh   z6Image features and image tokens do not match, tokens: z, features: )r  r   r  r   rx   longr  allsumr   	unsqueeze	expand_asr   r   numel)r   r  r)  r  special_image_maskn_image_tokensn_image_featuress          rX   get_placeholder_maskJanusModel.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!rW   Nr   r   r   	use_cachelogits_to_keepc	           
         US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUb  U R                  USS9R                  n
U
R	                  SUR
                  S   5      nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUUUS.U	D6n[        UR                  UR                  UR                  UR                   Ub  W
S9$ S S9$ )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oneT)r  r   )r)  r  )r)  r   r   r   r  r  )r*  r   r   r$  image_hidden_statesrG   )r   r  r  r+  r   r   r   r  r   r  masked_scatterr  r   r*  r   r   r$  )r   r  r   r   r   r   r)  r  r  r   image_embedsr  image_attention_mask	lm_outputs                 rX   r   JanusModel.forward  sH    -t";<s    557	BM#22<T2R``L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M^M'' 
')%+)
 
	 ,'99%55#11 ++0<0H
 	

 OS
 	
rW   )	r  r   r  r  r  r  r  r  r  )NNNNNNNr   )rH   rI   rJ   rK   rr   r   r  r  r   r   r   r   r   r   rP   r   r  rR  r  r   r
   rS   rM   r   r   rV   r   r   s   @rX   r  r    sb   { *:8 !--9?@R9S	+	+  "))":?:K:K"]b]n]n"0  .215.204(,26!%-.,
##d*,
 ''$.,
 t+	,

 &&-,
 ,
 ((4/,
 $;,
 ell*,
 
&,
  ,
rW   r  c                   t  ^  \ rS rSrSS0rSrSrS\4U 4S jjrS r	S	 r
S
\R                  S\R                  4S jr\\         SS\R                   S-  S\R"                  S-  S\R                  S-  S\R                   S-  S\S-  S\R"                  S-  S\R                   S-  S\S-  S\\R                  -  S\\   S\4S jj5       5       r      S U 4S jjrS\R                  4S jr\R6                  " 5          S!S
\R                  S-  S\R                   S-  S\S-  4U 4S jjj5       rSrU =r$ )"JanusForConditionalGenerationiJ  zlm_head.weightz(model.language_model.embed_tokens.weightr   Tr   c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  R                  UR                  R                  SS9U l
        U R                  5         g )NFr   )r   r   r   r  r   r   r   ru   r3   
vocab_sizelm_headr  r
  s     rX   r   &JanusForConditionalGeneration.__init__O  sZ     '
yy!3!3!?!?ASASA^A^ejk 	rW   c                 J    U R                   R                  R                  5       $ r  )r   r  r  r  s    rX   r  2JanusForConditionalGeneration.get_input_embeddingsX  s    zz((==??rW   c                 N    U R                   R                  R                  U5        g r  )r   r  r  r  s     rX   r  2JanusForConditionalGeneration.set_input_embeddings[  s    

!!66u=rW   inputsr   c                 r    U R                   R                  U5      nU R                   R                  U5      nU$ r  )r   r  r  )r   r  r  s      rX   'prepare_embeddings_for_image_generationEJanusForConditionalGeneration.prepare_embeddings_for_image_generation^  s0    zz77?zz44\BrW   Nr  r   r   r   r   r)  labelsr  r  r   c
                    U R                   " SUUUUUUUS.U
D6nUR                  n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUb3  U R                  " SXU R                  R                  R                  S.U
D6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r  r   r   r   r   r)  r  N)logitsr   r  )lossr  r   r   r$  r  rG   )r   r*  r}   rM   slicer  loss_functionr   ru   r  r   r   r   r$  r  )r   r  r   r   r   r   r)  r   r  r  r   outputsr   slice_indicesr  r  s                   rX   r   %JanusForConditionalGeneration.forwardc  s    * ** 	
%)%+'	
 	
  118B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD +#33!//)) ' ; ;
 	
rW   c           	      z   > [         T
U ]  " U4UUUUUS.UD6n	U(       d  UR                  SS5      (       d  X)S'   U	$ )N)r   r)  r   r  is_first_iterationr  Tr   )r   prepare_inputs_for_generationr   )r   r  r   r   r   r)  r  r
  r   model_inputsr   s             rX   r  ;JanusForConditionalGeneration.prepare_inputs_for_generation  sX     w<
+'))1
 
 VZZT%B%B+7(rW   rE  c                 x    U R                   R                  R                  U5      nUR                  SSSS5      nU$ )z
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.
Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
r   r   r   rh   )r   r  r  rL  )r   rE  decoded_images      rX   decode_image_tokens1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9rW   logits_processorc           	        > UR                  SS5      nU R                  " UR                  SS 5      40 UD6u  pgUS:X  a  [        T%U ]  " SUUUS S.UD6$ UR	                  5       [
        R                  [
        R                  4;  a  [        S5      eUR                  5         U R                  UR                  5       5        Ub  UO	[        5       nSUS'   UR                  c  [        R                  S5        S	Ul        UR                  US
'   U R!                  XR"                  U5      u  pnUR$                  UR&                  p[)        UR*                  5      S:w  a  [        SUR*                   S35      eUS LnU R-                  XlUR&                  S9  UR                  (       a;  UR                  S:  a+  UR/                  [1        UR                  5      5        S Ul        U R3                  UUR*                  S   US UUS9nU R4                  " SUUUR6                  S.UD6u  pU R8                  R:                  R<                  R>                  nUR*                  u  pURA                  SS5      nUR                  SS 5      nURA                  SS5      nX'S'   UUS 2S S 24   UR"                  :g  UUS 2S S 24   URB                  S   :g  -  nUUS 2S S 24   RE                  UURF                  5        U RI                  5       " U5      nURK                  SS 5      cA  U RM                  URN                  =(       d    SUS-  [Q        URR                  X-   5      US9US'   [T        RV                  " X4XS9nURX                  nURZ                  nUR\                  nUR^                  nUR`                  nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS n[c        U5       GH_  nU Rd                  " SUUSS.UD6nSU;   a!  US   Rg                  UR&                  5      US'   U R8                  Rh                  " S0 UDUUS.D6nU Rk                  UU5      nURl                  S S 2SS S 24   Ro                  5       n U R8                  Rq                  U 5      n!U" UU!5      n"URr                  (       a:  [T        Rt                  " U"SS9n#[T        Rv                  " U#SS9Ry                  S5      n$O[T        Rz                  " U"SS9n$U$US S 2U4'   [T        R|                  " U$U$/5      n$U$R                  S5      n$U R                  U$5      nGMb     U(       aT  U(       a  UW!4-  nU(       a  UW R                  5       4-  nU(       a  UWR                  -  nU(       a  UWR                  -  nU(       a  [        UW!UUUWR                  S9$ U$ )Ngeneration_moder   generation_config)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  rh   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr   static)cache_implementationr   max_cache_lenmodel_kwargsr  rG   )r)  r  r
  )output_attentionsoutput_hidden_statesr   )rH  )num_samples)	sequencesscoresr  r$  r   r   )Fpop_prepare_generation_configr   generateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargscopyr   r  r   warning_prepare_model_inputsbos_token_idr   r  r|  r   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr   r  r   rF   repeatgeneration_kwargsmasked_fill_pad_token_idr  r   _prepare_static_cacher  max
max_lengthr   zerosr!  r"  output_scoresoutput_logitsreturn_dict_in_generater  r  r   r  #_update_model_kwargs_for_generationr*  cloner  	do_samplesoftmaxmultinomialsqueezeargmaxcatr  r  rR   r$  r   r   r   )&r   r  r   r  r   r  r  r   r  model_input_namer   r  kwargs_has_attention_maskrF   r   r   input_tokensmaskr)  generated_tokensr!  r"  r>  r?  r@  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  r  r%  next_token_scoresprobs
next_tokenr   s&                                        rX   r(  &JanusForConditionalGeneration.generate  s    !**%6?*.*I*IJJ*D1+
5;+
'
 f$7# -"3#	
   002>;P;PR`RnRn:ooT  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N22L5
1	\ ")9)9vy1$MiooM^EF  %3$$>!$$%6ZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #'"D"D #
))>>#
 	#
	  ::2299JJ'oo
 ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW113LA-t4<.2.H.H%6%K%K%Wx%>!"3">">@P@Z[) /I /L*+ !;;
'EUb .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A  == +|X\`lL  </1=>N1O1R1RS`SgSg1h-.jj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMQ )T #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#rW   )r   r  r   )	NNNNNNNNr   )NNNNNF)NNN) rH   rI   rJ   rK   _tied_weights_keysoutput_modalitiesr   rr   r   r  r  r   r   r  r   r   rR  r   r
   rS   rM   r   r   r   r   r  r  no_gradr   r(  rV   r   r   s   @rX   r  r  J  s   *,VW)!{ @>ell u|| 
  .215.204(,26*.!%-./
##d*/
 ''$./
 t+	/

 &&-/
 /
 ((4//
   4'/
 $;/
 ell*/
 +,/
 
%/
  /
h  @
 
 ]]_ '+267;	$t#$ ((4/$ .4	$ $rW   r  )r   r  r  r  r"  rZ   r1   rr   )dcollections.abcr   dataclassesr   r   torch.nn.functionalr   
functionalrJ  huggingface_hub.dataclassesr    r   r   activationsr	   cache_utilsr
   configuration_utilsr   
generationr   r   r   r   generation.utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   autor   r   r    blip_2.modeling_blip_2r!   !chameleon.configuration_chameleonr"   chameleon.modeling_chameleonr#   r$   r%   r&   r'   idefics.modeling_ideficsr(   r)   llama.modeling_llamar*   siglip.configuration_siglipr+   siglip.modeling_siglipr,   r-   r.   
get_loggerrH   r   r1   rZ   rr   r   r   r   r   r   r  r   r  r   r  r"  r3  rA  rT  rV  rX  rZ  rj  ru  r  r  r  r  r  r  __all__rG   rW   rX   <module>rs     s   % !    .  & !   3 u u 9 X X F &  9 8 5 D  e : < ^ ^ 			H	%
 <=)* )  >)2 <=#!+ #!  >#!L <==(" =(  >=(@ i? i i$ 
 	4{ 	4 	4	#A 		"? 	2 "I$299 I$XRYY (*0 *p p#
' #
LBII $" = "*	< 		8 		B 	RYY  ,J!		 J!ZA		 AH2U 2Uj299 $RYY   
l
% l

l
^x$$8/ x$v		rW   