
    Z jX                        S SK Jr  S SKJr  S SKrS SKJs  Jr  S SKJr  SSK	J
r  SSKJr  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*  SSK+J,r,  SSK-J.r.  SSK/J0r0J1r1J2r2  \&Rf                  " \45      r5\$ " S S\5      5       r6\$" SS9\ " S S\5      5       5       r7\$" SS9\ " S S\5      5       5       r8\$" SS9\ " S  S!\5      5       5       r9 " S" S#\Rt                  5      r;S$\Rx                  S%\=S&\Rx                  4S' jr> S\S(\Rt                  S)\Rx                  S*\Rx                  S+\Rx                  S,\Rx                  S-  S-\?S.\?S/\!\#   4S0 jjr@ " S1 S2\Rt                  5      rA " S3 S4\Rt                  5      rB " S5 S6\5      rC " S7 S8\Rt                  5      rD\$ " S9 S:\65      5       rE " S; S<\Rt                  5      rF " S= S>\Rt                  5      rG " S? S@\Rt                  5      rH " SA SB\Rt                  5      rI " SC SD\Rt                  5      rJ " SE SF\Rt                  5      rK " SG SH\Rt                  5      rL " SI SJ\Rt                  5      rM " SK SL\Rt                  5      rN\$\ " SM SN\5      5       5       rO\$" SOS9 " SP SQ\65      5       rP " SR SS\Rt                  5      rQ " ST SU\Rt                  5      rR\$" SVS9 " SW SX\65      5       rS " SY SZ\6\5      rT/ S[QrUg)]    )Callable)	dataclassN)nn   )initialization)ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check	torch_int)merge_with_config_defaults)capture_outputs   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   \   ^  \ rS rSr% \\S'   SrSrSrSS/r	SS	/r
SrSrSrU 4S
 jrSrU =r$ )JanusPreTrainedModel/   configmodelimagetextTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskc                   > [         TU ]  U5        [        U[        5      (       a\  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g g )Nr    r1   )super_init_weights
isinstanceJanusVisionEmbeddingsinitcopy_position_idstorcharangeshapeexpand)selfmodule	__class__s     y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/janus/modeling_janus.pyr4   "JanusPreTrainedModel._init_weights<   s^    f%f344JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5     )__name__
__module____qualname____firstlineno__r!   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr4   __static_attributes____classcell__r@   s   @rA   r%   r%   /   sO    (&*#,.GH#4m"DN!i irC   r%   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   j    \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
g)JanusVQVAEOutputB   z
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    Reconstructed pixel values after encoding and decoding the input.
embedding_loss (`torch.FloatTensor`):
    Embedding loss.
Ndecoded_pixel_valuesembedding_lossrD   )rE   rF   rG   rH   __doc__rY   r:   FloatTensorrI   rZ   rR   rD   rC   rA   rW   rW   B   s4     6:%++d29/3NE%%,3rC   rW   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   S	rg)
JanusBaseModelOutputWithPastT   aa  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.

    If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
    hidden_size)` is output.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlast_hidden_stater.   hidden_states
attentionsimage_hidden_statesrD   )rE   rF   rG   rH   r[   r`   r:   r\   rI   r.   r	   ra   tuplerb   rc   rR   rD   rC   rA   r^   r^   T   s|    & 37u((4/6$(OUT\(59M5**+d2926Je''(4/6;?u001D8?rC   r^   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S	'   S
rg)JanusCausalLMOutputWithPastu   a1  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlosslogitsr.   ra   rb   rc   rD   )rE   rF   rG   rH   r[   rh   r:   r\   rI   ri   r.   r	   ra   rd   rb   rc   rR   rD   rC   rA   rf   rf   u   s    " &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/6;?u001D8?rC   rf   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\
S\R                  4S jjrSrU =r$ )r6      r'   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l
        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        U R                  S[         R"                  " U R                  5      R%                  S5      SS9  g )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   r9   r2   F)
persistent)r3   __init__r'   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr:   r;   r=   r>   r'   r@   s     rA   rt   JanusVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]joprC   
embeddingsheightwidthreturnc                    UR                   S   nU R                  R                  R                   S   n[        R                  R                  5       (       d%  XE:X  a   X#:X  a  U R                  U R                  5      $ U R                  R                  R                  S5      nUR                   S   nX R                  -  nX0R                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r    r   r1   g      ?r   r   bicubicF)sizemodealign_corners)r<   r   weightr:   jit
is_tracingr9   	unsqueezerx   r   reshapepermuter   
functionalinterpolateview)r>   r   r   r   r|   r}   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              rA   interpolate_pos_encoding.JanusVisionEmbeddings.interpolate_pos_encoding   s:    !&&q)//66<<Q? yy##%%+*F6?**4+<+<==1188BB1Er".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#NrC   pixel_valuesr   c                 V   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  U R                  XU5      n	OU R                  U R                  5      n	X-   nU$ )N)dtyper   r    )
r<   r{   r   r   toflatten	transposer   r   r9   )
r>   r   r   _r   r   target_dtypepatch_embedsr   
pos_embedss
             rA   forwardJanusVisionEmbeddings.forward   s    *001e++2288++LOO,O,OP!))!,66q!<
#66z5QJ001B1BCJ,
rC   )r'   rv   rw   r|   r}   r{   rx   r   )F)rE   rF   rG   rH   r"   rt   r:   Tensorintr   boolr   rR   rS   rT   s   @rA   r6   r6      sj    q0 q($5<< $ $UX $]b]i]i $LELL D ]b]i]i  rC   r6   ra   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r    N)r<   r=   r   )ra   r   batchnum_key_value_headsslenhead_dims         rA   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTrC   r?   querykeyvalueattention_maskscalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr   r   r1   )r   r   )ptrainingr    )r   num_key_value_groupsr:   matmulr   r   r   softmaxfloat32r   r   r   r   
contiguous)r?   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               rA   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$rC   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   4S	 jjrS
rU =r$ )JanusVisionAttentioni  z(Attention Class for Janus Vision Encoderr'   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        SU l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  5      U l        US:  a  [        R,                  " U5      O[        R.                  " 5       U l        U(       a   [        R0                  " U R                  5      O[        R.                  " 5       U l        U(       a&  [        R0                  " U R                  5      U l        g [        R.                  " 5       U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      Fr    biasr   )r3   rt   r'   ru   rv   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)r>   r'   proj_dropoutqk_normr@   s       rA   rt   JanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=rC   Nra   r   r   c                     UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R                  XEU R
                  U R                  5      R                  SS5      n	[        R                  " U R                  R                  [        5      n
U
" U UUU	U4U R                   (       d  SOU R"                  U R$                  U R&                  S.UD6u  pUR	                  XEU R(                  5      nU R+                  U5      nU R-                  U5      nX4$ )Nr1   r    r           )r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   get_interfacer'   _attn_implementationr   r   r   r   r   rv   r   r   )r>   ra   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputs                 rA   r   JanusVisionAttention.forward%  s    "/!3!3!5
Q{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJnn
%
 
%
! "))*t~~N&&{3((0##rC   )r   r'   rv   r   r   r   r   r   r   r   r   r   r   r   r   N)rE   rF   rG   rH   r[   r"   rt   r:   r   r   r   r   rR   rS   rT   s   @rA   r   r     sT    2Q0 Q@ /3)$||)$ t+)$ +,	)$ )$rC   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )JanusVisionMLPiQ  r'   c                    > [         TU ]  5         Xl        [        UR                  UR
                  -  5      U l        [        UR                     U l	        [        R                  " UR                  U R                  5      U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g r   )r3   rt   r'   r   ru   	mlp_ratiointermediate_sizer   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2r   s     rA   rt   JanusVisionMLP.__init__R  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>rC   ra   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   r>   ra   s     rA   r   JanusVisionMLP.forward\  sP    /**=9m4/m4rC   )r   r'   r   r   r   r   r   )rE   rF   rG   rH   r"   rt   r:   r   r   rR   rS   rT   s   @rA   r   r   Q  s0    ?0 ?U\\ ell  rC   r   c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\	\
   S\R                  4S j5       rS	rU =r$ )
r-   ie  r'   c                 H  > [         TU ]  5         UR                  U l        [        R
                  " U R                  UR                  S9U l        [        U5      U l	        [        R
                  " U R                  UR                  S9U l
        [        U5      U l        Xl        g N)eps)r3   rt   ru   rv   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr'   r   s     rA   rt    JanusVisionEncoderLayer.__init__f  sr    ++<<F<Q<QR-f5<<F<Q<QR!&)rC   ra   r   r   r   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)ra   r   rD   )r  r  r  r  )r>   ra   r   r   residualr   s         rA   r   JanusVisionEncoderLayer.forwardo  sz     !((7>> 
')
 

 !0 ((7/ 0rC   )r'   rv   r  r  r  r  )rE   rF   rG   rH   r"   rt   r   r:   r   r   r   r\   r   rR   rS   rT   s   @rA   r-   r-   e  s^    0  ||  +,	
 
		 rC   r-   c                   z   ^  \ rS rSrSrS\4U 4S jjr\ SS\R                  S-  S\
\   S\4S	 jj5       rS
rU =r$ )JanusVisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`JanusVisionEncoderLayer`].

Args:
    config: JanusVisionConfig
r'   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r3   rt   r'   r   
ModuleListrangenum_hidden_layersr-   layersgradient_checkpointingr>   r'   r   r@   s      rA   rt   JanusVisionEncoder.__init__  sT    mmeTZTlTlNm$nNm%<V%DNm$no&+# %os   A&Nr   r   r   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)r`   )r  r   )r>   inputs_embedsr   r   ra   encoder_layers         rA   r   JanusVisionEncoder.forward  s>     &![[M) M ) ??rC   )r'   r  r  r   )rE   rF   rG   rH   r[   r"   rt   r   r:   r   r   r   r   r   rR   rS   rT   s   @rA   r  r    s`    ,0 ,  /3@ t+@ +,	@
 
@ @rC   r  c                      ^  \ rS rSr% SrSr\\S'   \\	S.r
S\4U 4S jjr\\" SS9\  SS\R                   S	-  S
\S\\   S\\-  4S jj5       5       5       rS rSrU =r$ )JanusVisionModeli  r   )r*   r'   ra   rb   c                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )r3   rt   r'   ru   r6   r   r  encoderr   r   r  post_layernorm	post_init)r>   r'   rv   r@   s      rA   rt   JanusVisionModel.__init__  sY     &&	/7)&1 ll9:O:OPrC   F)tie_last_hidden_statesNr   r   r   c                     Uc  [        S5      eU R                  XS9nU R                  " SSU0UD6nUR                  nU R	                  U5      nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r   r  r   )r`   pooler_outputrD   )r   r   r  r`   r  r   )r>   r   r   r   ra   encoder_outputsr`   pooled_outputs           rA   r   JanusVisionModel.forward  s     ?@@h+/<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rC   c                     U R                   $ r   )r   r>   s    rA   get_input_embeddings%JanusVisionModel.get_input_embeddings  s    rC   )r'   r   r  r  r  )rE   rF   rG   rH   main_input_namerK   r"   rI   r-   r   _can_record_outputsrt   r   r   r   r:   r\   r   r   r   rd   r   r   r)  rR   rS   rT   s   @rA   r  r    s    $O!0*
	0 	  E2 26).
''$.
 #'
 +,	

 
+	+
  3  
6 rC   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVisionAlignerMLPi  r'   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf Nr    )r3   rt   r   r   ru   projection_dimr   r  r  depthhidden_layersr   r   r   r  s      rA   rt   JanusVisionAlignerMLP.__init__  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   (5Cc                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r   r   r3  r   r>   ra   layers      rA   r   JanusVisionAlignerMLP.forward  B    /''E ..}=M!-0M ( rC   r   r   r3  )	rE   rF   rG   rH   r"   rt   r   rR   rS   rT   s   @rA   r.  r.    s    70 7 rC   r.  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	S\R                  S\R                  4S	 jrS
rU =r$ )JanusVQVAEVectorQuantizeri  a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
r'   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        USS5      U l        [        R                  " U R                  U R                  5      U l	        UR                  /S-  U l        g )Nbetag      ?r   )r3   rt   num_embeddingsrv   embedding_dimgetattrr@  r   r~   	embeddingr|   quant_state_dimsr   s     rA   rt   "JanusVQVAEVectorQuantizer.__init__  sn    $33#--FFD1	d&9&94;M;MN!'!3!3 4q 8rC   hidden_statec           
      >   UR                  SSSS5      R                  5       nUR                  SU R                  5      n[        R
                  " US-  SSS9[        R
                  " U R                  R                  S-  SS9-   S[        R                  " S	X R                  R                  R                  SS5      5      -  -
  n[        R                  " USS9nU R                  U5      R                  UR                  5      n[        R                  " UR                  5       U-
  S-  5      U R                  [        R                  " XQR                  5       -
  S-  5      -  -   nXU-
  R                  5       -   nUR                  SSSS5      R                  5       nXVU4$ )
Nr   r   r   r    r1   T)r   keepdimr   z	bd,dn->bn)r   r   r   rB  r:   sumrD  r   einsumr   argminr<   meandetachr@  )r>   rG  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrh   s          rA   r   !JanusVQVAEVectorQuantizer.forward  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e"5"5"77A=\
 P
 

 *,-N,V,V,XX 0771aCNNP!)===rC   image_tokensr   c                 >   UR                   S   nU R                  R                  R                   S   nU R                  U5      n[        R                  " USSS9nUR                  U/U R                  QUP75      nUR                  SSSS5      R                  5       nU$ )Nr   r1   r   )r   r   r   r    )	r<   rD  r   F	normalizer   rE  r   r   )r>   rU  r   emb_dimrS  s        rA   get_codebook_entry,JanusVQVAEVectorQuantizer.get_codebook_entry"  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!rC   )r@  rD  rB  rA  rE  )rE   rF   rG   rH   r[   r#   rt   r:   r   r   
LongTensorr\   rZ  rR   rS   rT   s   @rA   r>  r>    sI    9/ 9>ELL >6"u/?/? "EDUDU " "rC   r>  c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ )JanusVQVAEResnetBlocki2  c                   > [         TU ]  5         X l        Uc  UOUU l        X@l        [
        R                  R                  SUSSS9U l        [
        R                  R                  X#SSSS9U l
        [
        R                  R                  SUSSS9U l        [
        R                  R                  UR                  5      U l        [
        R                  R                  X3SSSS9U l        U R                  U R                  :w  a]  U R                  (       a&  [
        R                  R                  X#SSSS9U l        g [
        R                  R                  X#SSSS9U l        g g )	N    ư>T
num_groupsrz   r   affiner   r    rp   rq   rr   r   )r3   rt   rn   ro   use_conv_shortcutr:   r   	GroupNormnorm1ry   conv1norm2r   r   conv2conv_shortcutnin_shortcut)r>   r'   rn   ro   rl  r@   s        rA   rt   JanusVQVAEResnetBlock.__init__3  s%    	&+7+?K\!.XX''2KUYbf'g
XX__[AVWab_c
XX''2LVZcg'h
xx''7XX__\QWXbc_d
t000%%%*XX__[\]fgqr_%s"$)HHOOK[\efpqO$r!	 1rC   c                    UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U5      nU R                  U R                  :w  a7  U R                  (       a  U R                  U5      nX!-   $ U R                  U5      nX!-   $ r   )rh  r:   sigmoidri  rj  r   rk  rn   ro   rf  rl  rm  )r>   ra   r  s      rA   r   JanusVQVAEResnetBlock.forwardJ  s     

=1}55

=1

=1}55]3

=1t000%%--h7 ''  ,,X6''rC   )
ri  rk  rl  r   rn   rm  rh  rj  ro   rf  r  rE   rF   rG   rH   rt   r   rR   rS   rT   s   @rA   r^  r^  2  s    
 s.( (rC   r^  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEAttnBlocki^  c                   > [         TU ]  5         Xl        [        R                  R                  SUSSS9U l        [        R                  R                  XSSSS9U l        [        R                  R                  XSSSS9U l	        [        R                  R                  XSSSS9U l
        [        R                  R                  XSSSS9U l        g )Nr`  ra  Trb  r    r   re  )r3   rt   rn   r:   r   rg  normry   qkvproj_outr>   rn   r@   s     rA   rt   JanusVQVAEAttnBlock.__init___  s    &HH&&";TXae&f	qQR\]^qQR\]^qQR\]^aXYcderC   c                 Z   UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  u  pgpUR                  XgX-  5      R                  SSS5      nUR                  XgX-  5      n[        R                  " X45      n
U
[        U5      S-  -  n
[        R                  " U
SS9n
UR                  XgX-  5      nU
R                  SSS5      n
[        R                  " XZ5      R                  XgX5      nU R                  U5      nX+-   $ )Nr   r   r    r   rJ  )rv  rw  rx  ry  r<   r   r   r:   bmmr   rW  r   rz  )r>   ra   r  r   r   r   r   channelsr   r   r   r   s               rA   r   JanusVQVAEAttnBlock.forwardi  s    		-0vvm,VVM*
vvm, /;.@.@+
f#++J&.QYYZ[]^`ab''
fnM
yy:#s8}'>?yy15 $++J&.Q#++Aq!4ii;CCJZ`hmmK0%%rC   )rn   rx  rv  rz  rw  ry  rr  rT   s   @rA   rt  rt  ^  s    f& &rC   rt  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvDownsamplei  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r   r   re  )r3   rt   r   ry   convr{  s     rA   rt   !JanusVQVAEConvDownsample.__init__  s%    IIkAaYZ[	rC   c                 V    [         R                  " USSSS9nU R                  U5      nU$ )N)r   r    r   r    constantr   )padr   r   )rW  r  r  r   s     rA   r    JanusVQVAEConvDownsample.forward  s+    mJVWX		-0rC   r  rr  rT   s   @rA   r  r    s    \ rC   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvUpsamplei  c                 l   > [         TU ]  5         [        R                  R	                  XSSSS9U l        g )Nr   r    re  )r3   rt   r:   r   ry   r  r{  s     rA   rt   JanusVQVAEConvUpsample.__init__  s,    HHOOK!TU_`Oa	rC   c                 T    [         R                  " USSS9nU R                  U5      nU$ )Ng       @nearest)scale_factorr   )rW  r   r  r   s     rA   r   JanusVQVAEConvUpsample.forward  s(    m#IV		-0rC   r  rr  rT   s   @rA   r  r    s    b rC   r  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	JanusVQVAEMidBlocki  r'   r  c                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        [        UUUS9U l        g )Nr'   rn   ro   )r3   rt   r^  block_1rt  attn_1block_2)r>   r'   r  r@   s      rA   rt   JanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
rC   ra   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r   s     rA   r   JanusVQVAEMidBlock.forward  s2    ]3M2]3rC   )r  r  r  )rE   rF   rG   rH   r#   r   rt   r:   r   r   rR   rS   rT   s   @rA   r  r    s7    
/ 
3 
U\\ ell  rC   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )JanusVQVAEEncoderi  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  X2SSSS9U l        S[        U5      -   nXpl        [        R                   " 5       U l        [%        U R                  5       GH   n[        R                   " 5       n	[        R                   " 5       n
X'U   -  nX&U   -  n[%        U R
                  5       HM  nU	R'                  [)        UUUS95        UnXR                  S-
  :X  d  M3  U
R'                  [+        U5      5        MO     [        R,                  " 5       nXl        Xl        XR                  S-
  :w  a  [3        U5      Ul        U R"                  R'                  U5        GM     [7        UW5      U l        [        R                  R;                  SUSSS	9U l        [        R                  R                  UU(       a  S
U-  OUSSSS9U l        g )Nr   r    re  )r    r  r`  ra  Trb  r   ) r3   rt   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrn   double_latentlatent_channelsr:   r   ry   conv_inrd   in_channel_multiplierr  downr  appendr^  rt  Moduleblockattnr  
downsampler  midrg  norm_outconv_out)r>   r'   r  rn   r  r  r  r  i_levelr  r  block_in	block_outi_blockr  r@   s                  rA   rt   JanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!45)%$,%. %22Q66KK 3H => 6 99;DJI..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
rC   r   c                    U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr1   r   r    )r  r  r  r  r  r  r  r  r  r  r  r  r:   rp  r  )r>   r   ra   r  r  rG  r`   s          rA   r   JanusVQVAEEncoder.forward  sB   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  rC   )r  r  r  r  r  r  r  r  )
rE   rF   rG   rH   rt   r:   r\  r   rR   rS   rT   s   @rA   r  r    s     1
f!E$4$4 ! !rC   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusVQVAEDecoderi  c           
      d  > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nX!R                  U R                  S-
     -  n[        R                  R                  X5SSSS9U l        [        X5      U l        [        R                  " 5       U l        [#        [%        U R                  5      5       H  n[        R                  " 5       n[        R                  " 5       nX!R                  U   -  n	[%        U R
                  S-   5       HM  n
UR'                  [)        UUU	S95        U	nX`R                  S-
  :X  d  M3  UR'                  [+        U5      5        MO     [        R,                  " 5       nX{l        Xl        US:w  a  [3        U5      Ul        U R                   R'                  U5        M     [        R                  R7                  SUSSS	9U l        [        R                  R                  XTSSSS9U l        g )
Nr    r   re  r  r   r`  ra  Trb  )r3   rt   r  r  r  r  r  r  ro   r:   r   ry   r  r  r  r  upreversedr  r  r^  rt  r  r  r  r  upsamplerg  r  r  )r>   r'   r  r  ro   r  r  r  r  r  r  r  r@   s               rA   rt   JanusVQVAEDecoder.__init__  s   "6#<#<=$33,, 00** !#<#<T=Q=QTU=U#VV xxaXYcde &f7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %22Q66KK 3H => : BHG!|4X>GGNN2) =. **bxUYbf*gAVWabcrC   rG  r   c                 r   U R                  U5      nU R                  U5      n[        U R                  5       H  n[        U R                  S-   5       Ho  nU R
                  U   R                  U   " U5      n[        U R
                  U   R                  5      S:  d  MM  U R
                  U   R                  U   " U5      nMq     X R                  S-
  :w  d  M  U R
                  U   R                  U5      nM     U R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr    r   )r  r  r  r  r  r  r  r  r  r  r  r:   rp  r  )r>   rG  r  r  s       rA   r   JanusVQVAEDecoder.forward*  s   ||L1 xx- T112G !4!4q!89#www/55g>|Ltwww',,-1#'777#3#8#8#A,#OL : ..22#www/88F 3 }}\2l33}}\2rC   )r  r  r  r  r  r  r  )
rE   rF   rG   rH   rt   r:   r\   r   rR   rS   rT   s   @rA   r  r    s.    ,d\E$5$5 %:K:K  rC   r  c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Srg)JanusVQVAEModelOutputi?  a  
quantized_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    Quantized last hidden state from the VQ-VAE model.
image_tokens (`torch.FloatTensor` of shape `(batch_size, config.vocab_size`):
    Indices of the image tokens predicted by the VQ-VAE model.
embedding_loss (`torch.FloatTensor`):
    The embedding loss computed during quantization.
Nquantized_last_hidden_staterU  rZ   rD   )rE   rF   rG   rH   r[   r  r:   r\   rI   rU  rZ   rR   rD   rC   rA   r  r  ?  sJ     =A!2!2T!9@-1L%##d*1/3NE%%,3rC   r  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                   <  ^  \ rS rSr% \\S'   / SQr\\S.r	Sr
S\4U 4S jjr\\S\R                  S\\   S\4S	 j5       5       rS
\R                  S\R(                  4S jr\\S\R(                  S\\R(                  \R(                  4   4S j5       5       rSrU =r$ )
JanusVQVAEiP  r'   )rt  r^  r>  r  r   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  R                  UR                  UR                  S5      U l        [        R                  R                  UR                  UR                  S5      U l        U R                  5         [        U5      U l        SU l        U R#                  5         g )Nr    F)r3   rt   r  r  r>  quantizer:   r   ry   r  rv   
quant_convpost_quant_convevalr  decoderr  r  r   s     rA   rt   JanusVQVAE.__init__e  s     (01&9((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		(0&+#rC   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      u  pVn[        UUUUS9$ )N)r`   r  rU  rZ   )r  r  r  r  )r>   r   r   ra   conv_hidden_statesr  emb_lossindicess           rA   encodeJanusVQVAE.encodeq  sO     \2!__];9=GY9Z6#w$+(C #	
 	
rC   rU  c                    UR                   S   U R                  R                  S   U R                  R                  S   -  :w  aM  [        SU R                  R                  S   U R                  R                  S   -   SUR                    S35      eU R                  R	                  U5      nU R                  U5      nU R                  U5      nU$ )a  
Decodes quantized token IDs into pixel values.
Args:
    image_tokens (torch.LongTensor): Batch of token IDs.
Returns:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Pixel values decoded from the token IDs.
r    r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r<   r  rE  r   rZ  r  r  )r>   rU  codebook_entryra   r   s        rA   decodeJanusVQVAE.decode~  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2rC   c                     UR                   S   nU R                  " U4SS0UD6nU R                  UR                  R	                  US5      5      n[        XTR                  5      $ )Nr   return_dictTr1   )r<   r  r  rU  r   rW   rZ   )r>   r   r   r   encode_outputsrY   s         rA   r   JanusVQVAE.forward  sa     "''*
\NtNvN#{{>+F+F+K+KJXZ+[\ 46S6STTrC   )r  r  r  r  r  r  )rE   rF   rG   rH   r#   rI   rM   r^  rt  r,  r+  rt   r   r   r:   r\  r   r   r  r  r\   r  r   r   rd   r   rR   rS   rT   s   @rA   r  r  P  s      /) %O
/ 
  	
5#3#3 	
vFX?Y 	
^s 	
   	
5#3#3 8I8I & 	U''	U 
u  %"3"33	4		U  	UrC   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVQVAEAlignerMLPi  r'   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf r0  )r3   rt   r   r   rv   r1  r   r  r  r  r3  r   r   r   r  s      rA   rt   JanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rr5  c                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r   r7  r8  s      rA   r   JanusVQVAEAlignerMLP.forward  r;  rC   r<  )	rE   rF   rG   rH   r#   rt   r   rR   rS   rT   s   @rA   r  r    s    7/ 7 rC   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	JanusVQVAEHeadi  zOHead used for sampling tokens in image generation, replacing the usual lm head.r'   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  UR                  5      U l        g r   )r3   rt   r   r   image_token_embed_dimr1  rz  r   r   r   rA  vision_headr   s     rA   rt   JanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRrC   ra   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rz  r   r  r   s     rA   r   JanusVQVAEHead.forward  s6    m4**=9((7rC   )r   rz  r  )rE   rF   rG   rH   r[   r#   rt   r:   r   tensorr   rR   rS   rT   s   @rA   r  r    s5    YS/ SU\\ ell  rC   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	S\
R                  S\\   S\\-  4S	 j5       5       rS
\
R"                  S\
R                  S\
R                  4S jr\\	        SS
\
R"                  S-  S\
R                  S-  S\
R&                  S-  S\
R"                  S-  S\S-  S\
R                  S-  S\S-  S\\
R&                  -  S\4S jj5       5       rSrU =r$ )
JanusModeli  r'   c                   > [         TU ]  U5        Xl        [        R	                  UR
                  5      U l        [        U R                  R                  5      U l        [        R	                  UR                  5      U l        [        R                  " U R                  R                  R                  U R                  R                  R                  5      U l        [#        U R                  R                  5      U l        ['        U R                  R                  5      U l        [*        R,                  " UR.                  S9U l        SU l        U R5                  5         g )N)r'   F)r3   rt   r'   r  _from_configvision_configvision_modelr.  alignerr  	vq_configvqmodelr   r~   rA  rv   generation_embeddingsr  generation_alignerr  generation_headr   from_configtext_configlanguage_modelr  r  r   s     rA   rt   JanusModel.__init__  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#rC   c                 6    U R                   R                  5       $ r   )r  r)  r(  s    rA   r)  JanusModel.get_input_embeddings  s    ""7799rC   c                 :    U R                   R                  U5        g r   )r  set_input_embeddingsr>   r   s     rA   r  JanusModel.set_input_embeddings  s    007rC   r   r   r   c                 p    U R                   " U4SS0UD6nU R                  UR                  5      Ul        U$ )Nr  T)r  r  r`   r#  )r>   r   r   vision_outputss       rA   get_image_featuresJanusModel.get_image_features  s;    
 **<TTTVT'+||N4T4T'U$rC   	input_idsr  image_featuresc           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r   devicer1   r   r    z6Image features and image tokens do not match, tokens: z, features: )r)  r:   r  r'   image_token_idlongr	  allrK  r<   r   	expand_asr   r   numel)r>   r  r  r  special_image_maskn_image_tokensn_image_featuress          rA   get_placeholder_maskJanusModel.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!rC   Nr   r9   r.   	use_cachelogits_to_keepc	           
         US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUb  U R                  USS9R                  n
U
R	                  SUR
                  S   5      nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUUUS.U	D6n[        UR                  UR                  UR                  UR                   Ub  W
S9$ S S9$ )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oneT)r  r1   )r  r  )r  r   r9   r.   r  r  )r`   r.   ra   rb   rc   rD   )r   r)  r  r#  r   r<   r   r	  r   r  masked_scatterr  r^   r`   r.   ra   rb   )r>   r  r   r   r9   r.   r  r  r  r   image_embedsr  image_attention_mask	lm_outputs                 rA   r   JanusModel.forward  sH    -t";<s    557	BM#22<T2R``L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M^M'' 
')%+)
 
	 ,'99%55#11 ++0<0H
 	

 OS
 	
rC   )	r  r'   r  r  r  r  r  r  r  )NNNNNNNr   )rE   rF   rG   rH   r!   rt   r)  r  r   r   r:   r\   r   r   rd   r   r  r\  r  r   r	   r   r   r^   r   rR   rS   rT   s   @rA   r  r    sb   { *:8 !--9?@R9S	+	+  "))":?:K:K"]b]n]n"0  .215.204(,26!%-.,
##d*,
 ''$.,
 t+	,

 &&-,
 ,
 ((4/,
 $;,
 ell*,
 
&,
  ,
rC   r  c                   t  ^  \ rS rSrSS0rSrSrS\4U 4S jjrS r	S	 r
S
\R                  S\R                  4S jr\\         SS\R                   S-  S\R"                  S-  S\R                  S-  S\R                   S-  S\S-  S\R"                  S-  S\R                   S-  S\S-  S\\R                  -  S\\   S\4S jj5       5       r      S U 4S jjrS\R                  4S jr\R6                  " 5          S!S
\R                  S-  S\R                   S-  S\S-  4U 4S jjj5       rSrU =r$ )"JanusForConditionalGenerationi5  zlm_head.weightz(model.language_model.embed_tokens.weightr)   Tr'   c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  R                  UR                  R                  SS9U l
        U R                  5         g )NFr   )r3   rt   r'   r  r(   r   r   r  ru   
vocab_sizelm_headr  r   s     rA   rt   &JanusForConditionalGeneration.__init__:  sZ     '
yy!3!3!?!?ASASA^A^ejk 	rC   c                 J    U R                   R                  R                  5       $ r   )r(   r  r)  r(  s    rA   r)  2JanusForConditionalGeneration.get_input_embeddingsC  s    zz((==??rC   c                 N    U R                   R                  R                  U5        g r   )r(   r  r  r  s     rA   r  2JanusForConditionalGeneration.set_input_embeddingsF  s    

!!66u=rC   inputsr   c                 r    U R                   R                  U5      nU R                   R                  U5      nU$ r   )r(   r  r  )r>   r&  rG  s      rA   'prepare_embeddings_for_image_generationEJanusForConditionalGeneration.prepare_embeddings_for_image_generationI  s0    zz77?zz44\BrC   Nr  r   r   r9   r.   r  labelsr  r  r   c
                    U R                   " SUUUUUUUS.U
D6nUR                  n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUb3  U R                  " SXU R                  R                  R                  S.U
D6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r  r   r   r9   r.   r  r  N)ri   r*  r  )rh   ri   r.   ra   rb   rc   rD   )r(   r`   r5   r   slicer   loss_functionr'   r  r  rf   r.   ra   rb   rc   )r>   r  r   r   r9   r.   r  r*  r  r  r   outputsra   slice_indicesri   rh   s                   rA   r   %JanusForConditionalGeneration.forwardN  s    * ** 	
%)%+'	
 	
  118B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD +#33!//)) ' ; ;
 	
rC   c           	      z   > [         T
U ]  " U4UUUUUS.UD6n	U(       d  UR                  SS5      (       d  X)S'   U	$ )N)r.   r  r   r  is_first_iterationr  Tr   )r3   prepare_inputs_for_generationget)r>   r  r   r.   r   r  r  r2  r   model_inputsr@   s             rA   r3  ;JanusForConditionalGeneration.prepare_inputs_for_generation  sX     w<
+'))1
 
 VZZT%B%B+7(rC   rU  c                 x    U R                   R                  R                  U5      nUR                  SSSS5      nU$ )z
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.
Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
r   r   r   r    )r(   r  r  r   )r>   rU  decoded_images      rA   decode_image_tokens1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9rC   logits_processorc           	        > UR                  SS5      nU R                  " UR                  SS 5      40 UD6u  pgUS:X  a  [        T%U ]  " SUUUS S.UD6$ UR	                  5       [
        R                  [
        R                  4;  a  [        S5      eUR                  5         U R                  UR                  5       5        Ub  UO	[        5       nSUS'   UR                  c  [        R                  S5        S	Ul        UR                  US
'   U R!                  XR"                  U5      u  pnUR$                  UR&                  p[)        UR*                  5      S:w  a  [        SUR*                   S35      eUS LnU R-                  XlUR&                  S9  UR                  (       a;  UR                  S:  a+  UR/                  [1        UR                  5      5        S Ul        U R3                  UUR*                  S   US UUS9nU R4                  " SUUUR6                  S.UD6u  pU R8                  R:                  R<                  R>                  nUR*                  u  pURA                  SS5      nUR                  SS 5      nURA                  SS5      nX'S'   UUS 2S S 24   UR"                  :g  UUS 2S S 24   URB                  S   :g  -  nUUS 2S S 24   RE                  UURF                  5        U RI                  5       " U5      nURK                  SS 5      cA  U RM                  URN                  =(       d    SUS-  [Q        URR                  X-   5      US9US'   [T        RV                  " X4XS9nURX                  nURZ                  nUR\                  nUR^                  nUR`                  nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS n[c        U5       GH_  nU Rd                  " SUUSS.UD6nSU;   a!  US   Rg                  UR&                  5      US'   U R8                  Rh                  " S0 UDUUS.D6nU Rk                  UU5      nURl                  S S 2SS S 24   Ro                  5       n U R8                  Rq                  U 5      n!U" UU!5      n"URr                  (       a:  [T        Rt                  " U"SS9n#[T        Rv                  " U#SS9Ry                  S5      n$O[T        Rz                  " U"SS9n$U$US S 2U4'   [T        R|                  " U$U$/5      n$U$R                  S5      n$U R                  U$5      nGMb     U(       aT  U(       a  UW!4-  nU(       a  UW R                  5       4-  nU(       a  UWR                  -  nU(       a  UWR                  -  nU(       a  [        UW!UUUWR                  S9$ U$ )Ngeneration_moder+   generation_config)r&  r   r>  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r?  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r	  r    )r>  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr;  r	  )r  r   expand_sizer   boi_token_idr.   static)cache_implementationr   max_cache_lenmodel_kwargsr  rD   )r  r  r2  )output_attentionsoutput_hidden_statesr1   rJ  )num_samples)	sequencesscoresri   rb   ra   r.   )Fpop_prepare_generation_configr3   generateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargscopyr   r?  loggerwarning_prepare_model_inputsbos_token_idr   r	  r  r<   _prepare_special_tokensr  r
   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr(   r  r'   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr)  r4  _prepare_static_cacherG  max
max_lengthr:   zerosrJ  rK  output_scoresoutput_logitsreturn_dict_in_generater  r3  r   r  #_update_model_kwargs_for_generationr`   cloner  	do_sampler   multinomialsqueezeargmaxcatr   r(  floatrb   ra   r   r.   )&r>   r&  r   r;  r   r=  r>  rI  r  model_input_namer   r	  kwargs_has_attention_maskr`  r   r   input_tokensmaskr  generated_tokensrJ  rK  ri  rj  rk  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir5  r.  rG  rN  next_token_scoresprobs
next_tokenr@   s&                                        rA   rQ  &JanusForConditionalGeneration.generate  s    !**%6?*.*I*IJJ*D1+
5;+
'
 f$7# -"3#	
   002>;P;PR`RnRn:ooT  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N22L5
1	\ ")9)9vy1$MiooM^EF  %3$$>!$$%6ZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #'"D"D #
))>>#
 	#
	  ::2299JJ'oo
 ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW113LA-t4<.2.H.H%6%K%K%Wx%>!"3">">@P@Z[) /I /L*+ !;;
'EUb .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A  == +|X\`lL  </1=>N1O1R1RS`SgSg1h-.jj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMQ )T #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#rC   )r'   r   r(   )	NNNNNNNNr   )NNNNNF)NNN) rE   rF   rG   rH   _tied_weights_keysoutput_modalitiesrQ   r!   rt   r)  r  r:   r   r(  r   r   r\  r\   r	   r   r   r   r   rf   r   r3  r9  no_gradr   rQ  rR   rS   rT   s   @rA   r  r  5  s   *,VW)!{ @>ell u|| 
  .215.204(,26*.!%-./
##d*/
 ''$./
 t+	/

 &&-/
 /
 ((4//
   4'/
 $;/
 ell*/
 +,/
 
%/
  /
h  @
 
 ]]_ '+267;	$t#$ ((4/$ .4	$ $rC   r  )r%   r  r  r  r  )r   )Vcollections.abcr   dataclassesr   r:   torch.nn.functionalr   r   rW   r   r7   activationsr   cache_utilsr	   
generationr
   r   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   autor   configuration_janusr!   r"   r#   
get_loggerrE   rX  r%   rW   r^   rf   r  r6   r   r   r   rs  r   r   r   r-   r  r  r.  r>  r^  rt  r  r  r  r  r  r  r  r  r  r  r  __all__rD   rC   rA   <module>r     so  * % !     & !   u u 9 9 X X F & u u 7 5  Q Q 
		H	% i? i i$ 
 	4{ 	4 	4 
 @; @ @6 
 @+ @ @4HBII HV	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2I$299 I$XRYY ( 8  F@ @D 3+ 3 3lBII $<"		 <"~)(BII )(X &"))  &F	ryy 	RYY  ,J!		 J!ZA		 AH 
46 4  4 DU% DUDUN299 $RYY   
l
% l

l
^x$$8/ x$v	 trC   