
    Z jx                     D   S SK r S SKJr  S SKJr  S SKrS SKJr  SSKJr	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*J+r+  \ \ " S S\5      5       5       r,\ " SS9\ " S S\5      5       5       r-\ " SS9\ " S S\5      5       5       r.\" S5       " S  S!\R^                  5      5       r0 " S" S#\R^                  5      r1 " S$ S%\R^                  5      r2 SGS&\R^                  S'\Rf                  S(\Rf                  S)\Rf                  S*\Rf                  S-  S+\4S,\44S- jjr5 " S. S/\R^                  5      r6 " S0 S1\R^                  5      r7 " S2 S3\5      r8 " S4 S5\R^                  5      r9 " S6 S7\R^                  5      r: " S8 S9\Rv                  5      r< " S: S;\5      r=S<\Rf                  S=\>4S> jr? " S? S@\=5      r@\ " SAS9 " SB SC\=5      5       rA\  " SD SE\=\5      5       rB/ SFQrCg)H    N)Callable)	dataclass)nn   )initialization)ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)merge_with_config_defaults)capture_outputs   )	AutoModel   )Ovis2ConfigOvis2VisionConfigc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)*BaseModelOutputWithVisualIndicatorFeatures,   z
visual_indicator_features (`torch.FloatTensor` of shape `(batch_size, visual_indicator_size)`):
    Visual indicator features extracted from the model, which can be used for auxiliary tasks or further processing.
Nvisual_indicator_features )
__name__
__module____qualname____firstlineno____doc__r"   torchFloatTensor__annotations____static_attributes__r#       y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/ovis2/modeling_ovis2.pyr    r    ,   s    
 ;?u0047>r-   r    zJ
    Base class for Llava outputs, with hidden states and attentions.
    custom_introc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Ovis2ModelOutputWithPast7   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_statesr#   )
r$   r%   r&   r'   r(   r4   r)   r*   r+   r,   r#   r-   r.   r2   r2   7   s    	 59**T18r-   r2   zQ
    Base class for Ovis2 causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)Ovis2CausalLMOutputWithPastL   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr4   r#   )r$   r%   r&   r'   r(   r8   r)   r*   r+   r9   r:   r	   r;   tupler<   r4   r,   r#   r-   r.   r6   r6   L   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r-   r6   RMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Ovis2RMSNormj   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
Ovis2RMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parameterr)   onesweightvariance_epsilon)selfhidden_sizerB   	__class__s      r.   rF   Ovis2RMSNorm.__init__l   s/     	ll5::k#:; #r-   r;   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   Tkeepdim)	dtypetor)   float32powmeanrsqrtrJ   rI   )rK   r;   input_dtypevariances       r.   forwardOvis2RMSNorm.forwardt   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r-   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r=   rI   shaperJ   rK   s    r.   
extra_reprOvis2RMSNorm.extra_repr{   s*    ))*+6$2G2G1HIIr-   )rJ   rI   )gư>)r$   r%   r&   r'   floatrF   r)   Tensorr[   r`   r,   __classcell__rM   s   @r.   r@   r@   j   sB    $ $$ $ $;U\\ ;ell ;J Jr-   r@   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Ovis2VisionMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g NbiasrE   rF   configrL   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnrK   rn   rM   s     r.   rF   Ovis2VisionMLP.__init__       !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r-   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ Nrt   rv   rr   rs   rK   xrt   s      r.   r[   Ovis2VisionMLP.forward   6    NN4;;t~~a/@#ADLLQRO#ST	r-   rv   rn   rt   rr   rL   ro   rs   r$   r%   r&   r'   rF   r[   r,   rd   re   s   @r.   rg   rg          0 r-   rg   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )Ovis2VisionEmbeddings   rn   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l
        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        U R                  S[         R"                  " U R                  5      R%                  S5      SS9  ['        UR                  UR(                  5      U l        g )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_idsr   rP   F)
persistent)rE   rF   rn   rL   	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr)   arangeexpandr@   rms_norm_epsrms_normrw   s     r.   rF   Ovis2VisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jop$V%7%79L9LMr-   pixel_valuesrC   c                     U R                   R                  R                  nU R                  UR                  US95      nUR	                  S5      R                  SS5      nU R                  U5      nX@R                  U R                  5      -   nU$ )NrS   r   r   )	r   rI   rS   rT   flatten	transposer   r   r   )rK   r   target_dtypepatch_embeds
embeddingss        r.   r[   Ovis2VisionEmbeddings.forward   s~    ++2288++LOO,O,OP!))!,66q!<
]]:.
"9"9$:K:K"LL
r-   )	rn   r   r   r   r   r   r   r   r   )r$   r%   r&   r'   r   rF   r)   r*   rc   r[   r,   rd   re   s   @r.   r   r      s4    N0 N*E$5$5 %,,  r-   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrP   )dimrS   )ptrainingr   r   )r)   matmulr   r   
functionalsoftmaxrU   rT   rS   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r.   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r-   c            
          ^  \ rS rSrSrU 4S jr S
S\R                  S\R                  S-  S\\R                  \R                  S-  4   4S jjr	S	r
U =r$ )Ovis2VisionAttention   z=Multi-headed attention from 'Attention Is All You Need' paperc                 h  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frk   )rE   rF   rn   rL   r   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutr   	is_causalr   rp   qkv_biask_projv_projq_projout_projrw   s     r.   rF   Ovis2VisionAttention.__init__   s0   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr-   Nr;   r   rC   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNrP   r   r           )r   r   r   )r^   r   r   viewr   r   r   r   get_interfacern   _attn_implementationr   r   r   r   r   reshaper   r   )rK   r;   r   r   input_shapehidden_shapequerieskeysvaluesattention_interfacer   r   s               r.   r[   Ovis2VisionAttention.forward   s6    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "));;;;FFHmmK0((r-   )rn   r   r   r   r   r   r   r   r   r   r   r{   )r$   r%   r&   r'   r(   rF   r)   rc   r=   r[   r,   rd   re   s   @r.   r   r      s[    GX, /3!)||!) t+!)
 
u||U\\D00	1!) !)r-   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Ovis2MLPi  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g rj   rm   rw   s     r.   rF   Ovis2MLP.__init__  ry   r-   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r{   r|   r}   s      r.   r[   Ovis2MLP.forward  r   r-   r   r   re   s   @r.   r   r     r   r-   r   c            	          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\R                  4S	 jjr
S
rU =r$ )Ovis2VisionEncoderLayeri  rn   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g r{   )rE   rF   r   	attentionr   ffnr@   rL   r   	rms_norm1	rms_norm2rw   s     r.   rF    Ovis2VisionEncoderLayer.__init__  sZ    -f5F#%f&8&8&:M:MN%f&8&8&:M:MNr-   Nr;   r   r   rC   c                     U R                  U5      nU R                  " SXBS.UD6u  pVX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)r;   r   r#   )r   r   r   r   )rK   r;   r   r   norm_hidden_statesr   _
mlp_outputs           r.   r[   Ovis2VisionEncoderLayer.forward  sa     "^^M:r6Hrkqr%3!^^M:XX01
%2r-   )r   r   r   r   r{   )r$   r%   r&   r'   r   rF   r)   rc   r   r   r[   r,   rd   re   s   @r.   r   r     s^    O0 O /3|| t+ +,	
 
 r-   r   c            	          ^  \ rS rSrSrS\4U 4S jjr\\ SS\	R                  S-  S\\   S\4S	 jj5       5       rS
rU =r$ )Ovis2VisionEncoderi*  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Ovis2VisionEncoderLayer`].

Args:
    config: Ovis2VisionConfig
rn   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rE   rF   rn   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rK   rn   r   rM   s      r.   rF   Ovis2VisionEncoder.__init__3  sT    mmeTZTlTlNm$nNm%<V%DNm$no&+# %os   A&Nr   r   rC   c                 P    UnU R                    H  nU" XB40 UD6nM     [        US9$ )Nlast_hidden_state)r   r   )rK   inputs_embedsr   r   r;   encoder_layers         r.   r[   Ovis2VisionEncoder.forward:  s3     &![[M)-R6RM ) ??r-   )rn   r   r   r{   )r$   r%   r&   r'   r(   r   rF   r   r   r)   rc   r   r   r   r[   r,   rd   re   s   @r.   r   r   *  sh    ,0 ,  /3
@ t+
@ +,	
@
 

@  
@r-   r   c                   h   ^  \ rS rSrS\4U 4S jjr\ SS\R                  S-  4S jj5       r	Sr
U =r$ )	Ovis2VisionTransformeriI  rn   c                    > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        SU l        g r   )rE   rF   rn   r   r   r   encoderr@   rL   r   r   r   rw   s     r.   rF   Ovis2VisionTransformer.__init__J  sM    /7)&1$V%7%79L9LM&+#r-   Nr   c                     U R                  U5      nU R                  " SUUS.UD6nUR                  nU R                  U5      n[	        US9$ )N)r   r   r   r#   )r   r  r   r   r   )rK   r   r   r   r;   encoder_outputsr   s          r.   r[   Ovis2VisionTransformer.forwardR  s_     5+/<< ,
'),
 ,
 ,== MM*;<1BCCr-   )rn   r   r  r   r   r{   )r$   r%   r&   r'   r   rF   r   r)   rc   r[   r,   rd   re   s   @r.   r   r   I  s?    ,0 ,  /3D t+D Dr-   r   c                   \   ^  \ rS rSrS\R
                  S\R
                  4U 4S jjrSrU =r$ )Ovis2VisualEmbeddingTableig  visual_tokensrC   c                   > UR                   [        R                  [        R                  [        R                  [        R
                  [        R                  4;   a  [        TU ]!  U5      $ [        R                  " XR                  5      $ r{   )rS   r)   int8int16int32int64longrE   r[   r   rI   )rK   r	  rM   s     r.   r[   !Ovis2VisualEmbeddingTable.forwardh  sU    5::u{{EKKV[V`V`"aa7?=11||M;;77r-   r#   )	r$   r%   r&   r'   r)   rc   r[   r,   rd   re   s   @r.   r  r  g  s#    8U\\ 8ell 8 8r-   r  c                   b   ^  \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSrSrSrSrU 4S jrS	rU =r$ )
Ovis2PreTrainedModelin  rn   model)imagetextTr   r:   c                   > [         TU ]  U5        [        U[        5      (       a\  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g g )NrP   r   )rE   _init_weights
isinstancer   initcopy_r   r)   r   r^   r   )rK   r   rM   s     r.   r  "Ovis2PreTrainedModel._init_weights}  s^    f%f344JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r-   r#   )r$   r%   r&   r'   r   r+   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr  r,   rd   re   s   @r.   r  r  n  sY    (&*#/0"3 N!"&i ir-   r  r9   r   c                     U R                  U5      nUR                  USS9S   n[        R                  " U [        R                  S9R                  XS5      nXBR                  5       -
  U-   nU$ )NTrQ   r   )memory_formatg      ?)r   maxr)   
zeros_likelegacy_contiguous_formatscatter_detach)r9   r   y_softindexy_hardrets         r.   hard_softmaxr2    sf    ^^C FJJsDJ)!,EfE4R4RS\\]`ilmF
==?
"V
+CJr-   c            	          ^  \ rS rSr% \\S'   \\S.rS\4U 4S jjr	\
\S\R                  S\\   S\\-  4S j5       5       rS	rU =r$ )
Ovis2VisionModeli  rn   )r;   r<   c                   > [         TU ]  U5        Xl        [        U5      U l        UR
                  U l        UR                  U l        [        R                  " UR                  UR                  -  UR                  -  U R                  U R
                  -
  SS9U l        [        R                  " U R                  U R
                  -
  5      U l        U R                  5         g NFrk   )rE   rF   rn   r   transformernum_visual_indicator_tokens
vocab_sizer   rp   rL   hidden_stridehead_linear	LayerNorm	head_norm	post_initrw   s     r.   rF   Ovis2VisionModel.__init__  s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr-   r   r   rC   c           	         U R                   " U40 UD6nUS   nU R                  R                  S:  a  UR                  u  pVnU R                  R                  n[	        [
        R                  " U5      5      n	X-  U:w  a  [        S5      eXU-  -
  U-  n
[        R                  R                  USSSU
SU
4SS5      nX-  n	UR                  XYU-  XU-  X5      nUR                  SSSSSS5      nUR                  US	X-  U-  5      nU R                  U5      nU R                  U5      nU R                  R                  S
:X  a   [        R                  R!                  US	SS9nO]U R                  R                  S:X  a  [#        US	S9nO8U R                  R                  S:X  a  [        R                  R%                  US	S9n['        UWS9$ )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         rP   gumbel_argmaxT)r   hard	st_argmaxr   r   )r   pooler_output)r7  rn   r:  r^   intmathsqrtr   r   r   padr   permuter;  r=  tokenize_functiongumbel_softmaxr2  r   r    )rK   r   r   outputsr   
num_imagesseq_len
hidden_dimr:  sqrt_lpad_sizer9   
prob_tokens                r.   r[   Ovis2VisionModel.forward  s   
 ""<:6:#AJ;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uF 1 9 9m3]mD[]j! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ[[**k9%f"5J[[**i7..v2.>J9/$
 	
r-   )rn   r;  r=  r8  r7  r9  )r$   r%   r&   r'   r   r+   r   r   _can_record_outputsrF   r   r   r)   r*   r   r   r=   r    r[   r,   rd   re   s   @r.   r4  r4    sj    0*
0   &
!--&
9?@R9S&
	;	;&
   &
r-   r4  zu
    The Ovis2 model which consists of a vision backbone and a language model, without a language modeling head.
    c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	" SS9S\
R                  S	\\   S
\\-  4S j5       5       rS\
R"                  S\
R                  S\
R                  4S jr\\	         SS\
R"                  S-  S\
R                  S-  S\
R&                  S-  S\
R"                  S-  S\S-  S\
R                  S-  S\
R"                  S-  S\S-  S\\
R&                  -  S
\\-  4S jj5       5       rSrU =r$ )
Ovis2Modeli  rn   c                   > [         TU ]  U5        [        UR                  5      U l        [
        R                  " UR                  5      U l        [        UR                  R                  UR                  5      U l        UR                  R                  U l        UR                  U l
        UR                  U l        U R                  5         g r{   )rE   rF   r4  vision_configvision_towerr   from_configtext_configlanguage_modelr  r9  rL   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr>  rw   s     r.   rF   Ovis2Model.__init__  s     ,V-A-AB'33F4F4FG'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K'r-   c                 6    U R                   R                  5       $ r{   )r`  get_input_embeddingsr_   s    r.   rf  Ovis2Model.get_input_embeddings  s    ""7799r-   c                 :    U R                   R                  U5        g r{   )r`  set_input_embeddingsrK   r   s     r.   ri  Ovis2Model.set_input_embeddings  s    007r-   zWObtains image last hidden states from the vision tower and apply multimodal projection.r/   r   r   rC   c                 T   U R                   " U4SS0UD6nUR                  nUR                  u  pVn[        R                  " XVU R                   R
                  4UR                  UR                  SUR                  S9n[        R                  " XH/SS9nU R                  U5      n[        R                  " U R                  U R                   R
                  -
  U R                  [        R                  S9R                  UR                  5      n	XCl        U R                  U	5      Ul        U$ )Nreturn_dictTF)rS   devicerequires_gradlayoutr   rG  r   )r]  rH  r^   r)   zerosr8  rS   rn  rp  catra  r   rb  r  rT   r"   )
rK   r   r   image_outputsimage_features
batch_sizeimg_seq_lenr   padding_tensorvisual_indicators
             r.   get_image_featuresOvis2Model.get_image_features  s    )),SDSFS&44%3%9%9"
d&7&7&S&ST &&!((!((
 N#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 '5#262N2NO_2`/r-   	input_idsr   rt  c           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
rS   rn  rP   r   r   z6Image features and image tokens do not match, tokens: z, features: )rf  r)   tensorrn   image_token_idr  rn  allsumr^   	unsqueeze	expand_asrT   r   numel)rK   r{  r   rt  special_image_maskn_image_tokensn_image_featuress          r.   get_placeholder_maskOvis2Model.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r-   Nr   r   r:   labels	use_cachelogits_to_keepc
           
         US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUGb4  U R                  USS9nUR                  nUR                  nU R                  UUUS9nUR                  X5      n[        U R                  5       H  u  nnUcV  X`R                  5       " [        R                  " U[        R                  UR                  S95      :H  nUR                  S5      nOUU:H  R                  UR                  5      nUR                  5       (       d  M  X   R!                  UU   5      R                  UR                  UR"                  5      UU'   M     U R$                  " S	UUUUUU	S.U
D6n['        UR(                  UR*                  UR,                  UR.                  Ub  WS9$ S S9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsT)r   rm  )r   rt  r}  rP   )r   r   r:   r   r  r  )r   r:   r;   r<   r4   r#   )r   rf  ry  rH  r"   r  masked_scatter	enumeraterc  r)   r~  r  rn  r  rT   anyr  rS   r`  r2   r   r:   r;   r<   )rK   r{  r   r   r   r:   r   r  r  r  r   rs  rt  r"   r  ivisual_indicator_idmaskrP  s                      r.   r[   Ovis2Model.forward  s    -t";<YZZ  557	BM# 33[_3`M*88N(5(O(O%!%!:!:+- "; "
 *889K\M*3D4S4S*T&&$(,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88::14"=#67M00-2E2EF "$' +U  %% 
)%+')
 
 (%77#33!//))2>2J
 	

 QU
 	
r-   )r`  r]  ra  rc  rb  r9  	NNNNNNNNr   )r$   r%   r&   r'   r   rF   rf  ri  r   r   r)   r*   r   r   r=   r    ry  
LongTensorr  rc   r	   boolrI  r2   r[   r,   rd   re   s   @r.   rZ  rZ    s   	{ 	:8 n'' +, 
;	;	 8"))":?:K:K"]b]n]n"0  .215.204(,26*.!%-.?
##d*?
 ''$.?
 t+	?

 &&-?
 ?
 ((4/?
   4'?
 $;?
 ell*?
 
)	)?
  ?
r-   rZ  c                     ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  4S	 jr\S
\R                  S\\   S\\-  4S j5       r\\         SS\R*                  S-  S
\R                  S-  S\R,                  S-  S\R*                  S-  S\S-  S\R                  S-  S\R*                  S-  S\S-  S\\R,                  -  S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ )Ovis2ForConditionalGenerationia  zlm_head.weightz(model.language_model.embed_tokens.weightrn   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g r6  )
rE   rF   rZ  r  r   rp   rL   r9  lm_headr>  rw   s     r.   rF   &Ovis2ForConditionalGeneration.__init__e  sF     '
yy!3!3V5F5FUSr-   c                 6    U R                   R                  5       $ r{   )r  rf  r_   s    r.   rf  2Ovis2ForConditionalGeneration.get_input_embeddingsk  s    zz..00r-   c                 :    U R                   R                  U5        g r{   )r  ri  rj  s     r.   ri  2Ovis2ForConditionalGeneration.set_input_embeddingsn  s    

''.r-   rC   c                     U R                   $ r{   )r  r_   s    r.   get_output_embeddings3Ovis2ForConditionalGeneration.get_output_embeddingsq  s    ||r-   r   r   c                 >    U R                   R                  " SSU0UD6$ )Nr   r#   )r  ry  )rK   r   r   s      r.   ry  0Ovis2ForConditionalGeneration.get_image_featurest  s!     zz,,Q,Q&QQr-   Nr{  r   r   r:   r   r  r  r  c
                    U R                   " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [        U	* S5      OU	nU R	                  USS2USS24   5      nSnUb3  U R
                  " SXU R                  R                  R                  S.U
D6n[        UUUR                  UR                  UR                  UR                  S9$ )a2  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

>>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
>>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

>>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
>>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
"user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
```)r{  r   r   r   r:   r   r  r   N)r9   r  r9  )r8   r9   r:   r;   r<   r4   r#   )r  r  rI  slicer  loss_functionrn   r_  r9  r6   r:   r;   r<   r4   )rK   r{  r   r   r   r:   r   r  r  r  r   rP  r;   slice_indicesr9   r8   s                   r.   r[   %Ovis2ForConditionalGeneration.forwardz  s    X ** 	
%)%+'	
 	
  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD +#33!//)) ' ; ;
 	
r-   c           	      z   > [         T
U ]  " U4UUUUUS.UD6n	U(       d  UR                  SS5      (       d  XIS'   U	$ )N)r:   r   r   r  is_first_iterationr  Tr   )rE   prepare_inputs_for_generationget)rK   r{  r:   r   r   r   r  r  r   model_inputsrM   s             r.   r  ;Ovis2ForConditionalGeneration.prepare_inputs_for_generation  sZ     w<
+'))1
 
 VZZT%B%B
 ,8(r-   )r  r  r  )NNNNNF)r$   r%   r&   r'   _tied_weights_keysr   rF   rf  ri  r   Moduler  r   r)   r*   r   r   r=   r    ry  r   r  rc   r	   r  rI  r6   r[   r  r,   rd   re   s   @r.   r  r  a  s   *,VW{ 1/ryy  R!--R9?@R9SR	;	;R R
  .215.204(,26*.!%-.G
##d*G
 ''$.G
 t+	G

 &&-G
 G
 ((4/G
   4'G
 $;G
 ell*G
 
,	,G
  G
X   r-   r  )r  rZ  r  )r   )DrJ  collections.abcr   dataclassesr   r)   r    r   r  activationsr   cache_utilsr	   
generationr
   integrationsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   autor   configuration_ovis2r   r   r    r2   r6   r  r@   rg   r   rc   rb   r   r   r   r   r   r   r   r  r  rI  r2  r4  rZ  r  __all__r#   r-   r.   <module>r     s_  *  $ !   & !   ) 7 9 d d F & n n 7 5  ? 
?1K ?  ? 
 96 9 9 
 9+ 9 90 Y'J299 J (J(RYY  BII P %II%<<% 
% <<	%
 LL4'% % %.7)299 7)tryy  8 2@ @>DRYY D<8 8i? i* C >
+ >
B 
K
% K

K
\ A$8/ A AH Rr-   