
    Z jRE                        S SK r S SKJr  S SKrS SKJr  SSKJr  SSKJ	r	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJr  SSKJrJr  SSKJr  SSK J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*J+r+  SSK,J-r-J.r.  S\R^                  S\04S jr1\\ " S S\5      5       5       r2 " S S\(5      r3 " S S\'5      r4 " S  S!\"5      r5 " S" S#\!5      r6 " S$ S%\+5      r7 " S& S'\5      r8 " S( S)\5      r9 " S* S+\*5      r: " S, S-\Rv                  5      r< " S. S/\Rz                  5      r> " S0 S1\5      r? " S2 S3\?5      r@ " S4 S5\%5      rA\ " S6 S7\$\5      5       rB/ S8QrCg)9    N)	dataclass)nn   )initialization)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )Aimv2AttentionAimv2EncoderLayer)	AutoModel)LlamaMLPLlamaRMSNorm)LlavaForConditionalGeneration
LlavaModel)LlavaNextCausalLMOutputWithPastLlavaNextModelOutputWithPast)SiglipEncoderSiglipVisionEmbeddings   )Ovis2ConfigOvis2VisionConfiglogitsdimc                     U R                  U5      nUR                  USS9S   n[        R                  " U [        R                  S9R                  XS5      nXBR                  5       -
  U-   nU$ )NT)keepdimr   )memory_formatg      ?)softmaxmaxtorch
zeros_likelegacy_contiguous_formatscatter_detach)r!   r"   y_softindexy_hardrets         x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/ovis2/modular_ovis2.pyhard_softmaxr2   '   sf    ^^C FJJsDJ)!,EfE4R4RS\\]`ilmF
==?
"V
+CJ    c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)*BaseModelOutputWithVisualIndicatorFeatures1   z
visual_indicator_features (`torch.FloatTensor` of shape `(batch_size, visual_indicator_size)`):
    Visual indicator features extracted from the model, which can be used for auxiliary tasks or further processing.
Nvisual_indicator_features )
__name__
__module____qualname____firstlineno____doc__r7   r(   FloatTensor__annotations____static_attributes__r8   r3   r1   r5   r5   1   s    
 ;?u0047>r3   r5   c                       \ rS rSrSrg)Ovis2ModelOutputWithPast<   r8   Nr9   r:   r;   r<   r@   r8   r3   r1   rB   rB   <       r3   rB   c                       \ rS rSrSrg)Ovis2CausalLMOutputWithPast@   r8   NrD   r8   r3   r1   rG   rG   @   rE   r3   rG   c                       \ rS rSrSrg)Ovis2RMSNormD   r8   NrD   r8   r3   r1   rJ   rJ   D   rE   r3   rJ   c                       \ rS rSrSrg)Ovis2VisionMLPH   r8   NrD   r8   r3   r1   rM   rM   H   rE   r3   rM   c                   p   ^  \ rS rSrS\4U 4S jjrS rS\R                  S\R                  4S jr
SrU =r$ )	Ovis2VisionEmbeddingsL   configc                 n   > [         TU ]  U5        [        UR                  UR                  5      U l        g N)super__init__rJ   hidden_sizerms_norm_epsrms_normselfrR   	__class__s     r1   rV   Ovis2VisionEmbeddings.__init__M   s*     $V%7%79L9LMr3   c                     [        S5      e)NzNot needed for Ovis2)NotImplementedError)r[   s    r1   interpolate_pos_encoding.Ovis2VisionEmbeddings.interpolate_pos_encodingQ   s    !"899r3   pixel_valuesreturnc                     U R                   R                  R                  nU R                  UR                  US95      nUR	                  S5      R                  SS5      nU R                  U5      nX@R                  U R                  5      -   nU$ )Ndtyper   r   )	patch_embeddingweightrf   toflatten	transposerY   position_embeddingposition_ids)r[   rb   target_dtypepatch_embeds
embeddingss        r1   forwardOvis2VisionEmbeddings.forwardT   s~    ++2288++LOO,O,OP!))!,66q!<
]]:.
"9"9$:K:K"LL
r3   )rY   )r9   r:   r;   r<   r    rV   r`   r(   r>   Tensorrq   r@   __classcell__r\   s   @r1   rP   rP   L   s9    N0 N:E$5$5 %,,  r3   rP   c                       \ rS rSrSrg)Ovis2VisionAttention_   r8   NrD   r8   r3   r1   rw   rw   _   rE   r3   rw   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Ovis2VisionEncoderLayerc   rR   c                 B   > [         TU ]  5         [        U5      U l        g rT   )rU   rV   rw   	attentionrZ   s     r1   rV    Ovis2VisionEncoderLayer.__init__d   s    -f5r3   )r}   )r9   r:   r;   r<   r    rV   r@   rt   ru   s   @r1   rz   rz   c   s    60 6 6r3   rz   c            	          ^  \ rS rSrS\4U 4S jjr\\ S
S\R                  S-  S\
\   S\4S jj5       5       rS	rU =r$ )Ovis2VisionEncoderi   rR   c                    > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf rT   )rU   rV   r   
ModuleListrangenum_hidden_layersrz   layers)r[   rR   _r\   s      r1   rV   Ovis2VisionEncoder.__init__j   sF     mmeTZTlTlNm$nNm%<V%DNm$no$ns   ANattention_maskkwargsrc   c                 P    UnU R                    H  nU" XB40 UD6nM     [        US9$ )Nlast_hidden_state)r   r	   )r[   inputs_embedsr   r   hidden_statesencoder_layers         r1   rq   Ovis2VisionEncoder.forwardn   s3     &![[M)-R6RM ) ??r3   )r   rT   )r9   r:   r;   r<   r    rV   r   r   r(   rs   r   r   r	   rq   r@   rt   ru   s   @r1   r   r   i   se    p0 p  /3
@ t+
@ +,	
@
 

@  
@r3   r   c                   h   ^  \ rS rSrS\4U 4S jjr\ SS\R                  S-  4S jj5       r	Sr
U =r$ )	Ovis2VisionTransformer}   rR   c                    > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        SU l        g )NF)rU   rV   rR   rP   rp   r   encoderrJ   rW   rX   rY   gradient_checkpointingrZ   s     r1   rV   Ovis2VisionTransformer.__init__~   sM    /7)&1$V%7%79L9LM&+#r3   Nr   c                     U R                  U5      nU R                  " SUUS.UD6nUR                  nU R                  U5      n[	        US9$ )N)r   r   r   r8   )rp   r   r   rY   r	   )r[   rb   r   r   r   encoder_outputsr   s          r1   rq   Ovis2VisionTransformer.forward   s_     5+/<< ,
'),
 ,
 ,== MM*;<1BCCr3   )rR   rp   r   r   rY   rT   )r9   r:   r;   r<   r    rV   r   r(   rs   rq   r@   rt   ru   s   @r1   r   r   }   s?    ,0 ,  /3D t+D Dr3   r   c                   \   ^  \ rS rSrS\R
                  S\R
                  4U 4S jjrSrU =r$ )Ovis2VisualEmbeddingTable   visual_tokensrc   c                   > UR                   [        R                  [        R                  [        R                  [        R
                  [        R                  4;   a  [        TU ]!  U5      $ [        R                  " XR                  5      $ rT   )rf   r(   int8int16int32int64longrU   rq   matmulrh   )r[   r   r\   s     r1   rq   !Ovis2VisualEmbeddingTable.forward   sU    5::u{{EKKV[V`V`"aa7?=11||M;;77r3   r8   )	r9   r:   r;   r<   r(   rs   rq   r@   rt   ru   s   @r1   r   r      s#    8U\\ 8ell 8 8r3   r   c                   b   ^  \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSrSrSrSrU 4S jrS	rU =r$ )
Ovis2PreTrainedModel   rR   model)imagetextTrw   past_key_valuesc                   > [         TU ]  U5        [        U[        5      (       a\  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g g )N)r   r   )rU   _init_weights
isinstancerP   initcopy_rm   r(   arangeshapeexpand)r[   moduler\   s     r1   r   "Ovis2PreTrainedModel._init_weights   s^    f%f344JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r3   r8   )r9   r:   r;   r<   r   r?   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr   r@   rt   ru   s   @r1   r   r      sY    (&*#/0"3 N!"&i ir3   r   c            	          ^  \ rS rSr% \\S'   \\S.rS\4U 4S jjr	\
\S\R                  S\\   S\\-  4S j5       5       rS	rU =r$ )
Ovis2VisionModel   rR   )r   
attentionsc                   > [         TU ]  U5        Xl        [        U5      U l        UR
                  U l        UR                  U l        [        R                  " UR                  UR                  -  UR                  -  U R                  U R
                  -
  SS9U l        [        R                  " U R                  U R
                  -
  5      U l        U R                  5         g NF)bias)rU   rV   rR   r   transformernum_visual_indicator_tokens
vocab_sizer   LinearrW   hidden_stridehead_linear	LayerNorm	head_norm	post_initrZ   s     r1   rV   Ovis2VisionModel.__init__   s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr3   rb   r   rc   c           	         U R                   " U40 UD6nUS   nU R                  R                  S:  a  UR                  u  pVnU R                  R                  n[	        [
        R                  " U5      5      n	X-  U:w  a  [        S5      eXU-  -
  U-  n
[        R                  R                  USSSU
SU
4SS5      nX-  n	UR                  XYU-  XU-  X5      nUR                  SSSSSS5      nUR                  US	X-  U-  5      nU R                  U5      nU R                  U5      nU R                  R                  S
:X  a   [        R                  R!                  US	SS9nO]U R                  R                  S:X  a  [#        US	S9nO8U R                  R                  S:X  a  [        R                  R%                  US	S9n['        UWS9$ )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         r   gumbel_argmaxT)r"   hard	st_argmaxr"   r&   )r   pooler_output)r   rR   r   r   intmathsqrt
ValueErrorr   
functionalpadreshapepermuter   r   tokenize_functiongumbel_softmaxr2   r&   r5   )r[   rb   r   outputsr   
num_imagesseq_len
hidden_dimr   sqrt_lpad_sizer!   
prob_tokens                r1   rq   Ovis2VisionModel.forward   s   
 ""<:6:#AJ;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uF 1 9 9m3]mD[]j! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ[[**k9%f"5J[[**i7..v2.>J9/$
 	
r3   )rR   r   r   r   r   r   )r9   r:   r;   r<   r    r?   rz   rw   _can_record_outputsrV   r   r   r(   r>   r   r   tupler5   rq   r@   rt   ru   s   @r1   r   r      sj    0*
0   &
!--&
9?@R9S&
	;	;&
   &
r3   r   c                     ^  \ rS rSrS\4U 4S jjr\\" SS9S\R                  S\
\   S\\-  4S	 j5       5       r\\         SS\R                  S
-  S\R                  S
-  S\R                   S
-  S\R                  S
-  S\S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\\R                   -  S\\-  4S jj5       5       rSrU =r$ )
Ovis2Model   rR   c                 ~  > [         TU ]  U5        [        UR                  5      U l        [        UR                  R                  UR                  5      U l        UR                  R                  U l	        UR                  U l        UR                  U l
        [        R                  " UR                  5      U l        U ?g rT   )rU   rV   r   vision_configvision_towerr   r   rW   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr   from_configtext_configlanguage_modelmulti_modal_projectorrZ   s     r1   rV   Ovis2Model.__init__   s     ,V-A-AB'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K''33F4F4FG&r3   zWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introrb   r   rc   c                 T   U R                   " U4SS0UD6nUR                  nUR                  u  pVn[        R                  " XVU R                   R
                  4UR                  UR                  SUR                  S9n[        R                  " XH/SS9nU R                  U5      n[        R                  " U R                  U R                   R
                  -
  U R                  [        R                  S9R                  UR                  5      n	XCl        U R                  U	5      Ul        U$ )Nreturn_dictTF)rf   devicerequires_gradlayoutr   r   re   )r   r   r   r(   zerosr   rf   r  r  catr   r   r   r   ri   r7   )
r[   rb   r   image_outputsimage_features
batch_sizeimg_seq_lenr   padding_tensorvisual_indicators
             r1   get_image_featuresOvis2Model.get_image_features  s    )),SDSFS&44%3%9%9"
d&7&7&S&ST &&!((!((
 N#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 '5#262N2NO_2`/r3   N	input_idsr   rm   r   r   labels	use_cachelogits_to_keepc
           
         US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUGb4  U R                  USS9nUR                  nUR                  nU R                  UUUS9nUR                  X5      n[        U R                  5       H  u  nnUcV  X`R                  5       " [        R                  " U[        R                  UR                  S95      :H  nUR                  S5      nOUU:H  R                  UR                  5      nUR                  5       (       d  M  X   R!                  UU   5      R                  UR                  UR"                  5      UU'   M     U R$                  " S	UUUUUU	S.U
D6n['        UR(                  UR*                  UR,                  UR.                  Ub  WS9$ S S9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rb   r  )r   r
  )rf   r  r   )r   rm   r   r   r  r  )r   r   r   r   image_hidden_statesr8   )r   get_input_embeddingsr  r   r7   get_placeholder_maskmasked_scatter	enumerater   r(   tensorr   r  allri   any	expand_asrf   r   rB   r   r   r   r   )r[   r  rb   r   rm   r   r   r  r  r  r   r	  r
  r7   special_image_maskivisual_indicator_idmaskr   s                      r1   rq   Ovis2Model.forward$  s    -t";<YZZ  557	BM# 33[_3`M*88N(5(O(O%!%!:!:+- "; "
 *889K\M*3D4S4S*T&&$(,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88::14"=#67M00-2E2EF "$' +U  %% 
)%+')
 
 (%77#33!//))2>2J
 	

 QU
 	
r3   )r   r   r   r   r   r   	NNNNNNNNr   )r9   r:   r;   r<   r   rV   r   r   r(   r>   r   r   r   r5   r  
LongTensorrs   r   boolr   rB   rq   r@   rt   ru   s   @r1   r   r      sS   	'{ 	' n'' +, 
;	;	 8  .215.204(,26*.!%-.?
##d*?
 ''$.?
 t+	?

 &&-?
 ?
 ((4/?
   4'?
 $;?
 ell*?
 
)	)?
  ?
r3   r   c                     ^  \ rS rSrS\4U 4S jjr\S\R                  S\	\
   S\\-  4S j5       r\\         SS	\R                  S-  S\R                  S-  S
\R                   S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\\R                   -  S\\-  4S jj5       5       rSrU =r$ )Ovis2ForConditionalGenerationih  rR   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  SS9U l        g r   )rU   rV   r   r   rW   r   lm_headrZ   s     r1   rV   &Ovis2ForConditionalGeneration.__init__j  s0     yy!3!3V5F5FUSr3   rb   r   rc   c                 >    U R                   R                  " SSU0UD6$ )Nrb   r8   )r   r  )r[   rb   r   s      r1   r  0Ovis2ForConditionalGeneration.get_image_featuresn  s!     zz,,Q,Q&QQr3   Nr  r   rm   r   r   r  r  r  c
                    U R                   " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [        U	* S5      OU	nU R	                  USS2USS24   5      nSnUb3  U R
                  " SXU R                  R                  R                  S.U
D6n[        UUUR                  UR                  UR                  UR                  S9$ )a2  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

>>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
>>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

>>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
>>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
"user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
```)r  rb   r   rm   r   r   r  r   N)r!   r  r   )lossr!   r   r   r   r  r8   )r   r   r   slicer*  loss_functionrR   r   r   rG   r   r   r   r  )r[   r  rb   r   rm   r   r   r  r  r  r   r   r   slice_indicesr!   r/  s                   r1   rq   %Ovis2ForConditionalGeneration.forwardt  s    X ** 	
%)%+'	
 	
  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD +#33!//)) ' ; ;
 	
r3   )r*  r$  )r9   r:   r;   r<   r   rV   r   r(   r>   r   r   r   r5   r  r   r%  rs   r   r&  r   rG   rq   r@   rt   ru   s   @r1   r(  r(  h  sR   T{ T R!--R9?@R9SR	;	;R R
  .215.204(,26*.!%-.G
##d*G
 ''$.G
 t+	G

 &&-G
 G
 ((4/G
   4'G
 $;G
 ell*G
 
,	,G
  G
r3   r(  )r   r   r(  )Dr   dataclassesr   r(   r    r   r   cache_utilsr   
generationr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   aimv2.modeling_aimv2r   r   autor   llama.modeling_llamar   r   llava.modeling_llavar   r   llava_next.modeling_llava_nextr   r   siglip.modeling_siglipr   r   configuration_ovis2r   r    rs   r   r2   r5   rB   rG   rJ   rM   rP   rw   rz   r   Moduler   	Embeddingr   r   r   r   r(  __all__r8   r3   r1   <module>rH     s^    !   &   ) K - & I I 7 5 D  9 L j J ? C  
?1K ?  ?	; 		"A 		< 		X 	2 &	> 	6/ 6@ @(DRYY D<8 8i? i*>
+ >
Bm
 m
` T
$A? T
 T
n Rr3   