
    Z j h                     ,   S SK r S SKJr  S SKJrJrJr  S SKrS SK	r	SSK
Jr  SSKJrJrJr  SSKJrJrJrJr  SSKJrJrJr  SS	KJr  SS
KJr  \(       a  SSKJr   " S S\SS9rS\4S jr S r!S r"S r#S r$\" SS9\ " S S\5      5       5       r%S/r&g)    N)
accumulate)TYPE_CHECKINGOptionalUnion   )BatchFeature)
ImageInputis_valid_image
load_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenBatchEncoding	TextInput)auto_docstring)requires)PreTokenizedInputc                   0    \ rS rSrSS0SSSS.SS0S	.rS
rg)ColModernVBertProcessorKwargs(   paddinglongestTchannels_first)return_row_col_infodata_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r$       ڍ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/colmodernvbert/processing_colmodernvbert.pyr   r   (   s/     y
 $(+"

 +D1
Ir+   r   F)totalreturnc                 R    [        U [        5      =(       a    U R                  S5      $ )Nhttp)
isinstancestr
startswith)vals    r,   is_urlr5   6   s    c3:CNN6$::r+   c                 <    [        U 5      =(       d    [        U 5      $ N)r5   r
   )elems    r,   is_image_or_image_urlr9   :   s    $</>$//r+   c           	          Sn[        U5       H7  n[        U5       H   nUU SUS-    SUS-    S3-   U U -  -   -  nM"     US-  nM9     USU 3U -   U U -  -   U -   -  nU$ )zKPrompt with expanded image tokens for when the image is split into patches. <row_   _col_>
)range)	image_seq_len
image_rows
image_colsfake_token_around_imageimage_tokenglobal_img_tokentext_split_imagesn_hn_ws	            r,   _prompt_split_imagerK   >   s    Z $C*+sQwiuS1WIQ/OOU`TaerRrr % 	T! ! 
$%&	 M]
*	+ %%	' r+   c                 &    U U -   U U -  -   U -   $ )z5Prompt with expanded image tokens for a single image.r$   )rB   rE   rF   rG   s       r,   _prompt_single_imagerM   Q   s6     #
#	 M]
*	+ %%	'r+   c                 L    U S:X  a  US:X  a  [        UUUUS9$ [        X XXE5      $ )Nr   )rE   rF   rG   )rM   rK   )rC   rD   rB   rE   rF   rG   s         r,   get_image_prompt_stringrO   [   s@     Q:?#$;#-	
 	
 : r+   )torch)backendsc                     ^  \ rS rSrSr     SS\S\S-  S\S-  4U 4S jjjrS r\	   S S	\
\\
   -  \\\
      -  S
\\S\\   \S   4   S\S-  S\\   S\4
S jj5       rS\S\\   S\\\      4S jrS!S jr S!S	\
S-  S\\   S\4S jjrS
\\\   -  S\\   S\4S jr   S"S\S\S   4   S\S\S   4   S\S\S   S\S\4   SS4S jjrSrU =r$ )#ColModernVBertProcessorj   a  
Constructs a ColModernVBert processor which wraps a ModernVBertProcessor and special methods to process images and queries, as
well as to compute the late-interaction retrieval score.

[`ColModernVBertProcessor`] offers all the functionalities of [`ModernVBertProcessor`]. See the [`~ModernVBertProcessor.__call__`]
for more information.

Args:
        image_processor ([`Idefics3ImageProcessor`]): An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
        tokenizer (`PreTrainedTokenizerFast`, *optional*): An instance of [`PreTrainedTokenizerFast`]. This should correspond with the model's text model. The tokenizer is a required input.
        image_seq_len (`int`, *optional*, defaults to 64): The length of the image sequence i.e. the number of <image> tokens per image in the input.
        visual_prompt_prefix (`Optional`, *optional*): A prefix to be prepended to visual prompts.
        query_prefix (`Optional`, *optional*): A prefix to be prepended to query prompts.
NrB   visual_prompt_prefixquery_prefixc                   > Sn[        SSSS9R                  U l        [        SSSS9R                  U l        [        SSSS9R                  U l        SU l        X@l        UR                  U R                  5      U l        UR                  U R                  5      U l	        UR                  U R
                  5      U l
        [        S	5       VV	s/ s H3  n[        S	5        H   oR                  S
US-    SU	S-    S35      PM"     M5     sn	nU l        [        R                  " S5      U l        SU R                  U R                  U R                  /0n
UR!                  U
5        UR                  U R                  5      U l        ["        TU ]H  " X4SU0UD6  U=(       d    SU R                   S3U l        U=(       d    SU l        U R                  U l        gs  sn	nf )aJ  
image_seq_len (`int`, *optional*, defaults to 64):
    The length of the image sequence i.e. the number of <image> tokens per image in the input.
visual_prompt_prefix (`str`, *optional*):
    A string that gets tokenized and prepended to the image tokens.
query_prefix (`str`, *optional*):
    A prefix to be used for the query.
Nz<fake_token_around_image>FT)
normalizedspecialz<image>z<end_of_utterance>z<global-img>   r<   r=   r>   r?   z*(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+additional_special_tokenschat_templatez<|begin_of_text|>User:z0Describe the image.<end_of_utterance>
Assistant:r;   )r   contentfake_image_tokenrF   end_of_utterance_tokenglobal_image_tagrB   convert_tokens_to_idsimage_token_idfake_image_token_idglobal_image_token_idrA   row_col_idsrecompile%_regex_to_remove_extra_special_tokensadd_special_tokenssuper__init__rU   rV   query_augmentation_token)selfimage_processor	tokenizerr\   rB   rU   rV   kwargsijtokens_to_add	__class__s              r,   rk    ColModernVBertProcessor.__init__|   s   $  *+FSXbf g o o%iE4PXX&01ERWae&f&n&n# .*'==d>N>NO#,#B#B4CXCX#Y %.%D%DTEZEZ%["SXYZS[
S[aejklem`a++eAE7%Awa,HIemIS[
 68ZZ@m5n2 (%%  ++*
 	$$]3'==d>N>NO[=[TZ[$8 %
$T%5%5$66gh 	! ).B(,(C(C%1
s   :Gc                     / nU Hn  n/ nU HR  n[        U5      (       a  UR                  U5        M&  [        U5      (       d  M8  UR                  [        U5      5        MT     UR                  U5        Mp     U$ r7   )r
   appendr5   r   )rm   promptsprompt_imagespromptimagesr8   s         r,   _extract_images_from_prompts4ColModernVBertProcessor._extract_images_from_prompts   si    FF!$''MM$'D\\MM*T"23	 
   (  r+   r{   textr   rp   r.   c                 v
   Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUb  UOU R
                  nUS   R                  SS5      nUS   R                  SS5      n/ n/ n	0 n
Ub{  [        U[        5      (       a  U/nO8[        U[        5      (       d#  [        US   [        5      (       d  [        S	5      eU Vs/ s H  oR                  U R                  5      PM     nnUGb  [        U5      (       a  U//nGO,[        U[        [        45      (       a  [        US   5      (       a  Ub  [        U5      [        U5      :w  a>  [        S
U R                   S[        U5       SU R                   S[        U5       S3	5      eS/[        [!        U5      5      -   n[#        [        U5      5       Vs/ s H  nXU   XS-       PM     nnO^U/nOZ[        U[        [        45      (       d?  [        US   [        [        45      (       d!  [        US   S   5      (       d  [        S5      eU Vs/ s H  n[        U5      PM     n	nU VVs/ s H0  o Vs/ s H   n[%        U5      (       a  ['        U5      OUPM"     snPM2     nnnU R(                  " U40 US   D6nU
R+                  U5        UGb  X:w  a  [        SU SU	 S35      eU
R                  SU Vs/ s H	  nS/U-  PM     sn5      nU
R                  SU Vs/ s H	  nS/U-  PM     sn5      nU R,                  nU R                  nU R.                  n/ n/ n[1        UUU5       H  u  nnn/ n/ n[1        UU5       H^  u  nn[3        UUUUUUS9nU R
                  S-   U-  S-   nUR5                  U R
                  S-   UU-  -   5        UR5                  U5        M`     UR5                  U5        UR7                  U5      n [        U 5      S:X  a  [        S5      eU S   n[9        U5       H  u  nnUUU US-      -   -  nM     UR5                  U5        M     U R                  " U40 US   D6n!U R;                  UU!S/S9  U
R+                  U!5        O`Ub]  [=        U5      (       a%  [        S[        U5       SU R                   S35      eU R                  " S#SU0US   D6n!U
R+                  U!5        U(       a  U R?                  U
S    W5      U
S!'   [A        XS"9$ s  snf s  snf s  snf s  snf s  snnf s  snf s  snf )$z
image_seq_len (`int`, *optional*):
    The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
    image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
Nz+You must provide either `text` or `images`.tokenizer_init_kwargsr!   return_mm_token_type_idsFr   r   zAInvalid input text. Please provide a string, or a list of stringszThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.r=   zdInvalid input images. Please provide a single image or a list of images or a list of list of images.r"   z!The number of images in the text z and images z should be the same.rowscols)rF   rE   rG      r   z.The image token should be present in the text.image)
modalitieszFound z. tokens in the text but no images were passed.r~   	input_idsmm_token_type_ids)datatensor_typer$   )!
ValueError_merge_kwargsr   ro   init_kwargsrB   popr1   r2   listcountrF   r9   tuplesumlenr   rA   r5   r   rn   updater^   r`   ziprO   rw   split	enumerate_check_special_mm_tokensanycreate_mm_token_type_idsr   )"rm   r{   r~   rB   rp   output_kwargsr   r   n_images_in_textn_images_in_imagesinputssamplecumsum_images_in_textrq   imimage_inputsn_imagesrC   rD   r^   rF   rG   prompt_stringsbatch_image_seq_lengthssample_rowssample_colsimage_prompt_stringsimage_seq_lengthsn_rowsn_colsimage_prompt_string
row_lengthsplit_sampletext_inputss"                                     r,   __call__ ColModernVBertProcessor.__call__   s    <FNJKK**)
"&.."<"<
 
 *7)BHZHZ#0#?#C#CD^`e#f &}599:JDQ$$$vd++JtAw4L4L !deeMQRT6T-=-= >TR$V,,!(FT5M227LVTUY7W7W#+,F;(243C3C2D E&&)*:&;%<Ad>N>N=O|\_`f\g[hhpr 
 ./C$zBR7S2T,T) "'s+;'<!=!=A Q7:OTUPU:VW!=  F
 %XFve}55"6!9tUm<<-fQil;; z  =C!CF&#f+F!C ]cc\bRXfMfz"~;fM\bFc//Y-:XYLMM,'%9$;<L;M\ZlYm  nB  C  $ZZP`0aP`H!xP`0ab
#ZZP`0aP`H!xP`0ab
#'#8#8 "..#'#8#8 !#*,'8;D*j8Y4FK+-((*%*-k;*G.E"")(34D-=/+ '+&8&81&<%F%J
)00$2D2Dq2HJY_L_1_`,334GH +H ,223DE#)<<#<L<(A-()YZZ *!_F2;<P2Q.."5QU8K"KK 3R"))&17 9Z: #nn^\}]?[\--nkW^V_-`k*#$$ S!1231T5E5E4FFtu  ..SdSmM6RSKMM+&#*.*G*G{H[]t*uF&'DD}  S "D Nc 1b0as6   $TTT!$	T+-'T&T+0T1
T6
&T+r   r   c                    / n[        U5       H  u  pE[        R                  " X   5      n[        R                  " U5      n[        R                  " X`R
                  :H  5      S   nSn	U H6  n
U	[        U5      :  a    O&X   nX-   nSX{U& [        R                  " X5      n	M8     UR                  UR                  5       5        M     U$ )Nr   r=   )
r   nparray
zeros_likewhererc   r   searchsortedrw   tolist)rm   r   r   r   rq   seq_lengths	array_idsmm_token_typesimage_start_positionsrr   seq_lenstartends                r,   r   0ColModernVBertProcessor.create_mm_token_type_ids@  s     '(?@NA.I]]95N$&HHY:R:R-R$STU$V!A&122-0o,-S)OO$9? ' $$^%:%:%<= A ! r+   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        U Vs/ s H!  nU R                  R
                  " / UQUP76 PM#     nnU R                  S-   nU R                  S-   n/ n	/ n
U H4  u  pnX-  S-   nU	R                  X~U-  -   5        U
R                  U5        M6     UR                  XS.5        [        S0 UD6$ s  snf )a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
r"   r   r   r=   )num_image_tokensnum_image_patchesr$   )	r   r)   getr   rn   get_number_of_image_patchesrB   rw   r   )rm   image_sizesrp   vision_datar"   
image_sizenum_image_row_colsbase_image_length
col_lengthr   r   num_patchesnum_rowsnum_colsr   s                  r,   _get_num_multimodal_tokens2ColModernVBertProcessor._get_num_multimodal_tokensU  s	    "9CCGGY[\M  ( #.""-J $$@@\*\m\"-  "
 !% 2 2Q 6++a/J! "3E/x'2Q6
 ''(9(=R(ST!((5 4F
 4Dmn,,,#"s   (Cc                    U R                   " [        4SU R                  R                  0UD6nUS   R	                  SS5      nUSLn[        U5      (       a  U/nOw[        U[        5      (       a  [        US   5      (       a  ON[        U[        5      (       a.  [        US   [        5      (       a  [        US   S   5      (       d  [        S5      eU Vs/ s H  ofR                  S5      PM     nnU R                  U R                  /[        U5      -  UUS   US   S	9nU(       a.  US
   R                  US   S:H  S5      nUR                  SU05        U$ s  snf )au  
Prepare for the model one or several image(s). Handles input validation, RGB conversion,
and prepends the `visual_prompt_prefix` to each image. Optionally computes labels from
`token_type_ids` when a `suffix` is provided in `text_kwargs`.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
r   r!   suffixNr   zAimages must be an image, list of images or list of list of imagesRGBr"   )r~   r{   r"   r!   r   token_type_idsilabels)r   r   ro   r   r   r
   r1   r   r   convertr   rU   r   masked_fillr   )	rm   r{   rp   r   r   return_token_type_idsr   	batch_docr   s	            r,   process_images&ColModernVBertProcessor.process_imagesz  sj   < **)
"&.."<"<
 
 }-11(DA &d 2 &!!XF%%.*C*CVT**z&)T/J/J~^def^ghi^jOkOk`aa 5;;F5--&F; MM++,s6{:'8%m4	 " 
	 !{+77	BR8SWX8XZ^_Fh/0 <s   Ec                    U R                   " [        4SU R                  R                  0UD6nUS   R	                  SS5      n[        U[        5      (       a  U/nO8[        U[        5      (       a  [        US   [        5      (       d  [        S5      eUc  U R                  S-  nU Vs/ s H  oPR                  U-   U-   PM     nnU R                  USUS   S	9nU$ s  snf )
a  
Prepare for the model one or several text queries. Handles input validation, prepends the
`query_prefix`, and appends query augmentation tokens (used to pad query embeddings for
better late-interaction retrieval performance).

Args:
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
r   r!   r   Nr   z*Text must be a string or a list of strings
   F)r~   r   r!   )r   r   ro   r   r   r1   r2   r   r   rl   rV   r   )rm   r~   rp   r   r   querytexts_querybatch_querys           r,   process_queries'ColModernVBertProcessor.process_queries  s    : **)
"&.."<"<
 
 }-11(DAdC  6DT4((ZQ-E-EIJJ >22R7F SW!WRV"3"3e";f"DRV!Wmm"'%m4 $ 
  "Xs   *Cquery_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         [        U5      S:X  a  [        S5      e[        U5      S:X  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUc  US   R                  n/ n[	        S[        U5      U5       GH  n/ n[
        R                  R                  R                  R                  XXs-    SSS9n	[	        S[        U5      U5       H}  n
[
        R                  R                  R                  R                  X*X-    SSS9nUR                  [
        R                  " SX5      R                  S	S
9S   R                  SS
95        M     UR                  [
        R                  " USS
9R                  U5      R                  U5      5        GM     [
        R                  " USS
9$ )a  
Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
image of a document page.

Because the embedding tensors are multi-vector and can thus have different shapes, they
should be fed as:
(1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
(2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
    obtained by padding the list of tensors.

Args:
    query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
    passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
    batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
    output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
        If `None`, the dtype of the input embeddings is used.
    output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

Returns:
    `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
    tensor is saved on the "cpu" device.
r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)batch_firstpadding_valuezbnd,csd->bcnsr   )dimr   r=   )r   r   devicedtyperA   rP   nnutilsrnnpad_sequencerw   einsummaxr   catto)rm   r   r   r   r   r   scoresrq   batch_scoresbatch_queriesrr   batch_passagess               r,   score_retrieval'ColModernVBertProcessor.score_retrieval  s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./<A/1L!HHNN..;; Q^4$VW < M 1c"45zB!&!3!3!@!@&1>:\] "A " ##LL-PTTYZT[\]^bbghbi	 C MM%))La8;;LILL][\ = yyQ''r+   )rh   r_   r^   rc   r`   rd   rB   rF   rb   rl   rV   re   rU   )NN@   NN)NNNr7   )   Ncpu)r%   r&   r'   r(   __doc__intr2   rk   r|   r   r	   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   __classcell__)rt   s   @r,   rS   rS   j   s   $ +/#'3D
 3D "Dj3D Dj3D 3Dj
  JNbf$(	@ET*--T*5E0FF@E I2DOTJ]E^^_@E Tz	@E
 67@E 
@E @ED!$ !QUVYQZ !_cdhildm_n !*#-N %)@T!@ 67@ 
	@D7$y/)7 677 
	7z 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>( >(r+   rS   )'rf   	itertoolsr   typingr   r   r   numpyr   rP   feature_extraction_utilsr   image_utilsr	   r
   r   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r   r   utils.import_utilsr   r   r   boolr5   r9   rK   rM   rO   rS   __all__r$   r+   r,   <module>r     s   * 
   1 1   4 A A X X K K # * <$4E ;4 ;0& 
:G(n G(  G(T %
%r+   