
    Z jA                         S SK JrJr  SSKJr  SSKJrJr  SSKJ	r	J
r
JrJr  SSKJrJr  SSKJrJr  \" 5       (       a  S SKr " S	 S
\
SS9r\ " S S\5      5       rS/rg)    )OptionalUnion   )BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringis_torch_availableNc                   .    \ rS rSrSS0SSS.SS0S	.rS
rg)ColQwen2ProcessorKwargs"   paddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/colqwen2/processing_colqwen2.pyr   r   "   s,     y
 ,"
 +D1	Ir%   r   F)totalc                     ^  \ rS rSr     SS\S-  S\S-  4U 4S jjjr\  SS\S-  S\\	-  \
\   -  \
\	   -  S\\   S	\4S
 jj5       rSS jr\S 5       r\S	\4S j5       r SS\S-  S\\   S	\4S jjrS\\
\   -  S\\   S	\4S jr   SS\S\
S   4   S\S\
S   4   S\S\S   S\S\4   S	S4S jjrSrU =r$ )ColQwen2Processor/   Nvisual_prompt_prefixquery_prefixc                    > [         TU ]  XUS9  [        US5      (       d  SOUR                  U l        [        US5      (       d  SOUR                  U l        U=(       d    SU l        U=(       d    SU l        g)	aJ  
visual_prompt_prefix (`str`, *optional*, defaults to `"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"`):
    A string that gets tokenized and prepended to the image tokens.
query_prefix (`str`, *optional*, defaults to `"Query: "`):
    A prefix to be used for the query.
)chat_templateimage_tokenz<|image_pad|>video_tokenz<|video_pad|>zf<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>zQuery: N)super__init__hasattrr/   r0   r+   r,   )selfimage_processor	tokenizerr.   r+   r,   kwargs	__class__s          r&   r2   ColQwen2Processor.__init__1   ss     	=Q29)]2S2S?YbYnYn29)]2S2S?YbYnYn$8 %
u 	! )5Ir%   imagestextr7   returnc                    U R                   " [        4SU R                  R                  0UD6nUS   R	                  SS5      nUSLnUc  Uc  [        S5      eUb  Ub  [        S5      eUGbV  [        U5      (       a  U/nOw[        U[        5      (       a  [        US   5      (       a  ON[        U[        5      (       a.  [        US   [        5      (       a  [        US   S   5      (       d  [        S5      eU R                  /[        U5      -  nU R                  " SS	U0US
   D6nUS   n	U	b  U R                  R                  S-  n
Sn[        [        U5      5       H  nU R                  X|   ;   aP  X|   R                  U R                  SX   R!                  5       U
-  -  S5      X|'   US-  nU R                  X|   ;   a  MP  X|   R                  SU R                  5      X|'   M     U R                  " U4SS0US   D6n[#        0 UEUES9nUS   SS2S4   US   SS2S4   -  n[        [$        R&                  " US   UR)                  5       5      5      n[$        R*                  R,                  R.                  R1                  USS9US'   U(       a.  US   R3                  US   S:H  S5      nUR5                  SU05        U$ Ub  [        U[6        5      (       a  U/nO8[        U[        5      (       a  [        US   [6        5      (       d  [        S5      eUc  U R8                  S-  n/ nU H&  nU R:                  U-   U-   nUR=                  U5        M(     U R                  " U4SS0US   D6nU$ g)a  
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
tokenizer_init_kwargsr   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesr:   r   image_grid_thw   z<|placeholder|>   return_token_type_idsF)datapixel_valuesT)batch_first	input_idstoken_type_idsilabelsz*Text must be a string or a list of strings
   r   )_merge_kwargsr   r6   init_kwargspop
ValueErrorr   
isinstancelistr+   lenr5   
merge_sizeranger/   replaceprodr   torchsplittolistnnutilsrnnpad_sequencemasked_fillupdatestrquery_augmentation_tokenr,   append)r4   r:   r;   r7   output_kwargsr?   rC   	texts_docimage_inputsr@   merge_lengthindexitext_inputsreturn_dataoffsetsrE   rI   texts_queryqueryaugmented_querybatch_querys                         r&   __call__ColQwen2Processor.__call__I   s   " **#
"&.."<"<
 

 }-11(DA &d 2<FNEFF 2TUUf%% FD))nVAY.G.G ..:fQi3N3NSabhijbklmbnSoSo !dee223c&kAI//`v`A_`L)*:;N)#33>>As9~.A**il:'0|';'; ,,.?>CXC]C]C_coCo.prs(	 
	 **il:
 $-<#7#78I4K[K[#\IL / ..&+  .K ',K{,Kl,KLK ""23AqD9KHX<YZ[]^Z^<__G  K79IJL
 +0((..*<*<*I*I$ +J +K' %$[1==kJZ>[_`>`bfg""Hf#56$$$v t,,DGS1I1I !MNN~66;%'K"&"3"3e";f"D""?3  ..&+  .K + r%   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ s H!  nU R                  R                  " / UQUP76 PM#     nnU Vs/ s H
  oUS-  -  PM     n	nUR                  XS.5        [        S0 UD6$ s  snf s  snf )ay  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr   rR   rA   )num_image_tokensnum_image_patchesr   )r   r#   getr^   r5   rR   get_number_of_image_patchesr	   )
r4   image_sizesr7   vision_datar   rR   
image_sizers   num_patchesrr   s
             r&   _get_num_multimodal_tokens,ColQwen2Processor._get_num_multimodal_tokens   s     "3==AA/SUVM  (&**<>a$BVBVBaBaJ #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd4Dmn,,,!  es   *(C	Cc                     U R                   R                  nU R                  R                  nU Vs/ s H  o3S;  d  M
  UPM     nnX-   $ s  snf )N)pixel_values_videosvideo_grid_thw)r6   model_input_namesr5   )r4   tokenizer_input_namesimage_processor_input_namesnames       r&   r   #ColQwen2Processor.model_input_names   sY     $ @ @&*&:&:&L&L#
 9'
8THq<qD8 	$ '
 %BB'
s
   	A
A
c                 .    U R                   R                  $ )zr
Return the query augmentation token.

Query augmentation buffers are used as reasoning buffers during inference.
)r6   	pad_token)r4   s    r&   r`   *ColQwen2Processor.query_augmentation_token   s     ~~'''r%   c                 *    U R                   " SSU0UD6$ )ax  
Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColQwen2Processor's
[`ColQwen2Processor.__call__`].

This method forwards the `images` and `kwargs` arguments to the image processor.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
r:   r   ro   )r4   r:   r7   s      r&   process_images ColQwen2Processor.process_images   s    > }}5F5f55r%   c                 *    U R                   " SSU0UD6$ )a  
Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColQwen2Processor's
[`ColQwen2Processor.__call__`].

This method forwards the `text` and `kwargs` arguments to the tokenizer.

Args:
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
r;   r   r   )r4   r;   r7   s      r&   process_queries!ColQwen2Processor.process_queries  s    < }}1$1&11r%   query_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         [        U5      S:X  a  [        S5      e[        U5      S:X  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUc  US   R                  n/ n[	        S[        U5      U5       GH  n/ n[
        R                  R                  R                  R                  XXs-    SSS9n	[	        S[        U5      U5       H}  n
[
        R                  R                  R                  R                  X*X-    SSS9nUR                  [
        R                  " SX5      R                  S	S
9S   R                  SS
95        M     UR                  [
        R                  " USS
9R                  U5      R                  U5      5        GM     [
        R                  " USS
9$ )a  
Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
image of a document page.

Because the embedding tensors are multi-vector and can thus have different shapes, they
should be fed as:
(1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
(2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
    obtained by padding the list of tensors.

Args:
    query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
    passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
    batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
    output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
        If `None`, the dtype of the input embeddings is used.
    output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

Returns:
    `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
    tensor is saved on the "cpu" device.
r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)rF   padding_valuezbnd,csd->bcnsr   )dimrA   rB   )rQ   rN   devicedtyperS   rV   rY   rZ   r[   r\   ra   einsummaxsumcatto)r4   r   r   r   r   r   scoresrg   batch_scoresbatch_queriesjbatch_passagess               r&   score_retrieval!ColQwen2Processor.score_retrieval"  s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./<A/1L!HHNN..;; Q^4$VW < M 1c"45zB!&!3!3!@!@&1>:\] "A " ##LL-PTTYZT[\]^bbghbi	 C MM%))La8;;LILL][\ = yyQ''r%   )r/   r,   r0   r+   )NNNNN)NN)N)   Ncpu)r   r    r!   r"   r_   r2   r   r   r   r   rP   r   r   r   ro   rz   propertyr   r`   r   r   r   intr   r   r$   __classcell__)r8   s   @r&   r)   r)   /   s    +/#'6
 "Dj6 Dj6 60  %)Z^fT!f ++d9o=EV@WWf 01	f
 
f fP-4 	C 	C (# ( ( %)6T!6 016 
	6B2$y/)2 012 
	2H 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>( >(r%   r)   )typingr   r   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   rZ   r   r   rV   r   r)   __all__r   r%   r&   <module>r      sb   * # 4 5 X X C 7 
.e 
 p( p( p(f	 
r%   