
    Z jX                         S r SSKJr  SSKJrJrJr  SSKJrJ	r	J
r
Jr  SSKJrJr  SSKJrJr  \R$                  " \5      r " S S\	S	S
9r\ " S S\
5      5       rS/rg)z
Processor class for Llava.
   )BatchFeature)
ImageInputget_image_sizeto_numpy_array)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringloggingc                   "    \ rS rSrSSSS.0rSrg)LlavaProcessorKwargs!   text_kwargsF)paddingreturn_mm_token_type_ids N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/llava/processing_llava.pyr   r   !   s    5eLIr   r   F)totalc            
          ^  \ rS rSr       SU 4S jjr\  SS\S-  S\\-  \	\   -  \	\   -  S\
\   S\4S jj5       rSS	 jrS
rU =r$ )LlavaProcessor'   Nc                    > X0l         Xpl        X@l        [        US5      (       a  UR                  OUU l        UR                  U R                  SS9S   U l        [        T	U ]!  XUS9  g)aV  
patch_size (`int`, *optional*):
    Patch size from the vision tower.
vision_feature_select_strategy (`str`, *optional*):
    The feature selection strategy used to select the vision feature from the vision backbone.
    Should be same as in model's config
image_token (`str`, *optional*, defaults to `"<image>"`):
    Special token used to denote image location.
num_additional_image_tokens (`int`, *optional*, defaults to 0):
    Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
    extra tokens appended, no need to set this arg.
image_tokenF)add_special_tokens    )chat_templateN)	
patch_sizenum_additional_image_tokensvision_feature_select_strategyhasattrr#   encodeimage_token_idsuper__init__)
selfimage_processor	tokenizerr'   r)   r&   r#   r(   kwargs	__class__s
            r   r.   LlavaProcessor.__init__)   sj    . %+F(.L+4;I}4U4U900[f'..t/?/?TY.Z[\]=Qr   imagestextr2   returnc                    Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUb  U R
                  " U40 US   D6nO0 n[        U[        5      (       a  U/nO8[        U[        5      (       d#  [        US   [        5      (       d  [        S5      eUnUR                  S5      b  US   n[        [        US   5      5      u  pXR                  -  XR                  -  -  U R                  -   n
U R                  S:X  a  U
S	-  n
/ nU H=  nUR!                  U R"                  U R"                  U
-  5      nUR%                  U5        M?     US
   R'                  SS5      nUS
   R'                  SS5      nU R                  " U40 US
   DSS0D6nU R)                  XnS/S9  U(       a  U R+                  US   5      US'   [-        0 UEUEUS9$ )a  
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
Nz7You have to specify at least one of `images` or `text`.tokenizer_init_kwargsimages_kwargsr%   zAInvalid input text. Please provide a string, or a list of stringspixel_valuesdefault   r   return_tensorsr   Fimage)
modalities	input_idsmm_token_type_ids)datatensor_type)
ValueError_merge_kwargsr   r1   init_kwargsr0   
isinstancestrlist	TypeErrorgetr   r   r'   r(   r)   replacer#   appendpop_check_special_mm_tokenscreate_mm_token_type_idsr   )r/   r5   r6   r2   output_kwargsimage_inputsprompt_stringsr;   heightwidthnum_image_tokenssampler>   r   text_inputss                  r   __call__LlavaProcessor.__call__G   s   " >dlVWW** 
"&.."<"<
 

 //Y-:XYLLdC  6DD$''
47C0H0H_`` N+7'7L*>,q/+JKMF &// 9( 00 1 22i? A% N(8(8$:J:JM]:]^%%f-  '}599:JDQ#0#?#C#CD^`e#f nn^i}]7Sidhi%%nwi%X#/3/L/L[YdMe/fK+,!@K!@<!@n]]r   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nUS   US   pvX`R                  -  XpR                  -  -  nXR                  -  nU R                  S:X  a  US-  nU/[        U5      -  nS/[        U5      -  n	UR                  XS.5        [        S	0 UD6$ )
a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr:   	crop_sizerU   rV   r<   r=   )rW   num_image_patchesr   )r   r   rL   updater0   r]   r'   r(   r)   lenr   )
r/   image_sizesr2   vision_datar:   r]   resized_heightresized_widthrW   r^   s
             r   _get_num_multimodal_tokens)LlavaProcessor._get_num_multimodal_tokens   s     "0::>>PRSM  (%))+t<^@T@T@^@^I,5h,?7ASM .// AmWfWfFfg @ @@22i? A%  01C4DD!"c+&6 64Dmn,,,r   )r#   r,   r(   r'   r)   )NNNNNz<image>r%   )NN)N)r   r   r   r   r.   r   r   r   r   rJ   r
   r   r   rZ   re   r   __classcell__)r3   s   @r   r    r    '   s     '+$%R<  %)Z^:^T!:^ ++d9o=EV@WW:^ -.	:^
 
:^ :^x- -r   r    N)__doc__feature_extraction_utilsr   image_utilsr   r   r   processing_utilsr   r   r	   r
   tokenization_utils_baser   r   utilsr   r   
get_loggerr   loggerr   r    __all__r   r   r   <module>rq      sr    5 E E  D , 
		H	%+5  y-^ y- y-x 
r   