
    Z j;                     .   S r SSKrSSKJr  SSKJrJr  SSKrSSK	J
r
  SSKJrJrJr  SSKJrJrJrJr  SS	KJrJrJr  SS
KJrJr  \(       a  SSKJr  \R8                  " \5      rS\4S jr S r!S r"S r#S r$ " S S\SS9r%\ " S S\5      5       r&S/r'g)z
Processor class for Idefics3.
    N)
accumulate)TYPE_CHECKINGUnion   )BatchFeature)
ImageInputis_valid_image
load_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenBatchEncoding	TextInput)auto_docstringlogging)PreTokenizedInputreturnc                 R    [        U [        5      =(       a    U R                  S5      $ )Nhttp)
isinstancestr
startswith)vals    ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/idefics3/processing_idefics3.pyis_urlr   %   s    c3:CNN6$::    c                 <    [        U 5      =(       d    [        U 5      $ N)r   r	   )elems    r   is_image_or_image_urlr"   )   s    $</>$//r   c           	          Sn[        U5       H7  n[        U5       H   nUU SUS-    SUS-    S3-   U U -  -   -  nM"     US-  nM9     USU 3U -   U U -  -   U -   -  nU$ )zKPrompt with expanded image tokens for when the image is split into patches. <row_   _col_>
)range)	image_seq_len
image_rows
image_colsfake_token_around_imageimage_tokenglobal_img_tokentext_split_imagesn_hn_ws	            r   _prompt_split_imager4   -   s    Z $C*+sQwiuS1WIQ/OOU`TaerRrr % 	T! ! 
$%&	 M]
*	+ %%	' r   c                 &    U U -   U U -  -   U -   $ )z5Prompt with expanded image tokens for a single image. )r+   r.   r/   r0   s       r   _prompt_single_imager7   @   s6     #
#	 M]
*	+ %%	'r   c                 L    U S:X  a  US:X  a  [        UUUUS9$ [        X XXE5      $ )Nr   )r.   r/   r0   )r7   r4   )r,   r-   r+   r.   r/   r0   s         r   get_image_prompt_stringr9   J   s@     Q:?#$;#-	
 	
 : r   c                   ,    \ rS rSrSSSSS.SS0S.rSrg)	Idefics3ProcessorKwargsY   TF)add_special_tokenspaddingis_split_into_wordsreturn_mm_token_type_idsreturn_row_col_info)text_kwargsimages_kwargsr6   N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r6   r   r   r;   r;   Y   s(     #'#((-	
 "4

Ir   r;   F)totalc                      ^  \ rS rSr SS\S\S-  4U 4S jjjrS r\   SS\	\
\	   -  \
\
\	      -  S\\S	\
\   \
S	   4   S\S-  S
\\   S\4
S jj5       rS\
S\
\   S\
\
\      4S jrSS jrSrU =r$ )Idefics3Processorg   Nr+   chat_templatec                 2  > [        SSSS9R                  U l        [        SSSS9R                  U l        [        SSSS9R                  U l        SU l        X0l        UR                  U R                  5      U l        UR                  U R                  5      U l	        UR                  U R
                  5      U l
        [        S5       VVs/ s H3  n[        S5        H   orR                  S	US
-    SUS
-    S35      PM"     M5     snnU l        [        R                  " S5      U l        SU R                  U R                  U R                  /0nUR!                  U5        UR                  U R                  5      U l        ["        T	U ]H  " X4SU0UD6  gs  snnf )aw  
image_seq_len (`int`, *optional*, defaults to 169):
    The length of the image sequence i.e. the number of <image> tokens per image in the input.
    This parameter is used to build the string from the input prompt and image tokens and should match the
    value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
z<fake_token_around_image>FT)
normalizedspecialz<image>z<end_of_utterance>z<global-img>   r%   r&   r'   r(   z*(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+additional_special_tokensrN   N)r   contentfake_image_tokenr/   end_of_utterance_tokenglobal_image_tagr+   convert_tokens_to_idsimage_token_idfake_image_token_idglobal_image_token_idr*   row_col_idsrecompile%_regex_to_remove_extra_special_tokensr=   super__init__)
selfimage_processor	tokenizerr+   rN   kwargsijtokens_to_add	__class__s
            r   ra   Idefics3Processor.__init__i   s    !++FSXbf g o o%iE4PXX&01ERWae&f&n&n# .*'==d>N>NO#,#B#B4CXCX#Y %.%D%DTEZEZ%["SXYZS[
S[aejklem`a++eAE7%Awa,HIemIS[
 68ZZ@m5n2 (%%  ++*
 	$$]3'==d>N>NO[=[TZ[%
s   :Fc                     / nU Hn  n/ nU HR  n[        U5      (       a  UR                  U5        M&  [        U5      (       d  M8  UR                  [        U5      5        MT     UR                  U5        Mp     U$ r    )r	   appendr   r
   )rb   promptsprompt_imagespromptimagesr!   s         r   _extract_images_from_prompts.Idefics3Processor._extract_images_from_prompts   si    FF!$''MM$'D\\MM*T"23	 
   (  r   rp   textr   re   r   c                 v
   Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUb  UOU R
                  nUS   R                  SS5      nUS   R                  SS5      n/ n/ n	0 n
Ub{  [        U[        5      (       a  U/nO8[        U[        5      (       d#  [        US   [        5      (       d  [        S	5      eU Vs/ s H  oR                  U R                  5      PM     nnUGb  [        U5      (       a  U//nGO,[        U[        [        45      (       a  [        US   5      (       a  Ub  [        U5      [        U5      :w  a>  [        S
U R                   S[        U5       SU R                   S[        U5       S3	5      eS/[        [!        U5      5      -   n[#        [        U5      5       Vs/ s H  nXU   XS-       PM     nnO^U/nOZ[        U[        [        45      (       d?  [        US   [        [        45      (       d!  [        US   S   5      (       d  [        S5      eU Vs/ s H  n[        U5      PM     n	nU VVs/ s H0  o Vs/ s H   n[%        U5      (       a  ['        U5      OUPM"     snPM2     nnnU R(                  " U40 US   D6nU
R+                  U5        UGb  X:w  a  [        SU SU	 S35      eU
R                  SU Vs/ s H	  nS/U-  PM     sn5      nU
R                  SU Vs/ s H	  nS/U-  PM     sn5      nU R,                  nU R                  nU R.                  n/ n/ n[1        UUU5       H  u  nnn/ n/ n[1        UU5       H^  u  nn[3        UUUUUUS9nU R
                  S-   U-  S-   nUR5                  U R
                  S-   UU-  -   5        UR5                  U5        M`     UR5                  U5        UR7                  U5      n [        U 5      S:X  a  [        S5      eU S   n[9        U5       H  u  nnUUU US-      -   -  nM     UR5                  U5        M     U R                  " U40 US   D6n!U R;                  UU!S/S9  U
R+                  U!5        O`Ub]  [=        U5      (       a%  [        S[        U5       SU R                   S35      eU R                  " S#SU0US   D6n!U
R+                  U!5        U(       a  U R?                  U
S    W5      U
S!'   [A        XS"9$ s  snf s  snf s  snf s  snf s  snnf s  snf s  snf )$z
image_seq_len (`int`, *optional*):
    The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
    image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
Nz+You must provide either `text` or `images`.tokenizer_init_kwargsrB   r@   Freturn_tensorsr   zAInvalid input text. Please provide a string, or a list of stringszThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.r&   zdInvalid input images. Please provide a single image or a list of images or a list of list of images.rC   z!The number of images in the text z and images z should be the same.rowscols)r/   r.   r0      r   z.The image token should be present in the text.image)
modalitieszFound z. tokens in the text but no images were passed.rs   	input_idsmm_token_type_ids)datatensor_typer6   )!
ValueError_merge_kwargsr;   rd   init_kwargsr+   popr   r   listcountr/   r"   tuplesumlenr   r*   r   r
   rc   updaterU   rW   zipr9   rl   split	enumerate_check_special_mm_tokensanycreate_mm_token_type_idsr   )"rb   rp   rs   r+   re   output_kwargsr@   rv   n_images_in_textn_images_in_imagesinputssamplecumsum_images_in_textrf   imimage_inputsn_imagesr,   r-   rU   r/   r0   prompt_stringsbatch_image_seq_lengthssample_rowssample_colsimage_prompt_stringsimage_seq_lengthsn_rowsn_colsimage_prompt_string
row_lengthsplit_sampletext_inputss"                                     r   __call__Idefics3Processor.__call__   s    <FNJKK**#
"&.."<"<
 
 *7)BHZHZ#0#?#C#CD^`e#f &}599:JDQ$$$vd++JtAw4L4L !deeMQRT6T-=-= >TR$V,,!(FT5M227LVTUY7W7W#+,F;(243C3C2D E&&)*:&;%<Ad>N>N=O|\_`f\g[hhpr 
 ./C$zBR7S2T,T) "'s+;'<!=!=A Q7:OTUPU:VW!=  F
 %XFve}55"6!9tUm<<-fQil;; z  =C!CF&#f+F!C ]cc\bRXfMfz"~;fM\bFc//Y-:XYLMM,'%9$;<L;M\ZlYm  nB  C  $ZZP`0aP`H!xP`0ab
#ZZP`0aP`H!xP`0ab
#'#8#8 "..#'#8#8 !#*,'8;D*j8Y4FK+-((*%*-k;*G.E"")(34D-=/+ '+&8&81&<%F%J
)00$2D2Dq2HJY_L_1_`,334GH +H ,223DE#)<<#<L<(A-()YZZ *!_F2;<P2Q.."5QU8K"KK 3R"))&17 9Z: #nn^\}]?[\--nkW^V_-`k*#$$ S!1231T5E5E4FFtu  ..SdSmM6RSKMM+&#*.*G*G{H[]t*uF&'DD}  S "D Nc 1b0as6   $TTT!$	T+-'T&T+0T1
T6
&T+r}   r   c                    / n[        U5       H  u  pE[        R                  " X   5      n[        R                  " U5      n[        R                  " X`R
                  :H  5      S   nSn	U H6  n
U	[        U5      :  a    O&X   nX-   nSX{U& [        R                  " X5      n	M8     UR                  UR                  5       5        M     U$ )Nr   r&   )
r   nparray
zeros_likewhererZ   r   searchsortedrl   tolist)rb   r}   r   r~   rf   seq_lengths	array_idsmm_token_typesimage_start_positionsrg   seq_lenstartends                r   r   *Idefics3Processor.create_mm_token_type_ids  s     '(?@NA.I]]95N$&HHY:R:R-R$STU$V!A&122-0o,-S)OO$9? ' $$^%:%:%<= A ! r   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        U Vs/ s H!  nU R                  R
                  " / UQUP76 PM#     nnU R                  S-   nU R                  S-   n/ n	/ n
U H4  u  pnX-  S-   nU	R                  X~U-  -   5        U
R                  U5        M6     UR                  XS.5        [        S0 UD6$ s  snf )a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
rC   r   rz   r&   )num_image_tokensnum_image_patchesr6   )	r;   rH   getr   rc   get_number_of_image_patchesr+   rl   r   )rb   image_sizesre   vision_datarC   
image_sizenum_image_row_colsbase_image_length
col_lengthr   r   num_patchesnum_rowsnum_colsr   s                  r   _get_num_multimodal_tokens,Idefics3Processor._get_num_multimodal_tokens2  s	    "3==AA/SUVM  ( #.""-J $$@@\*\m\"-  "
 !% 2 2Q 6++a/J! "3E/x'2Q6
 ''(9(=R(ST!((5 4F
 4Dmn,,,#"s   (C)
r_   rV   rU   rZ   rW   r[   r+   r/   rY   r\   )N   N)NNNr    )rD   rE   rF   rG   intr   ra   rq   r   r   r   r   r   r   r;   r   r   r   r   rI   __classcell__)ri   s   @r   rL   rL   g   s    fj#\>A#\X[^bXb#\ #\J
  JNbf$(	@ET*--T*5E0FF@E I2DOTJ]E^^_@E Tz	@E
 01@E 
@E @ED!$ !QUVYQZ !_cdhildm_n !*#- #-r   rL   )(__doc__r]   	itertoolsr   typingr   r   numpyr   feature_extraction_utilsr   image_utilsr   r	   r
   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   utilsr   r   r   
get_loggerrD   loggerboolr   r"   r4   r7   r9   r;   rL   __all__r6   r   r   <module>r      s    
   '  4 A A X X K K , <			H	%;4 ;0&.e  m- m- m-` 
r   