
    Z j6              
       (   S r SSKrSSKJr  SSKJrJr  SSKJ	r	J
r
Jr  SSKJrJr  SSKJr   " S	 S
\	SS9rS\\   S\S\\\      4S jrS\\\\         S\\\      S\S\S\R*                  4
S jrS\S\S\S\4S jr\ " S S\
5      5       rS/rg)zProcessor class for Mllama.    N   )BatchFeature)
ImageInputmake_nested_list_of_images)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringc                        \ rS rSrSSS00rSrg)MllamaProcessorKwargs   image_kwargsmax_image_tiles    N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mllama/processing_mllama.pyr   r      s    q
Ir   r   F)total	input_idsimage_token_idreturnc                    [        U 5       VVs/ s H  u  p#X1:X  d  M  UPM     nnn[        U5      S:X  a  / $ [        U5      S:X  a  US   S//$ [        USS USS 5       VVs/ s H  u  pVXV/PM
     nnnUR                  US   [        U 5      /5        US   S   nUSSS2    H  n	U	S   U	S   S-
  :X  a  XS'   U	S   nM     U$ s  snnf s  snnf )a  
Generate a cross-attention token mask for image tokens in the input sequence.

This function identifies the positions of image tokens in the input sequence and creates
a mask that defines which subsequent tokens each image token should attend to.

Args:
    input_ids (list[int]): A list of token ids representing the input sequence.
    image_token_id (int): The id of the token used to represent images in the sequence.

Returns:
    list[list[int]]: A list of [start, end] pairs, where each pair represents the range
    of tokens an image token should attend to.

Notes:
    - If no image tokens are present, an empty list is returned.
    - For a single image token, it attends to all subsequent tokens until the end of the sequence.
    - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
    - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
r      N)	enumeratelenzipappend)
r   r   itokenimage_token_locationsloc1loc2vision_maskslast_mask_endvision_masks
             r   get_cross_attention_token_maskr/   "   s   , 09/C_/C81uG^Q/C_
 !Q&	  !Q&&q)2.//367LSb7QShijikSl3mn3mZTTL3mLn .r2C	NCD
 !$Q'M#DbD)q>[^a//*N#A *
 / ` os   CC$Ccross_attention_token_mask	num_tilesmax_num_tileslengthc           	      p   [        U 5      n[        S U  5       5      n[        R                  " XCXR4[        R                  S9n[        [        X5      5       H[  u  nu  p[        [        X5      5       H;  u  n
u  p[        U5      S:X  d  M  Uu  p[        X5      nUS:X  a  UnSXgX2U
SU24'   M=     M]     U$ )a  
Convert the cross attention mask indices to a cross attention mask 4D array.

This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

Args:
    cross_attention_token_mask (list[list[list[int]]]): A nested list structure where:
        - The outer list represents the batch dimension.
        - The middle list represents different images within each batch item.
        - The inner list contains pairs of integers [start, end] representing token ranges for each image.
    num_tiles (list[list[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
    max_num_tiles (int): The maximum possible number of tiles.
    length (int): The total sequence length of the input.

Returns:
    np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
        The array contains `1` where attention is allowed and `0` where it is not.

Note:
    - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
c              3   8   #    U  H  n[        U5      v   M     g 7fNr$   ).0maskss     r   	<genexpr>?convert_sparse_cross_attention_mask_to_dense.<locals>.<genexpr>p   s     L1KU1K   )shapedtype   r"   r!   N)r$   maxnpzerosint64r#   r%   min)r0   r1   r2   r3   
batch_sizemax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartends                  r   ,convert_sparse_cross_attention_mask_to_denserP   R   s    : /0JL1KLLN88>Ahh
 9B#F`Bl8m4
4\5>s<?b5c1H1y9~"&
#&"9 CYZ$Ho~o%UV 6d 9n  r   prompt	bos_tokenimage_tokenc                     X;   a  U $ SnU R                  U5      (       a+  U [        U5      S n US-  nU R                  U5      (       a  M+  X#-   U U  3$ )a  
Builds a string from the input prompt by adding `bos_token` if not already present.

Args:
    prompt (`str`):
        The input prompt string.
    bos_token (`str`):
        The beginning of sentence token to be added.
    image_token (`str`):
        The image token used to identify the start of an image sequence.

Returns:
    str: The modified prompt string with the `bos_token` added if necessary.

Examples:
    >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'

    >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
    '<|image|><begin_of_text>Hello world'

    >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'
r   Nr!   )
startswithr$   )rQ   rR   rS   num_image_tokens_on_starts       r   build_string_from_inputrW      so    4  !


K
(
(K(*+!Q&! 

K
(
( 56yk&JJr   c            
          ^  \ rS rSrSU 4S jjr\  SS\S-  S\\-  \	\   -  \	\   -  S-  S\
\   S\4S jj5       r SS	 jr\S
 5       rSrU =r$ )MllamaProcessor   Nc                 H  > [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        SU l        UR                  U R                  5      U l        UR                  U l        [        TU ]!  XUS9  g )NrS   z	<|image|>z<|python_tag|>)chat_template)	hasattrrS   convert_tokens_to_idsr   python_tokenpython_token_idrR   super__init__)selfimage_processor	tokenizerr\   	__class__s       r   rb   MllamaProcessor.__init__   s    y-00*D"+"A"A$BRBR"SD(44D"+":":D,(>>t?P?PQ",,=Qr   imagestextkwargsr   c           
      6   Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUS   R                  SS5      n0 nUGb  [        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S5      eU Vs/ s H  owR                  U R                  5      PM     nnU V	s/ s H#  n	[        XR                  U R                  5      PM%     nn	U R                  " U40 US   D6n
U R                  X*S/S	9  U
S
    Vs/ s H  oR                  U R                   5      PM     nnUR#                  U
5        S/nUbA  U R$                  R'                  U5      n[)        U5      nU Vs/ s H  n[+        U5      PM     nnUb  [-        S W 5       5      (       a"  [        S U 5       5      (       d  [        S5      e[/        U5      S:  aW  X:w  d  WU:w  aL  Uc  [        S5      eSn[/        U5      [/        U5      :X  a  X:w  a  SnOWU:w  a  Sn[        SU SU SU 35      eUb8  U R$                  " U40 US   D6nUR                  S5      nUR#                  U5        Ubc  Ub`  W
S
    Vs/ s H  n[1        XR                   5      PM     nn[3        UWU R$                  R4                  [7        S U
S
    5       5      S9nUUS'   [9        XeS9$ s  snf s  sn	f s  snf s  snf s  snf )aO  
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
Nz'You must specify either text or images.tokenizer_init_kwargstext_kwargsreturn_tensorsc              3   B   #    U  H  n[        U[        5      v   M     g 7fr6   )
isinstancestr)r8   ts     r   r:   +MllamaProcessor.__call__.<locals>.<genexpr>   s     =_Z^UVjC>P>PZ^s   zAInvalid input text. Please provide a string, or a list of stringsimage)
modalitiesr   r   c              3   *   #    U  H	  oS :H  v   M     g7fr   Nr   r8   	batch_imgs     r   r:   rs      s     D3Ci>3C   c              3   *   #    U  H	  oS :H  v   M     g7frw   r   rx   s     r   r:   rs      s      Q0@9Q0@rz   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the prompt zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). images_kwargsr1   c              3   8   #    U  H  n[        U5      v   M     g 7fr6   r7   )r8   r   s     r   r:   rs     s     Q;Pi3y>>;Pr<   )r1   r2   r3   rG   )datatensor_type)
ValueError_merge_kwargsr   re   init_kwargspoprp   rq   listtupleallcountrS   rW   rR   _check_special_mm_tokensr   updaterd   fetch_imagesr   r$   anysumr/   rP   r   r@   r   )rc   rh   ri   rj   output_kwargsrn   r   rr   n_images_in_text	text_itemencoding	token_idsn_images_in_idsn_images_in_imagessampleadd_messageimage_featuresr1   r0   rG   s                       r   __call__MllamaProcessor.__call__   si   $ <FNFGG**!
"&.."<"<
 

 '}599:JDQ$$$v e}55#=_Z^=_:_:_ !deeCGH4a(8(8 94Hjnojn]f+I~~tGWGWXjnDo~~dKmM.JKH))$gY)OU]^iUjkUj	t/B/BCUjOkKK!S))66v>F/7F<B!CF&#f+F!CD3CDDDS Q0@Q N N !w  #$q("6/M_:_>$%ghh"$K-.#6F2GGL^Lr 'C(,>> 'Q$CDTCU V@@R?SSVWbVce 
 !11&[M/<Z[N&**;7IKK' $"2`hit`u*`uS\.y:M:MN`u ' * $P*#"22BBQ8K;PQQ	$  ,@D'(BBo  Io l "DB*s   4$L*L8$LL Lc                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spaces)re   batch_decode)rc   generated_outputsr   r   rj   s        r   post_process_image_text_to_text/MllamaProcessor.post_process_image_text_to_text  s3    ( ~~**
 3)E
 	
 	
r   c                     U R                   R                  nU R                  R                  nU Vs/ s H  o3S:w  d  M
  UPM     nn[        X-   S/-   5      $ s  snf )Nr1   rG   )re   model_input_namesrd   r   )rc   tokenizer_input_namesimage_processor_input_namesnames       r   r   !MllamaProcessor.model_input_names,  sb     $ @ @&*&:&:&L&L# 9T&k8S_jWjt8S#&k)GKaJbbcc 'ls
   	AA)rR   rS   r   r_   r`   r6   )NN)TF)r   r   r   r   rb   r   r   r   r
   r   r	   r   r   r   r   propertyr   r   __classcell__)rf   s   @r   rY   rY      s    R  %)aeXCT!XC ++d9o=EV@WWZ^^XC ./	XC
 
XC XCv Y^
6 d dr   rY   )__doc__numpyrA   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r	   tokenization_utils_baser
   r   utilsr   r   r   intr/   ndarrayrP   rq   rW   rY   __all__r   r   r   <module>r      s    "  4 A H H C #,E -d3i - -QUVZ[^V_Q` -`-  $T$s)_ 5- DI-  -  	- 
 ZZ- `"KC "KC "Kc "Kc "KJ Ldn Ld Ld^ 
r   