
    Z j@                     (   S SK r S SKrSSKJr  SSKJr  SSKJrJ	r	  SSK
JrJrJrJr  SSKJrJr  SSKJrJrJr  SS	KJr  SS
KJr  \" 5       (       a  SSKJrJr  \R:                  " \5      r " S S\SS9r \\" SS9 " S S\5      5       5       r!S/r"g)    N   )
AudioInput)BatchFeature)
ImageInputmake_nested_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringis_vision_availablelogging)requires)
VideoInput   )Gemma4ImageProcessorKwargs get_aspect_ratio_preserving_sizec                   <    \ rS rSr% \\S'   SSS.SS00 SS0S.rSrg	)
Gemma4ProcessorKwargs$   images_kwargsT)paddingreturn_mm_token_type_idsdo_convert_rgbreturn_metadata)text_kwargsr   audio_kwargsvideos_kwargs N)__name__
__module____qualname____firstlineno__r   __annotations__	_defaults__static_attributes__r!       }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma4/processing_gemma4.pyr   r   $   s5    -- (,

 d
 +T2
Ir)   r   F)total)vision)backendsc                      ^  \ rS rSr    SS\S\S\4U 4S jjjr\    SS\S-  S\\	-  \
\   -  \
\	   -  S	\S-  S
\S-  S\\   S\4S jj5       rS\S\4S jrSS jr\U 4S j5       rSrU =r$ )Gemma4Processor3   Nimage_seq_lengthaudio_seq_lengthaudio_ms_per_tokenc	           	        > X`l         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  SS/05        SU l        UR                  U R                  5      U l        Xpl	        Xl
        [        USS5      U l        [        USS5      U l        [        USS5      U l        [        USS5      U l        [         T
U ]D  " S	UUUUUS.U	D6  g)
u  
image_seq_length (`int`, *optional*, defaults to 280):
    The number of soft tokens per image used for placeholder expansion.
audio_seq_length (`int`, *optional*, defaults to 750):
    The maximum number of audio soft tokens per audio segment. Serves as an
    upper-bound cap when dynamic audio token counts are computed.
audio_ms_per_token (`int`, *optional*, defaults to 40):
    Milliseconds of audio per output soft token. Used to dynamically compute
    the number of audio placeholder tokens as ``ceil(duration_ms / audio_ms_per_token)``.
    The default of 40 comes from the SSCP convolution's 4× time reduction on 10ms frames.
additional_special_tokensz	<|video|>audio_token_idNaudio_token	boa_token	eoa_token)feature_extractorimage_processor	tokenizervideo_processorchat_templater!   )r1   image_token_id	boi_token	eoi_tokenimage_tokenadd_special_tokensvideo_tokenconvert_tokens_to_idsvideo_token_idr2   r3   getattrr6   r7   r8   r9   super__init__)selfr:   r;   r<   r=   r>   r1   r2   r3   kwargs	__class__s             r*   rI   Gemma4Processor.__init__6   s    . !1'66",,",,$00 	$$&AK=%QR&'==d>N>NO !1 #5%i1A4H"9mTB K> K> 	
/++'	
 	
r)   imagestextaudiovideosrK   returnc                   ^ ^! Uc  Uc  Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6n[        U[        5      (       a  U/nO8[        U[        5      (       d#  [        US   [        5      (       d  [        S5      e0 nUGbW  U R                  R                  U5      n[        U5      nU R                  " U40 US   D6nUR                  S5      n	U(       d8  U Vs/ s H+  nSR                  U R                  /[        U5      -  5      PM-     nn[        U5      [        U5      :w  a$  [        S[        U5       S	[        U5       S
35      eU	 V
s/ s H+  oR                    U R                  U
-   U R"                   3PM-     nn
[%        U5      m [&        R(                  " U R                  5      nU Vs/ s H  n[&        R*                  " UU 4S jU5      PM!     nn0 nUGb  U R,                  " S'SU0US   D6nUR                  S5      nUR/                  S5      (       d  UR                  S5      nOUS   n/ m![1        UU5       H  u  nnUR2                  c  [4        R7                  S5        UR2                  c  SOUR2                  Ul        UR8                   Vs/ s H$  n[;        US-  5      S S[;        US-  5      S 3PM&     nnT!R=                  SR                  U Vs/ s H/  nU SU R                    U R>                  U-   U R"                   3PM1     sn5      5        M     [%        T!5      m![&        R(                  " U R>                  5      nU Vs/ s H  n[&        R*                  " UU!4S jU5      PM!     nn0 nUGbp  U R@                  b  U RB                  b  U RD                  c  [        S5      e[        U[F        RH                  5      (       a  URJ                  S:X  a  U/nU(       d  U R@                  /[        U5      -  nUR/                  S0 5      nU RL                  " U40 UD6nU RL                  RN                  nU Vs/ s H  nU RQ                  UU5      PM     nnU V
s/ s H+  oRB                   U R@                  U
-   U RD                   3PM-     nn
[%        U5      m [&        R(                  " U R@                  5      nU Vs/ s H  n[&        R*                  " UU 4S jU5      PM!     nnUS   R                  SS 5      nUS   R                  SS5      nU R                  " S'SU0US   D6n/ nWb  UR=                  S 5        Ub  UR=                  S!5        Ub  UR=                  S"5        U(       a  U RS                  UUUS#9  U(       a  U RU                  US$   5      US%'   [W        0 UEUEUEUEUS&9$ s  snf s  sn
f s  snf s  snf s  snf s  snf s  snf s  sn
f s  snf )(Nz?Provide at least one of `text`, `images`, `audio`, or `videos`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   num_soft_tokens_per_image z1Received inconsistently sized batches of images (z) and text (z).c                    > [        T5      $ Nnext_replacements_iters    r*   <lambda>*Gemma4Processor.__call__.<locals>.<lambda>   s    d3D.Er)   rQ   r    num_soft_tokens_per_videor   video_metadataa  Gemma 4 requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results.   <   02d:c                    > [        T5      $ rX   rY   )r\   video_replacementss    r*   r^   r_      s    d3E.Fr)   zUAudio inputs were provided, but the tokenizer does not have an `audio_token` defined.r   r   c                    > [        T5      $ rX   rY   r[   s    r*   r^   r_      s    D9J4Kr)   r   return_tensorsr   FrO   imagevideorP   )
modalities	input_idsmm_token_type_ids)datatensor_typer!   ),
ValueError_merge_kwargsr   r<   init_kwargs
isinstancestrlist	TypeErrorr;   fetch_imagesr   popjoinrB   lenr@   rA   iterreescapesubr=   getzipfpsloggerwarning_once
timestampsintappendrD   r7   r8   r9   npndarrayndimr:   sampling_rate_compute_audio_num_tokens_check_special_mm_tokenscreate_mm_token_type_idsr   )"rJ   rN   rO   rP   rQ   rK   output_kwargsimage_inputsbatched_imagesnum_soft_tokensnreplacementspatternpromptvideo_inputsnum_video_tokensra   metadatan_tokenssecondstimestamp_strtaudio_inputsr   r   anum_audio_tokensaudio_patternri   r   text_inputsactive_modalitiesr]   rg   s"                                   @@r*   __call__Gemma4Processor.__call__n   s    <FNu}^__**!
"&.."<"<
 
 dC  6DD$''
47C0H0H_``))66v>F7?N//Y-:XYL*../JKO Q_`Q_v$"2"2!3c&k!ABQ_`>"c$i/ GNH[G\\hilmqirhssuv  `oo_nZ[~~.t/?/?!/C.DT^^DTU_nLo $\ 2 ii 0 01G]ab]aSYBFF7$EvN]aDb //`v`A_`L+//0KL ::/00!-!1!12B!C!-.>!?!#&).:J&K"(<<''', &.\\%9rx|| X`WjWj!WjGs7b=)#.aGbL0A#/FGWj  ! #))HHgtugtbcA3a/0@0@80K/LT^^L\]gtu 'L& "&&8!9ii 0 01G^bc^bTZBFF7$FO^bDc '4>>+AT^^E[ k 
 %,,q (()CJ6 ),,^R@L11%H<HL 22@@MZ_`Z_UV > >q- PZ_`_op_oZ[~~.t/?/?!/C.DT^^DTU_oLp $\ 2IId&6&67McghcgY_BFF=*KVTcgDh&}599:JDQ#0#?#C#CD^`e#f nnO$O-2NO $$W-$$W-$$W-))$HY)Z#/3/L/L[YdMe/fK+,PKP<P<P<P&
 	
G a p c2!
 v d4  ap is6   62W*2W&W4+W#;6W(/&W- W22W7%&W<r   c                 0   [        U5      n[        [        US-  S-  5      5      n[        [        US-  S-  5      5      nUS-   nUS-  nX7-   nX-
  U-  S-   n	U	S::  a  gU	n
[        S5       H  nU
S-   nUS-
  S-  S-   n
M     [	        XR
                  5      $ )a  Compute the number of audio soft tokens for a single waveform.

Replicates the exact sequence-length arithmetic of the audio encoder
so that the processor inserts the correct number of placeholder tokens.
The computation mirrors:

1. Mel framing via ``_unfold`` in ``Gemma4AudioFeatureExtractor``
2. Two ``Conv2d`` subsampling layers in ``Gemma4AudioSubSampleConvProjection``
   (each: kernel=3, stride=2, semicausal padding top=1, bottom=1)

The result is capped at ``self.audio_seq_length`` (the configured maximum).

Args:
    audio_waveform: A 1-D numpy array or list containing the raw audio samples.
    sampling_rate: The sampling rate of the audio waveform in Hz.

Returns:
    The number of audio soft tokens to insert as placeholders.
g      4@g     @@g      $@r      r   r   )r{   r   roundrangeminr2   )rJ   audio_waveformr   num_samplesframe_length
hop_lengthframe_size_for_unfoldpad_leftpadded_samplesnum_mel_framesr   r\   t_paddeds                r*   r   )Gemma4Processor._compute_audio_num_tokens   s    ( .) 5!5!>?@}t3f<=>
 ,q 0
  1$$/(@ZORSSQ qA1uHA!#a'A 
 1++,,r)   c           	      D   [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nUR                  SS5      =(       d    U R                  R                  nUR                  SS5      =(       d    U R                  R                  nXvS-  -  n0 n	Ubd  / n
U H9  n[        US   US   UUUS	9u  pX-  nX-  nU
R                  X-  US-  -  5        M;     S/[        U5      -  nU	R                  U
US
.5        Ub`  [        U R                  SS5      nU Vs/ s H)  nU R                  [        R                  " U5      U5      PM+     nnU	R                  SU05        [!        S0 U	D6$ s  snf )a  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
    audio_lengths (`list[int]`, *optional*):
        The lengths of audio inputs in number of samples. Used to dynamically
        compute per-audio token counts.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
r   
patch_sizeNpooling_kernel_sizemax_soft_tokensr   r   r   )heightwidthr   max_patchesr   )num_image_tokensnum_image_patchesr   i>  r   r!   )r   r'   r   updater;   r   r   r   r   r   r{   rG   r:   r   r   zerosr   )rJ   image_sizesaudio_lengthsrK   r   r   r   r   r   vision_datar   
image_sizetarget_htarget_wpatch_heightpatch_widthr   r   lengthr   s                       r*   _get_num_multimodal_tokens*Gemma4Processor._get_num_multimodal_tokens&  s     .77;;ORPV$"&&|T:]d>R>R>]>]
3T:fd>R>R>f>f 	 (++,=tDlH\H\HlHl%Q(>>"!)
%E%a=$Q-) +(;&"  (5&4 ''(BFY[\F\(\] * "#c+&6 64D[lmn$ $D$:$:OVTM^k ^kTZ..rxx/?O^k     24DEF,,, s   0Fc                   > [         TU ]  nU Vs/ s H  nUS;  d  M  UPM     nnU R                  b?  U R                  R                  nUR                  U Vs/ s H  o"U;  d  M
  UPM     sn5        US/-   $ s  snf s  snf )N)rU   r`   rn   )rH   model_input_namesr:   extend)rJ   r   namefeature_extractor_input_namesrL   s       r*   r   !Gemma4Processor.model_input_names]  s    !G5 *
)UU ) 	 
 !!-,0,B,B,T,T)$$7T%v7TtduXud7T%vw $7#888
 &ws   
A8A8	A=%A=)r3   r2   r7   r6   r8   r@   r9   rA   r1   rB   r?   rD   rF   )Ni  i  (   )NNNN)NN)r"   r#   r$   r%   r   rI   r   r   r   r   rv   r   r   r   r   r   r   r   r   propertyr   r(   __classcell__)rL   s   @r*   r/   r/   3   s      # #"$6
 6
 6
  6
 6
p  %)Z^#'$(F
T!F
 ++d9o=EV@WWF
 D 	F

 T!F
 ./F
 
F
 F
P--s --s --^5-n 9 9r)   r/   )#r}   numpyr   audio_utilsr   image_processing_utilsr   image_utilsr   r   processing_utilsr   r	   r
   r   tokenization_utils_baser   r   utilsr   r   r   utils.import_utilsr   video_utilsr   image_processing_pil_gemma4r   r   
get_loggerr"   r   r   r/   __all__r!   r)   r*   <module>r      s    
  % 2 A X X C A A * % i 
		H	%,E  	;v9n v9   v9r	 
r)   