
    Z j$                         S r SSKrSSKrSSKrSSKJr  SSKJr  SSKJ	r	J
r
Jr  \	" 5       (       a  SSKr\
" 5       (       a  SSKr\R                  " \5      rSS0rS	 r " S
 S\5      rS/rg)zTokenization class for VITS.    N)Any   )PreTrainedTokenizer)is_phonemizer_availableis_uroman_availablelogging
vocab_filez
vocab.jsonc                 \    [         R                  " S5      nUR                  U 5      nUS LnU$ )Nz[^\x00-\x7F])recompilesearch)input_stringnon_roman_patternmatchhas_non_romans       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_charactersr   $   s3    

?3 $$\2E%M    c                     ^  \ rS rSrSr\rSS/r       S SU 4S jjjr\	S 5       r
S	 rS
 rS r SS\S\S\S-  S\\\\\4   4   4S jjrS\S\\   4S jrS\\   S\4S jrS rS rSS\S\S-  S\\   S-  4S jjrSrU =r$ )VitsTokenizer.   a|  
Construct a VITS tokenizer. Also supports MMS-TTS.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    language (`str`, *optional*):
        Language identifier.
    add_blank (`bool`, *optional*, defaults to `True`):
        Whether to insert token id 0 in between the other tokens.
    normalize (`bool`, *optional*, defaults to `True`):
        Whether to normalize the input text by removing all casing and punctuation.
    phonemize (`bool`, *optional*, defaults to `True`):
        Whether to convert the input text into phonemes.
    is_uroman (`bool`, *optional*, defaults to `False`):
        Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
	input_idsattention_maskNreturnc	                 f  > [        USS9 n
[        R                  " U
5      U l        S S S 5        U R                  R	                  5        VVs0 s H  u  pX_M	     snnU l        X@l        XPl        X`l        Xpl	        Xl
        [        TU ]0  " SUUUUUUUSS.U	D6  g ! , (       d  f       Nz= fs  snnf )Nutf-8encodingnone)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uromanspecial_tokens_pattern )openjsonloadencoderitemsdecoderr"   r#   r$   r%   r&   super__init__)selfr	   r    r!   r"   r#   r$   r%   r&   kwargsvocab_handlekv	__class__s                r   r0   VitsTokenizer.__init__G   s     *w/<99\2DL 0 *.););)=>)=)=> """" 
	
#)
	
 
	
 0/ ?s   BB-
B*c                 ,    [        U R                  5      $ N)lenr,   )r1   s    r   
vocab_sizeVitsTokenizer.vocab_sizej   s    4<<  r   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r9   )ranger;   convert_ids_to_tokensupdateadded_tokens_encoder)r1   ivocabs      r   	get_vocabVitsTokenizer.get_vocabn   sL    ;@;QR;Qa++A.1;QRT../ Ss   Ac                    [        U R                  R                  5       5      [        U R                  R                  5       5      -   nSnSnU[	        U5      :  ag  SnU H-  nXU[	        U5      -    U:X  d  M  X6-  nU[	        U5      -  nSn  O   U(       d  X1U   R                  5       -  nUS-  nU[	        U5      :  a  Mg  U$ )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr,   keysrA   r:   lower)r1   r   all_vocabularyfiltered_textrB   found_matchwords          r   normalize_textVitsTokenizer.normalize_texts   s    dll//12T$:S:S:X:X:Z5[[#l##K&AD	M2d:!)MTNA"&K ' a!6!6!88Q #l## r   c                 J    U R                   S:X  a  UR                  SS5      nU$ )z4Special treatment of characters in certain languagesronu   țu   ţ)r"   replace)r1   texts     r   _preprocess_charVitsTokenizer._preprocess_char   s#    ==E!<<d+Dr   rU   is_split_into_wordsr$   c           	        ^  Ub  UOT R                   nU(       a  T R                  U5      nT R                  U5      n[        U5      (       a\  T R                  (       aK  [        5       (       d  [        R                  S5        O&[        R                  " 5       nUR                  U5      nT R                  (       aN  [        5       (       d  [        S5      e[        R                  " USSSSSS9n[        R                   " SSU5      nXT4$ U(       a6  S	R#                  [%        ['        U 4S
 jU5      5      5      R)                  5       nXT4$ )am  
Performs any necessary transformations before tokenization.

This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
`kwargs` at the end of the encoding process to be sure all the arguments have been used.

Args:
    text (`str`):
        The text to prepare.
    is_split_into_words (`bool`, *optional*, defaults to `False`):
        Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
        tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
        which it will tokenize.
    normalize (`bool`, *optional*, defaults to `None`):
        Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
        trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
        text consists only of lower-case characters.
    kwargs (`dict[str, Any]`, *optional*):
        Keyword arguments to use for the tokenization.

Returns:
    `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs.
aC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usespeakT)r"   backendstrippreserve_punctuationwith_stressz\s+ rG   c                 "   > U TR                   ;   $ r9   )r,   )charr1   s    r   <lambda>8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>   s    TT\\=Qr   )r$   rP   rV   r   r&   r   loggerwarningurUromanromanize_stringr%   r   ImportError
phonemizerr   subjoinrI   filterr\   )r1   rU   rX   r$   r2   rM   uromans   `      r   prepare_for_tokenization&VitsTokenizer.prepare_for_tokenization   s   4 "+!6IDNN	&&t,D--d3#M22t~~&((y  & 6 6} E>>*,,!"ijj&00  %) M FF63>M
 $$	 GGD0QS`)a$bciikM$$r   c                     [        U5      nU R                  (       a-  U R                  S5      /[        U5      S-  S-   -  nX#SSS2'   UnU$ )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      rH   N)rI   r#   _convert_id_to_tokenr:   )r1   rU   tokensintersperseds       r   	_tokenizeVitsTokenizer._tokenize   sO    d>> 55a89S[1_q=PQL!'A!Fr   rt   c                 t    U R                   (       a  [        U5      S:  a  USS S2   nSR                  U5      $ )NrH   rr   rG   )r#   r:   rl   )r1   rt   s     r   convert_tokens_to_string&VitsTokenizer.convert_tokens_to_string   s0    >>c&kAoADqD\Fwwvr   c                 V    XR                   ;   a  U R                   U   $ U R                  $ )z0Converts a token (str) in an id using the vocab.)r,   unk_token_id)r1   tokens     r   _convert_token_to_id"VitsTokenizer._convert_token_to_id   s'    LL <<&&   r   c                 8    U R                   R                  U5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r.   get)r1   indexs     r   rs   "VitsTokenizer._convert_id_to_token   s    ||&&r   save_directoryfilename_prefixc           
         [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS9 nUR                  [        R                  " U R                  S	S
SS9S-   5        S S S 5        U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-rG   r	   wr   r   rr   TF)indent	sort_keysensure_ascii
)ospathisdirrd   errorrl   VOCAB_FILES_NAMESr)   writer*   dumpsr,   )r1   r   r   r	   fs        r   save_vocabularyVitsTokenizer.save_vocabulary   s    ww}}^,,LL,^,<<STUWW\\o_s22QbcoQpp

 *cG4GGDJJt||ATYZ]aab 5 } 54 }s   ?4B>>
C)r#   r.   r,   r&   r"   r$   r%   )z<pad>z<unk>NTTTF)r   N)FNr9   )__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr0   propertyr;   rD   rP   rV   strbooltupledictr   ro   rI   rv   ry   r~   rs   r   __static_attributes____classcell__)r6   s   @r   r   r   .   s   * *$&67
 !
 
!
 !
F ! !
* VZ?%?%.2?%GKd{?%	sDcN"	#?%B	c 	d3i 	tCy S 
!'c C$J Z_`cZdgkZk  r   r   )r   r*   r   r   typingr   tokenization_pythonr   utilsr   r   r   rj   rn   rf   
get_loggerr   rd   r   r   r   __all__r(   r   r   <module>r      ss    #  	 	  6 J J 			H	%!<0 G' GT 
r   