
    Z j                          S r SSKJr  SSKJr  SSKJr  SSKJr  SSK	J
r
  \R                  " \5      rS	S
0r\" SS9 " S S\5      5       rS/rg)z Tokenization class for SpeechT5.    )Any   )SentencePieceBackend)logging)requires   )EnglishNumberNormalizer
vocab_filezspm_char.model)sentencepiece)backendsc            
       0  ^  \ rS rSrSr\rSS/rSr      SS\	\
\4   S-  SS4U 4S	 jjjrSS
 jr\S 5       r\R                   S 5       rSS\\   4S jjr SS\\   S\\   S-  S\S\\   4U 4S jjjr SS\\   S\\   S-  S\\   4S jjrSrU =r$ )SpeechT5Tokenizer   a  
Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The begin of sequence token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    normalize (`bool`, *optional*, defaults to `False`):
        Whether to convert numeric quantities in the text to their spelt-out english counterparts.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Attributes:
    sp_model (`SentencePieceProcessor`):
        The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
	input_idsattention_maskFNsp_model_kwargsreturnc           
      \   > X`l         S U l        Ub  XxS'   [        T	U ]  " SUUUUUUS.UD6  g )Nr   )r
   	bos_token	eos_token	unk_token	pad_token	normalize )r   _normalizersuper__init__)
selfr
   r   r   r   r   r   r   kwargs	__class__s
            ڃ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr   SpeechT5Tokenizer.__init__M   sS     # &(7$% 	 	
!	
 	
    c                     UR                  SU R                  5      nU(       a  SU-   nU(       a  U R                  U5      nX4$ )Nr    )popr   
normalizer)r   textis_split_into_wordsr   r   s        r!   prepare_for_tokenization*SpeechT5Tokenizer.prepare_for_tokenizationj   s;    JJ{DNN;	:D??4(D~r#   c                 R    U R                   c  [        5       U l         U R                   $ N)r   r	   )r   s    r!   r'   SpeechT5Tokenizer.normalizerr   s%    #68Dr#   c                     Xl         g r-   )r   )r   values     r!   r'   r.   x   s     r#   c                 J    Uc  XR                   /-   $ X-   U R                   /-   $ )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r   token_ids_0token_ids_1s      r!    build_inputs_with_special_tokens2SpeechT5Tokenizer.build_inputs_with_special_tokens|   s1    "3"3!444(D,=,=+>>>r#   r3   r4   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ S/nUc  S/[        U5      -  U-   $ S/[        U5      -  S/[        U5      -  -   U-   $ )NT)r3   r4   r7   r   r   )r   get_special_tokens_masklen)r   r3   r4   r7   suffix_onesr    s        r!   r9   )SpeechT5Tokenizer.get_special_tokens_mask   ss     &72']a 3   cC#k**k99c+&&A3[1A+AB[PPr#   c                 l    U R                   /nUc  [        X-   5      S/-  $ [        X-   U-   5      S/-  $ )a}  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. SpeechT5 does not
make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of zeros.
r   )r2   r:   )r   r3   r4   eoss       r!   $create_token_type_ids_from_sequences6SpeechT5Tokenizer.create_token_type_ids_from_sequences   sG        !{()QC//;,s23qc99r#   )r   r   )z<s>z</s>z<unk>z<pad>FN)Fr-   )NF)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesis_fastdictstrr   r   r*   propertyr'   setterlistintr5   boolr9   r?   __static_attributes____classcell__)r    s   @r!   r   r      s)   (T *$&67G
 15
 c3h$.
 

 
:    
 ! !?QUVYQZ ? puQ9Q379t3CQhlQ	cQ Q GK:9:379t3C:	c: :r#   r   N)rE   typingr    tokenization_utils_sentencepiecer   utilsr   utils.import_utilsr   number_normalizerr	   
get_loggerrA   loggerrF   r   __all__r   r#   r!   <module>r[      sg    '  D  * 6 
		H	%!#34  
%&E:, E: 'E:P 
r#   