
    Z j                     d    S r SSKJrJr  SSKJr  \R                  " \5      r " S S\5      r	S/r
g)zTokenization class for Dia.   )
AddedTokenPreTrainedTokenizer)loggingc            	          ^  \ rS rSrSrSS/r    SS\S-  S\S-  S\S-  S	\4U 4S
 jjjr\	S 5       r
S rS\S\\   4S jrS rS rS\\   S\4S jrSrU =r$ )DiaTokenizer   a  
Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    unk_token (`str`, *optional*, defaults to `"<pad>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    max_length (`int`, *optional*, defaults to 1024):
        The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
    offset (`int`, *optional*, defaults to 0):
        The offset of the tokenizer.
	input_idsattention_mask	pad_tokenN	unk_token
max_lengthoffsetc                   > [        U[        5      (       a  [        U5      OUn[        U[        5      (       a  [        U5      OUnSU l        U[        S5      [        S5      S.U l        X@l        [        TU ]  " S	UUUUSSSS.UD6  g )
N   z[S1]z[S2])          	all_zerosTnone)r   r   r   r   token_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )
isinstancestrr   _utf_vocab_size_added_tokens_decoderr   super__init__)selfr   r   r   r   kwargs	__class__s         y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/dia/tokenization_dia.pyr   DiaTokenizer.__init__,   s     .8	3-G-GJy)Y	-7	3-G-GJy)Y	#)2z&7IjY_N`%a" 		
!#.26#)		
 		
    c                     U R                   $ N)r   )r    s    r#   
vocab_sizeDiaTokenizer.vocab_sizeF   s    ###r%   c                     [        U R                  U R                  -   5       Vs0 s H  oR                  U5      U_M     nnUR	                  U R
                  5        U$ s  snf r'   )ranger(   r   convert_ids_to_tokensupdateadded_tokens_encoder)r    ivocabs      r#   	get_vocabDiaTokenizer.get_vocabJ   sX    ;@SWS^S^A^;_`;_a++A.1;_`T../ as   Atextreturnc                 d    UR                  S5       Vs/ s H  n[        U5      PM     nnU$ s  snf )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encodechr)r    r3   r/   tokenss       r#   	_tokenizeDiaTokenizer._tokenizeO   s/    "&++g"67"6Q#a&"67 8s   -c                 \    [        U5      S:w  a  SnU$ [        U5      U R                  -   nU$ )z0Converts a token (str) in an id using the vocab.r   N)lenordr   )r    tokentoken_ids      r#   _convert_token_to_id!DiaTokenizer._convert_token_to_idT   s4     u:?H  5zDKK/Hr%   c                 4    [        XR                  -
  5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r8   r   )r    indexr?   s      r#   _convert_id_to_token!DiaTokenizer._convert_id_to_token^   s    EKK'(r%   r9   c                    SnU Hr  nX0R                   ;   a*  U R                   U   n[        U5      R                  S5      nO2X0R                  ;   a  UR                  S5      nOUR                  S5      nX%-  nMt     UR	                  SSS9nU$ )z:Converts a sequence of tokens (string) in a single string.r%   r6   ignore)errors)added_tokens_decoderr   r7   r.   decode)r    r9   bstringr?   added_token_obj
tok_stringstrings          r#   convert_tokens_to_string%DiaTokenizer.convert_tokens_to_stringc   s    E111"&";";E"B 188A
333"\\'2
"\\'2
!G  9r%   )r   r   r   )<pad>rR   i   r   )__name__
__module____qualname____firstlineno____doc__model_input_namesr   intr   propertyr(   r1   listr:   rA   rE   rP   __static_attributes____classcell__)r"   s   @r#   r   r      s    $ %&67 !( '!%
:
 :
 $J	

 
 
4 $ $
c d3i 

tCy S  r%   r   N)rW   tokenization_pythonr   r   utilsr   
get_loggerrS   loggerr   __all__r   r%   r#   <module>rc      s<    " B  
		H	%Y& Yx 
r%   