
    Z j	                         S r SSKJrJrJrJr  SSKJr  SSKJ	r	  SSK
Jr  \R                  " \5      rSSS	S
.r " S S\	5      rS/rg)z$Tokenization classes for OpenAI GPT.    )	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                      ^  \ rS rSrSr\rSS/r\r	   SS\
\\
\4   -  S-  S\
\\
   -  S-  S\
4U 4S	 jjjr\S
 5       rSrU =r$ )OpenAIGPTTokenizer   a`  
Construct a GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
the following peculiarities:

- lower case all inputs
- uses BERT's BasicTokenizer for pre-BPE tokenization

This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab_file (`str`, *optional*):
        Path to the vocabulary file.
    merges_file (`str`, *optional*):
        Path to the merges file.
    tokenizer_file (`str`, *optional*):
        Path to a tokenizers JSON file containing the serialization of a tokenizer.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    vocab (`str` or `dict[str, int]`, *optional*):
        Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
    merges (`str` or `list[str]`, *optional*):
        Custom merges list. If not provided, an empty list is used.
	input_idsattention_maskNvocabmerges	unk_tokenc                   > Ub  UO[        U5      S0U l        U=(       d    / U l        [        [	        U R                  U R                  S SSS[        U5      S95      U l        [        R                  " SS9U R
                  l        [        R                  " 5       U R
                  l        [        R                  " SS9U R
                  l        [        TU ]@  " S
S	U0UD6  g )Nr    z</w>F)r   r   dropoutcontinuing_subword_prefixend_of_word_suffixfuse_unkr   T)	lowercase)suffixr    )str_vocab_mergesr   r   
_tokenizerr   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizerr   
BPEDecoderdecodersuper__init__)selfr   r   r   kwargs	__class__s        /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/openai/tokenization_openai.pyr*   OpenAIGPTTokenizer.__init__;   s      %0es9~q6I|#kk||*,#)i.

 &1%?%?$%O"(6(G(G(I%"*"5"5V"D 	
	
	
    c                     g)NTr   )r+   s    r.   do_lower_case OpenAIGPTTokenizer.do_lower_case]   s    r0   )r!   r"   r    )NNz<unk>)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr   dictintlistr*   propertyr2   __static_attributes____classcell__)r-   s   @r.   r   r      s    4 *$&67E .2)- 	 
T#s(^#d* 
 d3i$& 
 	 
  
D  r0   r   N)r8   
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr	   utilsr
   
get_loggerr4   loggerr9   r   __all__r   r0   r.   <module>rJ      sS    + G G ! >  
		H	%#/`pq C* CL  
 r0   