
    Z j                     n    S SK r S SKrS SKJrJrJrJrJr  S SKJ	r	  SSK
Jr  SSS.r " S	 S
\5      rS
/rg)    N)	Tokenizerdecodersnormalizerspre_tokenizers
processors)Unigram   )TokenizersBackendzspiece.modelztokenizer.json)
vocab_filetokenizer_filec                      ^  \ rS rSrSr\rSS/r\r	        SU 4S jjr
S rS r   SS	\\\   -  S
\S\S-  S\S\4
U 4S jjjrSrU =r$ )LasrTokenizer!   aP  
Construct a LASR tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab_file (`str`, *optional*):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    extra_ids (`int`, *optional*, defaults to 100):
        Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
        "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
        calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
    additional_special_tokens (`list[str]`, *optional*):
        Additional special tokens used by the tokenizer.
    vocab (`str`, `dict` or `list`, *optional*):
        Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
	input_idsattention_maskNc	           	      d  > XPl         Ub~  U V
s/ s H  n
S[        U
5      ;   d  M  U
PM     nn
[        U5      S:  a$  U[        U5       Vs/ s H	  nSU S3PM     sn-  nOIUS:  a!  U[        U5      :w  a  [	        SU SU S35      eO![        U5       Vs/ s H	  nSU S3PM     nnUnUb  Xpl        Od[        U5      S4[        U5      S4[        U5      S4S	/U l        [        US-
  S
S
5       H$  nU R
                  R                  SU S3S45        M&     [        [        U R
                  SSS95      U l	        Ub%  [        R                  " U5      U R                  l        [        R                  " [        R                  " 5       [        R                   " SSSS9/5      U R                  l        [$        R                   " SSSS9U R                  l        [(        TU ]T  " SUUUUUS.U	D6  [,        R.                  " SS// SQSU R0                  4/S9U R                  l        g s  sn
f s  snf s  snf )Nz
<extra_id_   >r   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to LasrTokenizer. In this case the additional_special_tokens must include the extra_ids tokensg        )   ▁g       r	   F)unk_idbyte_fallbackr   alwaysT)replacementprepend_schemesplit)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens$A</s>)r"   r#   z$Br#   )singlepairspecial_tokens )
_extra_idsstrlenrange
ValueError_vocab_scoresappendr   r   
_tokenizerr   Precompiled
normalizerr   SequenceWhitespaceSplit	Metaspacepre_tokenizerr   decodersuper__init__r   TemplateProcessingeos_token_idpost_processor)selfr   r   r   _spm_precompiled_charsmapr    r!   vocabr   kwargsxextra_tokensi	__class__s                {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/lasr/tokenization_lasr.pyr8   LasrTokenizer.__init__J   s_    $ %0'@['@!LTWXYTZDZA'@L[< 1$)yIY-ZIYA
1#Q.?IY-ZZ)Q9L0A#A &yk1RSlRm n   8=Y7GH7G!j1-7GLH(4% !& Y%Y%Y%	"D 9q="b1""))Zs!+<c*BC 2#""#
 %0)4)@)@AZ)[DOO&(6(?(?..0((U8[_`)
% #+"4"4W_gk"l 	
&?	
 	
 *4)F)F&>-**+*
&k \-Z Is   H#H#H(H-c                 T    [        [        [        S U R                  5      5      5      $ )zQGet the list of sentinel tokens (extra_id tokens) from additional_special_tokens.c                 F    [        [        R                  " SU 5      5      S L$ )Nz<extra_id_\d+>)boolresearch)r@   s    rD   <lambda>3LasrTokenizer.get_sentinel_tokens.<locals>.<lambda>   s    bii0A1&E!Fd!R    )listsetfilterr!   )r<   s    rD   get_sentinel_tokens!LasrTokenizer.get_sentinel_tokens   s&    RTXTrTrst
 	
rM   c                 j    U R                  5        Vs/ s H  oR                  U5      PM     sn$ s  snf )z&Get the token IDs for sentinel tokens.)rQ   convert_tokens_to_ids)r<   tokens     rD   get_sentinel_token_ids$LasrTokenizer.get_sentinel_token_ids   s.    ?C?W?W?YZ?Ye**51?YZZZs   0	token_idsskip_special_tokensclean_up_tokenization_spacesgroup_tokensreturnc                   > [        U[        5      (       a  U/nU(       a(  [        R                  " U5       Vs/ s H  ofS   PM	     nnU Vs/ s H  owU R                  :w  d  M  UPM     nn[
        TU ]  " SUUUS.UD6$ s  snf s  snf )Nr   )rX   rY   rZ   r'   )
isinstanceint	itertoolsgroupbypad_token_idr7   _decode)	r<   rX   rY   rZ   r[   r?   token_grouprU   rC   s	           rD   rc   LasrTokenizer._decode   s     i%%"I;D;L;LY;WX;WKQ;WIX )2P	ud>O>O5OU		Pw 
 3)E
 	
 	
 Y Qs   B B$B)r(   r/   r-   )r#   z<unk>z<pad>Nd   NNN)FNT)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr8   rQ   rV   r_   rN   rH   r)   rc   __static_attributes____classcell__)rC   s   @rD   r   r   !   s    "H *$&67E "&"&K
Z
[ %*48!
c?
 "
 '+Tk	

 
 

 
rM   r   )r`   rI   
tokenizersr   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   rl   r   __all__r'   rM   rD   <module>rv      sA   *  	 S S % > $2EUV U
% U
p 
rM   