
    Z jR=                         S SK Jr  SSKJr  SSKJr  SSKJr  SSKJ	r	  SSK
Jr  \	R                  " \5      rSrS	S
S.r/ SQ/ SQS.rSSSSSSSS.r\" SS9 " S S\5      5       rS/rg)    )Any   )BatchEncoding)
AddedToken)SentencePieceBackend)logging)requiresu   ▁zsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)__java__
__python__	__en_XX__)r   r   r   __javascript____php____ruby____go__)basemultir   r   r   r   r   r   r   )javapythonen_XX
javascriptphprubygo)sentencepiece)backendsc                     ^  \ rS rSr% Sr\rSS/r/ r\	\
   \S'   / r\	\
   \S'                S!S\\\4   S-  4U 4S	 jjjr\S
 5       rS r\S\4S j5       r\R*                  S\SS4S j5       rS\S\S-  S\S-  4S jrS rS r   S"S\	\   S\S\	\   S-  S\S\4
U 4S jjjrS rS rS#S jrS\SS4S jrS\S\4S jrS$U 4S jjr S r!U =r"$ )%PLBartTokenizer/   a  
Construct an PLBART tokenizer.

Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    src_lang (`str`, *optional*):
        A string representing the source language.
    tgt_lang (`str`, *optional*):
        A string representing the target language.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The start of sequence token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The cls token, which is a special token used as the first token for all tasks.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    mask_token(`str`, *optional*, defaults to `"<mask>"`):
        The token used for masking values. This is the token used when training this model with masking tasks. This
        is only used in the `"base"` tokenizer type. For `"multi"` tokenizer, masking is never done for the
        downstream tasks.
    language_codes (`str`, *optional*, defaults to `"base"`):
        What language codes to use. Should be one of `"base"` or `"multi"`.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:
        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.
        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Examples:

```python
>>> from transformers import PLBartTokenizer

>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
>>> expected_translation_english = "Returns the maximum value of a b c."
>>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
```	input_idsattention_maskprefix_tokenssuffix_tokensNsp_model_kwargsc           	        > [        U[        5      (       a  [        USSS9OUnUc  0 OUU l        U R	                  U
5      n
U R	                  U5      nXl        [        U R
                     nXl        0 U l        0 U l	        SSSSS.U l
        U R                  R                  5        VVs0 s H	  u  nnUU_M     snnU l        SU l        [        U5      nUb*  UR                  U Vs/ s H  nUU;  d  M  UPM     sn5        [         TU ]D  " S 0 S	U_S
U_SU_SU_SU_SU_SU_SU_SU
_SU_SU_SU R                  _SU_SU	_SS_SS_UD6  [%        U R&                  5      U l        [+        U5       VVs0 s H#  u  nnUU R(                  U-   U R                  -   _M%     snnU l        U R                  R                  5        VVs0 s H	  u  nnUU_M     snnU l	        SSSSS.U l
        U R
                  S:X  aE  [%        U R&                  5      [%        U R                  5      -   U R                  -   U R                  S'   U R                  R-                  U R                  5        U R                  R                  5        VVs0 s H	  u  nnUU_M     snnU l        1 SknUR-                  [        U R
                     5        SnU HB  nU R.                  R1                  US 5      nUc  M$  U R2                  R1                  US 5        SnMD     U(       a   U R5                  5         U R7                  5         SnU R.                  R                  5        H4  u  nnUU R2                  ;   a  M  [        USSSSS9U R2                  U'   SnM6     U(       a   U R5                  5         U R7                  5         U R
                  S:X  a>  Xl        U R8                  b  U R                  U R8                     OU R8                  U l        O*U
b  U
OSU l        U R                  U R8                     U l        Xl        U R?                  U R8                  5        g s  snnf s  snf s  snnf s  snnf s  snnf )!NTF)lstriprstripr         r   )<s><pad></s><unk>r
   	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensr%   clean_up_tokenization_spaceslanguage_codesspecial_tokens_patternprefix_suffixtoken_type_ids_pattern	all_zerosr   <mask>>   r+   r-   r,   r.   r?   )special
normalizedr'   r(   r    ) 
isinstancestrr   r%   !_convert_lang_code_special_formatr:   FAIRSEQ_LANGUAGE_CODESr
   lang_code_to_idid_to_lang_codefairseq_tokens_to_idsitemsfairseq_ids_to_tokensfairseq_offsetlistextendsuper__init__lensp_modelsp_model_size	enumerateupdate_added_tokens_encoderpop_added_tokens_decoder_update_trie_update_total_vocab_size	_src_langcur_lang_code_idr7   set_src_lang_special_tokens)selfr
   r/   r0   r2   r3   r1   r4   r5   r:   r6   r7   r%   r8   r9   kwargsfairseq_language_codeskv_additional_special_tokensticodereserved_tokensremovedtokenidxsynced	__class__s                               /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/plbart/tokenization_plbart.pyrP   PLBartTokenizer.__init__t   sz   & KUU_adJeJeZ
4Fku
%4%<r/99(C99(C,!78K8K!L %!!-.APQ%R"7;7Q7Q7W7W7Y%Z7Ytq!ad7Y%Z"%)*@%A"$0&--5]5qB\9\5] 	 	
!	
	
  	
  		

  	
  	
  	
 "	
 	
 	
 'A	
 !00	
 *F	
 *	
 $3	
  $/#	
* !/NWXnNo 
No71dD$$$q(4+>+>>>No 
 261E1E1K1K1MN1MA11MN-.APQ%R"&(36t}}3EDL`L`Ha3adhdwdw3wD&&x0""))$*>*>?7;7Q7Q7W7W7Y%Z7Ytq!ad7Y%Z"E5d6I6IJK$E,,00=C**..sD9	 %
 ))+44::<JE3d000.8teE/D&&s+ F = ))+&(%N8<8R$$T^^4X\XfXf ! *2)=X;DN$($8$8$HD! ((8[ &[ ^2 
  O &[s$   (Q%
Q	3Q	2*QQ>Qc                     [        [        U S0 5      5      n[        U SS5      n[        U S5      (       a  [        U R                  5      OSn[        U SS5      S:X  a
  X1-   U-   S-   $ X1-   U-   $ )NrG   rL   r)   rR   r   r:   r   )rQ   getattrhasattrrR   )r^   lang_code_countrL   
base_vocabs       rm   
vocab_sizePLBartTokenizer.vocab_size   st    gd,=rBC '7;+24+D+DS'!
4)62f</.@1DD+n<<    c                    U R                   R                  5       n[        U R                  R	                  5       5       HI  nU R                  R                  U5      nUS:X  a  U R                  OX R                  -   nX1;  d  ME  XAU'   MK     UR                  U R                  R                  5        VVs0 s H  u  pVXQ;  d  M  XV_M     snn5        U$ s  snnf )z,Override to use fairseq vocabulary structurer   )rI   copyrangerR   get_piece_size	IdToPieceunk_token_idrL   rU   rV   rJ   )r^   vocabre   sp_tokenvocab_idri   rj   s          rm   	get_vocabPLBartTokenizer.get_vocab   s    **//1t}}3356A}}..q1H,-Ft((=P=P9PH$"*h 7 	43M3M3S3S3Ul3UZUY^Ykjej3Ulm ms   3C
C
returnc                     U R                   $ N)r[   r^   s    rm   r6   PLBartTokenizer.src_lang   s    ~~rv   new_src_langc                 h    U R                  U5      nXl        U R                  U R                  5        g r   )rE   r[   r]   )r^   r   s     rm   r6   r     s)    ==lK%((8rv   return_tensorsr6   r7   c                     Ub  Uc  [        S5      eU R                  U5      U l        U R                  U5      U l        U " U4SUS.UD6nU R	                  U R                  5      nXvS'   U$ )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrE   r6   r7   convert_tokens_to_ids)r^   
raw_inputsr   r6   r7   extra_kwargsinputstgt_lang_ids           rm   _build_translation_inputs)PLBartTokenizer._build_translation_inputs	  sx     x/`aa>>xH>>xHjiT.i\hi00?(3$%rv   c                     XR                   ;   a  U R                   U   $ U R                  R                  U5      nU(       a  X R                  -   $ U R                  $ )z0Converts a token (str) in an id using the vocab.)rI   rR   	PieceToIdrL   r|   )r^   ri   spm_ids      rm   _convert_token_to_id$PLBartTokenizer._convert_token_to_id  sQ    ...--e44((/ 06v+++L4;L;LLrv   c                     XR                   ;   a  U R                   U   $ U R                  R                  XR                  -
  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)rK   rR   r{   rL   )r^   indexs     rm   _convert_id_to_token$PLBartTokenizer._convert_id_to_token  s=    ...--e44}}&&u/B/B'BCCrv   	src_texts	tgt_textsc                 ~   > U R                  U5      U l        U R                  U5      U l        [        TU ]  " X40 UD6$ r   )rE   r6   r7   rO   prepare_seq2seq_batch)r^   r   r6   r   r7   r_   rl   s         rm   r   %PLBartTokenizer.prepare_seq2seq_batch%  s>     >>xH>>xHw,YLVLLrv   c                 8    U R                  U R                  5      $ r   )r]   r6   r   s    rm   _switch_to_input_mode%PLBartTokenizer._switch_to_input_mode1      //>>rv   c                 8    U R                  U R                  5      $ r   )set_tgt_lang_special_tokensr7   r   s    rm   _switch_to_target_mode&PLBartTokenizer._switch_to_target_mode4  r   rv   c                     U R                  U5      nUb  U R                  U   OSU l        / U l        U R                  b  U R                  U R                  /U l        gU R                  /U l        g)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrE   rG   cur_lang_coder#   eos_token_idr$   )r^   r6   s     rm   r]   +PLBartTokenizer.set_src_lang_special_tokens7  sk    99(C?G?ST11(;Y])"&"3"3T5G5G!HD"&"3"3!4Drv   langc                     U R                  U5      nUb  U R                  U   OSU l        / U l        U R                  b  U R                  U R                  /U l        gU R                  /U l        g)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   r^   r   s     rm   r   +PLBartTokenizer.set_tgt_lang_special_tokensA  sk    55d;;?;KT11$7QU)"&"3"3T5G5G!HD"&"3"3!4Drv   c                 0    [         R                  X5      nU$ )z;Convert Language Codes to format tokenizer uses if required)FAIRSEQ_LANGUAGE_CODES_MAPgetr   s     rm   rE   1PLBartTokenizer._convert_lang_code_special_formatL  s    )--d9rv   c                 @   > [         TU ]  " SUUU R                  S.UD6$ )zOOverride to use self.clean_up_tokenization_spaces as default for batched input.)	token_idsskip_special_tokensr9   rB   )rO   decoder9   )r^   r   r   r9   r_   rl   s        rm   r   PLBartTokenizer.decodeQ  s2    w~ 
 3)-)J)J
 	
 	
rv   )r[   r   r\   rK   rL   rI   rH   rG   r:   r#   r%   rS   r6   r$   r7   r
   )r+   r-   r-   r+   r.   r,   r?   r   NNNNT)r   Nr   )r   N)FN)#__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr#   rM   int__annotations__r$   dictrD   r   rP   propertyrt   r   r6   setterr   r   r   r   r   r   r   r]   r   rE   r   __static_attributes____classcell__)rl   s   @rm   r   r   /   s   ;z *$&67!M49!!M49!
 15"&%)s9 c3h$.s9 s9j = = #   __9S 9T 9 9
*-9<tORUYzMD  &* 
M9
M 
M 9t#	
M
 
M 

M 
M??5	5 	5 	5c c 

 
rv   r   N)typingr   tokenization_pythonr   tokenization_utils_baser    tokenization_utils_sentencepiecer   utilsr   utils.import_utilsr	   
get_loggerr   loggerSPIECE_UNDERLINEr   rF   r   r   __all__rB   rv   rm   <module>r      s     0 1 D  * 
		H	% #<P`a  4g  "
  
%&h
* h
 'h
V	 
rv   