
    Z j!                         S SK JrJrJrJr  S SKJr  SSKJr  SSK	J
r
  SSKJr  \R                  " \5      rSSS	.r/ S
Qr " S S\
5      rS/rg)    )	Tokenizerdecoderspre_tokenizers
processors)Unigram   )
AddedToken)TokenizersBackend)loggingzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                   ,  ^  \ rS rSr% Sr\rSS/r\r	/ r
\\   \S'   / r\\   \S'              SS\\-  \-  S-  4U 4S	 jjjr\S
\4S j5       r\R(                  S\S
S4S j5       rS\S\S-  S\S-  4S jrS rS rSS jrS\S
S4S jrSrU =r$ )MBartTokenizer!   u  
Construct an MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Examples:

```python
>>> from transformers import MBartTokenizer

>>> tokenizer = MBartTokenizer.from_pretrained(
...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
```	input_idsattention_maskprefix_tokenssuffix_tokensNvocabc                   > [        U[        5      (       a  [        USSS9OUn[        R	                  5       nUb)  UR                  U Vs/ s H  oU;  d  M
  UPM     sn5        Uct  [        U5      S4[        U5      S4[        U5      S4[        U5      S4/nUS/-  n[         H  nUR                  US45        M     UR                  [        U5      S45        Xl        [        [        U R                  SSS95      U l
        S U R                  l        [        R                  " [        R                  " 5       [        R                  " SS	SS
9/5      U R                  l        ["        R                  " SS	SS
9U R                  l        [&        TU ]P  " SUUUUUUUU	U
US.
UD6  [         Vs0 s H  oU R+                  U5      _M     snU l        SU l        SSSSS.U l        U R0                  R3                  U R,                  5        U R+                  [        U5      5      U R0                  S'   U R0                  R5                  5        VVs0 s H	  u  nnUU_M     snnU l        U	b  U	OSU l        U R+                  U R8                  5      U l        Xl        U R?                  U R8                  5        g s  snf s  snf s  snnf )NTF)lstriprstripg        )   ▁g       r   )unk_idbyte_fallbackr2   always)replacementprepend_schemesplit)
	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokens   r      )<s><pad></s><unk><mask>r    ) 
isinstancestrr	   FAIRSEQ_LANGUAGE_CODEScopyextendappend_vocabr   r   
_tokenizer
normalizerr   SequenceWhitespaceSplit	Metaspacepre_tokenizerr   decodersuper__init__convert_tokens_to_idslang_code_to_idfairseq_offsetfairseq_tokens_to_idsupdateitemsfairseq_ids_to_tokens	_src_langcur_lang_coderA   set_src_lang_special_tokens)selfr.   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   kwargs_additional_special_tokenst	lang_codekv	__class__s                     }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mbart/tokenization_mbart.pyrZ   MBartTokenizer.__init__@   s    KUU_adJeJeZ
4Fku
%;%@%@%B"$0&--5]5qB\9\5] =Y%Y%Y%Y%	E m_$E3	i-. 4LL#j/3/0#GDKKQV$WX%)"(6(?(?..0((U8[_`)
% #+"4"4W_gk"l 	
!&@	
 	
 Oe 
Ndt11)<<Nd 
   	&
" 	""))$*>*>?/3/I/I#j//Z""8,7;7Q7Q7W7W7Y%Z7Ytq!ad7Y%Z"%-%9w!77G ((8} ^V 
 &[s   
	J1J1+J6
J;returnc                     U R                   $ N)rb   re   s    rm   r@   MBartTokenizer.src_lang   s    ~~    new_src_langc                 F    Xl         U R                  U R                   5        g rq   )rb   rd   )re   ru   s     rm   r@   rs      s    %((8rt   return_tensorsr@   rA   c                 v    Ub  Uc  [        S5      eX0l        U " U4SUS.UD6nU R                  U5      nXvS'   U$ )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensrw   forced_bos_token_id)
ValueErrorr@   r[   )re   
raw_inputsrw   r@   rA   extra_kwargsinputstgt_lang_ids           rm   _build_translation_inputs(MBartTokenizer._build_translation_inputs   sU     x/`aa jiT.i\hi00:(3$%rt   c                 8    U R                  U R                  5      $ rq   )rd   r@   rr   s    rm   _switch_to_input_mode$MBartTokenizer._switch_to_input_mode   s    //>>rt   c                 t    U R                   c  U R                  U l         U R                  U R                   5      $ rq   )rA   rb   set_tgt_lang_special_tokensrr   s    rm   _switch_to_target_mode%MBartTokenizer._switch_to_target_mode   s,    ==  NNDM//>>rt   c                    U R                  U5      U l        / U l        U R                  U R                  /U l        U R                  U R                  5      nU R                  U R                  5      n[        R                  " US/-   U-   USS/-   U-   [        [        X#-   U R                  U R                  -   5      5      S9U R                  l        g)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$Bsinglepairspecial_tokensNr[   rc   r,   eos_token_idr-   convert_ids_to_tokensr   TemplateProcessinglistziprR   post_processor)re   r@   prefix_tokens_strsuffix_tokens_strs       rm   rd   *MBartTokenizer.set_src_lang_special_tokens   s    !77A"//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$5$I4K]K]`d`r`rKr st*
&rt   langc                    U R                  U5      U l        / U l        U R                  U R                  /U l        U R                  U R                  5      nU R                  U R                  5      n[        R                  " US/-   U-   USS/-   U-   [        [        X#-   U R                  U R                  -   5      5      S9U R                  l        g)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].r   r   r   Nr   )re   r   r   r   s       rm   r   *MBartTokenizer.set_tgt_lang_special_tokens   s    !77="//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$5$I4K]K]`d`r`rKr st*
&rt   )rb   rR   rQ   rc   ra   r]   r^   r\   r,   r@   r-   rA   )NrE   rG   rG   rE   rH   rF   rI   NNN)ro   N)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr,   r   int__annotations__r-   rL   dictrZ   propertyr@   setterr   r   r   rd   r   __static_attributes____classcell__)rl   s   @rm   r(   r(   !   s   . *$&67E!M49!!M49! +/"&R9TzD 4'R9 R9h #   __9S 9T 9 9
*-
9<t
ORUYz
??


 
 
 
rt   r(   N)
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_pythonr	   tokenization_utils_tokenizersr
   utilsr   
get_loggerr   loggerr   rM   r(   __all__rJ   rt   rm   <module>r      s]     G F % - >  
		H	% $=P`a  { l
& l
^ 
rt   