
    Z j`9                         S SK JrJrJrJrJrJr  S SKJr  SSK	J
r
Jr  SSKJr  SSKJr  \R                   " \5      rSSS	.r/ S
Qr " S S\5      rS/rg)    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )
AddedTokenBatchEncoding)TokenizersBackend)loggingzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ace_Arabace_Latnacm_Arabacq_Arabaeb_Arabafr_Latnajp_Arabaka_Latnamh_Ethiapc_Arabarb_Arabars_Arabary_Arabarz_Arabasm_Bengast_Latnawa_Devaayr_Latnazb_Arabazj_Latnbak_Cyrlbam_Latnban_Latnbel_Cyrlbem_Latnben_Bengbho_Devabjn_Arabbjn_Latnbod_Tibtbos_Latnbug_Latnbul_Cyrlcat_Latnceb_Latnces_Latncjk_Latnckb_Arabcrh_Latncym_Latndan_Latndeu_Latndik_Latndyu_Latndzo_Tibtell_Grekeng_Latnepo_Latnest_Latneus_Latnewe_Latnfao_Latnpes_Arabfij_Latnfin_Latnfon_Latnfra_Latnfur_Latnfuv_Latngla_Latngle_Latnglg_Latngrn_Latnguj_Gujrhat_Latnhau_Latnheb_Hebrhin_Devahne_Devahrv_Latnhun_Latnhye_Armnibo_Latnilo_Latnind_Latnisl_Latnita_Latnjav_Latnjpn_Jpankab_Latnkac_Latnkam_Latnkan_Kndakas_Arabkas_Devakat_Georknc_Arabknc_Latnkaz_Cyrlkbp_Latnkea_Latnkhm_Khmrkik_Latnkin_Latnkir_Cyrlkmb_Latnkon_Latnkor_Hangkmr_Latnlao_Laoolvs_Latnlij_Latnlim_Latnlin_Latnlit_Latnlmo_Latnltg_Latnltz_Latnlua_Latnlug_Latnluo_Latnlus_Latnmag_Devamai_Devamal_Mlymmar_Devamin_Latnmkd_Cyrlplt_Latnmlt_Latnmni_Bengkhk_Cyrlmos_Latnmri_Latnzsm_Latnmya_Mymrnld_Latnnno_Latnnob_Latnnpi_Devanso_Latnnus_Latnnya_Latnoci_Latngaz_Latnory_Oryapag_Latnpan_Gurupap_Latnpol_Latnpor_Latnprs_Arabpbt_Arabquy_Latnron_Latnrun_Latnrus_Cyrlsag_Latnsan_Devasat_Bengscn_Latnshn_Mymrsin_Sinhslk_Latnslv_Latnsmo_Latnsna_Latnsnd_Arabsom_Latnsot_Latnspa_Latnals_Latnsrd_Latnsrp_Cyrlssw_Latnsun_Latnswe_Latnswh_Latnszl_Latntam_Tamltat_Cyrltel_Telutgk_Cyrltgl_Latntha_Thaitir_Ethitaq_Latntaq_Tfngtpi_Latntsn_Latntso_Latntuk_Latntum_Latntur_Latntwi_Latntzm_Tfnguig_Arabukr_Cyrlumb_Latnurd_Arabuzn_Latnvec_Latnvie_Latnwar_Latnwol_Latnxho_Latnydd_Hebryor_Latnyue_Hantzho_Hanszho_Hantzul_Latnc                     ^  \ rS rSr% Sr\rSS/r\r	/ r
\\   \S'   / r\\   \S'                  S!S\\\\4   -  S-  S	\\\   -  S-  S
\S-  4U 4S jjjr\S\4S j5       r\R(                  S\SS4S j5       rS\S\S-  S\S-  4S jr        S"S\\   S\S\\   S-  S\S\S-  S\S-  S\S\S-  S\S\4S jjrS rS rS#S jrS\SS4S jrS rU =r$ )$NllbTokenizer!   ad  
Construct an NLLB tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Examples:

```python
>>> from transformers import NllbTokenizer

>>> tokenizer = NllbTokenizer.from_pretrained(
...     "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn"
... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
```

Args:
    vocab_file (`str`, *optional*):
        Path to the vocabulary file.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The classifier token.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding.
    mask_token (`str`, *optional*, defaults to `"<mask>"`):
        The token used for masking values.
    src_lang (`str`, *optional*):
        The language to use as source language for translation.
    tgt_lang (`str`, *optional*):
        The language to use as target language for translation.
    legacy_behaviour (`bool`, *optional*, defaults to `False`):
        Whether to use legacy behaviour (suffix pattern) or new behaviour (prefix pattern).
	input_idsattention_maskprefix_tokenssuffix_tokensNvocabmerges_spm_precompiled_charsmapc                 2  > Ub  UnO	Uc  [         n[        U	[        5      (       a  [        U	SSSS9OU	n	Xl        Uc.  [        U5      S[        U5      S[        U5      S[        U5      S0nXl        U=(       d    / U l        [        [        U R
                  U R                  S [        U5      SSS95      U l	        UbY  [        R                  " [        R                  " U5      [        R                  " [        S	5      S
5      /5      U R                  l        [         R"                  " SSSS9U R                  l        [&        R"                  " SSSS9U R                  l        [*        TU ]X  " SUUUUUUU
UU	UUS.UD6  SU l        SSSSS.U l        U R0                  R3                  5        VVs0 s H	  u  nnUU_M     snnU l        U
b  U
OSU l        U R9                  U R6                  5      U l        Xl        U R?                  U R6                  5        g s  snnf )NT)
normalizedlstripspecialr         r
   F)r   r   dropout	unk_tokenfuse_unkbyte_fallbackz {2,} u   ▁always)replacementprepend_schemesplit)	bos_token	eos_token	sep_token	cls_tokenr   	pad_tokensrc_langtgt_lang
mask_tokenextra_special_tokenslegacy_behaviour)<s><pad></s><unk>r?    ) FAIRSEQ_LANGUAGE_CODES
isinstancestrr   r   _vocab_mergesr   r	   
_tokenizerr   SequencePrecompiledReplacer   
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__fairseq_offsetfairseq_tokens_to_idsitemsfairseq_ids_to_tokens	_src_langconvert_tokens_to_idscur_lang_coder   set_src_lang_special_tokens)selfr   r   r   r   r   r   r   r   r   r   r   r   additional_special_tokensr   r   kwargskv	__class__s                      {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/nllb/tokenization_nllb.pyr  NllbTokenizer.__init__Y   s   *  +(<%&.(>% *c** zd4N 	
 !1=IIII	E |#kk||i.#	
 %0)4)=)=++,EF''h=*DOO& )7(@(@Ucksw(x%"*"4"4W_gk"l 	
!!:-	
 	
   	&
" 8<7Q7Q7W7W7Y%Z7Ytq!ad7Y%Z"%-%9z!77G ((8 &[s   ,Hreturnc                     U R                   $ N)r  r  s    r   r   NllbTokenizer.src_lang   s    ~~    new_src_langc                 F    Xl         U R                  U R                   5        g r$  )r  r  )r  r(  s     r   r   r&     s    %((8r'  return_tensorsr   r   c                 v    Ub  Uc  [        S5      eX0l        U " U4SUS.UD6nU R                  U5      nXvS'   U$ )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr*  forced_bos_token_id)
ValueErrorr   r  )r  
raw_inputsr*  r   r   extra_kwargsinputstgt_lang_ids           r   _build_translation_inputs'NllbTokenizer._build_translation_inputs   sU     x/`aa jiT.i\hi00:(3$%r'  	src_texts	tgt_texts
max_lengthmax_target_lengthpadding
truncationc
           	          X l         X@l        Uc  U R                  nU " U4SUUUU	S.U
D6nUc  U$ Uc  UnU R                  5         U " U4SUUUU	S.U
D6nUS   US'   U R	                  5         U$ )NT)r,  r*  r7  r9  r:  )r,  r*  r9  r7  r:  r   labels)r   r   model_max_length_switch_to_target_mode_switch_to_input_mode)r  r5  r   r6  r   r7  r8  r9  r*  r:  r  model_inputsr<  s                r   prepare_seq2seq_batch#NllbTokenizer.prepare_seq2seq_batch   s     ! ..J
#)!!
 
  $ * 	##%
#)(!
 
 "(!4X 	""$r'  c                 8    U R                  U R                  5      $ r$  )r  r   r%  s    r   r?  #NllbTokenizer._switch_to_input_mode  s    //>>r'  c                 t    U R                   c  U R                  U l         U R                  U R                   5      $ r$  )r   r  set_tgt_lang_special_tokensr%  s    r   r>  $NllbTokenizer._switch_to_target_mode  s,    ==  NNDM//>>r'  c                 t   U R                  U5      U l        UnU R                  (       a  / U l        U R                  U R                  /U l        [        R                  " SU R                  U/SSU R                  U/U R                  U R                  4X R                  4/S9U R                  l
        gU R                  /U l        U R                  /U l        [        R                  " USU R                  /USSU R                  /U R                  U R                  4X R                  4/S9U R                  l
        g)zReset the special tokens to the source lang setting.
- In legacy mode: No prefix and suffix=[eos, src_lang_code].
- In default mode: Prefix=[src_lang_code], suffix = [eos]
$A$Bsinglepairspecial_tokensNr  r  r   r   eos_token_idr   r   TemplateProcessingr   r  post_processor)r  r   lang_code_tokens      r   r  )NllbTokenizer.set_src_lang_special_tokens  s   
 "77A"  !#D"&"3"3T5G5G!HD-7-J-Jdnno>D$../B!%1B1B CoWiWiEjk.DOO* #'"4"4!5D"&"3"3!4D-7-J-J't~~>%tT4>>B!%1B1B CoWiWiEjk.DOO*r'  langc                 t   U R                  U5      U l        UnU R                  (       a  / U l        U R                  U R                  /U l        [        R                  " SU R                  U/SSU R                  U/U R                  U R                  4X R                  4/S9U R                  l
        gU R                  /U l        U R                  /U l        [        R                  " USU R                  /USSU R                  /U R                  U R                  4X R                  4/S9U R                  l
        g)zReset the special tokens to the target lang setting.
- In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
- In default mode: Prefix=[tgt_lang_code], suffix = [eos]
rI  rJ  rK  NrO  )r  rU  rS  s      r   rF  )NllbTokenizer.set_tgt_lang_special_tokens$  s   
 "77=  !#D"&"3"3T5G5G!HD-7-J-Jdnno>D$../B!%1B1B CoWiWiEjk.DOO* #'"4"4!5D"&"3"3!4D-7-J-J't~~>%tT4>>B!%1B1B CoWiWiEjk.DOO*r'  )r  r  r  r  r  r  r  r  r   r   r   r   r   )NNr   r   r   r   r  r   z<mask>NNNNNF)r?   NrI   NNlongestNT)r"  N)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   modelr   listint__annotations__r   r  dictr  propertyr   setterr3  boolr   rA  r?  r>  r  rF  __static_attributes____classcell__)r  s   @r   r   r   !   s   .` *$&67E!M49!!M49! .2)-04"&!!]9T#s(^#d*]9 d3i$&]9 $':]9 ]9~ #   __9S 9T 9 9
*-
9<t
ORUYz
 #&*"!%(, %)494 4 9t#	4
 4 $J4 :4 4 d
4 4 
4l??
2   r'  r   N)
tokenizersr   r   r   r   r   r   tokenizers.modelsr	   tokenization_pythonr   r   tokenization_utils_tokenizersr   utilsr   
get_loggerrY  loggerr^  r  r   __all__r  r'  r   <module>rs     s]     [ Z ! < >  
		H	% $=P`a  R& Z% Zz 
r'  