
    Z jF                     0   S SK r S SKrS SKrS SKJr  S SKJr  S SKJr  S SK	r	SSK
Jr  SSKJr  SSKJr  \R                   " \5      rS	S
SSSS.rSr\" SS9 " S S\5      5       rS\S\\\4   S\	R0                  4S jrS\SS4S jrS\S\\-  4S jrS/rg)    N)Path)copyfile)Any   )PreTrainedTokenizer)logging)requiresz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_filetokenizer_config_fileu   ▁)sentencepiece)backendsc            
         ^  \ rS rSrSr\rSS/r         S-S\\	\
4   S-  SS4U 4S jjjrS	 rS
\	S\	4S jrS rS\	4S jrS\	S\\	   4S jrS\S\	4S jrU 4S jrU 4S jr  S.S\S\S-  S\	4U 4S jjjrS\\	   S\	4S jrS/S\\   4S jjrS rS r\S\4S j5       rS/S\	S\	S-  S\\	   4S jjrS\4S  jr S! r!S" r"S\4S# jr#S$\SS4S% jr$S& r%S' r& S0S(\S)\S-  S*\S\\   4S+ jjr'S,r(U =r)$ )1MarianTokenizer,   a  
Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    source_spm (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
        contains the vocabulary for the source language.
    target_spm (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
        contains the vocabulary for the target language.
    source_lang (`str`, *optional*):
        A string representing the source language.
    target_lang (`str`, *optional*):
        A string representing the target language.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    model_max_length (`int`, *optional*, defaults to 512):
        The maximum sentence length the model accepts.
    additional_special_tokens (`list[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
        Additional special tokens used by the tokenizer.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Examples:

```python
>>> from transformers import MarianForCausalLM, MarianTokenizer

>>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
>>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

>>> outputs = model(**inputs)  # should work
```	input_idsattention_maskNsp_model_kwargsreturnc                   > Uc  0 OUU l         [        U5      R                  5       (       d
   SU 35       eXl        [	        U5      U l        [        U5      U R
                  ;  a  [        S5      eU(       aL  [	        U5      U l        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l
        / U l        OU R
                  R                  5        VVs0 s H  u  pX_M	     snnU l
        U R
                   Vs/ s H4  oR                  S5      (       d  M  UR                  S5      (       d  M2  UPM6     snU l        XPl        X`l        X/U l        [#        XR                   5      U l        [#        X R                   5      U l        U R$                  U l        U R
                  U l        U R-                  5         SU l        [0        TU ]d  " SUUUUU	U
U R                   UUS.	UD6  g s  snnf s  snnf s  snf )Nzcannot find spm source z <unk> token must be in the vocab>><<F)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabs )r   r   existsr!   	load_jsonencoderstrKeyErrortarget_encoderitemsdecodersupported_language_codes
startswithendswithr   r   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizer_decode_use_source_tokenizersuper__init__)selfr
   r   r   r   r   r   r   r   r   r    r   r!   kwargskv	__class__s                   /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/marian/tokenization_marian.pyr7   MarianTokenizer.__init__k   s     &5%<r/J&&((P,CJ<*PP(. 'y>-=>>"+,=">D-1-@-@-F-F-HI-HTQAD-HIDL,.D)-1\\-?-?-AB-ATQAD-ABDL>Bll2vlll[_N`1efeoeopteu1l2vD)&&$1 #:/C/CD":/C/CD??#|| 	 ,1) 	
##- 00/+	
 	
- J C2vs   $G. G4G:"G::G:c                      SSK Jn  U" U R                  5      R                  U l        g ! [
        [        4 a!    [        R                  " S5        S U l         g f = f)Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                     U $ Nr"   )xs    r=   <lambda>3MarianTokenizer._setup_normalizer.<locals>.<lambda>   s    Q    )	
sacremosesr@   r   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)r8   r@   s     r=   r4   !MarianTokenizer._setup_normalizer   sM    	/7#78H8H#I#S#SD ./ 	/MM@A#.D 	/s   '* .AArC   c                 6    U(       a  U R                  U5      $ S$ )zHCover moses empty string edge case. They return empty list for '' input! )rI   )r8   rC   s     r=   rH   MarianTokenizer.normalize   s    *+t##A&33rF   c                 p    XR                   ;   a  U R                   U   $ U R                   U R                     $ rB   )r3   r   )r8   tokens     r=   _convert_token_to_id$MarianTokenizer._convert_token_to_id   s6    (((''.. ##DNN33rF   textc                     / nUR                  S5      (       a5  UR                  S5      =nS:w  a  UR                  USUS-    5        XS-   S nX!4$ )z6Remove language codes like >>fr<< before sentencepiecer   r   N   )r,   findappend)r8   rV   codeend_locs       r=   remove_language_code$MarianTokenizer.remove_language_code   sW    ??4  4&@gR%GKK]w{+,!&DzrF   c                 l    U R                  U5      u  p!U R                  R                  U[        S9nX#-   $ )N)out_type)r^   r2   encoder&   )r8   rV   r\   piecess       r=   	_tokenizeMarianTokenizer._tokenize   s7    ..t4
!!(((<}rF   indexc                     XR                   ;   a  U R                   U   $ U R                  (       a  U R                  OU R                  nUR	                  U5      nU(       a  U$ U R
                  $ )z?Converts an index (integer) in a token (str) using the decoder.)r*   r5   r0   r1   	IdToPiecer   )r8   rf   	spm_modelpieces       r=   _convert_id_to_token$MarianTokenizer._convert_id_to_token   sS    LL <<&&'+'H'HDOOdoo	##E*u14>>1rF   c                 &   > [         TU ]  " U40 UD6$ )a  
Convert a list of lists of token ids into a list of strings by calling decode.

Args:
    sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor]`):
        List of tokenized input ids. Can be obtained using the `__call__` method.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.
    clean_up_tokenization_spaces (`bool`, *optional*):
        Whether or not to clean up the tokenization spaces. If `None`, will default to
        `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
    use_source_tokenizer (`bool`, *optional*, defaults to `False`):
        Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
        problems).
    kwargs (additional keyword arguments, *optional*):
        Will be passed to the underlying model specific decode method.

Returns:
    `list[str]`: The list of decoded sentences.
)r6   batch_decode)r8   	sequencesr9   r<   s      r=   rn   MarianTokenizer.batch_decode   s    * w#I888rF   c                 &   > [         TU ]  " U40 UD6$ )a_  
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.

Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

Args:
    token_ids (`Union[int, list[int], np.ndarray, torch.Tensor]`):
        List of tokenized input ids. Can be obtained using the `__call__` method.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.
    clean_up_tokenization_spaces (`bool`, *optional*):
        Whether or not to clean up the tokenization spaces. If `None`, will default to
        `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
    use_source_tokenizer (`bool`, *optional*, defaults to `False`):
        Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
        problems).
    kwargs (additional keyword arguments, *optional*):
        Will be passed to the underlying model specific decode method.

Returns:
    `str`: The decoded sentence.
)r6   decode)r8   	token_idsr9   r<   s      r=   rr   MarianTokenizer.decode   s    0 w~i2622rF   skip_special_tokensclean_up_tokenization_spacesc                 |   > U R                   (       + nUR                  SU5      U l        [        TU ]  " SUUUS.UD6$ )zCInternal decode method that handles use_source_tokenizer parameter.use_source_tokenizer)rs   ru   rv   r"   )r!   popr5   r6   _decode)r8   rs   ru   rv   r9   default_use_sourcer<   s         r=   rz   MarianTokenizer._decode  sR     "&!5!55,2JJ7MOa,b)w 
 3)E
 	
 	
rF   tokensc                 Z   U R                   (       a  U R                  OU R                  n/ nSnU H@  nXPR                  ;   a  XBR	                  U5      U-   S-   -  n/ nM/  UR                  U5        MB     XBR	                  U5      -  nUR                  [        S5      nUR                  5       $ )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserP    )	r5   r0   r1   all_special_tokensdecode_piecesr[   replaceSPIECE_UNDERLINEstrip)r8   r}   sp_modelcurrent_sub_tokens
out_stringrS   s         r=   convert_tokens_to_string(MarianTokenizer.convert_tokens_to_string  s    &*&G&G4??T__
E///445GH5PSVVV
%'""))%0  	,,-?@@
''(8#>
!!rF   c                 J    Uc  XR                   /-   $ X-   U R                   /-   $ )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r8   token_ids_0token_ids_1s      r=    build_inputs_with_special_tokens0MarianTokenizer.build_inputs_with_special_tokens&  s1    "3"3!444(D,=,=+>>>rF   c                 H    U R                   U l        U R                  U l        g rB   )r0   r2   r%   r3   r8   s    r=   _switch_to_input_mode%MarianTokenizer._switch_to_input_mode-  s    ??#||rF   c                 l    U R                   U l        U R                  (       a  U R                  U l        g g rB   )r1   r2   r!   r(   r3   r   s    r=   _switch_to_target_mode&MarianTokenizer._switch_to_target_mode1  s*    ??#'#6#6D   rF   c                 ,    [        U R                  5      $ rB   )lenr%   r   s    r=   
vocab_sizeMarianTokenizer.vocab_size6  s    4<<  rF   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g / nU R
                  (       a  [         R                  R                  UU(       a  US-   OS[        S   -   5      n[         R                  R                  UU(       a  US-   OS[        S   -   5      n[        U R                  U5        [        U R                  U5        UR                  U5        UR                  U5        O\[         R                  R                  X(       a  US-   OS[        S   -   5      n[        U R                  U5        UR                  U5        [        [        S   [        S   /U R                  U R                  U R                  /5       GH$  u  pxn	[         R                  R                  X(       a  US-   OSU-   5      n
[         R                  R!                  U5      [         R                  R!                  U
5      :w  aB  [         R                  R#                  U5      (       a  [%        X5        UR                  U
5        M  [         R                  R#                  U5      (       a  M  ['        U
S	5       nU	R)                  5       nUR+                  U5        S S S 5        UR                  U
5        GM'     [-        U5      $ ! , (       d  f       N/= f)
NzVocabulary path (z) should be a directory-rP   r   r   r
   r   wb)ospathisdirloggererrorr!   joinVOCAB_FILES_NAMES	save_jsonr%   r(   r[   zipr.   r0   r1   abspathisfiler   openserialized_model_protowritetuple)r8   r   r   saved_filesout_src_vocab_fileout_tgt_vocab_fileout_vocab_filespm_save_filenamespm_orig_pathri   spm_save_pathficontent_spiece_models                r=   save_vocabularyMarianTokenizer.save_vocabulary:  sE   ww}}^,,LL,^,<<STU!#*93&rEVW^E__" "$*93&rEVWjEkk" dll$67d))+=>1212WW\\/3!6rUfgnUo oN dllN3~.;>|,.?.MNNN__doo.<
7i
 GGLL/3!6rUf fM ww}-1OOTVT[T[TbTbcpTqTq6""=1WW^^M22-."+4+K+K+M(HH12 / ""=1<
" [!! /.s   "K
K(	c                 "    U R                  5       $ rB   )get_src_vocabr   s    r=   	get_vocabMarianTokenizer.get_vocabg  s    !!##rF   c                 B    [        U R                  40 U R                  D6$ rB   )dictr%   added_tokens_encoderr   s    r=   r   MarianTokenizer.get_src_vocabj  s    DLL>D$=$=>>rF   c                 B    [        U R                  40 U R                  D6$ rB   )r   r(   added_tokens_decoderr   s    r=   get_tgt_vocabMarianTokenizer.get_tgt_vocabm  s    D''E4+D+DEErF   c                     U R                   R                  5       nUR                  [        R	                  / SQ5      5        U$ )N)r0   r1   r2   rI   r   )__dict__copyupdater   fromkeys)r8   states     r=   __getstate__MarianTokenizer.__getstate__p  s4    ""$MMmn	
 rF   dc                    ^  UT l         [        T S5      (       d  0 T l        [        T S5      (       d  ST l        U 4S jT R                   5       u  T l        T l        T R
                  T l        T R                  5         g )Nr   r5   Fc              3   P   >#    U  H  n[        UTR                  5      v   M     g 7frB   )r/   r   ).0fr8   s     r=   	<genexpr>/MarianTokenizer.__setstate__.<locals>.<genexpr>  s$     +fWeRSHQ8L8L,M,MWes   #&)	r   hasattrr   r5   r.   r0   r1   r2   r4   )r8   r   s   ` r=   __setstate__MarianTokenizer.__setstate__w  sj     t.//#%D t;<<05D-+fW[WeWe+f(?? rF   c                     g)zJust EOS   r"   )r8   argsr9   s      r=   num_special_tokens_to_add)MarianTokenizer.num_special_tokens_to_add  s    rF   c                     [        U R                  5      nUR                  U R                  5        U Vs/ s H  o3U;   a  SOSPM     sn$ s  snf )Nr   r   )setall_special_idsremoveunk_token_id)r8   seqr   rC   s       r=   _special_token_mask#MarianTokenizer._special_token_mask  sH    d223t001:=>#Q/)q0#>>>s   Ar   r   already_has_special_tokensc                     U(       a  U R                  U5      $ Uc  U R                  U5      S/-   $ U R                  X-   5      S/-   $ )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.r   )r   )r8   r   r   r   s       r=   get_special_tokens_mask'MarianTokenizer.get_special_tokens_mask  sQ     &++K88 ++K8A3>>++K,EF!LLrF   )r   r5   r3   r2   r*   r%   rI   r!   r   r   r.   r0   r1   r+   r(   r   )	NNNz<unk>z</s>z<pad>i   NF)FNrB   )NF)*__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr   r&   r   r7   r4   rH   rT   r^   listrd   intrk   rn   rr   boolrz   r   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   __static_attributes____classcell__)r<   s   @r=   r   r   ,   s   8t *$&67 15=
 c3h$.=
 
=
 =
~/43 43 44 c d3i 
2# 2# 29.3: %*48	
 "
 '+Tk	
 

 
""tCy "S " ?QUVYQZ ?,7
 !C ! !+"c +"C$J +"Z_`cZd +"Z$4 $?Fd !d !t !? fk	M	M.2Tk	M^b	M	c	M 	MrF   r   r   r   r   c                 T    [         R                  " S0 UD6nUR                  U 5        U$ )Nr"   )r   SentencePieceProcessorLoad)r   r   spms      r=   r/   r/     s%    

.
.
A
ACHHTNJrF   c                 z    [        US5       n[        R                  " XSS9  S S S 5        g ! , (       d  f       g = f)NwrY   )indent)r   jsondump)datar   r   s      r=   r   r     s%    	dCA		$!$ 
s   ,
:c                 |    [        U S5       n[        R                  " U5      sS S S 5        $ ! , (       d  f       g = f)Nr)r   r   load)r   r   s     r=   r$   r$     s"    	dCAyy| 
s   -
;)r   r   rL   pathlibr   shutilr   typingr   r   tokenization_pythonr   utilsr   utils.import_utilsr	   
get_loggerr   r   r   r   r   r&   r   r   r/   r   r   r$   __all__r"   rF   r=   <module>r     s     	      6  * 
		H	% ,4   
 
%&iM) iM 'iMX3 c3h M<`<` %# %$ %
C D4K 
 
rF   