
    Z jC_              	       6   S r SSKrSSKrSSKrSSKrSSKJr  SSKJr  \R                  " \
5      rSSS.rS	 r " S
 S\5      r SrSr\S\SSSSSS4	r\R$                  " SSR'                  \5      -  \R(                  \R*                  -  \R,                  -  5      r\R$                  " S5      r\R$                  " \\R(                  \R*                  -  \R,                  -  5      r\R$                  " S5      rS S jrS!S jr " S S5      rS rS rS"S jr S/r!g)#z!Tokenization classes for BERTweet    N   )PreTrainedTokenizer)logging	vocab.txt	bpe.codes)
vocab_filemerges_filec                 z    [        5       nU S   nU SS  H  nUR                  X#45        UnM     [        U5      nU$ )zy
Return set of symbol pairs in a word.

Word is represented as tuple of symbols (symbols being variable-length strings).
r      N)setadd)wordpairs	prev_charchars       ڃ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/bertweet/tokenization_bertweet.py	get_pairsr   #   sH     EEQIQR		9#$	  JEL    c            	          ^  \ rS rSrSr\r        SU 4S jjr\S 5       r	S r
S rS rS rS	 rS
 rS rS rSS\S\S-  S\\S4   4S jjrS rSrU =r$ )BertweetTokenizer3   a8	  
Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    normalization (`bool`, *optional*, defaults to `False`):
        Whether or not to apply a normalization preprocess.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the beginning of
        sequence. The token used is the `cls_token`.

        </Tip>

    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The classifier token which is used when doing sequence classification (classification of the whole sequence
        instead of per-token classification). It is the first token of the sequence when built with special tokens.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    mask_token (`str`, *optional*, defaults to `"<mask>"`):
        The token used for masking values. This is the token used when training this model with masked language
        modeling. This is the token which the model will try to predict.
c                   >  SSK Jn  Xl        Xl        X l        0 U l        SU R                  [        U5      '   SU R                  [        U	5      '   SU R                  [        U5      '   SU R                  [        U5      '   U R                  U5        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l        [        USS9 nUR                  5       R                  S	5      S S
 nS S S 5        W Vs/ s H  n[!        UR                  5       S S
 5      PM!     nn[#        [%        U['        [)        U5      5      5      5      U l        0 U l        X0l        [1        5       U l        SSS.U l        [6        TU ]p  " SUUUUUUU	U
SSSS.UD6  g ! [         a     [        R                  S5        S U l         GNf = fs  snnf ! , (       d  f       N= fs  snf )Nr   )demojizezsemoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0r      r   utf-8encoding
'z...)u   ’u   …	all_zerosTcls_double_sep)normalization	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokentoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )emojir   	demojizerImportErrorloggerwarningr   r	   encoderstradd_from_fileitemsdecoderopenreadsplittupledictziprangelen	bpe_rankscacher#   TweetTokenizertweetPreprocessorspecial_punctssuper__init__)selfr   r	   r#   r$   r%   r&   r'   r(   r)   r*   kwargsr   kvmerges_handlemergesmerge	__class__s                     r   rG   BertweetTokenizer.__init__h   s   		"&%N %&'(S^$'(S^$'(S^$'(S^$:&)-););)=>)=)=>+0M"'')//5cr:F 19?@%cr*+@c&%F*<=>
*!/!1&)%8 	
'!#.26#3	
 	
=  	"NN( "DN	"$ ?00@s)   F 0G#G
 &G&G G

Gc                 ,    [        U R                  5      $ N)r@   r4   rH   s    r   
vocab_sizeBertweetTokenizer.vocab_size   s    4<<  r   c                 B    [        U R                  40 U R                  D6$ rR   )r=   r4   added_tokens_encoderrS   s    r   	get_vocabBertweetTokenizer.get_vocab   s    DLL>D$=$=>>r   c                 6  ^  UT R                   ;   a  T R                   U   $ [        U5      n[        [        US S 5      US   S-   /-   5      n[        U5      nU(       d  U$  [	        UU 4S jS9nUT R
                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS S	 nUT R                   U'   U$ ! [         a    UR                  X(S  5         Mq  f = f)
Nr   z</w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)rA   getfloat)pairrH   s    r   <lambda>'BertweetTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1Wr   keyr   r   r   @@ )rB   r<   listr   minrA   r@   indexextend
ValueErrorappendjoin)
rH   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpeBertweetTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E7 7FFc                     U R                   (       a  U R                  U5      n/ n[        R                  " SU5      nU H;  nUR	                  [        U R                  U5      R                  S5      5      5        M=     U$ )zTokenize a string.z\S+\n? )r#   normalizeTweetrefindallri   rf   rt   r;   )rH   textsplit_tokenswordsrm   s        r   	_tokenizeBertweetTokenizer._tokenize   sf    &&t,D

9d+ETXXe_%:%:3%? @A r   c                 .   U R                    H!  nUR                  X R                   U   5      nM#     U R                  R                  U5      nSR	                  U Vs/ s H  o@R                  U5      PM     sn5      nUR                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  S	S
5      nUR                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      nUR                  SS5      R                  SS5      R                  SS5      R                  SS5      nSR	                  UR                  5       5      $ s  snf )z
Normalize a raw Tweet
rw   zcannot zcan not zn't z n't zn 't zca n'tzcan'tzai n'tzain'tz'm z 'm z're z 're z's z 's z'll z 'll z'd z 'd z've z 've z p . m .z  p.m.z p . m z p.m z a . m .z a.m.z a . m z a.m )rE   replacerD   tokenizerl   normalizeTokenr;   )rH   tweetpuncttokensrm   	normTweets         r   rx    BertweetTokenizer.normalizeTweet   s^    ((EMM%)<)<U)CDE ) ''007HHfMfU11%8fMN	 i4WVW%WWg&WXw'WXw' 	 eV,WVW%WUF#WVW%WUF#WVW% 	 j(3WY(WZ)WY(	 	 xx	)**1 Ns   Fc                 H   UR                  5       nUR                  S5      (       a  gUR                  S5      (       d  UR                  S5      (       a  g[        U5      S:X  a>  XR                  ;   a  U R                  U   $ U R                  b  U R	                  U5      $ U$ U$ )z
Normalize tokens in a Tweet
@z@USERhttpwwwHTTPURLr   )lower
startswithr@   rE   r0   )rH   rm   lowercased_tokens      r   r    BertweetTokenizer.normalizeToken  s     !;;=C  ((004D4O4OPU4V4VZ1_+++**511~~)~~e,,Lr   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r4   r]   r(   )rH   rm   s     r   _convert_token_to_id&BertweetTokenizer._convert_token_to_id  s*    ||||'7'7'GHHr   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r8   r]   r(   )rH   rh   s     r   _convert_id_to_token&BertweetTokenizer._convert_id_to_token  s    ||~~66r   c                 d    SR                  U5      R                  SS5      R                  5       nU$ )z:Converts a sequence of tokens (string) in a single string.rw   rd    )rl   r   strip)rH   r   
out_strings      r   convert_tokens_to_string*BertweetTokenizer.convert_tokens_to_string#  s,    XXf%--eR8>>@
r   Nsave_directoryfilename_prefixreturn.c           	      "   [         R                  R                  U5      (       d  [        R	                  SU S35        g[        U S0 5      nU(       a  U S3OSn[         R                  R                  XUR                  SS5      -   5      n[        US	S
S9 n[        U R                  R                  5       S S9 H$  u  pxUS:  d  M  UR                  U SU S35        M&     SSS5        [         R                  R                  XUR                  SS5      -   5      n	[        U	S	S
S9 n
U
R                  S [        U R                  R                  5       S S9 5       5        SSS5        XY4$ ! , (       d  f       N= f! , (       d  f       XY4$ = f)z6
Save the vocabulary and merges files to a directory.
zVocabulary path (z) should be a directoryr.   vocab_files_names-r   r   r   wr   r   c                     U S   $ Nr   r.   kvs    r   r`   3BertweetTokenizer.save_vocabulary.<locals>.<lambda>=  s	    rRSur   rb      rw   r   Nr	   r   c              3   N   #    U  H  u  pS R                  U5      S-   v   M     g7f)rw   r   N)rl   ).0
bpe_tokenstoken_indexs      r   	<genexpr>4BertweetTokenizer.save_vocabulary.<locals>.<genexpr>E  s(      /c+J $t+/cs   #%c                     U S   $ r   r.   r   s    r   r`   r   G  s    ]_`a]br   )ospathisdirr2   errorgetattrrl   r]   r9   sortedr4   r7   write
writelinesrA   )rH   r   r   r   prefixr   frm   token_id
merge_filewriters              r   save_vocabulary!BertweetTokenizer.save_vocabulary.  se    ww}}^,,LL,^,<<STU#D*=rB*9O$A&r WW\\.;L;P;PQ]_j;k2kl
*cG4#)$,,*<*<*>DT#Uq=GGugQxj34 $V 5 WW\\.;L;P;PQ^`k;l2lm
*cG4 /5dnn6J6J6LRb/c  5 '' 54 54 ''s   -E-E-(:E>-
E;>
Fc                    [        U[        5      (       a'   [        USSS9 nU R                  U5        SSS5        gUR                  5       nU H\  nUR                  5       nUR                  S5      nUS:X  a  [        S	5      eUSU n[        U R                  5      U R                  U'   M^     g! , (       d  f       g= f! [         a  nUeSnAf[
         a    [        SU S35      ef = f)
zY
Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
rr   r   NzIncorrect encoding detected in z, please rebuild the datasetrw   r   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer5   r9   r6   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindrj   r@   r4   )	rH   r   fdfnfelineslineTmplineidxr   s	            r   r6   BertweetTokenizer.add_from_fileL  s     ac!S73r&&r* 4 G==?D**S/Cby !XYY:D!$T\\!2DLL  43 	 % 
 c"A!D` abbcs3   C B0C 0
B>:C >C 
C*CC*)
rA   rB   r8   r0   r4   r	   r#   rE   rD   r   )F<s></s>r   r   z<unk>z<pad>z<mask>rR   )__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESr   rG   propertyrT   rX   rt   r~   rx   r   r   r   r   r5   r<   r   r6   __static_attributes____classcell__)rO   s   @r   r   r   3   s    0d * >
@ ! !?*X	 +D&I7(c (C$J (Z_`ceh`hZi (<3 3r   r   ac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
    (?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);c                 \    Uc  Sn[        U [        5      (       a  U R                  X5      $ U $ )Nr   )r   bytesdecode)r{   r   errorss      r   _str_to_unicoder     s.    ${{8,,Kr   c                 R   ^^ UU4S jn[         R                  U[        X5      5      $ )u  
Remove entities from text by converting them to their corresponding unicode character.

Args:
    text:
        A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
    keep (list):
        List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
        `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
    remove_illegal (bool):
        If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
        kept "as is".
Returns: A unicode string with the entities removed.

See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

Examples:

```python
>>> from nltk.tokenize.casual import _replace_html_entities

>>> _replace_html_entities(b"Price: &pound;100")
'Price: \xa3100'

>>> print(_replace_html_entities(b"Price: &pound;100"))
Price: £100
```c                 ,  > U R                  S5      nU R                  S5      (       a\   U R                  S5      (       a  [        US5      nO[        US5      nSUs=::  a  S::  a  O  O\[        U45      R                  S5      $ O@UT;   a  U R                  S	5      $ [
        R                  R                  R                  U5      nUb   [        U5      $ T(       a  S
$ U R                  S	5      $ ! [         a    S n N7f = f! [        [        4 a     N>f = f)Nr   r   r      
         cp1252r   r   )groupintr   r   rj   htmlentitiesname2codepointr]   chrOverflowError)matchentity_bodynumberkeepremove_illegals      r   _convert_entity/_replace_html_entities.<locals>._convert_entity;  s    kk!n;;q>>;;q>> b1F b1F
 6)T) &+228<< *
 d"{{1~%5599+F6{" $r7Q7   . s)   >C. )C. 	
D  .C=<C= DD)ENT_REsubr   )r{   r   r   r   r   s    ``  r   _replace_html_entitiesr     s     :8: ::ot'FGGr   c                   (    \ rS rSrSrSS jrS rSrg)rC   i^  a  
Examples:

```python
>>> # Tokenizer for tweets.
>>> from nltk.tokenize import TweetTokenizer

>>> tknzr = TweetTokenizer()
>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
>>> tknzr.tokenize(s0)
['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

>>> # Examples using *strip_handles* and *reduce_len parameters*:
>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
>>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
>>> tknzr.tokenize(s1)
[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
```c                 (    Xl         X l        X0l        g rR   preserve_case
reduce_lenstrip_handles)rH   r   r   r   s       r   rG   TweetTokenizer.__init__r  s    *$*r   c                    [        U5      nU R                  (       a  [        U5      nU R                  (       a  [	        U5      n[
        R                  SU5      n[        R                  U5      nU R                  (       d<  U Vs/ s H/  n[        R                  U5      (       a  UOUR                  5       PM1     nnU$ s  snf )z
Args:
    text: str

Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
`preserve_case=False`
\1\1\1)r   r   remove_handlesr   reduce_lengtheningHANG_REr   WORD_RErz   r   EMOTICON_REsearchr   )rH   r{   	safe_textr}   xs        r   r   TweetTokenizer.tokenizew  s     &d+!$'D??%d+DKK	40		*!!HMN1+,,Q//QQWWY>EN Os   6B>r   NTFF)r   r   r   r   r   rG   r   r   r.   r   r   rC   rC   ^  s    &+
r   rC   c                 R    [         R                  " S5      nUR                  SU 5      $ )zY
Replace repeated character sequences of length 3 or greater with sequences of length 3.
z	(.)\1{2,}r  regexcompiler   r{   patterns     r   r  r    s#     mmL)G;;y$''r   c                 R    [         R                  " S5      nUR                  SU 5      $ )z,
Remove Twitter username handles from text.
zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)rw   r  r  s     r   r  r    s+     mm 	BG ;;sD!!r   c                 4    [        XUS9R                  U 5      $ )z2
Convenience function for wrapping the tokenizer.
r   )rC   r   )r{   r   r   r   s       r   casual_tokenizer    s"     \ijss r   )Nstrict)r.   Tr   r  )"r   r   r   ry   r  tokenization_pythonr   utilsr   
get_loggerr   r2   r   r   r   	EMOTICONSURLSREGEXPSr  rl   VERBOSEIUNICODEr  r  r  r   r   r   rC   r  r  r  __all__r.   r   r   <module>r      sG   (  	 	  6  
		H	%   n3+ n3x	L		$)\ 		  (.
A+` --chhw&779PSXS`S`9`
a --/
0 mmIu}}uww'>'NO 
.	/:H@0 0p("  
r   