
    Z jN                     f    S r SSKJr  SSKJr  SSKJrJrJr  SSK	J
r
  \
 " S S\5      5       rS/rg)	z
Processor class for MarkupLM.
   )
TensorType)ProcessorMixin)BatchEncodingPaddingStrategyTruncationStrategy)auto_docstringc                       ^  \ rS rSrSrU 4S jr\                   SS\S\\-  \	-  S\\-  \
-  S\S-  S	\S
\S-  S\S-  S\S-  S\S\S\S\S\S\\-  S-  S\4S jj5       rSrU =r$ )MarkupLMProcessor   Tc                 $   > [         TU ]  X5        g )N)super__init__)selffeature_extractor	tokenizer	__class__s      ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/markuplm/processing_markuplm.pyr   MarkupLMProcessor.__init__   s    *6    Nadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                    U R                   (       a>  Uc  [        S5      eUc  Uc  Ub  [        S5      eU R                  U5      nUS   nUS   nOUb  [        S5      eUb  Uc  [        S5      eUb)  U R                   (       a  [        U[        5      (       a  U/nU R
                  " S0 SUb  UOU_S	Ub  UOS_SU_S
U_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_UD6nU$ )a  
html_strings (`str` or `list[str]`, *optional*):
    Raw HTML strings to parse and process. When `parse_html=True` (default), these strings are parsed
    to extract nodes and xpaths automatically. If provided, `nodes`, `xpaths`, and `node_labels` should
    not be provided. Required when `parse_html=True`.
nodes (`list[list[str]]`, *optional*):
    Pre-extracted HTML nodes as a list of lists, where each inner list contains the text content of nodes
    for a single document. Required when `parse_html=False`. Should not be provided when `parse_html=True`.
xpaths (`list[list[str]]`, *optional*):
    Pre-extracted XPath expressions corresponding to the nodes. Should be a list of lists with the same
    structure as `nodes`, where each XPath identifies the location of the corresponding node in the HTML
    tree. Required when `parse_html=False`. Should not be provided when `parse_html=True`.
node_labels (`list[list[int]]`, *optional*):
    Labels for the nodes, typically used for training or fine-tuning tasks. Should be a list of lists
    with the same structure as `nodes`, where each label corresponds to a node. Optional and only used
    when `parse_html=False`.
questions (`str` or `list[str]`, *optional*):
    Question strings for question-answering tasks. When provided, the tokenizer processes questions
    as the first sequence and nodes as the second sequence (text_pair). If a single string is provided,
    it is converted to a list to match the batch dimension of the parsed HTML.
NzDMake sure to pass HTML strings in case `parse_html` is set to `True`zUPlease don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`nodesxpathsz@You have passed HTML strings but `parse_html` is set to `False`.zIMake sure to pass nodes and xpaths in case `parse_html` is set to `False`text	text_pairnode_labelsr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#    )
parse_html
ValueErrorr   
isinstancestrr   )r   html_stringsr&   r'   r*   	questionsr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   kwargsfeaturesencoded_inputss                          r   __call__MarkupLMProcessor.__call__   s   \ ??# !ghh F$6+:Q k  --l;HW%Eh'F' !cdd} !lmm  T__)S))&K	 
'3
(4e$
 
 $	

  2
 
 "
 "
 
  2
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ *'
, r   r+   )NNNNNTFNN    NNNFFFFTN)__name__
__module____qualname____firstlineno__r,   r   r   boolr/   r   r   intr   r   r5   __static_attributes____classcell__)r   s   @r   r
   r
      s0   J7  #'056:!%)--1-1*/+0',#26)Z !Z o-Z 3J!33Z $JZ Z  $JZ  $d{Z  $d{Z $(Z  %)!Z" !%#Z$ %Z& 'Z( j(4/)Z, 
-Z Zr   r
   N)__doc__
file_utilsr   processing_utilsr   tokenization_utils_baser   r   r   utilsr   r
   __all__r+   r   r   <module>rF      sD    % . Y Y # a a aH 
r   