
    Z j#                     J   S SK r S SKrS SKJrJr  S SKJr  S SKrS SKJ	r	  S SK
Jr  SSKJr  SSKJr  SS	KJrJr  S
SKJrJrJrJr  \R0                  " \5      r\" \R8                  " 5       5      r\" S \ 5       5      r\ " S S5      5       r  " S S\5      r! " S S\5      r"g)    N)	dataclassfield)Enum)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   8   #    U  H  oR                   v   M     g 7fN)
model_type).0confs     q/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/data/datasets/squad.py	<genexpr>r   !   s     E0DOO0Ds   c                      \ rS rSr% Sr\" SSSSR                  \5      -   0S9r\	\
S'   \" SSS	0S9r\	\
S
'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" S SS!0S9r\\
S"'   \" S#SS$0S9r\\
S%'   S&rg)'SquadDataTrainingArguments$   zZ
Arguments pertaining to what data we are going to input our model for training and eval.
Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads )__name__
__module____qualname____firstlineno____doc__r   joinMODEL_TYPESr   str__annotations__r   r!   intr"   r$   r&   r'   boolr(   r)   floatr+   r,   r.   __static_attributes__r/       r   r   r   $   s    (KdiiXcNd(deJ  (pqHc   Q
NC  rsJ  "/
c  #J
s  ")\ ]OT  %*)o p%T  (-v'rs(u  f&qrK  C
GS  f6k-lmGSmr=   r   c                       \ rS rSrSrSrSrg)Splitg   traindevr/   N)r0   r1   r2   r3   rA   rB   r<   r/   r=   r   r?   r?   g   s    E
Cr=   r?   c                       \ rS rSr% \\S'   \\   \S'   \\S'   \	\S'   S\R                  SSS4S\S	\S
\S-  S\\-  S\	S\S-  S\4S jjrS rS\\\R$                  4   4S jrSrg)SquadDatasetl   argsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                    Xl         XPl        UR                  (       a
  [        5       O	[	        5       U l        [        U[        5      (       a
   [        U   nX@l
        UR                  (       a  SOSn[        R                  R                  Ub  UOUR                  SUR                   SUR                   R"                   SUR$                   SU 35      n	U	S-   n
['        U
5         [        R                  R)                  U	5      (       Ga  UR*                  (       d  [,        R,                  " 5       n[/        5         [0        R2                  " U	SS9U l        U R4                  S	   U l        U R4                  R9                  S
S 5      U l        U R4                  R9                  SS 5      U l        [>        RA                  SU	 S3[,        R,                  " 5       U-
  5        U R:                  b  U R<                  c  [>        RC                  SU	 S35        GOJU[        RD                  :X  a+  U R
                  RG                  UR                  5      U l        O*U R
                  RI                  UR                  5      U l        [K        U R<                  UUR$                  URL                  URN                  U[        RP                  :H  URR                  US9u  U l        U l        [,        R,                  " 5       n[0        RT                  " U R6                  U R:                  U R<                  S.U	5        [>        RA                  SU	 S[,        R,                  " 5       U-
  S S35        S S S 5        g ! [         a    [        S5      ef = f! , (       d  f       g = f)Nzmode is not a valid split namev2v1cached__z.lockT)weights_onlyrG   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rV   rK   r!   r"   r$   is_trainingr.   return_dataset)rG   rU   rV   z!Saving features into cached file z [took z.3fz s])+rF   rI   r(   r   r   	processor
isinstancer7   r?   KeyErrorrH   ospathr5   r   value	__class__r0   r!   r   existsr'   timer   torchloadold_featuresrG   getrU   rV   loggerinfowarningrB   get_dev_examplesget_train_examplesr   r"   r$   rA   r.   save)selfrF   rK   rL   rH   rI   rM   rN   version_tagcached_features_file	lock_pathstarts               r   __init__SquadDataset.__init__r   s    	%:"/3/K/K)+QaQcdC  AT{ 	"::d!ww||".IDMMdjj\9#6#6#?#?"@$BUBUAVVWXcWde 
 )72	i ww~~233D<P<P		(*$)JJ/CRV$W! !% 1 1* =#0044YE $ 1 1 5 5j$ G89M8Nn]_c_h_h_jmr_r <<'4==+@NN/0D/E F& &
 599$$(NN$C$CDMM$RDM$(NN$E$Edmm$TDM.P!]]'#'#6#6#%)%:%: $ 3 LL#1	/+t| 		

!%4<<UYUbUbc(
 78L7MWUYU^U^U`chUhilTmmpqW !   A?@@A ! s   	M" $I5M;"M8;
N	c                 ,    [        U R                  5      $ r   )lenrG   )rl   s    r   __len__SquadDataset.__len__   s    4==!!r=   returnc                    U R                   U   n[        R                  " UR                  [        R                  S9n[        R                  " UR
                  [        R                  S9n[        R                  " UR                  [        R                  S9n[        R                  " UR                  [        R                  S9n[        R                  " UR                  [        R                  S9n[        R                  " UR                  [        R                  S9nUUUS.n	U R                  R                  S;   a  U	S	 U R                  R                  S;   a  U	R                  XgS.5        U R                  R                  (       a  U	R                  SU05        U R                  (       aU  U	R                  S[        R                   " UR"                  [        R$                  S9U R                  R&                  -  05        U R(                  [*        R,                  :X  am  [        R                  " UR.                  [        R                  S9n
[        R                  " UR0                  [        R                  S9nU	R                  XS	.5        U	$ )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertr|   )xlnetr}   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rG   rb   tensorrz   longr{   r|   r   r   r;   r   rF   r   updater(   rI   onesshapeint64r,   rH   r?   rA   start_positionend_position)rl   ifeaturerz   r{   r|   r   r   r   inputsr   r   s               r   __getitem__SquadDataset.__getitem__   s   --"LL!2!2%**E	g&<&<EJJOg&<&<EJJOLL!2!2%**E	gnnEKK@W%:%:%++N #,,
 99#PP'(99#33MM	DEyy00>?))wIOO5;;)WZ^ZcZcZkZk)kmn99##ll7+A+ATO!LL)=)=UZZPMMMo^_r=   )rF   rU   rV   rG   rI   rH   rd   rY   )r0   r1   r2   r3   r   r8   listr   r?   r:   rA   r
   r9   r7   rq   ru   dictrb   Tensorr   r<   r/   r=   r   rD   rD   l   s    
$$=!!
K $(!KK&+ $"J(J 'J Dj	J
 EkJ  $J :J JX" S%,,%6 7  r=   rD   )#r\   ra   dataclassesr   r   enumr   rb   filelockr   torch.utils.datar   models.auto.modeling_autor	   tokenization_pythonr
   utilsr   r   processors.squadr   r   r   r   
get_loggerr0   rf   r   keysMODEL_CONFIG_CLASSEStupler6   r   r?   rD   r/   r=   r   <module>r      s    
  (    $ M 6 6 t t 
		H	%@EEGH E0DEE ?n ?n ?nDD 
u7 ur=   