
    Z j                         S r SSKJr  SSKJr  SSKJrJr  \R                  " \	5      r
\" SS9\ " S S	\5      5       5       rS	/rg
)zXLNet configuration    )strict   )PreTrainedConfig)auto_docstringloggingzxlnet/xlnet-large-cased)
checkpointc                   D  ^  \ rS rSr% SrSrS/rSSSSS	.rS
r\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\	-  \
S'   Sr\	S-  \
S'   Sr\	S-  \
S'   Sr\\
S '   S!r\\
S"'   S!r\\
S#'   S$r\	\
S%'   S!r\\
S&'   S'r\\
S('   Sr \\
S)'   S*r!\\
S+'   Sr"\\	-  \
S,'   S-r#\	\
S.'   S-r$\	\
S/'   S-r%\	S-  \
S0'   S1r&\	S-  \
S2'   S3r'\	\(\	   -  S-  \
S4'   Sr)\\
S5'   U 4S6 jr*S7 r+\,S8 5       r-\-R\                  S9 5       r-S:r/U =r0$ );XLNetConfig   a  
ff_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
    The non-linear activation function (function or string) in the If string, `"gelu"`, `"relu"`, `"silu"` and
    `"gelu_new"` are supported.
attn_type (`str`, *optional*, defaults to `"bi"`):
    The attention type used by the model. Set `"bi"` for XLNet, `"uni"` for Transformer-XL.
mem_len (`int` or `None`, *optional*):
    The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous
    forward pass won't be re-computed. See the
    [quickstart](https://huggingface.co/transformers/quickstart.html#using-the-past) for more information.
reuse_len (`int`, *optional*):
    The number of tokens in the current batch to be cached and reused in the future.
use_mems_eval (`bool`, *optional*, defaults to `True`):
    Whether or not the model should make use of the recurrent memory mechanism in evaluation mode.
use_mems_train (`bool`, *optional*, defaults to `False`):
    Whether or not the model should make use of the recurrent memory mechanism in train mode.
    <Tip>
    For pretraining, it is recommended to set `use_mems_train` to `True`. For fine-tuning, it is recommended to
    set `use_mems_train` to `False` as discussed
    [here](https://github.com/zihangdai/xlnet/issues/41#issuecomment-505102587). If `use_mems_train` is set to
    `True`, one has to make sure that the train batches are correctly pre-processed, *e.g.* `batch_1 = [[This
    line is], [This is the]]` and `batch_2 = [[ the first line], [ second line]]` and that all batches are of
    equal size.
    </Tip>
bi_data (`bool`, *optional*, defaults to `False`):
    Whether or not to use bidirectional input pipeline. Usually set to `True` during pretraining and `False`
    during finetuning.
clamp_len (`int`, *optional*, defaults to -1):
    Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping.
same_length (`bool`, *optional*, defaults to `False`):
    Whether or not to use the same attention length for each token.
summary_type (`str`, *optional*, defaults to "last"):
    Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
    Has to be one of the following options:
        - `"last"`: Take the last token hidden state (like XLNet).
        - `"first"`: Take the first token hidden state (like BERT).
        - `"mean"`: Take the mean of all tokens hidden states.
        - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
        - `"attn"`: Not implemented now, use multi-head attention.
summary_use_proj (`bool`, *optional*, defaults to `True`):
    Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
    Whether or not to add a projection after the vector extraction.
summary_activation (`str`, *optional*):
    Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
    Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
summary_last_dropout (`float`, *optional*, defaults to 0.1):
    Used in the sequence classification and multiple choice models.
    The dropout ratio to be used after the projection and activation.
start_n_top (`int`, *optional*, defaults to 5):
    Used in the SQuAD evaluation script.
end_n_top (`int`, *optional*, defaults to 5):
    Used in the SQuAD evaluation script.

Examples:

```python
>>> from transformers import XLNetConfig, XLNetModel

>>> # Initializing a XLNet configuration
>>> configuration = XLNetConfig()

>>> # Initializing a model (with random weights) from the configuration
>>> model = XLNetModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```xlnetmems
vocab_sized_modeln_headn_layer)n_tokenhidden_sizenum_attention_headsnum_hidden_layersi }  i         i   d_innerNd_headgeluff_activationbi	attn_typeg{Gz?initializer_rangeg-q=layer_norm_epsg?dropouti   mem_len	reuse_lenTuse_mems_evalFuse_mems_trainbi_data	clamp_lensame_lengthlastsummary_typesummary_use_projtanhsummary_activationsummary_last_dropout   start_n_top	end_n_toppad_token_id   bos_token_id   eos_token_idtie_word_embeddingsc                    > U R                   =(       d    U R                  U R                  -  U l         [        TU ]  " S0 UD6  g )N )r   r   r   super__post_init__)selfkwargs	__class__s     ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/xlnet/configuration_xlnet.pyr;   XLNetConfig.__post_init__   s0    kk@T\\T[[%@''    c                 >   U R                   U R                  -  S:w  a&  [        SU R                   U R                  -   S35      eU R                  U R                   U R                  -  :w  a3  [        SU R                   SU R                   U R                  -   S35      eg)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   z'd_model % n_head' (z) should be equal to 0z
`d_head` (z*) should be equal to `d_model // n_head` ()N)r   r   
ValueErrorr   r<   s    r?   validate_architecture!XLNetConfig.validate_architecture   s    <<$++%*3DLL4;;4N3OOefgg;;$,,$++55T[[M)STXT`T`dhdodoToSppqr  6rA   c                 J    [         R                  SU R                   S35        g)N
The model < is one of the few models that has no sequence length limit.r&   )loggerinfo
model_typerE   s    r?   max_position_embeddings#XLNetConfig.max_position_embeddings   s     j 11mnorA   c                 4    [        SU R                   S35      e)NrI   rJ   )NotImplementedErrorrM   )r<   values     r?   rN   rO      s#     "))ef
 	
rA   )r   )1__name__
__module____qualname____firstlineno____doc__rM   keys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   r   r   r   r   r   strr   r   floatr   r    r!   r"   r#   boolr$   r%   r'   r(   r*   r+   r-   r.   r0   r1   r2   r4   r6   listr7   r;   rF   propertyrN   setter__static_attributes____classcell__)r>   s   @r?   r
   r
      s   BH J#)( '&	M JGSGSFCGSFC$JM3Is#u#!NE!GUS[GS4Z IsTz M4 ND GTIsKL#!d!$$(+%#++KIs L#*  L#* +,L#S	/D(, $$(   ##
 $
rA   r
   N)rW   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r   
get_loggerrS   rK   r
   __all__r9   rA   r?   <module>ri      s\     . 3 , 
		H	% 45B
" B
  6B
J /rA   