
    Z j                     d    S r SSKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       rS	/rg
)zLongT5 model configuration    )strict   )PreTrainedConfig)auto_docstringzgoogle/long-t5-local-base)
checkpointc                     ^  \ rS rSr% SrSrS/rSSSSS	.rS
r\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\\	-  \
S'   Sr\\
S'   S r\\
S!'   S"r\\
S#'   S$r\\
S%'   S&r\\
S''   S$r\\
S('   S)r\	S-  \
S*'   S+r \	\!\	   -  S-  \
S,'   Sr"\	S-  \
S-'   S.r#\\
S/'   S$r$\\
S0'   U 4S1 jr%S2 r&S3r'U =r($ )4LongT5Config   a  
d_ff (`int`, *optional*, defaults to 2048):
    Size of the intermediate feed forward layer in each `LongT5Block`.
local_radius (`int`, *optional*, defaults to 127):
    Number of tokens to the left/right for each token to locally self-attend in a local attention mechanism.
global_block_size (`int`, *optional*, defaults to 16):
    Length of blocks an input sequence is divided into for a global token representation. Used only for
    `encoder_attention_type = "transient-global"`.
relative_attention_num_buckets (`int`, *optional*, defaults to 32):
    The number of buckets to use for each attention layer.
relative_attention_max_distance (`int`, *optional*, defaults to 128):
    The maximum distance of the longer sequences for the bucket separation.
feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
    Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. LongT5v1.1 uses the
    `"gated-gelu"` feed forward projection. Original LongT5 implementation uses `"gated-gelu"`.
encoder_attention_type (`string`, *optional*, defaults to `"local"`):
    Type of encoder attention to be used. Should be one of `"local"` or `"transient-global"`, which are
    supported by LongT5 implementation.
longt5past_key_valuesd_model	num_heads
num_layersd_kv)hidden_sizenum_attention_headsnum_hidden_layershead_dimi}  
vocab_sizei   @   i   d_ff   Nnum_decoder_layers      local_radius   global_block_size    relative_attention_num_buckets   relative_attention_max_distanceg?dropout_rategư>layer_norm_epsilong      ?initializer_factorrelufeed_forward_projTis_encoder_decoderlocalencoder_attention_type	use_cacher   pad_token_id   eos_token_idbos_token_idF
is_decodertie_word_embeddingsc                   > U R                   b  U R                   OU R                  U l         U R                  R                  S5      nUS   U l        US   S:H  U l        U R                  S:X  a  SU l        [        TU ]  " S0 UD6  g )N-r   gatedz
gated-gelugelu_new )r   r   r'   splitdense_act_fnis_gated_actsuper__post_init__)selfkwargsact_info	__class__s      ڀ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/longt5/configuration_longt5.pyr<   LongT5Config.__post_init__N   s~    =A=T=T=`$"9"9fjfufu))//4$RL$QK72!!\1 *D''    c                     U R                   R                  S5      n[        U5      S:  a	  US   S:w  d  [        U5      S:  a  [        SU R                    S35      eg)	zOPart of `@strict`-powered validation. Validates the architecture of the config.r3   r-   r   r5      z`feed_forward_proj`: z is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'N)r'   r8   len
ValueError)r=   r?   s     rA   validate_architecture"LongT5Config.validate_architectureY   sf    ))//4x=1!!73x=1;L'(>(>'? @) )  <MrC   )r9   r:   r   ))__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   r   r   r   r   r   r   r   r    r"   r#   floatr$   r%   r'   strr(   boolr*   r+   r,   r.   listr/   r0   r1   r<   rH   __static_attributes____classcell__)r@   s   @rA   r	   r	      sH   ( J#4"5 *)	M JGSD#ND#J%)d
)IsL#s*,"C,+.#S. #L%#+# $$ ###s###")C)It L#* +,L#S	/D(,#L#*#J $$	( rC   r	   N)	rN   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r7   rC   rA   <module>r^      sK    ! . 3 # 67I# I  8IX 
rC   