
    Z jj                     d    S r SSKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       rS	/rg
)zPerceiver model configuration    )strict   )PreTrainedConfig)auto_docstringzdeepmind/language-perceiver)
checkpointc                      \ rS rSr% SrSrSr\\S'   Sr	\\S'   Sr
\\S	'   S
r\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\S-  \S'   Sr\\S'   S
r\\S'   S
r\\S'   Sr\\S'   Sr\\-  \S'   Sr\\S'   Sr\\S'   S r\\S!'   S"r\\S#'   S$r\\S%'   S&r\\\   -  \ \\4   -  \S''   S(r!\\   \ \S)4   -  \S*'   S+r"\\S,'   S-r#\\S.'   S+r$\\S/'   S0r%\\   \ \S)4   -  \S1'   S2r&\\S3'   S4r'\\S5'   S6r(g)7PerceiverConfig   a  
num_latents (`int`, *optional*, defaults to 256):
    The number of latents.
d_latents (`int`, *optional*, defaults to 1280):
    Dimension of the latent embeddings.
num_blocks (`int`, *optional*, defaults to 1):
    Number of blocks in the Transformer encoder.
num_self_attends_per_block (`int`, *optional*, defaults to 26):
    The number of self-attention layers per block.
num_self_attention_heads (`int`, *optional*, defaults to 8):
    Number of attention heads for each self-attention layer in the Transformer encoder.
num_cross_attention_heads (`int`, *optional*, defaults to 8):
    Number of attention heads for each cross-attention layer in the Transformer encoder.
qk_channels (`int`, *optional*):
    Dimension to project the queries + keys before applying attention in the cross-attention and self-attention
    layers of the encoder. Will default to preserving the dimension of the queries if not specified.
v_channels (`int`, *optional*):
    Dimension to project the values before applying attention in the cross-attention and self-attention layers
    of the encoder. Will default to preserving the dimension of the queries if not specified.
cross_attention_shape_for_attention (`str`, *optional*, defaults to `"kv"`):
    Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
self_attention_widening_factor (`int`, *optional*, defaults to 1):
    Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
cross_attention_widening_factor (`int`, *optional*, defaults to 1):
    Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder.
use_query_residual (`float`, *optional*, defaults to `True`):
    Whether to add a query residual in the cross-attention layer of the encoder.
image_size (`int`, *optional*, defaults to 56):
    Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
train_size (`list[int]`, *optional*, defaults to `[368, 496]`):
    Training size of the images for the optical flow model.
num_frames (`int`, *optional*, defaults to 16):
    Number of video frames used for the multimodal autoencoding model.
audio_samples_per_frame (`int`, *optional*, defaults to 1920):
    Number of audio samples per frame for the multimodal autoencoding model.
samples_per_patch (`int`, *optional*, defaults to 16):
    Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
output_shape (`list[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
    Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
    autoencoding model. This excludes the channel dimension.
output_num_channels (`int`, *optional*, defaults to 512):
    Number of output channels for each modalitiy decoder.

Example:

```python
>>> from transformers import PerceiverModel, PerceiverConfig

>>> # Initializing a Perceiver deepmind/language-perceiver style configuration
>>> configuration = PerceiverConfig()

>>> # Initializing a model from the deepmind/language-perceiver style configuration
>>> model = PerceiverModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```	perceiver   num_latentsi   	d_latentsi   d_model   
num_blocks   num_self_attends_per_block   num_self_attention_headsnum_cross_attention_headsNqk_channels
v_channelskv#cross_attention_shape_for_attentionself_attention_widening_factorcross_attention_widening_factorgelu
hidden_actg?attention_probs_dropout_probg{Gz?initializer_rangeg-q=layer_norm_epsTuse_query_residuali  
vocab_sizei   max_position_embeddings8   
image_size)ip  i  .
train_size   
num_framesi  audio_samples_per_framesamples_per_patch)r   r(      r,   output_shapei   output_num_channelsi   _label_trainable_num_channels ))__name__
__module____qualname____firstlineno____doc__
model_typer   int__annotations__r   r   r   r   r   r   r   r   r   strr   r   r   r   floatr    r!   r"   boolr#   r$   r&   listtupler'   r)   r*   r+   r-   r.   r/   __static_attributes__r0       چ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/perceiver/configuration_perceiver.pyr	   r	      sn   8t JKIsGSJ&(($%c%%&s&"Kt"!Jd
!/3'3*+"C++,#S,J03 %#+3#u#!NE!##J#'S'46Jd3i%S/16.8JS	E#s(O+8J#'S's0AL$s)eCHo-A"")-!3-r?   r	   N)	r5   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r0   r?   r@   <module>rE      sK    $ . 3 # 89W.& W.  :W.t 
r?   