
    Z jem                        S r SSKrSSKJr  SSKrSSKrSSKJs  J	r
  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  SSK J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/J0r0J1r1  SSK2J3r3  SSK4J5r5  \)Rl                  " \75      r8\' " S S\55      5       r9 " S S\!SS9r:\'\-" SS9 " S  S!\35      5       5       r;\'" S"S#9\ " S$ S%\5      5       5       r<S&\Rz                  S'\4S( jr> " S) S*\R~                  5      r@ " S+ S,\R~                  5      rA\' " S- S.\5      5       rB\' " S/ S0\B5      5       rC " S1 S2\B5      rD/ S3QrEg)4zTPI0 model: PaliGemma + Action Expert with flow matching for robot action prediction.    N)Callable)strict)nn   )initialization)Cache)PreTrainedConfig)BatchFeature)
ImageInputmake_nested_list_of_images)create_bidirectional_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)ProcessingKwargsUnpack)PreTokenizedInput	TextInput)auto_docstringcan_return_tuplelogging)maybe_autocast)requires   )CONFIG_MAPPING
AutoConfig	AutoModel)PaligemmaProcessor)SiglipImageProcessorc                   ,    \ rS rSrSSS.rSSS.rSrSrg)PI0ImageProcessor.      )
max_height	max_width)heightwidthT N)__name__
__module____qualname____firstlineno__sizepad_sizedo_pad__static_attributes__r(       t/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/pi0/modular_pi0.pyr!   r!   .   s    C0D,HFr1   r!   c                   *    \ rS rSrSSSS.SS0S.rS	rg
)PI0ProcessorKwargs5   
max_length0   right)paddingr6   padding_sidereturn_tensorspt)text_kwargscommon_kwargsr(   N)r)   r*   r+   r,   	_defaultsr0   r(   r1   r2   r4   r4   5   s#     $#

 +D1Ir1   r4   F)total)visiontorch)backendsc                   0  ^  \ rS rSrSU 4S jjr   SS\\\   -  \\\      -  S-  S\\-  \\   -  \\   -  S-  S\\	R                  -  \R                  -  S-  S\\	R                  -  \R                  -  S-  S\\   S	\4S
 jjr\U 4S j5       rSrU =r$ )PI0Processor@   Nc                 J  > UR                   S   UR                   S   sU l        U l        UR                  S/ SQ5      nUR                  S/ SQ5      nUR                  S/ SQ5      nUR                  S	/ S
Q5      n[        R
                  " U5      U l        [        R
                  " U5      U l        [        R
                  " U5      U l        [        R
                  " U5      U l	        UR                  SS5      U l
        UR                  SS5      U l        [        T	U ]5  X5        g )Nr&   r'   
state_mean)ggsgr?g	h"l?gW2D@g\ AcgZd;OſgB>٬?gQI	state_std)gt$~?gL
F%u?g.!u?g/n?g6?gx?g]K=?gF%u?actions_mean)g&S?gX ?gW[재gHPsr?gg?g 	gHPsactions_std)gGz?g`"?g9#J{?gvOjM?g>yX5ͫ?g46<R?gj+?max_state_dim    
chunk_size2   )r-   r&   r'   getrB   tensorrH   rI   rJ   rK   rL   rN   super__init__)
selfimage_processor	tokenizerchat_templatekwargsrH   rI   rJ   rK   	__class__s
            r2   rS   PI0Processor.__init__C   s    "1"6"6x"@/BVBVW^B_TZZZ.rs
JJ{,lm	zz.2mnjj0hi,,z2i0!LL6 <<4#ZZ< **\264r1   imagestextactionsstaterX   returnc                    U R                   " [        4SU R                  R                  0UD6nUc  [        R                  S5        Sn[        U[        5      (       a  U/n[        U5      n[        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eUS   R                  S	S5      nUS
   R                  S	S5        / n	[        X'5       HV  u  pU R                  U R                  -  [        U5      -   U R                  R                   U
 S3n
U	R!                  U
5        MX     U R                  " U	40 US   D6n[#        S U 5       5      n[$        R&                  " [        U5      U4[$        R(                  S9n[$        R&                  " [        U5      USU R*                  U R,                  5      n[/        U5       H>  u  nnU R0                  " U4S	S0US
   D6n[        U5      nSUUSU24'   US   UUSU24'   M@     0 UEUUS.EnUb  [$        R2                  " U5      U R4                  -
  U R6                  S-   -  nUR8                  S   U R:                  :  a3  [<        R>                  " USU R:                  UR8                  S   -
  45      nURA                  SU RB                  U R:                  5      US'   Ub  [$        R2                  " U5      U RD                  -
  U RF                  S-   -  nUR8                  S   U R:                  :  a3  [<        R>                  " USU R:                  UR8                  S   -
  45      nURA                  SU R:                  5      US'   [I        UUS9$ )a  
actions (`list | np.ndarray | torch.Tensor`, *optional*):
    Actions to be predicted by the model. If provided, padding, mean and std normalization will be applied.
state (`list | np.ndarray | torch.Tensor`, *optional*):
    Robotic states to be predicted by the model. If provided, padding, mean and std normalization will be applied.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
      is provided, the `input_ids` will also contain the suffix input ids.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_attention_mask** -- Pixel values padding mask to be fed to a model. Returned when `images` is not `None`.
    - **state** -- Robot state compatible with model if `state` is not None
    - **actions** -- Label-actions compatible with training if `actions` is not None
tokenizer_init_kwargsNzPYou are using PI0 without a text prefix. The processor will use an empty prompt. z	Received z image samples for z\ prompts. Each prompt should be associated with one sample (with one or more camera images).r=   r;   images_kwargs
c              3   8   #    U  H  n[        U5      v   M     g 7fN)len).0sample_imagess     r2   	<genexpr>(PI0Processor.__call__.<locals>.<genexpr>   s     Un]c-00ns   dtyper   r<   Tpixel_values)rn   pixel_attention_maskg:0yE>r   r]   r^   )datatensor_type)%_merge_kwargsr4   rV   init_kwargsloggerwarning_once
isinstancestrr   rg   
ValueErrorpopzipimage_tokenimage_seq_length	bos_tokenappendmaxrB   zerosboolr&   r'   	enumeraterU   rQ   rJ   rK   shaperL   FpadviewrN   rH   rI   r
   )rT   r[   r\   r]   r^   rX   output_kwargsbatched_imagesr;   prompt_stringssample
image_listtext_inputsmax_num_camerasro   padded_pixel_valuesbatchri   	processednum_camerasreturn_datas                        r2   __call__PI0Processor.__call__R   si   6 **
6:nn6P6P
TZ
 < rsDdC  6D3F;~#d)+C/00CCI; Oe e 
 '}599:JDQo&**+;TB"%d";F##d&;&;;c*oMNt~~OgOgNhiohpprs  !!&)	 #< nn^T}]7ST UnUU${{C,?+QY^YcYcd#kk#n*=PQSWS^S^`d`j`jk$-n$= E=,,]r4rS`apSqrIm,K8< !457@7P|| 34 %>

/$8
 ||G,t/@/@@TEUEUX]E]^G}}R 4#5#55%%!T-?-?'--PRBS-S)TU%,\\"dootGYGY%ZK	"\\%(4??:t~~PU?UVE{{2!3!33ea););ekk"o)M%NO#(::b$2D2D#EK .IIr1   c                     > [         TU ]  S/-   $ )Nro   )rR   model_input_names)rT   rY   s    r2   r   PI0Processor.model_input_names   s    w(,B+CCCr1   )rJ   rK   rN   r&   rL   rH   rI   r'   )NNN)r)   r*   r+   r,   rS   r   listr   r   npndarrayrB   Tensorr   r4   r
   r   propertyr   r0   __classcell__rY   s   @r2   rE   rE   @   s    5$ bf;?9=WJT*--T*5E0FFMWJ ++d9o=EV@WWZ^^WJ 

"U\\1D8	WJ
 bjj 5<</$6WJ +,WJ 
WJr D Dr1   rE   zlerobot/pi0_base)
checkpointc                     ^  \ rS rSr% SrSr\\S.rSr\	\
-  S-  \S'   Sr\	\
-  S-  \S'   Sr\\S	'   S
r\\S'   S
r\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   U 4S jrS rSrU =r$ ) 	PI0Config   aX  
vlm_config (`dict`, *optional*):
    Configuration for the vlm backbone (PaliGemmaModel).
dit_config (`dict`, *optional*):
    Configuration for the DiT backbone. Defaults to a Gemma 300M variant.
chunk_size (`int`, *optional*, defaults to 50):
    Number of action steps to predict per chunk.
max_state_dim (`int`, *optional*, defaults to 32):
    Maximum state vector dimension (shorter vectors are zero-padded).
max_action_dim (`int`, *optional*, defaults to 32):
    Maximum action vector dimension (shorter vectors are zero-padded).
num_inference_steps (`int`, *optional*, defaults to 10):
    Number of denoising steps during inference.
time_sampling_beta_alpha (`float`, *optional*, defaults to 1.5):
    Alpha parameter for Beta distribution used to sample diffusion time during training.
time_sampling_beta_beta (`float`, *optional*, defaults to 1.0):
    Beta parameter for Beta distribution used to sample diffusion time during training.
time_sampling_scale (`float`, *optional*, defaults to 0.999):
    Scale factor for sampled time values.
time_sampling_offset (`float`, *optional*, defaults to 0.001):
    Offset added to sampled time values.
min_period (`float`, *optional*, defaults to 0.004):
    Minimum period for sinusoidal time embedding.
max_period (`float`, *optional*, defaults to 4.0):
    Maximum period for sinusoidal time embedding.
loss_reduction (`str`, *optional*, defaults to `"mean"`):
    The reduction to use on MSE loss.

Example:
```python
>>> from transformers import PI0ForConditionalGeneration, PI0Config

>>> config = PI0Config()
>>> model = PI0ForConditionalGeneration(config)
```
pi0)
vlm_config
dit_configNr   r   rO   rN   rM   rL   max_action_dim
   num_inference_stepsg      ?time_sampling_beta_alpha      ?time_sampling_beta_betag+?time_sampling_scalegMbP?time_sampling_offsetgMbp?
min_periodg      @
max_periodmeanloss_reductionc                   > [        U R                  [        5      (       a;  U R                  R                  SS5      n[        U   " S0 U R                  D6U l        O4U R                  c'  [        S   " SSSSSSS	S
.SSSSSSSS	SS.	SS	S9U l        [        U R
                  [        5      (       a;  U R
                  R                  SS5      n[        U   " S0 U R
                  D6U l        OCU R
                  c6  [        S   " SSSSSSU R                  R                  R                  S9U l        SU R
                  l        SU R
                  l	        SU R                  R                  l	        [        TU ],  " S0 UD6  g )N
model_type	paligemmagemmai      i @        i )r   hidden_sizenum_hidden_layersintermediate_sizenum_attention_headsnum_key_value_heads
vocab_sizesiglip_vision_modeli  i     r#         F)	r   r   r   
patch_size
image_sizer   r   r   vision_use_head)text_configvision_configprojection_dimimage_token_idi   i      )r   r   r   r   r   head_dimr   Tr(   )rw   r   dictrP   r   r   r   r   	is_causaluse_bidirectional_attentionrR   __post_init__)rT   rX   vlm_model_typedit_model_typerY   s       r2   r   PI0Config.__post_init__   s^   doot,,!__00{KN,^<OtODO__$,[9")#')+).+,+,"( #8)-#'"$"%)++-"(',
  $%-DO2 doot,,!__00wGN,^<OtODO__$,W5 "$"&$%$%??66AADO %*!6:3BF##?''r1   c                     U R                   R                  S-  S:w  a-  [        SU R                  R                   R                   S35      eg)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   r   zDiT hidden dim=(z) must be divisible by 2N)r   r   ry   configrT   s    r2   validate_architecturePI0Config.validate_architecture  sE    ??&&*a//0F0F0R0R/SSklmm 0r1   )r   r   )r)   r*   r+   r,   __doc__r   r   sub_configsr   r   r	   __annotations__r   rN   intrL   r   r   r   floatr   r   r   r   r   r   rx   r   r   r0   r   r   s   @r2   r   r      s    #J J!+:FK15J''$.515J''$.5JM3NC!!&)e)%(U(!&&"'%'JJ NC 0(dn nr1   r   block_boundariesr_   c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )N	batch_idxhead_idxq_idxkv_idxr_   c                 h   > [         R                  " UT5      n[         R                  " UT5      nXT:*  $ rf   )rB   	bucketize)r   r   r   r   q_blockkv_blockr   s         r2   
inner_mask0blockwise_bidirectional_mask.<locals>.inner_mask"  s.    //%)9:??6+;<""r1   )r   r   )r   r   s   ` r2   blockwise_bidirectional_maskr   !  s3    #c #S # #c #d #
 r1   c                   >   ^  \ rS rSrU 4S jr\S 5       rS rSrU =r	$ )PI0TimestepEmbeddingsi*  c                 r   > [         TU ]  5         Xl        U R                  U5      nU R	                  SUSS9  g )Nsinusoid_freqF)
persistent)rR   rS   r   compute_freqsregister_buffer)rT   r   r   rY   s      r2   rS   PI0TimestepEmbeddings.__init__+  s8    **62_mNr1   c                    [         R                  " SSU R                  R                  S-  [         R                  S9nU R
                  U R                  U R
                  -  U-  -  nSU-  S-  [        R                  -  nU$ )N        r   r   rl   )	rB   linspacer   r   float32r   r   mathpi)r   fractionperiodr   s       r2   r   #PI0TimestepEmbeddings.compute_freqs1  sr    >>#sF,=,=,I,IQ,NV[VcVcd""f&7&7&:K:K&KPX%XXfq(4772r1   c                    [        UR                  R                  [        5      (       a0  UR                  R                  S:w  a  UR                  R                  OSn[	        USS9   U R
                  S S S 24   nX1S S 2S 4   -  n[        R                  " UR                  5       UR                  5       /SS9nS S S 5        U$ ! , (       d  f       W$ = f)NmpscpuF)device_typeenabledr   dim)
rw   devicetyperx   r   r   rB   catsincos)rT   timer   r   embtime_embedss         r2   forwardPI0TimestepEmbeddings.forward8  s    *4T[[5E5Es*K*KPTP[P[P`P`diPidkk&&otUC ..tQw7Mq$w-/C))SWWY	$:BK D 	 DC s   &AC
C)r   )
r)   r*   r+   r,   rS   staticmethodr   r  r0   r   r   s   @r2   r   r   *  s'    O   r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )PI0ActionTimeEmbeddingiA  c                 @  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  R                  5      U l	        [        R
                  " UR                  UR                  R                  5      U l        [        R
                  " SUR                  R                  -  UR                  R                  5      U l        [        R
                  " UR                  R                  UR                  R                  5      U l        g )Nr   )rR   rS   r   sinusoid_embedsr   Linearr   r   r   action_in_projrL   
state_projaction_time_mlp_inaction_time_mlp_outrT   r   rY   s     r2   rS   PI0ActionTimeEmbedding.__init__B  s    4V< ii(=(=v?P?P?\?\]))F$8$8&:K:K:W:WX"$))A0A0A0M0M,MvO`O`OlOl"m#%99V->->-J-JFL]L]LiLi#j r1   c                    U R                  U5      nU R                  U5      nU R                  U5      nUS S 2S S S 24   R                  U5      R	                  UR
                  S9n[        R                  " XV/SS9nU R                  [        R                  " U R                  U5      5      5      n[        R                  " US S 2S S S 24   U/SS9nU$ )Nrl   r   r  r   )r  r  r  	expand_astorm   rB   r  r  r   silur  )	rT   r^   noisetimestepstate_embedsaction_embedsr
  action_time_embedsaction_embeds_mergeds	            r2   r  PI0ActionTimeEmbedding.forwardJ  s    u-++E2**84!!T1*-77FIIP]PcPcId"YY'CK!55affT=T=TUg=h6ij$yy,q$z*BDV)W]^_##r1   )r  r  r  r  r  )r)   r*   r+   r,   rS   r  r0   r   r   s   @r2   r  r  A  s    k
$ 
$r1   r  c                   ^   ^  \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSrSrSrU 4S jrS	rU =r$ )
PI0PreTrainedModeliW  r   modelr^   Tpast_key_values)imager\   c                    > [         TU ]  U5        [        U[        5      (       a;  [        R
                  " UR                  UR                  UR                  5      5        g g rf   )	rR   _init_weightsrw   r   initcopy_r   r   r   )rT   modulerY   s     r2   r*   PI0PreTrainedModel._init_weightse  sF    f%f344JJv++V-A-A&---PQ 5r1   r(   )r)   r*   r+   r,   r   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendinput_modalitiesr*  r0   r   r   s   @r2   r%  r%  W  sR    O&*##4"5N!"&(R Rr1   r%  c                   P  ^  \ rS rSrS\4U 4S jjrS rS rSS jr\	\
       SS\R                  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\4S jj5       5       rSrU =r$ )PI0Modelik  r   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        U R                  5         g rf   )	rR   rS   r   from_configr   ditr   vlm	post_initr  s     r2   rS   PI0Model.__init__m  sJ     (():):;(():):;r1   c                 6    U R                   R                  5       $ rf   )r>  get_input_embeddingsr   s    r2   rB  PI0Model.get_input_embeddingss  s    xx,,..r1   c                 :    U R                   R                  U5        g rf   )r>  set_input_embeddings)rT   values     r2   rE  PI0Model.set_input_embeddingsv  s    %%e,r1   Nc                    UR                   S   nUR                  SS5      nU R                  R                  U5      R                  nUR                  SXVR                   S   UR                   S   5      n/ n[        U5       H  u  pXh   U	   n
UR                  U
5        M     [        R                  " USS9nUR                  5       nSXU R                  R                  R                  :H  '   U R                  R                  5       " U5      nXR                  R                  R                  :H  R                  S5      R!                  U5      R#                  UR$                  5      nUR'                  X5      nU$ )Nr   r   rp   r   r  )r   flattenr>  get_image_featurespooler_outputreshaper   r   rB   r  cloner   r   r   rB  	unsqueezer  r  r  masked_scatter)rT   	input_idsrn   ro   attention_maskr   image_featurestotal_image_featuresr   maskunpadded_image_featuresllm_input_idsinputs_embedsspecial_image_masks                 r2   embed_prefixPI0Model.embed_prefixy  sK   .44Q7#++Aq144\BPP'//OEYEYZ[E\^l^r^rst^uv!()=>OI&4&?&E# ''(?@  ?  %yy)=1E!)LM4;;#9#9#H#HHI557F++00???Yr]Y}%R$$%	 	 &445G^r1   r   rP  rn   rQ  ro   position_idsrW  r'  r_   c	           	         Ubo  Ucl  Ub  Uc  UR                  S5      S-
  nUc  U R                  X#U5      n[        R                  " U5      SS2SS2S4   n
U R	                  UUUU
SS9R
                  nUb  UR                  S:w  a  [        S5      eS=pUb  [        R                  " UR                  S   UR                  S   UR                  UR                  S	9n[        R                  " XM/SS
9n[        R                   " USS
9S-
  SS2UR                  S   * S24   nUR                  5       n[        R                  " US-   UR                  S   S-
  /UR                  S9n[        R                   " USS
9S-
  n[        U R                   R"                  UUU[%        U5      S9nU R&                  " SUUUUS.U	D6nU$ )z
action_embeds (`torch.Tensor`, *optional*):
    The embeddings of input actions and robot states.
pixel_attention_mask (`torch.Tensor`, *optional*):
    The mask indicating padded positions in the input image.
Nrp   r   r   T)rW  rQ  r[  token_type_ids	use_cacher   z:Only two-dimensional attention masks are accepted for now!rm   r  r  )r  )r   rW  rQ  r'  and_mask_function)rW  rQ  r[  r'  r(   )cumsumrY  rB   
zeros_liker>  r'  ndimry   onesr   rm   r  r  get_seq_lengthrQ   r   r   r   r   r=  )rT   r   rP  rn   rQ  ro   r[  rW  r'  rX   r]  dit_position_idsdit_attention_mask
noise_maskvlm_input_lengthblock_sizesr   bidirectional_mask
dit_outputs                      r2   r  PI0Model.forward  s   ( #(?)l.B-44R81<$ $ 1 1)K_ `"--m<Q1WEN"hh+-)- '  o  %.*=*=*BYZZ 154%##A&##A&$**%,,	J "'N+GQ!O %-?Q G! KQQ^QdQdefQgPgPiMij +99;ll$4q$8-:M:Ma:PST:T#U^k^r^rs <<;a?6;;))'-+:;KL
 XX 
'-)+	

 

 r1   )r=  r>  rf   )NNNNNNN)r)   r*   r+   r,   r   rS   rB  rE  rY  r   r   rB   r   
LongTensorr   r   r  r0   r   r   s   @r2   r:  r:  k  s    y /-2  *.,0.24804-1(,E||E <<$&E llT)	E
 t+E $llT1E &&-E ||d*E E 
!E  Er1   r:  c                     ^  \ rS rSrSrSS0rS\4U 4S jjr\\	          SS\
R                  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\
R                  S\4S jj5       5       r\
R$                  " 5           SS\
R                  S\
R                  S\
R                  S	\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\
R                  4S jj5       rSrU =r$ )PI0ForConditionalGenerationi  z9PI0 model with action projection heads and flow matching.action_out_projcolwise_gather_outputr   c                   > [         TU ]  U5        [        U5      U l        UR                  R
                  U l        [        U5      U l        [        R                  " U R                  UR                  5      U l        U R                  5         g rf   )rR   rS   r:  r&  r   r   expert_hidden_sizer  embed_action_timer   r  r   rq  r?  r  s     r2   rS   $PI0ForConditionalGeneration.__init__  sf     f%
"("3"3"?"?!7!?!yy)@)@&BWBWXr1   Nr^   r  r  rP  rn   ro   rQ  r[  rW  r'  r]   r_   c                    UR                   S   nUc  [        R                  " U R                  R                  [        R
                  S9n[        R                  " U R                  R                  [        R
                  S9n[        R                  R                  X5      nUR                  U45      R                  UR                  5      nUU R                  R                  -  U R                  R                  -   R                  5       nUcT  [        R                  " UU R                  R                   U R                  R"                  UR                  UR$                  S9nUb6  USS2SS4   nUU-  SU-
  U-  -   R                  UR$                  5      nX+-
  nOUnU R'                  UUU5      nU R(                  " S	UUUUUU	UU
S.UD6nUR*                  SS2U R                  R                   * S24   nU R-                  U5      nSnUb*  [.        R0                  " WUU R                  R2                  S9n[5        UUUR6                  UR8                  UR:                  S9$ )
a  
state (`torch.Tensor`, *optional*):
    Current robot state.
noise (`torch.Tensor`, *optional*):
    Random noise at current timestep that needs to be denoised
timestep (`torch.Tensor`, *optional*):
    Current denoising timestep.
pixel_attention_mask (`torch.Tensor`, *optional*):
    The mask indicating padded positions in the input image.
actions (`torch.Tensor`, *optional*):
    Input actions that need to be predicted. Used only when training to compiute loss.
r   Nrl   )r  rm   r   )rP  rn   rQ  ro   r[  rW  r   r'  )	reduction)losslogitsr'  hidden_states
attentionsr(   )r   rB   rQ   r   r   r   r   distributionsBetar   r  r  r   r   r   randnrN   r   rm   ru  r&  last_hidden_staterq  r   mse_lossr   r   r'  r{  r|  )rT   r^   r  r  rP  rn   ro   rQ  r[  rW  r'  r]   rX   
batch_sizealpha_tbeta_tdist	time_betatime_expandednoisy_actionstarget_velocityr!  outputslast_hidden_statespredicted_velocityry  s                             r2   r  #PI0ForConditionalGeneration.forward  s!   : [[^
 ll4;;#G#Gu}}]G\\$++"E"EU]][F&&++G<DZM255ellCI!DKK$C$CCdkkFfFffmmoH =KK&&**||kkE $Qd]3M*U2a-6G75RRVVW^WdWdeM#oO!M "33E=(S** 

%)!5%',+

 

 %66q4;;;Q;Q:Q:S7ST!112DE::o/AT[[MgMghD%%#33!//))
 	
r1   	num_stepsc           
         U=(       d    U R                   R                  nUR                  S   n	UR                  n
UcM  [        R
                  " SSU	U R                   R                  U R                   R                  4UR                  U
S9nUb  UR                  S5      S-
  nU R                  R                  X#U5      nU R                  R                  UUWSSS9R                  nUR                  5       nS	U-  n[        U5       Hk  nSUU-  -   n[        R                   " U[        R"                  U
S
9R%                  U	5      nU " UUUUUUS9nUR'                  U5        XOUR(                  -  -   nMm     U$ )z0Run flow matching inference to generate actions.r   r   r   )r   stdr-   rm   r  rp   r   T)rW  rQ  r[  r^  return_dictg      r_  )r^   r  r  ro   rQ  r'  )r   r   r   r  rB   normalrN   r   rm   ra  r&  rY  r>  r'  re  rangerQ   r   expandcroprz  )rT   r^   rP  rn   r  rQ  ro   r  rX   r  r  r[  rW  r'  prefix_lengthdtstepr  time_tensoroutputs                       r2   sample_actions*PI0ForConditionalGeneration.sample_actionsA  s~    @!@!@	__Q'
!! =LLKK**KK..
 #((
E %)004q8L

//	I]^**..')% ) 
 / 	 (668 I)$D?D,,t5==PWWXbcK$%9- /F   /..E % r1   )rq  ru  rt  r&  )
NNNNNNNNNN)NNNN)r)   r*   r+   r,   r   _tp_planr   rS   r   r   rB   FloatTensorr   
BoolTensorrn  r   r   r  no_gradr   r  r0   r   r   s   @r2   rp  rp    s   C!#:;Hy   +/-1)-,08<.204-1(,%)T
  T
   4'T
 ##d*	T

 <<$&T
 llT)T
 $..5T
 t+T
 &&-T
 ||d*T
 T
 ""T
 
 T
  T
l ]]_ +/.28< $=  = ##= ''	=
   4'= t+= $..5= := 
		= =r1   rp  )r   r%  r:  rp  rE   r!   )Fr   r   collections.abcr   numpyr   rB   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr   rb   r   r+  cache_utilsr   configuration_utilsr	   feature_extraction_utilsr
   image_utilsr   r   masking_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   r   tokenization_utils_baser   r   utilsr   r   r   utils.genericr   utils.import_utilsr   autor   r   r   paligemma.processing_paligemmar   siglip.image_processing_siglipr   
get_loggerr)   ru   r!   r4   rE   r   r   r   Moduler   r  r%  r:  rp  __all__r(   r1   r2   <module>r     s   [  $     .  &   3 4 A 6 O - 8 C > > + * 8 8 ? A 
		H	% ,  )  	&'kD% kD ( kD\ -.ln  ln  /ln^5<< H BII .$RYY $, R R R& m! m m`c"4 cLr1   