
    N j                   R   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKJrJ	r	J
r
Jr  S SKrS SKrS SKJr  S SKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'   " S S\$5      r(\(" 5       RR                  r*\
(       a  S SK+J,r,J-r-  SSKJ.r.  SSK/J0r0  SSK1J2r2  Sr3Sr4S&S jr5\Rl                  Ro                  \8S5      r9 " S S5      r: " S S\;5      r< " S S\#5      r=\R|                   " S  S!5      5       r? " S" S#\&5      r@ " S$ S%\'5      rAg)'    )annotationsN)AnyOptionalTYPE_CHECKINGUnion)
OrderedSet)ModularIndexing   )config)ComputedBuffertorch_dtype_to_jax)get_fused_kernel_nameget_kernel_metadata)V   )BlockPatternMatcher)BackendFeatureCSEVariableIndentedBufferOpOverridesPythonPrinter)
SIMDKernelSIMDSchedulingc                  6    \ rS rSrSrSS jrSS jrSS jrSrg)	PallasPrinter    zG
Custom sympy printer for Pallas that handles JAX-specific constructs.
c                    U R                  UR                  S   5      nU R                  UR                  S   5      nU R                  UR                  S   5      nSU SU SU S3$ )z!Convert sympy Where to jnp.where.r   r   r
   
jnp.where(, ))doprintargs)selfexprcpqs        o/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/_inductor/codegen/pallas.py_print_WherePallasPrinter._print_Where%   s_    LL1&LL1&LL1&A3b2aS**    c                    UR                    Vs/ s H  o R                  U5      PM     nnUS   nUSS  H  nSU SU S3nM     U$ s  snf )z7Convert sympy Min to jnp.minimum for JAX compatibility.r   r   Njnp.minimum(r    r!   r#   r"   r$   r%   argr#   results        r)   
_print_MinPallasPrinter._print_Min,   X    -1YY7YcS!Y7a8C#F82cU!4F 	 8   Ac                    UR                    Vs/ s H  o R                  U5      PM     nnUS   nUSS  H  nSU SU S3nM     U$ s  snf )z7Convert sympy Max to jnp.maximum for JAX compatibility.r   r   Njnp.maximum(r    r!   r/   r0   s        r)   
_print_MaxPallasPrinter._print_Max4   r5   r6    N)r%   
sympy.Exprreturnstr)	__name__
__module____qualname____firstlineno____doc__r*   r3   r9   __static_attributes__r;   r,   r)   r   r       s    +r,   r   )CallableSequence)IRNode)ReductionType)BaseSchedulerNodemain   c                6    U [         -   S-
  [         -  [         -  $ )z@Align size to WARPGROUP_SIZE (128) for Mosaic GPU compatibility.r   )WARPGROUP_SIZE)sizes    r)   _align_to_warpgrouprO   R   s    N"Q&>9^KKr,   kernel_codec                  <    \ rS rSrSr S   S	S jjrSS.S jrSrg)
PallasKernelWrapper[   z6Wrapper to provide .run() interface for Pallas kernelsNc                H    Xl         X l        [        R                  SU5        g )NzPallas kernel path: %s)	kernel_fnkernel_pathkernel_code_loginfo)r$   rU   rV   s      r)   __init__PallasKernelWrapper.__init__^   s      #&5{Cr,   )streamc               *    U R                   " USU0UD6$ )z
Execute the Pallas kernel.

Args:
    *args: Arguments to pass to the kernel function
    stream: CUDA stream to pass to the kernel function
    **kwargs: Additional keyword arguments for the kernel

Returns:
    Result of the kernel execution
r[   )rU   )r$   r[   r#   kwargss       r)   runPallasKernelWrapper.rune   s     ~~t=F=f==r,   )rU   rV   N)rU   zCallable[..., Any]rV   Optional[str])r?   r@   rA   rB   rC   rY   r^   rD   r;   r,   r)   rR   rR   [   s3    @ KOD+D:GD !% > >r,   rR   c                      \ rS rSrSrSrg)Unsupportedt   zJException raised when an operation is not supported by the Pallas backend.r;   N)r?   r@   rA   rB   rC   rD   r;   r,   r)   rc   rc   t   s    Tr,   rc   c                  	   \ rS rSrSr\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r	\SuS j5       r
\SuS j5       r\SuS	 j5       r\SuS
 j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SuS j5       r\SvS j5       r \SvS j5       r!\SvS j5       r"\SwS  j5       r#\SxS! j5       r$\  Sy         SzS# jj5       r%\S{S$ j5       r&\S|S% j5       r'\S}S& j5       r(\SuS' j5       r)\SuS( j5       r*\SuS) j5       r+\SuS* j5       r,\SuS+ j5       r-\SuS, j5       r.\SvS- j5       r/\SvS. j5       r0\SvS/ j5       r1\SvS0 j5       r2\SvS1 j5       r3\SuS2 j5       r4\SuS3 j5       r5\SuS4 j5       r6\SvS5 j5       r7\SvS6 j5       r8\SvS7 j5       r9\SuS8 j5       r:\SvS9 j5       r;\SvS: j5       r<\SvS; j5       r=\SvS< j5       r>\SvS= j5       r?\SvS> j5       r@\SvS? j5       rA\S~S@ j5       rB\BrC\SuSA j5       rD\SuSB j5       rE\SuSC j5       rF\SuSD j5       rG\SuSE j5       rH\SuSF j5       rI\SuSG j5       rJ\SuSH j5       rK\SuSI j5       rL\SuSJ j5       rM\SuSK j5       rN\SuSL j5       rO\MrP\SuSM j5       rQ\NrR\SuSN j5       rS\SSO j5       rT\SSP j5       rU\TrV\UrW\SSQ j5       rX\SuSR j5       rY\SSS j5       rZ\SST j5       r[\SSU j5       r\\SSV j5       r]\SSW j5       r^\SSX j5       r_\SSY j5       r`\SSZ j5       ra\SS[ j5       rb\SS\ j5       rc\SS] j5       rd\SS^ j5       re\SS_ j5       rf\SS` j5       rg\SSa j5       rh\SuSb j5       ri\SuSc j5       rj\SSd j5       rk\SvSe j5       rl\SvSf j5       rm\SvSg j5       rn\SuSh j5       ro\SuSi j5       rp\SvSj j5       rq\SvSk j5       rr\SvSl j5       rs\SuSm j5       rt\SvSn j5       ru\SvSo j5       rv\SSp j5       rw\SSq j5       rx\SSr j5       ry\SSs j5       rzStr{g")PallasKernelOverridesx   z
Map element-wise ops to JAX/Pallas operations.

For now, we use the default Python operators which are compatible
with JAX numpy broadcasting semantics.
c                    SU  S3$ )Nzjnp.sin(r!   r;   xs    r)   sinPallasKernelOverrides.sin       !Ar,   c                    SU  S3$ )Nzjnp.cos(r!   r;   ri   s    r)   cosPallasKernelOverrides.cos   rm   r,   c                    SU  S3$ )Nzjnp.tan(r!   r;   ri   s    r)   tanPallasKernelOverrides.tan   rm   r,   c                    SU  S3$ )Nz	jnp.sinh(r!   r;   ri   s    r)   sinhPallasKernelOverrides.sinh       1#Qr,   c                    SU  S3$ )Nz	jnp.cosh(r!   r;   ri   s    r)   coshPallasKernelOverrides.cosh   rw   r,   c                    SU  S3$ )Nz	jnp.tanh(r!   r;   ri   s    r)   tanhPallasKernelOverrides.tanh   rw   r,   c                    SU  S3$ )Nzjnp.arcsin(r!   r;   ri   s    r)   asinPallasKernelOverrides.asin       QCq!!r,   c                    SU  S3$ )Nzjnp.arccos(r!   r;   ri   s    r)   acosPallasKernelOverrides.acos   r   r,   c                    SU  S3$ )Nzjnp.arctan(r!   r;   ri   s    r)   atanPallasKernelOverrides.atan   r   r,   c                    SU  S3$ )Nzjnp.exp(r!   r;   ri   s    r)   expPallasKernelOverrides.exp   rm   r,   c                    SU  S3$ )Nz	jnp.exp2(r!   r;   ri   s    r)   exp2PallasKernelOverrides.exp2   rw   r,   c                    SU  S3$ )Nz
jnp.expm1(r!   r;   ri   s    r)   expm1PallasKernelOverrides.expm1       A3a  r,   c                    SU  S3$ )Nzjnp.log(r!   r;   ri   s    r)   logPallasKernelOverrides.log   rm   r,   c                    SU  S3$ )Nz
jnp.log10(r!   r;   ri   s    r)   log10PallasKernelOverrides.log10   r   r,   c                    SU  S3$ )Nz	jnp.log2(r!   r;   ri   s    r)   log2PallasKernelOverrides.log2   rw   r,   c                    SU  S3$ )Nz
jnp.log1p(r!   r;   ri   s    r)   log1pPallasKernelOverrides.log1p   r   r,   c                    SU  S3$ )Nz	jnp.sqrt(r!   r;   ri   s    r)   sqrtPallasKernelOverrides.sqrt   rw   r,   c                    SU  S3$ )Nzjax.lax.rsqrt(r!   r;   ri   s    r)   rsqrtPallasKernelOverrides.rsqrt   s    s!$$r,   c                    SU  S3$ )Nzjnp.abs(r!   r;   ri   s    r)   absPallasKernelOverrides.abs   rm   r,   c                    SU  S3$ )Nz(-r!   r;   ri   s    r)   negPallasKernelOverrides.neg   s    A3ayr,   c                    SU  S3$ )Nz
jnp.floor(r!   r;   ri   s    r)   floorPallasKernelOverrides.floor   r   r,   c                    SU  S3$ )Nz	jnp.ceil(r!   r;   ri   s    r)   ceilPallasKernelOverrides.ceil   rw   r,   c                    SU  S3$ )Nz
jnp.trunc(r!   r;   ri   s    r)   truncPallasKernelOverrides.trunc   r   r,   c                    SU  S3$ )Nz
jnp.round(r!   r;   ri   s    r)   roundPallasKernelOverrides.round   r   r,   c                    SU  S3$ )Nzjax.nn.sigmoid(r!   r;   ri   s    r)   sigmoidPallasKernelOverrides.sigmoid        1%%r,   c                    SU  S3$ )Nr8   z, 0)r;   ri   s    r)   reluPallasKernelOverrides.relu   s    aS%%r,   c                    SU  SU S3$ )Nz
jnp.power(r    r!   r;   abs     r)   powPallasKernelOverrides.pow       A3b1%%r,   c                    SU  SU S3$ )Nr8   r    r!   r;   r   s     r)   maximumPallasKernelOverrides.maximum       aS1#Q''r,   c                    SU  SU S3$ )Nr.   r    r!   r;   r   s     r)   minimumPallasKernelOverrides.minimum   r   r,   c                    SU  SU SU S3$ )Nr   r    r!   r;   )condr   r   s      r)   wherePallasKernelOverrides.where   s    D6A3b1--r,   c                   U" 5       n[        U[        5      (       aP  [        R                  " U5      (       a  SnO=[        R                  " U5      (       a  US:  a  SOSnO[        U5      nO[        U5      nSU  SU SU S3$ )zr
Computes body, but only uses the result where mask is true.
Where mask is false, uses the 'other' value instead.
jnp.nanr   jnp.inf-jnp.infr   r    r!   )
isinstancefloatmathisnanisinfrepr)maskbodyotherr2   	other_strs        r)   maskedPallasKernelOverrides.masked   su     eU##zz%  %	E"").I
	 K	UID6F82i[::r,   Nc                *    [        U5      nSU  SU S3$ )Nzjnp.asarray(	).astype(r!   r   )rj   dtype	src_dtypeuse_compute_types	jax_dtypes        r)   to_dtypePallasKernelOverrides.to_dtype  s#     'u-	aS	)A66r,   c                F    [        U5      n[        U5      nSU  SU SU S3$ )z=Bitcast a value from one dtype to another with the same size.z)jax.lax.bitcast_convert_type(jnp.asarray(r   z), r!   r   )rj   r   r   r   jax_src_dtypes        r)   to_dtype_bitcast&PallasKernelOverrides.to_dtype_bitcast  s7     'u-	*95:1#Y}oUXYbXccdeer,   c                
   SSK Jn  [        R                  R                  R                  [        R                  R                  U 5      5        [        R                  R                  U 5      n[        R                  R                  U5      n[        R                  R                  U5      n[        R                  R                  R                  [        R                  R                  XR" U 5      S9n[        R                  Xa5      $ )z>Convert a sympy expression to a JAX array indexing expression.r
   )get_bounds_index_expr)bounds)utilsr   r   kernelused_iter_varsupdate_get_used_iter_varsprepare_indexingrename_indexingkexprcsegeneratecomputerf   r   )r%   r   r   preparedrenamedidx_strvars          r)   
index_expr PallasKernelOverrides.index_expr  s     	2 	
&&qxx'C'CD'IJ 88,,T2((**84((..)hhll##HHg.CD.I $ 
 %--c99r,   c                   [        U5      nU[        R                  :X  a  U (       a  S$ S$ [        U [        5      (       aA  [
        R                  " U 5      (       a  g[
        R                  " U 5      (       a
  U S:  a  S$ S$ SU  SU S	3$ )
z/Convert a constant value to JAX representation.TrueFalser   r   r   r   z
jnp.array(z, dtype=r!   )r   torchboolr   r   r   r   r   )valr   r   s      r)   constantPallasKernelOverrides.constant0  sw     'u-	EJJ 6-g-c5!!zz# zz#$'!Gy;;C5155r,   c                    SU  S3$ )Nz	jnp.real(r!   r;   ri   s    r)   realPallasKernelOverrides.real>  rw   r,   c                    SU  S3$ )Nz	jnp.imag(r!   r;   ri   s    r)   imagPallasKernelOverrides.imagB  rw   r,   c                    SU  S3$ )Nz	jnp.conj(r!   r;   ri   s    r)   conjPallasKernelOverrides.conjF  rw   r,   c                    SU  S3$ )Nz
jnp.angle(r!   r;   ri   s    r)   anglePallasKernelOverrides.angleJ  r   r,   c                    SU  SU  S3$ )z8View complex tensor as real tensor with extra dimension.zjnp.stack([jnp.real(z), jnp.imag(z)], axis=-1)r;   ri   s    r)   view_as_real"PallasKernelOverrides.view_as_realN  s     &aSQC|DDr,   c                    SU  SU  S3$ )z#View real tensor as complex tensor.(z[..., 0] + 1j * z	[..., 1])r;   ri   s    r)   view_as_complex%PallasKernelOverrides.view_as_complexS  s     1#%aS	22r,   c                    SU  SU S3$ )Nr  z == r!   r;   r   s     r)   eqPallasKernelOverrides.eqY      1#T!Ar,   c                    SU  SU S3$ )Nr  z != r!   r;   r   s     r)   nePallasKernelOverrides.ne]  r  r,   c                    SU  SU S3$ )Nr  z < r!   r;   r   s     r)   ltPallasKernelOverrides.lta      1#S1~r,   c                    SU  SU S3$ )Nr  z <= r!   r;   r   s     r)   lePallasKernelOverrides.lee  r  r,   c                    SU  SU S3$ )Nr  z > r!   r;   r   s     r)   gtPallasKernelOverrides.gti  r$  r,   c                    SU  S3$ )Nz
jnp.isnan(r!   r;   ri   s    r)   r   PallasKernelOverrides.isnanm  r   r,   c                    SU  S3$ )Nz
jnp.isinf(r!   r;   ri   s    r)   r   PallasKernelOverrides.isinfq  r   r,   c                    SU  S3$ )Nzjnp.isfinite(r!   r;   ri   s    r)   isfinitePallasKernelOverrides.isfiniteu  s    qc##r,   c                    SU  SU S3$ )Nr  z >= r!   r;   r   s     r)   gePallasKernelOverrides.gey  r  r,   c                    SU  SU S3$ )Nzjnp.logical_and(r    r!   r;   r   s     r)   logical_and!PallasKernelOverrides.logical_and~      !!Bqc++r,   c                    SU  SU S3$ )Nzjnp.logical_or(r    r!   r;   r   s     r)   
logical_or PallasKernelOverrides.logical_or       2aS**r,   c                    SU  S3$ )Nzjnp.logical_not(r!   r;   ri   s    r)   logical_not!PallasKernelOverrides.logical_not      !!A&&r,   c                    SU  SU S3$ )Nzjnp.logical_xor(r    r!   r;   r   s     r)   logical_xor!PallasKernelOverrides.logical_xor  r8  r,   c                    SU  SU S3$ )Nzjnp.arctan2(r    r!   r;   r   s     r)   atan2PallasKernelOverrides.atan2  r   r,   c                    SU  SU S3$ )Nz
jnp.hypot(r    r!   r;   r   s     r)   hypotPallasKernelOverrides.hypot  r   r,   c                    SU  SU S3$ )Nz	jnp.fmod(r    r!   r;   r   s     r)   fmodPallasKernelOverrides.fmod  s    1#Rs!$$r,   c                    SU  SU S3$ )Nzjnp.remainder(r    r!   r;   r   s     r)   	remainderPallasKernelOverrides.remainder      s"QCq))r,   c                &    SU  SU SU  SU SU  S3$ )Nz
(jnp.sign(z) * jnp.sign(z) * (jnp.abs(z) // jnp.abs(z))).astype(.dtype)r;   r   s     r)   truncdivPallasKernelOverrides.truncdiv  s.     A3mA3mA3mA3kZ[Y\\cddr,   c                    SU  SU S3$ )Nr  z // r!   r;   r   s     r)   floordivPallasKernelOverrides.floordiv  r  r,   c                    SU  SU SU S3$ )Nz	jnp.clip(r    r!   r;   )rj   min_valmax_vals      r)   clampPallasKernelOverrides.clamp  s    1#Ry7)155r,   c                    SU  S3$ )Nz	jnp.sign(r!   r;   ri   s    r)   signPallasKernelOverrides.sign  rw   r,   c                    SU  S3$ )Nzjnp.signbit(r!   r;   ri   s    r)   signbitPallasKernelOverrides.signbit  s    aS""r,   c                    SU  S3$ )Nzjax.scipy.special.erf(r!   r;   ri   s    r)   erfPallasKernelOverrides.erf  s    's!,,r,   c                    SU  S3$ )Nzjax.scipy.special.erfc(r!   r;   ri   s    r)   erfcPallasKernelOverrides.erfc  s    (1--r,   c                    SU  S3$ )Nzjax.scipy.special.erfinv(r!   r;   ri   s    r)   erfinvPallasKernelOverrides.erfinv  s    *1#Q//r,   c                    SU  S3$ )Nzjax.scipy.special.gammaln(r!   r;   ri   s    r)   lgammaPallasKernelOverrides.lgamma      +A3a00r,   c                    SU  S3$ )Nzjax.scipy.special.digamma(r!   r;   ri   s    r)   digammaPallasKernelOverrides.digamma  ro  r,   c                    SU  SU  SU  S3$ )Nr   z>.astype(jnp.float64) == 0.0, 1.0, jax.scipy.special.bessel_jn(z&.astype(jnp.float64), v=0)[0]).astype(rR  r;   ri   s    r)   	bessel_j0PallasKernelOverrides.bessel_j0  +      ++,# .c"	
r,   c                    SU  SU  SU  S3$ )Nr   z>.astype(jnp.float64) == 0.0, 0.0, jax.scipy.special.bessel_jn(z&.astype(jnp.float64), v=1)[1]).astype(rR  r;   ri   s    r)   	bessel_j1PallasKernelOverrides.bessel_j1  rv  r,   c                    SU  SU  S3$ )Njax.lax.bessel_i0e() * jnp.exp(jnp.abs())r;   ri   s    r)   modified_bessel_i0(PallasKernelOverrides.modified_bessel_i0       %QC';A3bAAr,   c                    SU  SU  S3$ )Njax.lax.bessel_i1e(r|  r}  r;   ri   s    r)   modified_bessel_i1(PallasKernelOverrides.modified_bessel_i1  r  r,   c                    SU  SU  SU  S3$ )Nr   z == 0.0, 1.0, jnp.sin(z) / r!   r;   ri   s    r)   spherical_bessel_j0)PallasKernelOverrides.spherical_bessel_j0  s      A34QCtA3a@@r,   c                    SU  S3$ )Nr{  r!   r;   ri   s    r)   i0ePallasKernelOverrides.i0e       %QCq))r,   c                    SU  S3$ )Nr  r!   r;   ri   s    r)   i1ePallasKernelOverrides.i1e   r  r,   c                    SU  SU S3$ )Nzjax.scipy.special.gammainc(r    r!   r;   rj   ys     r)   gammaincPallasKernelOverrides.gammainc  s     -QCr!A66r,   c                    SU  SU S3$ )Nzjax.scipy.special.gammaincc(r    r!   r;   r  s     r)   	gammainccPallasKernelOverrides.gammaincc  s     .aS1#Q77r,   c                    SU  SU S3$ )Nzjax.scipy.special.polygamma(z.astype(jnp.int32), r!   r;   r  s     r)   	polygammaPallasKernelOverrides.polygamma  s     .aS0DQCqIIr,   c                    SU  S3$ )Nzjax.scipy.special.ndtri(r!   r;   ri   s    r)   ndtriPallasKernelOverrides.ndtri  s     *!A..r,   c                    SU  SU S3$ )Nzjax.scipy.special.zeta(r    r!   r;   r  s     r)   zetaPallasKernelOverrides.zeta  s     )2aS22r,   c                    SU  SU S3$ )Nzjax.scipy.special.xlogy(r    r!   r;   r  s     r)   xlogyPallasKernelOverrides.xlogy$  s     *!Bqc33r,   c                    SU  SU S3$ )Nzjax.scipy.special.xlog1py(r    r!   r;   r  s     r)   xlog1pyPallasKernelOverrides.xlog1py)  s     ,A3b155r,   c                >    SU  SU SU  SU  SU SU  SU SU S	U  S
3$ )Njnp.where(jnp.abs(z) <= 1, jnp.cos(z * jnp.arccos(jnp.clip(z, -1, 1))), jnp.where(z > 1, jnp.cosh(z * jnp.arccosh(jnp.maximum(z, 1.0))), ((-1.0) ** z) * jnp.cosh(z * jnp.arccosh(jnp.maximum(-z
, 1.0)))))r;   rj   ns     r)   chebyshev_polynomial_t,PallasKernelOverrides.chebyshev_polynomial_t.  s[     ! $c0 4 s5aS 9M!,H:	W	
r,   c                    SR                  / SPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU PS	PU PS
PU  PSPU  PSPU  PSPU PSPU PSPU PSPU PSPU  PSPU  PSP5      $ )N r  z) < 1, jnp.sin((z + 1) * jnp.arccos(jnp.clip(z&, -1, 1))) / jnp.sqrt(jnp.maximum(1 - z**2, 1e-10)), jnp.where(z >= 1, jnp.where( == 1, z + 1.0, jnp.sinh((z  + 1) * jnp.arccosh(jnp.maximum(z , 1.0))) / jnp.sqrt(jnp.maximum(z**2 - 1, 1e-10))), jnp.where(z == -1, ((-1.0) ** ) * (z + 1.0), ((-1.0) ** z) * jnp.sinh((z! + 1) * jnp.arccosh(jnp.maximum(-z**2 - 1, 1e-10)))))joinr  s     r)   chebyshev_polynomial_u,PallasKernelOverrides.chebyshev_polynomial_u<  s   	; 	;  	; 	; $ 	;s	;6	;78c	;:(	;()s	;+	; 	;	; 		; #		; $%#		;&		;
 	;
 <	;
 =>3	;
?$	; %&3	;'	; 	; /	; 01c	; 27	; 89c	;:	; 	; +	; ,-#	; .O	; PQc	;R$	; %&3	; ':	;	
r,   c                   SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU  PSPU PS	PU  PS
PU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r    == 0, jnp.ones_like(), jnp.where(	 == 1, 2* - 1, jnp.where(	 == 2, 4*z**2 - 2*	 == 3, 8*z**3 - 4***2 - 4* + 1, jnp.where(
 == 4, 16*z**4 - 8*	**3 - 12*z**2 + 4*
 == 5, 32*z	**5 - 16*	**4 - 32*z	**3 + 12***2 + 6*z - 1, jnp.zeros_like()))))))r  r  s     r)   chebyshev_polynomial_v,PallasKernelOverrides.chebyshev_polynomial_vQ  
   ) )j ) )0 ) ) 4 ))$)%&C)())$)%&C)'/)01s)3) ) %) &'C) (0) 12s) 3;) <=#)>) 	) &	) '(S	) )1	) 23	) 4=	) >?C	) @H	) IJs	)K	)
 )
 &)
 '(S)
 )2)
 34)
 5>)
 ?@S)
 AJ)
 KL)
 MU)
 VWTW)
X)  S) !()	
r,   c                   SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU  PS	PU PS
PU  PSPU  PSPU  PS	PU PSPU  PSPU  PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r   r  r  r  r  r  z**2 + 2*r  r  z**3 + 4*r  r  z**4 + 8*r  r  z	**5 + 16*r  r  z + 1, jnp.zeros_like(r  r  r  s     r)   chebyshev_polynomial_w,PallasKernelOverrides.chebyshev_polynomial_wa  r  r,   c                6    [         R                  SU  S3U5      $ Nz(2 * z - 1))rf   r  r  s     r)   shifted_chebyshev_polynomial_t4PallasKernelOverrides.shifted_chebyshev_polynomial_tq      $;;eA3e<LaPPr,   c                6    [         R                  SU  S3U5      $ r  )rf   r  r  s     r)   shifted_chebyshev_polynomial_u4PallasKernelOverrides.shifted_chebyshev_polynomial_uu  r  r,   c                6    [         R                  SU  S3U5      $ r  )rf   r  r  s     r)   shifted_chebyshev_polynomial_v4PallasKernelOverrides.shifted_chebyshev_polynomial_vy  r  r,   c                6    [         R                  SU  S3U5      $ r  )rf   r  r  s     r)   shifted_chebyshev_polynomial_w4PallasKernelOverrides.shifted_chebyshev_polynomial_w}  r  r,   c                    SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU PS	PU  PS
PU  PSPU PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r   r  r  z == 1, 2 * , jnp.where(z == 2, 4 * z**2 - 2, jnp.where(z == 3, 8 * z**3 - 12 * z == 4, 16 * z**4 - 48 * z**2 + 12, jnp.where(z == 5, 32 * z**5 - 160 * z**3 + 120 * , jnp.zeros_like(r  r  r  s     r)   hermite_polynomial_h*PallasKernelOverrides.hermite_polynomial_h  s~   ) )j ) )0 ) ) 4 ))&)'(c)*))&)'(c)*) ) ') ()c) *5) 67C)8) 	) (	) )*s	) +6	) 78S	)9	)
 )
 ()
 )*s)
 +7)
 89c)
 :F)
 GHS)
I)  S) !()	
r,   c                    SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU PS	PU  PS
PU  PSPU PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r   r  r  r  r  z == 2, z**2 - 1, jnp.where(z == 3, 
**3 - 3 * z == 4, z
**4 - 6 * z**2 + 3, jnp.where(z == 5, z**5 - 10 * **3 + 15 * r  r  r  r  s     r)   hermite_polynomial_he+PallasKernelOverrides.hermite_polynomial_he  s}   
) )j ) )0 ) ) 4 ))")#$#)&))")#$#)&) ) #) $%#) &0) 12s)3) 	) #	) $%#	) &0	) 12s	)3	)
 )
 #)
 $%#)
 &1)
 23)
 4?)
 @Ac)
B)  S) !()	
r,   c                   SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU  PS	PU PS
PU  PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r   r  r  z == 1, 1 - r  z == 2, (r  z + 2) / 2, jnp.where(z	 == 3, (-z**3 + 9*z	**2 - 18*z + 6) / 6, jnp.where(z == 4, (z	**4 - 16*z	**3 + 72*z	**2 - 96*z + 24) / 24, jnp.where(z	 == 5, (-z	**5 + 25*z
**4 - 200*z
**3 + 600*z
**2 - 600*z + 120) / 120, jnp.zeros_like(r  r  r  s     r)   laguerre_polynomial_l+PallasKernelOverrides.laguerre_polynomial_l  s	   
) )j ) )0 ) ) 4 ))&)'(c)*))#)$%3)&.)/0c)2) ) %) &'C) (0) 12s) 3<) =>3)?) 	) $	) %&3	) '0	) 12s	) 3<	) =>3	) ?H	) IJs	)K	)
 )
 %)
 &'C)
 (1)
 23)
 4>)
 ?@S)
 AK)
 LM#)
 NX)
 YZWZ)
[)  S) !()	
r,   c                    SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU PS	PU  PS
PU  PSPU PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r   r  r  r  r  z == 2, (3 * z**2 - 1) / 2, jnp.where(z == 3, (5 * r  z) / 2, jnp.where(z == 4, (35 * z**4 - 30 * z**2 + 3) / 8, jnp.where(z == 5, (63 * z**5 - 70 * r  z) / 8, jnp.zeros_like(r  r  r  s     r)   legendre_polynomial_p+PallasKernelOverrides.legendre_polynomial_p  s~   
) )j ) )0 ) ) 4 ))")#$#)&))')()s)+) ) () )*s) +5) 67C)8) 	) )	) *+	) ,7	) 89c	):	)
 )
 ))
 *+)
 ,7)
 89c)
 :E)
 FGC)
H)  S) !()	
r,   c                    SU  S3$ )Nzjnp.reciprocal(r!   r;   ri   s    r)   
reciprocal PallasKernelOverrides.reciprocal  r   r,   c                    SU  S3$ )Nzjnp.square(r!   r;   ri   s    r)   squarePallasKernelOverrides.square  r   r,   c                    SU  SU SU S3$ )zFused multiply-add: a * b + c

JAX doesn't have jnp.fma, so we use the unfused version.
The compiler may still fuse this on supported hardware.
z((r  z) + (r}  r;   )r   r   r&   s      r)   fmaPallasKernelOverrides.fma  s     A3eA3eA3b))r,   c                    SU  SU S3$ )Nzjnp.copysign(r    r!   r;   r   s     r)   copysignPallasKernelOverrides.copysign  s    qcA3a((r,   c                    SU  SU S3$ )Nzjnp.nextafter(r    r!   r;   r   s     r)   	nextafterPallasKernelOverrides.nextafter  rP  r,   c                    SU  SU S3$ )Nz
jnp.ldexp(r    r!   r;   r   s     r)   ldexpPallasKernelOverrides.ldexp  r   r,   c                    SU  S3$ )Nz
jnp.frexp(r!   r;   ri   s    r)   frexpPallasKernelOverrides.frexp  r   r,   c                    SU  S3$ )Nz	jnp.modf(r!   r;   ri   s    r)   modfPallasKernelOverrides.modf  rw   r,   c                    SU  SU S3$ )Nzjnp.bitwise_and(r    r!   r;   r   s     r)   bitwise_and!PallasKernelOverrides.bitwise_and  r8  r,   c                    SU  SU S3$ )Nzjnp.bitwise_or(r    r!   r;   r   s     r)   
bitwise_or PallasKernelOverrides.bitwise_or  r<  r,   c                    SU  SU S3$ )Nzjnp.bitwise_xor(r    r!   r;   r   s     r)   bitwise_xor!PallasKernelOverrides.bitwise_xor  r8  r,   c                    SU  S3$ )Nzjnp.bitwise_not(r!   r;   ri   s    r)   bitwise_not!PallasKernelOverrides.bitwise_not  r@  r,   c                    SU  SU S3$ )Nzjnp.left_shift(r    r!   r;   r   s     r)   
left_shift PallasKernelOverrides.left_shift  r<  r,   c                    SU  SU S3$ )Nzjnp.right_shift(r    r!   r;   r   s     r)   right_shift!PallasKernelOverrides.right_shift  r8  r,   c                    [         R                  R                  R                  SU5      nS[         R                  R                  R	                  U 5       SU S3$ )z)Load the random seed value from a buffer.load_seed_offsetr  z[0] + r!   )r   r   r#   seed_offsetinput)nameoffsetr  s      r)   	load_seedPallasKernelOverrides.load_seed  sH     hhmm//0BFK188==&&t,-VK=BBr,   c                    SU  SU SU S3$ )zGenerate uniform random numbers in [0, 1).

Uses JAX's threefry2x32 PRNG directly for vectorized random generation.
The seed provides the base key, offset provides per-element uniqueness.
zWjax.vmap(lambda o: jax.random.uniform(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(8)), jnp.uint32(o)), (), dtype=jnp.float32))(jnp.asarray(!).flatten()).reshape(jnp.asarray().shape)r;   seedr  s     r)   randPallasKernelOverrides.rand  s,    @@Dv F"8#DVHHV	
r,   c                    SU  SU SU S3$ )zGenerate standard normal random numbers.

Uses JAX's threefry2x32 PRNG directly for vectorized random generation.
The seed provides the base key, offset provides per-element uniqueness.
zVjax.vmap(lambda o: jax.random.normal(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(r  r  r  r;   r  s     r)   randnPallasKernelOverrides.randn  s,    @@Dv F"8#DVHHV	
r,   c                &    SU  SU SU SU SU S3$ )z,Generate random int64 values in [low, high).zWjax.vmap(lambda o: jax.random.randint(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(z)), jnp.uint32(o)), (), r    z , dtype=jnp.int64))(jnp.asarray(r  r  r;   )r  r  lowhighs       r)   	randint64PallasKernelOverrides.randint64$  s>    
@@DvE]^a]bbdeidj k"8#DVHHV	
r,   r;   )rj   r>   r=   r>   )r   r>   r   r>   r=   r>   )r   r>   r   r>   r   r>   r=   r>   )r   r>   r   zCallable[[], str]r   r   r=   r>   )NT)
rj   r>   r   torch.dtyper   zOptional[torch.dtype]r   r  r=   r>   )rj   r>   r   r)  r   r)  r=   r>   )r%   r<   r   r)  r=   r>   )r   r)  r=   r>   )rj   r>   rY  r>   rZ  r>   r=   r>   )rj   r>   r  r>   r=   r>   )rj   r>   r  r>   r=   r>   )r   r>   r   r>   r&   r>   r=   r>   )r  r>   r  r>   r=   r>   )r  r>   r  r>   r=   r>   )
r  r>   r  r>   r%  r>   r&  r>   r=   r>   )|r?   r@   rA   rB   rC   staticmethodrk   ro   rr   ru   ry   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r"  r&  r)  r   r   r0  r3  r6  r:  r>  rB  rE  rH  rK  rN  rS  rV  r[  clipr^  ra  rd  rg  rj  rm  rq  rt  rx  r~  r  r  i0r  i1r  r  r  igammaigammacr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r	  r  r  r  r  r"  r'  rD   r;   r,   r)   rf   rf   x   s0	                      " " " " " "       ! !   ! !     ! !     % %     ! !     ! ! ! ! & & & & & & ( ( ( ( . . ; ;&  ,0"&	777 )7  	7
 
7 7 f f : :  6 6             ! ! E E 3 3
           ! ! ! ! $ $   , , + + ' ' , , ( ( & & % % * * e e
   6 6 D     # # - - . . 0 0 1 1 1 1 	
 	
 	
 	
 B B
 B B
 A A
 
B* * 
B* * 7 7
 8 8 FGJ J
 / / 3 3 4 4 6 6 
 
 
 
( 
 
 
 
 Q Q Q Q Q Q Q Q 
 
" 
 
 
 
 
 
 & & " " * * ) ) * * & & ! !     , , + + , , ' ' + + , , C C 
 
 
 
 
 
r,   rf   c                      \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   Srg)_CodegenContexti/  z@Bundles local state shared across codegen_kernel helper methods.r   coder>   kernel_namer  is_tpuinterpret_is_cpuinterpret_literal	list[str]kernel_paramspure_out_paramsoutput_paramssize_var_paramszdict[str, str]output_buffer_lookupzdict[str, bool]aliasable_flagsalias_paramspointer_tailkernel_input_paramsfull_kernel_paramszOrderedSet[str]non_alias_out_setz	list[int]copy_output_indicesr;   N)r?   r@   rA   rB   rC   __annotations__rD   r;   r,   r)   r1  r1  /  sj    J
L(($$""!!&&""r,   r1  c                    ^  \ rS rSr% Sr\r\rS\	S'   U 4S jr
          S=S jrS>S jrS>S jrS>S	 jrS?S
 jrS@S jrSAS jrSBS jrSAS jrSCS jrSDS jrSES jrSFS jr\SGS j5       rSHS jrSIS jrSES jrSJS jr    SKS jr      SLS jr          SMS jr           SNS jr!          SNS jr"          SNS jr#            SOS jr$SPS jr%        SQS  jr&        SRS! jr'        SSS" jr(STS# jr)        SUS$ jr* SV               SWS% jjr+            SXS& jr,\-R\                  SYS' j5       r/S>S( jr0\-R\                   SV         SZS) jj5       r1\ S[       S\S* jj5       r2 S]     S^S+ jjr3        S_S, jr4        S`S- jr5          SaS. jr6\SbS/ j5       r7SVScS0 jjr8SdS1 jr9      SeS2 jr:\        SfS3 j5       r;SgS4 jr<      SgS5 jr=          ShS6 jr>SiS7 jr?      SiS8 jr@      SiS9 jrA\          SjS: j5       rBSVSkS; jjrCS<rDU =rE$ )lPallasKerneliF  a  
Pallas kernel for elementwise operations with support for strided/scatter access.

Strategy:
- Convert index expressions to JAX-compatible array slicing
- Load/store using indexed access: "in_ptrX[slice]" or full-array "in_ptrX[...]"
- Compute expression with Python operators (compatible with jax.numpy broadcasting)
- Generate Python code that defines a Pallas kernel and a host entrypoint.
- Use async_compile.pallas path to compile and load Python code.

For GPU (Mosaic backend):
- Use TMA (Tensor Memory Accelerator) for automatic OOB masking
- Falls back to legacy padding approach for reductions, broadcasting, non-contiguous tensors
zCallable[[sympy.Expr], str]r   c                d  > [         TU ]  " U0 UD6  [        R                  R	                  5       nUR
                  S:H  U l        U R                  U l        U R                  =(       a    U R                  (       + U l        / U l	        0 U l
        [        5       U l        SU l        [        5       U l        g )NcudaF)superrY   r   graphget_current_device_or_throwtypeis_gpuuse_emit_pipelineuse_warpgroup_paddingstore_with_outputload_index_exprsr   outputs_need_readhas_transposed_loadr   )r$   r#   r]   device	__class__s       r)   rY   PallasKernel.__init__Y  s    $)&)446kkV+ "&%)[[%O9O9O5O"8:792<, $) 8Br,   c                    g)z)Check array bounds for indirect indexing.Nr;   )r$   r%   rN   loweruppers        r)   check_boundsPallasKernel.check_boundsp  s    r,   c                    U R                  U5      n[        U[        R                  5      (       a  gUR                  (       a  [        U5      $ U R                  U5      $ )a  
Convert an index expression to a string suitable for Pallas indexing.

Pallas operates on full arrays, so we need to convert index expressions
to JAX array slicing. For example:
- x0 -> "..." (contiguous access, full array)
- 2*x0 -> "::2" (strided access with stride 2)
- 2*x0 + 1 -> "1::2" (strided access with offset 1, stride 2)

Args:
    index: The indexing expression to convert

Returns:
    The indexing string to use in generated code
...)r   r   sympySymbol
is_Integerr>   _convert_to_jax_slice)r$   indexprepared_indexs      r)   _get_index_strPallasKernel._get_index_strw  sQ    " ..u5 nell33&&~&& --n==r,   c                    U R                   (       d  gU R                  U5      nUR                  [        5      (       a;  U R                  R                  U R                  U5      5        U R                  U5      $ [        R                  R                  R                  U5      nU R                  U5      nU R                  R                  U5        [        U5      S:X  a  [        U5      $ [        U5      S:X  Ga  [        [        U5      5      n[         R"                  " X5      n[         R$                  " XC5      nUb  X-
  n[        R                  R                  R                  U5      nUS:  a  U R                  U5      $ US:X  a  gUS:w  a  U R                  U5      $  ['        U5      nUS:  a  U R                  U5      $  U R                  U5       S3$ X-
  n[        R                  R                  R                  U5      nUS:X  a  XC:X  a  gg[        U5      S:  aJ  SnU H9  n[         R"                  " X5      n[         R$                  " XC5      nUS:w  d  M7  Sn  O   U(       a  ggg! [(        [*        4 a    U R                  U5      s $ f = f)a  
Convert a sympy index expression to JAX slice notation.

Handles common patterns like:
- stride*var -> ::stride
- stride*var + offset -> offset::stride

For more complex patterns, falls back to explicit indexing.
Uses BlockPatternMatcher for robust pattern matching.
r]  r   r   z::1TF)range_treesr   hasr	   r   r   r   r   r   rJ  sizevarssimplifylenr>   nextiterr   get_subexpr_involving_symbolmatch_affine_block_exprint	TypeError
ValueError)	r$   rb  	used_varsr   var_exprstrider  
offset_valall_unit_strides	            r)   ra  "PallasKernel._convert_to_jax_slice  sX     $$U+ 99_%%&&t'?'?'FG ::e$$   ))%0,,U3	 	""9-y>Qu:^q tI'C +GGSH )@@OF!)))226:A:::e,,Q;  Q;::e,,-!$VJ!A~#zz%00 &
 **V,-S11 )))226:Q;8? , + ^a #O .KKEW,DDXSQ;&+O !   A ":. -::e,,-s   !I) )!JJc                    UR                   nU R                  5       nX#-  nXB:w  a  [        SU 35      eU R                  R	                  U5        U R                  U5      nU$ )a~  
Generate JAX code to compute an index array for strided/complex indexing patterns.

For expressions like `2 * x3 + 32 * x2 + 256 * x1 + 1024 * x0`, we generate
code that computes the flattened index array using broadcasting.

The iteration variables (x0, x1, x2, x3) are already defined as jnp.arange arrays
in the kernel. We just need to convert the sympy expression to JAX code.
z9Pallas backend does not yet support mixed index pattern: )free_symbols_get_iter_varsrc   r   r   r   )r$   rb  rz  	iter_varsrs  	index_strs         r)   _generate_strided_index$PallasKernel._generate_strided_index  sr     ))'')	 !,	$KE7S 
 	""9- JJu%	 r,   c                H    [        U R                  R                  5       5      $ )z*Get the set of iteration variable symbols.)r   range_tree_nodeskeys)r$   s    r)   r{  PallasKernel._get_iter_vars  s    $//44677r,   c                <    UR                   U R                  5       -  $ )z4Get iteration variables used in an index expression.)rz  r{  r$   rb  s     r)   r    PallasKernel._get_used_iter_vars   s    !!D$7$7$999r,   c                6    [        U R                  U5      5      $ )z7Check if index expression contains iteration variables.)r  r   r  s     r)   _has_iteration_vars PallasKernel._has_iteration_vars$  s    D,,U344r,   c                    UR                    Vs/ s H&  n[        U5      R                  S5      (       d  M$  UPM(     sn$ s  snf )zDGet list of indirect variable symbols (tmp*) in an index expression.tmp)rz  r>   
startswith)r$   rb  ss      r)   _get_indirect_varsPallasKernel._get_indirect_vars(  s3     --J-aQ1B1B51I-JJJs   #??c                <    [        U R                  U5      5      S:  $ )z6Check if index expression contains indirect variables.r   )rk  r  r  s     r)   _has_indirect_varsPallasKernel._has_indirect_vars,  s    4**512Q66r,   c                   [        U R                  R                  5       5      n/ nU H6  u  p4U R                  UR                  5      nUc  M%  UR                  U5        M8     [        U5      S::  a  U$ [        [        U5      5      $ )a  Get the expected output shape from iteration variables.

Iteration variables are shaped for broadcasting. For 2D outputs:
- First var (e.g., y0) gets shape (1, N) - innermost dimension
- Second var (e.g., x1) gets shape (M, 1) - outermost dimension
The broadcast result is (M, N).
r   )listr  items	_safe_intlengthappendrk  reversed)r$   	var_itemsbroadcast_varsvar_symentryr  s         r)   _get_expected_output_shape'PallasKernel._get_expected_output_shape0  s{     ..4467	'NG^^ELL1F!%%f- (
 ~!#!!
 H^,--r,   c                V   U R                  U5      nUc  gUu  pEpFn[        U5      S:w  d  [        U5      S:w  a  gU R                  US   5      nU R                  US   5      nUb  Ub  US::  d  US::  a  gUS   n	US   n
U	b  U
c  g[        U R                  R                  5       5      n[        U5      S:  a  g[        S U 5       5      (       a  gUS   S   nUS   S   n[        R                  R                  R                  U5      nU R                  X,5      nU R                  X-5      nUS:w  aw  US:w  aq  X:H  =(       a    U
S:H  nU(       d  gU R                  5       nU(       d  g[        X-
  5      [        X-
  5      :  n[        X-
  5      [        X-
  5      :  nU=(       a    U$ g)a  Check if buffer access needs transpose.

Transpose on load is needed when:
1. Non-square buffers: dimensions are swapped relative to iteration vars
2. Square buffers: index coefficient pattern indicates transposed access
   (first iteration var has larger coefficient than second)
Fr
   r   r   c              3  >   #    U  H  u  pUR                   v   M     g 7fr`   is_reduction).0_r  s      r)   	<genexpr>5PallasKernel._is_transposed_access.<locals>.<genexpr>k  s     <)hau!!)s   )_get_buffer_infork  r  r  r  r  anyr   rJ  ri  rj  _get_index_coefficient_has_column_major_outputr   )r$   r  rb  rX   r  buf_sizeactual_stridessize0size1s0s1r  	inner_var	outer_varinner_coeffouter_coeffis_standard_row_majoroutput_is_column_majorinner_matches_s0outer_matches_s1s                       r)   _is_transposed_access"PallasKernel._is_transposed_accessH  s    $$T*<,0)Q x=A^!4!9x{+x{+=EMUaZ5A:AA: ..4467	y>A <)<<< aLO	aLO	  ))%011%C11%C!q 0$&K$;B!G!( &*%B%B%D")  #;#34s;;K7LL";#34s;;K7LL#8(88r,   c                   [        U R                  S0 5      n[        R                  " U[        R
                  R                  5      nU H  n[        R
                  R                  U5      nUc  M'  X1;  a  [        U[        5      (       d  MC  [        USS 5      " 5       nUc  M[  [        USS5      nUb  [        U5      S:  a  M|  U R                  US   5      nU R                  US   5      nUc  M  Uc  M  Xx:  d  M    g	   g
)z:Check if any output buffer has column-major stride layout.output_buffersN
get_layoutc                     g r`   r;   r;   r,   r)   <lambda>7PallasKernel._has_column_major_output.<locals>.<lambda>  s    Dr,   ru  r
   r   r   TF)getattrr#   	itertoolschainr   rJ  name_to_buffer
get_bufferr   r   rk  r  )	r$   r  	buf_namesbuf_nameout_buflayout
out_strideout_s0out_s1s	            r)   r  %PallasKernel._has_column_major_output  s     ,<bA OONAGG4J4JK	!Hgg((2G-j7 7 WlLACF~ 48J!S_q%8^^JqM2F^^JqM2F!f&8V_# "& r,   c                   U R                  U5      nU R                  U5      nU(       a  U(       a  U R                  U5      S4$ U(       a  U R                  U5      S4$ U R	                  U5      nUR                  [        5      =(       a    US:g  nU(       d2  US:w  a,  SU;   d&  UR                  S5      R                  5       (       d  SnXE4$ )z@Get the index expression string and whether it needs flattening.TFr]  ::-)	r  r  _handle_mixed_indexingr   rd  rh  r	   lstripisdigit)r$   rb  has_indirecthas_iter_varsr}  needs_flattens         r)   _get_index_exprPallasKernel._get_index_expr  s    ..u5007M..u5t;;::e$e++++E2I "IIo6M9;MM !Y%%7	)Y-=-=c-B-J-J-L-L$(M++r,   c                F     [        U 5      $ ! [        [        4 a     gf = f)z0Convert value to int, returning None on failure.N)rp  rq  rr  )r  s    r)   r  PallasKernel._safe_int  s'    	s8O:& 		s   
   c                    SnU H;  nX0R                   ;   d  M  U R                  U R                   U   5      nUc    gX$-  nM=     U$ )zBCompute total numel for given prefixes (e.g., pointwise prefixes).r   N)numelsr  )r$   prefixesr2   r'   numels        r)   _compute_prefix_numel"PallasKernel._compute_prefix_numel  sG    AKKt{{1~6=  r,   c                    SnU R                    H:  nUR                  (       d  M  U R                  UR                  5      nUc    gX-  nM<     U$ )zCompute total reduction numel.r   N)rg  r  r  r  )r$   r2   treer  s       r)   _compute_reduction_numel%PallasKernel._compute_reduction_numel  sK    $$D   tzz2= % r,   c                  ^  T R                  5       nUb  US:  a  g/ nT R                  R                   H  nT R                  U5      nUc    gUu  pVpxn	U	(       d    g[	        USS 5      " 5       n
U
b  SSKnXR                  :X  a    g[        U 4S jU 5       5      nSU;   a    gUR                  U5        M     U(       a  [        [        U5      5      S:  a  gU(       ae  SnUS    H  nX-  nM	     SnT R                   H;  nUR                  (       a  M  T R                  UR                  5      nUc    gUU-  nM=     X:w  a  gg)	aV  
Check if TMA (Tensor Memory Accelerator) approach can be used.
TMA works for simple element-wise ops but not for:
- Reductions (need different accumulation patterns)
  TODO: TMA supports float64 for loading but not for reductions
- Broadcasting (inputs have different shapes or output differs)
- Non-contiguous tensors (strided, transposed)
Nr   F	get_dtypec                     g r`   r;   r;   r,   r)   r  4PallasKernel._can_use_tma_approach.<locals>.<lambda>  s    dr,   r   c              3  F   >#    U  H  nTR                  U5      v   M     g 7fr`   r  r  r  r$   s     r)   r  5PallasKernel._can_use_tma_approach.<locals>.<genexpr>  s     D8aq 1 18s   !T)r  r#   input_buffersr  r  r  float64tupler  rk  r   rg  r  r  r  )r$   reduction_numelinput_shapesr  rX   buf_objr  	buf_numelr  is_contiguous	buf_dtyper  shape_tupleinput_numelr  output_numelr  r  s   `                 r)   _can_use_tma_approach"PallasKernel._can_use_tma_approach  sJ    779&?Q+> %'II++D((.D|JNGGy- 
  lCEI$-   D8DDK{",- ,2 C
< 89A= K!!_  % L((((( NN4::6E}$ E)L ) *r,   c                   [         R                  R                  U5      nUc  gUR                  5       nSnU H  nU R	                  U5      nXFb  UOU-  nM     / nSn[        USS 5      " 5       n	U	(       a  [        U	SS5      OSn
U
b  [        [        U5      5       H'  nU R	                  X   5      nUR                  U5        M)     [        U5      S:X  a  US   b  US   S:w  a  SnO[[        U5      S:  aL  Sn[        [        U5      S-
  S	S	5       H-  nX{   nUb  X:w  a  SnU R	                  X;   5      nUc  M)  X-  nM/     X#XGU4$ )
z~Get buffer metadata (buf_obj, buf_size, buf_numel, actual_strides, is_contiguous).

Returns None if the buffer doesn't exist.
Nr   Tr  c                     g r`   r;   r;   r,   r)   r  /PallasKernel._get_buffer_info.<locals>.<lambda>)  s    r,   ru  r   F)	r   rJ  r  get_sizer  r  rangerk  r  )r$   r  r  r  r  r  svalr  r  r  
buf_strideiactual_strideexpected_stridedim_sizes                  r)   r  PallasKernel._get_buffer_info  s]   
 ''$$T*?##%	A>>!$D!1q8I 
  ",=?8>WVXt4D
!3x=) $z} =%%m4 *
 8}!!!$0^A5F!5K$)MX""#s8}q0"b9A$2$5M$,0P(-#~~hk:H+'3 : )]JJr,   c                    U R                  U5      n/ nU HT  nX@R                  ;   d  M  U R                  U   nU R                  UR                  5      nUc  MC  UR	                  U5        MV     SnU H  nXx-  nM	     Xr4$ )zNCompute expected output numel and used vars from iteration variables in index.r   )r   r  r  r  r  )	r$   rb  rs  used_range_lengthsr   r  
length_valr  ls	            r)    _compute_output_numel_from_index-PallasKernel._compute_output_numel_from_indexA  s     ,,U3	C+++--c2!^^ELL9
)&--j9  #AL $ &&r,   c                    [        5       nU H[  n[        R                  " X5      n[        R                  " XT5      nUc  SnU R	                  U5      nUR                  Ub  UOU5        M]     U$ )zD
Extract coefficients of iteration variables from index expression.
r   )r   r   rn  ro  r  add)r$   rb  rs  coefficientsr   rt  ru  coefs           r)   _get_index_coefficients$PallasKernel._get_index_coefficientsU  sj     $.<C*GGSH(@@OF~>>&)DT%5T6B  r,   c                V   S/n[        U5      S:  aR  Sn/ n[        [        U5      S-
  SS5       H1  nUR                  SU5        U R                  X   5      nUc  M-  Xh-  nM3     U(       a  [	        U5      n	U H
  n
X;  d  M
    g   g[	        S U 5       5      nU H
  n
X;  d  M
    g   g)zB
Check if access pattern requires gather (non-standard striding).
r   r  r   Tc              3  .   #    U  H  oc  M  Uv   M     g 7fr`   r;   r  r  s     r)   r  5PallasKernel._check_gather_pattern.<locals>.<genexpr>  s     *Vn11ns   	F)rk  r  insertr  r   )r$   r  r  r  r  expected_stridesr  r  r  expected_stride_setr  actual_stride_sets               r)   _check_gather_pattern"PallasKernel._check_gather_patterne  s     3x=1O!3x=1,b"5 ''?;>>(+6'#/O	 6 ",-=">$2 %  !+*Vn*V V$0 % r,   c                H  ^  US:w  d  U(       a  X44$ T R                  U5      nUc  X44$ Uu  pgpn
T R                  U5      u  pT R                  5       nT R                  X,5      nT R	                  XyX5      n[        U 4S jU 5       5      n[        U5      [        U5      :  =(       a8    [        U5      S:  =(       a#    US:  =(       a    [        U5      [        U5      :  n[        R                  R                  5       R                  S:H  nU
(       + =(       a    [        S U	 5       5      n[        S U 5       5      nU=(       a    U(       + =(       a    X:H  nUS:  aC  X:w  d  U(       d  U(       a0  [        U5      S:  a!  U(       d  U(       d  T R                  U5      S4$ X44$ )	z
Check if buffer access needs strided indexing due to size mismatch or gather patterns.

This handles cases like:
- Pooling operations where input/output have different sizes
- im2col-like gather patterns
- Transposed or strided buffer access
r]  c              3  V   >#    U  H  nTR                  U5      S :w  d  M  S v   M      g7fr   Nr  r  s     r)   r  7PallasKernel._needs_strided_indexing.<locals>.<genexpr>  s#      OHqq8IQ8NHs   )	)r   r   tpuc              3  (   #    U  H  oS Lv   M
     g 7fr`   r;   r  s     r)   r  r"    s      <
#1aTM>   c              3  Z   #    U  H!  n[        U[        [        -  5      (       + v   M#     g 7fr`   )r   rp  r   )r  r&   s     r)   r  r"    s!     U1Jq#+$> > >s   )+T)r  r  r{  r  r  sumrk  r   rJ  rK  rL  allr  r~  )r$   r  rb  r}  r  rX   r  r  r  r  r  r  rs  all_iter_varsr  has_non_unit_stridesbuf_effective_dimsnot_all_vars_usedr4  is_known_non_contiguoushas_symbolic_coefskip_for_non_contiguouss   `                     r)   _needs_strided_indexing$PallasKernel._needs_strided_indexing  s     ++$$T*<++FJC9m"&"G"G"N++-33EE  $99m 

 ! OH OO	NS// /I"/"Q&/ IX.	 	 446;;uD&3"3 #
 <
#1<
 9
  UUU#PF
Py7P 	  1*.?CWI"+%//6<<''r,   c                   U(       d  US:X  a  X44$ [         R                  R                  U5      nUc  X44$ UR                  5       n[	        U5      S:X  a  SU4$ [	        U5      S:  a5  U R                  U5      nU(       d  US4$ SU;   a  U R                  U5      S4$ U R                  (       a  SU;   a  U R                  U5      S4$ X44$ )zP
Adjust index expression based on buffer shape (0-dim scalar, multi-dim, etc.).
r]  r   r   Tr  )r   rJ  r  r   rk  r  r~  rM  )r$   r  rb  r}  r  r  r  r  s           r)   _adjust_index_for_buffer_shape+PallasKernel._adjust_index_for_buffer_shape  s     I.++''$$T*?++##% x=A-'' x=1 44U;M  $&"33E:D@@ ;;49,//6<<''r,   c                   U(       d  X44$ [         R                  R                  U5      nUc  X44$ UR                  5       n[	        U5      nUS:  a  X44$ U R                  U5      n[	        U5      S:w  a  X44$ [        [        U5      5      n	[        R                  " X)5      n
U R                  [        R                  " X5      5      nUb  US::  a  X44$ [         R                  R                  R                  X*-
  5      n [        U5      nUS:  d  X:  a  X44$ U R                  US   5      nUb  X-  S:w  a  X44$ U R"                  R%                  U	5      nUc  X44$ U R                  UR&                  5      nSnU H!  nU R                  U5      nUc  X44s  $ UU-  nM#     Ub	  UU-  U:w  a  X44$ SUS-
  -  nUS:X  a  U SU 3nUS4$ U U SU 3nUS4$ ! [        [         4 a    X44s $ f = f)z
Try to emit multi-dim slice notation instead of flatten + gather.

For a buffer with shape (d0, ..., dk) and index `stride * var + offset`,
emit `buf[:, ..., :, offset::stride]` when stride divides dk.
r
   r   r   r  z:, r  F)r   rJ  r  r   rk  r   rl  rm  r   rn  r  ro  ri  rj  rp  rq  rr  r  getr  )r$   r  rb  r}  r  r  r  ndimrs  r   rt  ru  r  rv  last_dimr  
var_lengthr  r  dprefix	slice_strs                         r)   _try_multidim_slice PallasKernel._try_multidim_slice  s9    ++''$$T*?++##%8}!8++ ,,U3	y>Q++4	?#&CCEO77F
 >Vq[++!!**5+;<	,VJ >Z1++>>(2,/x0A5++ %%))#.=++^^ELL1
	Aq!Ay //NI	 
 f!4	!A++$(#?!("VH-I % "(:,b9I%A :& 	,++	,s   >G G)(G)c                .   U(       a\  UR                  [        R                  5      =(       d    UR                  [        R                  5      nU(       a  SU S3OUnU SU S3$ U SU S3nUS:X  a#  U R	                  X#5      (       a  SU S3nS	U l        U$ )
z3
Build the load expression based on indexing mode.
r  z).astype(jnp.int64)z[...].flatten()[][r]  zjnp.transpose(r!   T)rh  r^  MinMaxr  rS  )	r$   bufr  rb  r}  r  
has_minmaxidx	load_exprs	            r)   _build_load_exprPallasKernel._build_load_expr;  s     599-E5991EJ8BAi[ 34	CU*3%q11 %q1-I E!d&@&@&M&M,YKq9	+/(r,   c                8  ^ UR                  S5      (       d  U$ [        U4S jU R                  R                   5       5      nU(       aP  [        R
                  R                  U5      mTb.  TR                  5       n[        U5      S:X  a  US   S:X  a  SU S3$ U$ )z
Squeeze (N,1) intermediate buffers when kernel has 1D graph inputs.

This avoids wrong broadcasting: (N,) op (N,1) -> (N,N) instead of (N,)
rD  c              3     >#    U  Hh  nUR                  S 5      (       + =(       aF    [        R                  R                  U5      =mSL=(       a    [	        TR                  5       5      S:H  v   Mj     g7f)rD  Nr   )r  r   rJ  r  rk  r   )r  r  r  s     r)   r  BPallasKernel._maybe_squeeze_intermediate_buffer.<locals>.<genexpr>`  sk      
 4 ##E** -GG..x88E-G$$&'1,- 4s   A0A3r
   r  r   zjnp.squeeze(z
, axis=-1))	r  r  r#   r  r   rJ  r  r   rk  )r$   r  rG  has_1d_inputr  r  s        @r)   "_maybe_squeeze_intermediate_buffer/PallasKernel._maybe_squeeze_intermediate_bufferV  s     u%%  
 !II33	
 
 gg((.G""++-x=A%(2,!*;))J??r,   c                   [         R                  R                  U5      nUb  [        UR	                  5       5      S:w  a  U$ U R                  UR	                  5       S   5      nUc  U$ UR                  S5      (       a  U$ [         R                  R                  U5      nUb  UR                  (       d  U$ SnU R                  R                   H  n[         R                  R                  U5      n	U	c  M'  [        U	R	                  5       5      S:  d  MF  U	R	                  5        V
s/ s H  oR                  U
5      PM     nn
[        S U 5       5      (       a    OSnM     Ub  [        U5      S::  a  U$ U R                  U5      n[        U5      S:w  a  U$ [        [        U5      5      nXR                  ;  a  U$ U R                  U   nU R                  UR                   5      U:w  a  U$ U R                  R#                  5        VVs/ s H;  u  pU R                  UR                   5      U:X  d  M&  UR$                  (       a  M9  UPM=     nnn[        U5      S:w  a  U$ ['        U5       VV
s/ s H  u  noU:X  d  M  UPM     nnn
[        U5      S:w  a  U$ US   nU[        U5      S-
  :X  a  U$ S/[        U5      -  nSUU'   U SSR)                  [+        [,        U5      5       S	3$ s  sn
f s  snnf s  sn
nf )
zGReshape 1D buffers (e.g., batch norm mean) for higher-dim broadcasting.Nr   r   rD  c              3  (   #    U  H  oS Lv   M
     g 7fr`   r;   r  s     r)   r  :PallasKernel._maybe_broadcast_1d_buffer.<locals>.<genexpr>  s     ;l}lr%  r  	.reshape(r    r!   )r   rJ  r  rk  r   r  r  r  is_floating_pointr#   r  r(  r   rl  rm  r  r  r  r  	enumerater  mapr>   )r$   r  rb  rG  r  
buf_lengthr   ref_buf_sizer  	other_bufr  rs  used_varr  vematching_varsr  matching_dimsaxis_posreshape_dimss                        r)   _maybe_broadcast_1d_buffer'PallasKernel._maybe_broadcast_1d_bufferp  s    ''$$T*?c'"2"2"45:^^G$4$4$6q$9:
 ??5!!!!$'U%<%< 		//H**84I$Y-?-?-A)BQ)F;D;M;M;OP;Oaq 1;OP;l;;;# 0 3|#4#9 ,,U3	y>QY(000 %%h/>>%,,':5
 --335
5~~ahh':5 >?nn 5 	 

 }" (1'>R'>tq!z/'>R}" #s<(1,,sS..!#XIdiiC0F&G%HJJQ  Q*
 Ss$   &K)	%K.2K.K.-K4=K4c                   US:w  d  U(       a  X#4$ U R                  U5      nU R                  5       nU R                  U5      n[        US5      (       a  UR                  O	[        5       U-  nXv-
  nU(       a  [        U5      S::  a  X#4$ Sn	U R                  R                  5        H  u  pU R                  U5      nU(       d  M  X:w  a  M&  U R                  U5      n[        US5      (       a  UR                  O	[        5       U-  nX:w  d  Xv:X  a  Mm  U R                  XU5      (       a  M  Sn	  O   U	(       a  U R                  U5      S4$ X#4$ )z
Check for im2col-like patterns where store uses block variables but load doesn't.

For cat/expand patterns, both load and store prepared indices share block vars.
For im2col patterns, store compresses to block vars but load doesn't.
r]  rz  r   FT)r   r{  r   hasattrrz  r   rk  rQ  r  _check_load_is_strided_inputr~  )r$   rb  r}  r  rc  r|  store_orig_varsstore_prep_varsnew_varshas_im2col_patternr  
load_indexload_orig_vars	prep_loadload_prep_varss                  r)   _check_im2col_pattern"PallasKernel._check_im2col_pattern  sl    ++..u5'')	2259 ~~66 ''	
 #4 3/14++ #$($9$9$?$?$A H!55jAN! 0 --j9I 9n55 &&\	N /?3U 44n  &*"5 %B8 //?EE''r,   c                `   [         R                  R                  U5      nUc  g[        USS 5      " 5       nUc  g[        USS5      nUc  gUR	                  5       n/ nU H[  n	[
        R                  " X)5      n
[
        R                  " X5      nUc  M4  U R                  U5      nUR                  Ub  UOU5        M]     [        5       n[        U5       HJ  u  pU R                  X~   5      nUb  US:  d  M#  U R                  U5      nUR                  Ub  UOU5        ML     [        U5      U:H  $ )zL
Check if load coefficients match buffer strides (strided input vs im2col).
NFr  c                     g r`   r;   r;   r,   r)   r  ;PallasKernel._check_load_is_strided_input.<locals>.<lambda>  s    Dr,   ru  r   )r   rJ  r  r  r   r   rn  ro  r  r  r   rU  r  )r$   r  rj  rk  rD  r  buf_strides	buf_sizesload_coeffsr   rt  r  int_coefbuf_stride_setr  r  r  int_ss                     r)   re  )PallasKernel._check_load_is_strided_input  s     gg  *;lL9;>fh5LLN	 !C*GG
XH&>>xMD>>$/""x/C8N " $k*DA~~il3H8a<q)""E,=51E	 + +&.88r,   c                   U R                   (       a  gU R                  U5      nUc  gUu  p4p5n[        U5      S:w  d  [        U5      S:w  a  gU R                  US   5      nU R                  US   5      nUS   nUS   n	Ub  U	b  X:  a  Ub  Ub  US:  a  US:  d  gU R                  R
                   HL  n
U R                  U
5      nUc  M  Uu      p<n[        U5      S:w  a  M1  US   nUS   nUc  M@  Uc  ME  X:  d  ML    g   g)z
Check if output needs transpose for column-major storage.

Transpose on store is needed when:
- Output has column-major stride (s0 < s1)
- But input(s) have row-major stride
- And we haven't already transposed on load
Fr
   r   r   T)rS  r  rk  r  r#   r  )r$   r  rX   r  r  r  r  r  r  r  inp_nameinp_infoinp_stridesinp_s0inp_s1s                  r)   _check_store_needs_transpose)PallasKernel._check_store_needs_transpose  s+    ##$$T*<,0)Q~!#s8}'9x{+x{+AA N!!		 		//H,,X6H&.#Aq!!;1$ ^F ^F!f&8V_ 0 r,   c                    SU S3/nU(       a  UR                  U SU S35        U$ UR                  U SU SU SU SU S3
5        U$ )	z
Build store expression for full array assignment.

Handles scalar broadcast, shape matching, and optional transpose.
Returns a list of lines to emit (variable assignment + store).
_val = jnp.asarray(r!   [...] = jnp.full(z8.shape, _val) if _val.ndim == 0 else jnp.transpose(_val)z3.shape, _val) if _val.ndim == 0 else (_val.reshape(z.shape) if _val.size == z".size else jnp.broadcast_to(_val, z.shape)))r  )r$   outvalueneeds_transposeliness        r)   _build_full_array_store_expr)PallasKernel._build_full_array_store_exprK  s     'ugQ/0LL% 5 !+,  LL% 5 !&&)U*B3% H//2e8= r,   c                   US:X  a#  U R                  U5      nU R                  XU5      $ U(       a  US:X  a  SOSn	U SU SU SU	 SU S	U S
3/$ U R                  U5      n
[        R                  R                  U5      nUbG  UR                  5       n[        U5      S:  a(  U R                  U5      (       d  U R                  XS5      $ U
(       a  US:X  a  SOSn	SU S3/nSU SU S3nUS:X  aE  U R                  R                  U5        U S3nUR                  U SU SU SU	 SU SU S
35        U$ UR                  U SU SU 35        U$ U SU SU 3/$ )z
Build the store expression based on indexing mode.
mode can be None (set) or "atomic_add" (accumulate).
Returns a list of lines to emit.
r]  
atomic_addr  set[...] = z[...].flatten().at[(z).flatten()].z(jnp.asarray(z).flatten()).reshape(.shape)r   Fr  r!   z
(jnp.full(z%.shape, _val) if _val.ndim == 0 else _aliasr  z.flatten()).reshape(rA  z] = )r  r  r  r   rJ  r  r   rk  r  rR  r  r  )r$   r  r  rb  r  r}  r  moder  
scatter_opr  rD  r  r  
value_expralias_params                   r)   _build_store_exprPallasKernel._build_store_exprd  s    "??EO44SQQ"&,"6EJ%xu$8=Q[P\ ]$g%:3%wH  ..u5gg  &?||~H8}q )A)A%)H)H88UKK"&,"6EJ*5'34EYK'LUGSTU  |#&&**3/!$Vne8K=0DYK}]g\hhi!l"6se7D L uAi[ZLABL%q4w/00r,   c           
     f   UR                  SS5      nU R                  R                  U5        U S3nUS:X  a  SOSnU(       aw  US   n	US   n
US	   n/ n[        [	        U5      5       H,  nX:X  a  UR                  U	5        M  UR                  S
5        M.     SR                  U5      nU SU SU SU SU S3
$ US   n	US   nUS   n[        R                  R                  U5      nUb  [	        UR                  5       5      OSn[	        U5      [	        U5      -   n[	        U R                  5      nUS-
  nUU:H  =(       a    UU:H  nU(       a{  U VVs/ s H  u  nnUPM
     nnn[	        U5      n[	        U5      nUS:  a  US:  a  SU-  nSU-  nU	 SU SU S3nOU	nUR                  U5        UR                  S U 5       5        O;U Vs/ s H  nSPM     nnUR                  U	5        UR                  S U 5       5        SR                  U5      nU SU SU SU SU S3
$ s  snnf s  snf )zBBuild store expression for scatter operations (indirect indexing).is_point_scatterFr  r  r  r  indirect_varindirect_dimoutput_shape0r    r  z	[...].at[z].r  r!   dims_before
dims_afterr   r   None, , NonerA  r]  r@  c              3  *   #    U  H	  u  pUv   M     g 7fr`   r;   )r  var_namerN   s      r)   r  9PallasKernel._build_scatter_store_expr.<locals>.<genexpr>  s     IjNHxjs   :c              3  &   #    U  H  nS v   M	     g7f)r  Nr;   )r  r  s     r)   r  r    s     7JqsJs   )r6  rR  r  r  rk  r  r  r   rJ  r  r   r  extend)r$   r  r  scatter_infor  r  r  r  r  r  r  r  index_partsdimindex_tupler  r  rD  output_ndimnum_iter_vars_in_storetotal_kernel_iter_varsremaining_dimsis_element_wiser  rN   	n_leading
n_trailingleading_onestrailing_nonesindirect_reshapedr  s                                  r)   _build_scatter_store_expr&PallasKernel._build_scatter_store_expr  s    (++,>F 	""3'Vn #l2U
'7L'7L'7L KS./&&&|4&&s+	 0 ))K0KU(;-yR
|STUZT[[\]] $N3"=1!,/
 gg  &-0_c#,,.)!!$[!1C
O!C!$T%:%:!;$q #n4 A&*@@ 	
 :EF+$8+KF K(IZJ1}a')3!)J!6'3nAl^3~FVVW$X!$0!01IjII )4413K4|,7J77ii,e8K=	+bAeWTUV	
+ G  5s   H(H.c                    U R                   R                  U5      n[        R                  R	                  U5      nX R
                  U'   U R                  U5      u  pVU R                  XXV5      u  pVU R                  XXV5      u  pVU R                  XXV5      u  pVU R                  X1X%U5      nU(       d)  US:X  a#  U R                  X5      nU R                  XU5      nU R                  R                  U R                  UUS9$ )Nr]  r   )r#   r  r   rJ  r  rQ  r  r0  r3  r=  rH  rN  ra  r   r   r   )r$   r  rb  rD  r   r}  r  rG  s           r)   loadPallasKernel.load  s   iiood#!!$' ',d# $(#7#7#> 	 $(#?#?$
 	
 $(#F#F$
 	
 $(#;#;$
 	
 ))#U}U	 e!3??PI77YOIxx  LL ! 
 	
r,   c                z	  ^ ^^$^% T R                  T5      nT R                  R                  U5        [        U5      S:X  a  T R	                  T5      $ UU 4S jn[        X#SS9nU Vs/ s H
  oS" U5      PM     nnT R	                  T R                  T5      5      nT R                  T5      nU V	s/ s H  n	[        U	5      PM     n
n	U Vs0 s H  n[        U5      U" U5      _M     nn[        U5      S:X  a  [        U
5      S:X  a  US   n[        U5      nUT R                  ;   =(       a    T R                  U   R                  nU(       ae  UT R                  ;   aS  T R                  U   nUR                  nT R                  U5      nST R	                  U5       S3nUR                  UU5      nU$ Sn[        U
5      S:  a!  T R                  5       U-
  n[        U5      S:H  nU(       a  S[        U5      -   nU
 H+  nS	[        U5      -  nU S
U S3nUR                  UU5      nM-     [        U5       H  u  nn[        U5      nUT R                  ;   d  M#  T R                  U   nUR                  nT R                  U5      nS/U-  nT R	                  U5      UUS-   '   SR                  U5      nST R	                  U5       SU S3nUR                  UU5      nM     U$ / nU H  nUR!                  U" U5      SU45        M     U H  n	UR!                  U" U	5      SU	45        M     UR#                  S SS9  [        U5       H  u  nn[        U5      nUT R                  ;   d  M#  T R                  U   nUR                  nT R                  U5      nU" U5      m%ST R	                  U5       S3n[%        U%4S jU 5       5      n[%        U%4S jUR'                  5        5       5      nUU-   nUS:  a  SU-  n U SU  S3nUR                  UU5      nM     U
 H  nUU   m$[%        U$4S jU 5       5      n![%        U$4S jU 5       5      nU!S:  a  US:  a  SU!-  n"SU-  n#U SU" SU# S3nO*U!S:  a  SU!-  n"U SU" S3nOUS:  a  SU-  n#U SU# S3nOUnUR                  UU5      nM     U$ s  snf s  sn	f s  snf )a9  
Handle indexing with both indirect variables and iteration variables.

For example, x[indices, :] generates index = i0 + stride * tmp0
where tmp0 is loaded from indices and i0 is the iteration variable.

We need to convert this to JAX advanced indexing with proper broadcasting.
When there are multiple iteration variables, they need different shapes
to form an outer product (grid) rather than broadcasting together.

Special case: For gather operations where a single iteration variable
and single indirect variable have the same extent, they should be
element-wise aligned, not broadcast into an outer product.

PyTorch advanced indexing semantics: When multiple indirect indices have
the same shape, they are paired element-wise (not outer product), and
the combined result dimension appears at the FRONT of the output.
r   c                8   > TR                  TU [        S5      S9$ )Ninf)default)r  r   )r   rb  r$   s    r)   _coeff3PallasKernel._handle_mixed_indexing.<locals>._coeff6  s    ..uc5<.PPr,   Tkeyreverser   jnp.arange(r!   Fz, 1z.reshape(-11r    z
).reshape(rm  indirectc                    U S   $ )Nr   r;   ri   s    r)   r  5PallasKernel._handle_mixed_indexing.<locals>.<lambda>  s    !A$r,   c              3  6   >#    U  H  oT:  d  M
  S v   M     g7fr!  r;   r  r&   	var_coeffs     r)   r  6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>  s     %NAIaa   		c              3  6   >#    U  H  oT:  d  M
  S v   M     g7f)r
   Nr;   r  s     r)   r  r    s      *7!y=AA7r  r  z[:r@  c              3  6   >#    U  H  oT:  d  M
  S v   M     g7fr!  r;   r  r&   indirect_coeffs     r)   r  r    s     I{!.6HAA{r  c              3  6   >#    U  H  oT:  d  M
  S v   M     g7fr!  r;   r  s     r)   r  r    s     J1>7IQQr  r  rA  r]  z...]z[...)r   r   r   rk  r   sortedr   r  r>   r  r  r  replacer{  rU  r  r  sortr'  values)&r$   rb  used_iter_vars_setr  r   r   iter_coeffsr}  indirect_var_symssymindirect_varsr  indirect_coeffsr  is_reduction_varrange_entry
range_sizerenamed_sizearange_exprpaired_indirectunused_iter_varsn_output_dimsr  trailing_onesreshape_exprr  shape_parts	shape_strall_componentsn_trailing_itern_trailing_indirectr  trailing_dimsr  leading_nonesr  r  r  s&   ``                                  @@r)   r  #PallasKernel._handle_mixed_indexing  s8   & "55e< 	""#56!"a'::e$$
	Q   2M.<=nsvc{n= JJt33E:;	 33E:->?->cS->? 7HH6G3q66!9,6GH ~!#M(:a(? #C3xHt,,,X1F1Fs1K1X1X   $///"&"7"7"<K!,!3!3J#'#7#7
#CL$/

<0H/I"KK ) 1 1(K HI    }!#2247II ""23q8O N 33M !. %N(; ;".{=/K%--lLI	 !. $N33s8$///"&"7"7"<K!,!3!3J#'#7#7
#CL $'%-"7K)-L)AKA& $		+ 6I%djj&>%?z)TUV   !* 1 1(K HI# 4& 
 !C!!6#;"<= "$C!!6#;
C"@A %=  /FAs3xHd+++"33C8(//
#33J?"3K	 +DJJ|,D+EQG #&%N%N"N&) *.557* '# -/BB
>$,z$9M%0MM?!"DK%--hD	5 0: *L,\:N I{IIIJJJJ 1}a (9 4!)J!6".qs>BRRSTQ (9 4".qtDa!)J!6".tN3C1E+!)),EI+ *. } >
 @ Is   &R..R3	R8c           	        Ub  US:w  a  [        SU S35      eU R                  R                  U5      nU R                  R	                  U5        [
        R                  R                  U5      nUS L=(       a    [        UR                  5       5      S:H  nU(       a  SU S3U SU SU S	3/nOU R                  X!5      n	U	b?  U R                  R                  U R                  U5      5        U R                  XSXU5      /nO;U R                  U5      u  pU R!                  X*U5      u  pU R#                  XQX#XU5      nU H:  nU R$                  R'                  U5        U R(                  R+                  X\45        M<     g )
Nr  zpallas store mode 'z' not supportedr   r  r!   r  z2.shape, _val) if _val.ndim == 0 else _val.reshape(r  )rc   r#   outputstore_buffer_namesr  r   rJ  r  rk  r   _detect_scatter_patternr   r   r   r  r  rn  r  stores	writelinerP  r  )r$   r  rb  r  r  r  rD  	is_scalarstore_linesr  r}  r  lines                r)   storePallasKernel.store  s   
  4 3D6IJJiit$##D) gg  &tO@CLLN(;q(@	%eWA.%(-_`c_ddklK  77DL'##**4+C+CE+JK223|SWX
 ,0+?+?+F(	 ,0+E+Em,(	
 #44uYt  DKK!!$'""))3+6  r,   c                    U R                  U5      nUS:X  a  [        R                  " X5      n [        U5      $ ! [        [
        4 a    Us $ f = f)z=Get integer coefficient of a variable in an index expression.r   )coeffr^  diffrp  rq  rr  )rb  r   r  r  s       r)   r  #PallasKernel._get_index_coefficient
	  sO    
 C A:JJu*E	u::& 	N	s   
: AAc                   U R                  U5      n[        U5      S:w  a  gUS   n[        U5      n[        U R	                  X5      5      nUS:X  a  gU R                  U5      (       d  U R                  X%U5      $ U R                  XU5      $ )zDDetect scatter operation pattern. Returns scatter info dict or None.r   Nr   )r  rk  r>   rp  r  r  _detect_point_scatter_detect_iter_scatter)r$   rb  output_nameindirect_symsindirect_symr  r  s          r)   r  $PallasKernel._detect_scatter_pattern	  s     //6}"$Q'<(!$"="=e"RSQ ''..--kXX ((nMMr,   c                   U(       d  g [         R                  R                  U5      nUR                  5        Vs/ s H  n[	        U5      PM     nn[        U5      S:  a  gSn[        U5      S-
  n[        [        U5      S-
  SS5       H  n	X7:X  a  U	n  OXvU	   -  nM     UU/ / SUS.$ s  snf ! [
         a     gf = f)z&Detect single-element scatter pattern.Nr
   r   r  Tr  r  r  r  r  r  )r   rJ  r  r   rp  	Exceptionrk  r  )
r$   r  r  r  rD  r  r  
cumulativer  r  s
             r)   r  "PallasKernel._detect_point_scatter,	  s     	''$$[1C,/LLN;NqCFNL; |q  
<(1,\*Q.B7C+"s++J	 8 )( $(
 	
! < 		s"   1B0 B+B0 +B0 0
B=<B=c                  ^ U R                  U5      n/ nU H  n[        U R                  X5      5      nUS:  d  M%  X`R                  ;   d  M6  U R	                  U R                  U   R
                  5      nUc    gUR                  [        U5      Xx45        M     UR                  TUS45        UR                  S SS9  [        U4S j[        U5       5       S5      n	U	c  gSn
[        XYS-   S 5       H  u  pnXz:w  a    gX-  n
M     X::w  a  gTU	USU	  VVVs/ s H	  u  poU4PM     snnnXYS-   S  VVVs/ s H	  u  poU4PM     snnnS	SS
.$ s  snnnf s  snnnf )z0Detect scatter pattern with iteration variables.r   Nr  c                    U S   $ )Nr   r;   ri   s    r)   r  3PallasKernel._detect_iter_scatter.<locals>.<lambda>^	  s    AaDr,   Tr  c              3  D   >#    U  H  u  nu  n  o2T:X  d  M  Uv   M     g 7fr`   r;   )r  r  r  r  r  s       r)   r  4PallasKernel._detect_iter_scatter.<locals>.<genexpr>b	  s$     R&9?1ltQ\=QQQ&9s    	 r   Fr  )r   rp  r  r  r  r  r  r>   r  rl  rU  r  )r$   rb  r  r  r   all_varsr   r  r  indirect_posexpectedr  r  r  s     `           r)   r   !PallasKernel._detect_iter_scatterM	  s    11%8 02!C33E?@EqyS$9$99(=(=c(B(I(IJ>S5 9: " 	~r:;.$7 Ri&9R
   (2B2D)E FAf H !G % )(2:=L2IJ2IwqQF2IJ19:J:L1MN1MgaAq61MN % 
 	
 KNs   E9Ec           	     P  ^  T R                   (       d   eUS:X  a  T R                  X5      $ [        U[        5      (       a  [	        S5      eX#U4nUT R
                  R                  ;   a  T R
                  R                  U   $ SSSSSSS	S
.n[        / SQ5      n[        U 4S jU 5       5      nT R                  U5      n	T R                  5       n
[        S T R                  R                  5        5       5      nUS:X  a(  U(       a  U	(       a  U
(       a  SU SU	 S3nGO<SU S3nGO4US;   Ga  Xc   nU=(       a    U	=(       a    U	S:  =(       a    U
nU(       GaW  US:  GaP  SnT R                  (       Ga0  [        [!        T R                  R#                  5       5      5      nT R                  R                  5        VVs/ s H  u  nnUR$                  (       d  M  UPM     nnnU(       a  US   nUR'                  U5      nUS:w  a  T R)                  U5      OSnUc  SnT R                  R                  5        VVs/ s H  u  nnUR$                  (       a  M  UPM     nnnU(       a>  US   nUR'                  U5      nUS:w  a  T R)                  U5      OSnUc  SnUU:  a  SOSnU SU SU S3nOU SU S3nOX6;   as  U=(       a    U	SL=(       a    U	S:  =(       a    U
nU=(       a    US:  =(       a    U	SL nU(       a  Xc   nSU SU SU	 SU
 S3	nOFU(       a  Xc    SU S3nO4Xc    SU S3nO)[	        SU S[+        UR-                  5       5       S35      eT R
                  R/                  T R0                  UUS 9nUT R
                  R                  U'   U$ s  snnf s  snnf )!a  
Generate code for reduction operations in JAX/Pallas.

Reductions in Pallas work by:
1. Loading the input data into the kernel
2. Applying JAX reduction operations (jnp.sum, jnp.max, etc.)
3. Storing the reduced result

The reduction happens over the loaded block of data.
welford_reducezHTuple reductions (e.g., welford_combine) not supported in Pallas backendzjnp.sumzjnp.prodzjnp.maxzjnp.minzjnp.anyz
jnp.argmaxz
jnp.argmin)r'  prodmaxminr  argmaxargmin)rj   r  zc              3  @   >#    U  H  oTR                   ;   v   M     g 7fr`   )r  )r  r'   r$   s     r)   r  )PallasKernel.reduction.<locals>.<genexpr>	  s     I6H,6Hs   c              3  P   #    U  H  u  pUR                   (       d  M  S v   M     g7fr!  r  )r  r   r  s      r)   r  r  	  s       
 =*#ASASAA =s   &	&xor_sumzjnp.bitwise_xor.reduce(rS  z, -1), axis=-1)r!   )r  r  r   r   r  Nr  z, axis=zpallas_partial_reduce(r    z	, axis=0)zReduction type 'z8' not yet supported in Pallas backend. Supported types: z	, xor_sumr  )inside_reductionwelford_reduce_fallbackr   r  rc   r   reduction_cacher   r  r  r  r'  r  r  rQ  rl  rm  r  r  r  r  r  r  r   r   )r$   r   r   reduction_typer  	cache_keyreduction_opspointwise_prefixeshas_pointwisepointwise_numelr  n_reduction_dimsreduction_exprreduction_opis_partial_reductionreduction_axisrj  r   r  reduction_varsr_varr_coeffr_stridepw_varspw_varpw_coeff	pw_strideis_symbolic_partialr2   s   `                            r)   	reductionPallasKernel.reductionz	  s/   " $$$$ --//==eU##Z 
 6	00088++I66 ""
 (8I6HII *.)C)CDV)W)-)F)F)H  
 $ 5 5 ; ; =
 
 Y&_#:5'?J[[j!k#:5'!C33 )8L  $#$#a'$ $	 ! $(81(< "$(((!%d4+@+@+G+G+I&J!KJ +/*?*?*E*E*G&*GJC -- *G # &
 & .q 1","2"25"9>El4>>'#:PQ#+'(H /3.C.C.I.I.K#.K
U#(#5#5  .K   #
 #%,QZF'1'7'7'?HDLPQMx(@WXI(0,-	 3;Y2FQBN$0>5'@PPQ!R %1>5'!;,  $#4/$#a'$ $	 ! R"2Q"6R?d;R   $  -<#9,r%PRSbRccefuevvw!x$$1$A#B!E7)!T %2$A#B!E7!!L">"2 3$$(););)=$>#?yJ  ""LL # 
 /5  +U&#s   ?NN<N"N"c                    [         R                  R                  U 5      nUR                  5       nUR	                  5       $ r`   )r   rJ  r  r  r  )buffer_namerD  r  s      r)   _buffer_is_contiguous"PallasKernel._buffer_is_contiguous
  s1    gg  -!##%%r,   c                `   [        5       nU R                  R                  5       u  p4  nU Vs/ s H  ofR                  PM     nnU Vs/ s H  oR	                  S5      (       d  M  UPM     n	nU Vs/ s H  oR	                  S5      (       d  M  UPM     n
n[        U R                  R                  R                  5       5      nU Vs/ s H  oU;   d  M
  UPM     nnU
(       d  [        S5      eU R                  R                  R                  5        VVs0 s H  u  p[        U[        5      (       d  M  X_M      nnnU=(       d    Sn[        R                  R                  5       R                   S:H  n[        R                  R                  5       R                   S:H  nU(       a  SOSn0 nU	 H  nS	UU'   M
     U	 Vs/ s H  nUU   (       d  M  U S
3PM     nnU Vs/ s H  oR	                  S5      (       d  M  UPM     nnUU-   nUU-   n[        UR                  5        VVs/ s H  u  nnU(       a  M  UPM     snn5      nU(       d  U(       a  [#        [%        ['        U
5      5      5      nO([)        U
5       VVs/ s H  u  nnUU;   d  M  UPM     nnn[+        SA0 SU_SU_SU_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_6nUU l        U R/                  U5        [        5       nUR1                  5          U R3                  UU5        U R4                  R6                   H  n UR9                  [        U 5      5        M     SSS5        U R                  R                  5       u  p4  nU Vs/ s H  ofR                  PM     nn[        U R                  R                  R                  5       5      nU Vs/ s H  oU;   d  M
  UPM     snUl        U Vs/ s H  oR	                  S5      (       d  M  UPM     snUl        UUR<                  -   Ul        UU-   Ul         SU SS RC                  UR@                  5       S!3n!UR9                  U!5        UR1                  5          UR6                   HT  n [        U [        5      (       a!  UR9                  U RE                  5       5        M9  UR6                  RG                  U 5        MV     U RH                   H)  u  n"n#U"UR@                  ;   d  M  UR9                  U#5        M+     SSS5        UR9                  S"5        U S#3n$/ n%S$['        UR:                  5      -   n&[)        UR>                  5       H8  u  nnUU;   d  UR	                  S%5      (       d  M$  U%RG                  UU&-   5        M:     U%(       a  S&S RC                  S' U% 5       5      -   S(-   n'OS)n'[#        [%        S$['        UR:                  5      -   5      5      n(S&S RC                  S* U( 5       5      -   S(-   n)UR9                  S+U) S,U' S-35        S.S//UR:                  -   UR>                  -   n*UR9                  SU$ S&S RC                  U*5       S!35        UR1                  5          UR9                  S05        UR9                  S15        UR9                  S25        UR9                  S-5        UR9                  S35        UR9                  S45        UR9                  S55        UR9                  S25        UR9                  S-5        UR9                  S65        UR9                  S75        UR9                  S8S RC                  UR>                  5      -   S9-   5        UR9                  S-5        / n+[)        URJ                  5       H  u  n,nUR	                  S5      (       aN  URM                  US:5      (       a5  U S
3n-UR>                  RO                  U-5      n.U+RG                  U.U,45        Mh  Mj  UR>                  RO                  U5      n.U+RG                  U.U,45        M     S RC                  S; U+ 5       5      n// n0UR:                   H  n1U0RG                  U1 S<U1 35        M     U0(       a  S=U S>S RC                  U05       S?3n2OU S@3n2U RP                  =(       a#    U RR                  =(       a    U RU                  5       n3U3(       a  U RW                  UU25        O8U RP                  (       a  U RY                  UU25        OU R[                  UU2U+U/5        SSS5        U R]                  UU$5        UR_                  5       $ s  snf s  snf s  snf s  snf s  snnf s  snf s  snf s  snnf s  snnf ! , (       d  f       GN"= fs  snf s  snf s  snf ! , (       d  f       GN= f! , (       d  f       N= f)Ba}  
Generate the complete Pallas kernel code as a Python string.

This includes:
- Import statements for JAX/Pallas
- The kernel function that operates on refs
- The main wrapper function that handles PyTorch<->JAX conversions via DLPack

Args:
    name: Optional kernel name (will use placeholder if not provided)

Returns:
    str: Complete Python source code for the Pallas kernel
out_ptr)r=  
in_out_ptrz2Pallas backend requires at least one output buffer<KERNEL_NAME>r#  cpur   r  Tr  )r>  in_ptrr2  r3  r4  r5  r6  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  Ndef z_kernel(r    ):r  _jit_wrapperr
   r>  r  c              3  8   #    U  H  n[        U5      v   M     g 7fr`   r>   r  rj   s     r)   r  .PallasKernel.codegen_kernel.<locals>.<genexpr>
  s     ,L^SVV^   ,)z()c              3  8   #    U  H  n[        U5      v   M     g 7fr`   rF  rG  s     r)   r  rH  
  s     0PAQrI  z+@functools.partial(jax.jit, static_argnums=z, donate_argnums=r!   
out_shapes
out_dtypeszout_shapes_pallas = tuple(z&    jax.ShapeDtypeStruct(shape, dtype)z3    for shape, dtype in zip(out_shapes, out_dtypes)z%indexer = lambda n: lambda i: [i] * nzout_specs_pallas = tuple(z,    pl.BlockSpec(shape, indexer(len(shape)))zin_specs_pallas = tuple(z0    pl.BlockSpec(i.shape, indexer(len(i.shape)))z    for i in [r@  Fc              3  4   #    U  H  u  pU S U 3v   M     g7f)z: Nr;   )r  r  os      r)   r  rH  
  s     )PK&1QCr!+Ks   =zfunctools.partial(z	_kernel, z),z_kernel,r;   )0r   r#   python_argdefsr  r  r   ri  r  RuntimeErrorr  r  r   r>   r   rJ  rK  rL  r  r  rk  rU  r1  aliasable_out_ptrs_codegen_importsindent_codegen_iteration_varsr   _linesr  r;  r?  r@  rA  r  r  r  rP  r:  r6  rb  rM  rN  r  _codegen_jit_wrapper_tma_codegen_jit_wrapper_legacy_gpu_codegen_jit_wrapper_cpu_tpu_codegen_main_entrygetvalue)4r$   r  r2  arg_defs	call_argsr  r   r8  r'   r9  r:  size_var_namesr;  outerinnerr<  r3  r4  r5  r6  r=  paramr>  r?  r@  rA  flagrB  rC  rF  ctxkernel_bodyr  kernel_signaturer=  
store_linejit_wrapper_namedonate_indicesbase_offsetdonate_literalstatic_argnumsstatic_argnums_literalwrapper_paramsalias_pairsout_idx
alias_name	input_idxalias_map_literalpartial_argssv_param
kernel_arguse_tmas4                                                       r)   codegen_kernelPallasKernel.codegen_kernel!
  s	     %)II$<$<$>!Q)12A2&3Om||I7N1mO$
$!5N(OA} 	 
 $DII$6$6$=$=$?@&3KmN7J1mKSTT !%		 8 8 > > @ 
 @%% EL @ 	  
 -o446;;uD77>>@EEN&6FG+-$E%)OE" % +:
*9_U=SugV/ 	 
 %
$!5M(NA} 	 
 +\9)M9&$3$9$9$;H$;jdD4T$;H
 v"&uS-?'@"A "+=!9#!9IC,, !9   #  

#
 
 .	

 0
 (
 ,
 (
 ,
 "6
 ,
 &
 &
 !4
  2
  0!
" !4#
& #2c"
 %&!((c:++%%c$i0 , " %)II$<$<$>!Q)12A2#DII$6$6$=$=$?@*7O-Q;Nq-O$
$!5M(NA}
 #/1A1A"A!-!= ;-x		#2H2H(I'J"M 	 	'([[]#**dC((NN4;;=1KK&&t,	 + (,'='=#c444NN:. (>  	r)],7#c1122"3#:#:;IC$)F)F%%cK&78 <  499,L^,L#LLtSN!NeAC,?,?(@$@AB!$tyy0P0P'P!PSW!W''=&>>Oa!	
 <(3+>+>>AXAXX 	 	./q>1J0K2NO[[]NN78NNCDNNPQNN3NNBCNN67NNIJNNPQNN3NN56NNMNNN+dii8O8O.PPSVVWNN313K!*3+<+<!=??9--&**477(,vV_
$'$;$;$A$A*$M	#**Iw+?@ 8
 !$ 7 7 = =d CI&&	7';< "> !%		)PK)P P L//##xj($<= 0 1+i		R^H_G``bc
 +}H5
 W 6 6W4;U;U;W  --c:>44S*E11[2C[ b 	  &67}}u 3O

 L 


 I#B "! 3O
 ]D ]s   d<e%e1ee	eee4e=e	ee;e%e 
7e 
?e&e&A
e,;e>	ff"f?f.Bf7f2J f,
e;
f
f-c                    SnUR                   (       a  US-  nUS-  nOUR                  (       d  US-  nUR                  R                  USS9  g )Na=  
import functools
import math
import torch
import jax
import jax.numpy as jnp
from jax.experimental import pallas as pl
from torch._inductor.runtime.runtime_utils import (
    pallas_gpu_align_output_specs, pallas_gpu_pad_inputs,
    pallas_gpu_unpad_results, pallas_partial_reduce,
    torch_dtype_to_jax_runtime,
)
z
import jax.exportz8
from torch_tpu._internal.pallas import tpu_torch_pallasz8
from jax.experimental.pallas import mosaic_gpu as plgpuTstrip)r4  r5  r2  splice)r$   rd  importss      r)   rT  PallasKernel._codegen_imports
  sP     ::,,GRRG%%RRGt,r,   c           
       ^^ U R                   (       a"  U R                  (       d  U R                  (       d  g UR                  S5        [	        U R                   R                  5        Vs/ s HI  n[        UR                  [        [        R                  45      (       d  M4  [        UR                  5      PMK     sn5      mU4S jn/ nUR                  (       a@  UR                  R                  UR                  S   5      nU(       a  UR                  U5        UR                  U R                   R"                  5        Su  pxU H  nU" U5      n	U	S   (       d  M  U	u  px  O   [%        U R                   R'                  5       5      n
/ nS n[)        U
5       HC  u  mu  pU R+                  UR                  5      nUb	  X:X  a  TnM/  UR                  TXU45        ME     [-        U5      n[)        U
5       GH  u  mu  pXR                  ;  a  M  [/        U5      nUR                  nU R1                  U5      nU R3                  U5      nU R+                  U5      nUc  U(       a  US:  ay  TU:w  as  [5        U4S j[)        U5       5       S 5      nUbQ  U R7                  UUU5      nS/U-  nUUU'   SR9                  U5      nS	U S
3nUR                  U SU SU S
35        M  UR                  U SU S
35        GM  U(       aO  [-        U5      S:  a@  X:X  a;  SR9                  S U 5       5      nS	U S
3nUR                  U SU SU S
35        GM]  US:  av  TU:w  ap  [5        U4S j[)        U5       5       5      nU R7                  UUU5      nS/U-  nUUU'   SR9                  U5      nS	U S
3nUR                  U SU SU S
35        GM  UR                  U SU S
35        GM     g s  snf )Nz*# Define iteration variables as JAX arraysc                  > [         R                  R                  U 5      nUb  [        UR	                  5       5      S::  a  g[        S UR	                  5        5       5      n[        R                  " U5      nUT;   a  X#4$ S$ )Nr   NNc              3     #    U  H7  n[        U[        [        R                  45      (       a  [        U5      OUv   M9     g 7fr`   )r   rp  r^  Integerr  s     r)   r  YPallasKernel._codegen_iteration_vars.<locals>._get_nd_shape_if_matches.<locals>.<genexpr>  s5      'A %Qemm(<==A1D's   ?A)r   rJ  try_get_bufferrk  r   r  r   r  )r  rD  shaper  iter_lengthss       r)   _get_nd_shape_if_matchesFPallasKernel._codegen_iteration_vars.<locals>._get_nd_shape_if_matches  sr    ''((2C{c#,,.1Q6!  E IIe$E%*l%:E>LLr,   r   r  r   c              3  H   >#    U  H  u  nu  n    nUT:X  d  M  Uv   M     g 7fr`   r;   r  r  vidxr  rF  s       r)   r  7PallasKernel._codegen_iteration_vars.<locals>.<genexpr>O  s.      6O 2?D!Q#s{ A6Os   "	"r  r    r  r!   z = rS  z = jnp.arange(c              3  8   #    U  H  n[        U5      v   M     g 7fr`   rF  r  s     r)   r  r  j  s     %K6Jc!ff6JrI  c              3  F   >#    U  H  u  nu  n    o2T:X  d  M  Uv   M     g 7fr`   r;   r  s       r)   r  r  n  s)      %0I,!_dAq!UX[AA0Is   !	!)r  rM  r   r  r   r  r   r  rp  r^  r  r:  r<  r6  r  r  r#   r  r  r  rU  r  rk  r>   r   r   rl  _broadcast_axis_idxr  )r$   re  rd  r\  r  candidate_buf_namesr  reshape_target_shapereshape_target_numelr2   r  r  total_var_idxr  r  r  num_broadcast_dimsr  r  renamed_length
length_strbroadcast_idxaxis_idxr  r  arangerF  r  s                             @@r)   rV  $PallasKernel._codegen_iteration_vars  s   
 %%dkkd>Q>QJK " ..5577Aahhemm(<= AHH7
		M !//33C4E4Ea4HIH#**84""499#:#:;5?2+H-h7Fayy=C:$	 , ..4467	%.y%9!C!'5J%**L #%%sGJ&GH &: !0%.y%9!C!'1117|H\\F!11&9NN3J/J!(*Q.},$(6?6O
 %M %0#'#;#;*M;M$ (+e.@&@0:H-$(IIk$:	#.zl!!<#--'jF89YKqI !%%
.A&NO %,-16 II%K6J%KK	&zl!4%%
#fXYykQR&ST#a'C=,@ $ %09.0I% !  33"M3E  #e&88(2H% IIk2	&zl!4%%
#fXYykQR&ST%%
.A&NOw &:[s   )3O( O(c                    [        S U  5       5      n[        S U  5       5      nU=(       a    UnU(       a  U$ US-
  U-
  $ )Nc              3  ^   #    U  H#  u  p  n[        U5      R                  S 5      v   M%     g7frNr>   r  r  r  r[  s      r)   r  3PallasKernel._broadcast_axis_idx.<locals>.<genexpr>  s+      !
5CzqQCFc""^s   +-c              3  h   #    U  H(  u  p  n[        U5      R                  S 5      (       + v   M*     g7fr  r  r  s      r)   r  r    s.      !
9G:1AA!!#&&&s   02r   )r  )r  r  r  has_reduction_varshas_pointwise_varsis_mixeds         r)   r   PallasKernel._broadcast_axis_idx|  s\     ! !
5C!
 
 ! !
9G!
 
 &<*<  !A%55r,   c                   UR                   nUR                  nUR                  nUR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  S5        U H  nUR                  SU S35        M     UR                  S5        UR                  S	5        UR                  S
5        U Vs/ s H  ow S3PM	     nnU Vs/ s H  ow S3PM	     n	nU Vs/ s H  ow S3PM	     n
nU Vs/ s H  ow S3PM	     nnUR                  S5        UR                  S5        X-   nX-   n[	        [        U5      5       Vs/ s H  nSU 3PM
     nnSR                  X-   5      nUR                  SSR                  U5       SU S35        UR                  5          UR                  S5        UR                  S5        UR                  5          UR                  S5        UR                  S5        UR                  S5        [        [        X5      5       H$  u  nu  nnUR                  SU SU SU S35        M&     UR                  S5        UR                  S5        [        U5       H  u  nnUR                  SU S35        M     UR                  S5        UR                  S5        X-   nUR                  S5      R                  5       nUR                  U SSR                  U5       S35        UR                  S5        UR                  S 5        UR                  S!5        [        X5       H  u  nnUR                  S"U SU S#35        M      UR                  S$5        UR                  S5        UR                  S%5        S S S 5        UR                  S5        UR                  S&5        UR                  S'5        S S S 5        UR                  S5        UR                  S(5        UR                  S)5        [        U
5       H"  u  nnXN   nUR                  S*U S+U S,35        M$     [        U5       H  u  nnUR                  S*U S-U S.35        M      U H  nUR                  S*U S/35        M     UR                  S5        UR                  S05        UR                  S15        UR                  S5        UR                  S25        UR                  S35        UR                  5          UR                  S45        UR                  S55        UR                  S65        S S S 5        UR                  S75        U H  nUR                  S8U S935        M     UR                  S5        UR                  S5        UR                  S:5        UR                  S;5        g s  snf s  snf s  snf s  snf s  snf ! , (       d  f       GN8= f! , (       d  f       GN= f! , (       d  f       N= f)<Nz6# Use lax.fori_loop with TMA for automatic OOB maskingzfrom jax import laxz"_tile_size = 128  # Warpgroup sizez_orig_out_shapes = out_shapesz_max_numel = 0z_max_numel = max(_max_numel, .size)for shape in out_shapes:z2    _max_numel = max(_max_numel, math.prod(shape))z8_num_tiles = (_max_numel + _tile_size - 1) // _tile_size_gmem_smemr  z4# Wrapper kernel using lax.fori_loop with direct TMA	_barrier_r    zdef _tma_kernel(z, *, rC  zdef _tile_body(_tile_idx, _):z$_tile_start = _tile_idx * _tile_sizez5# TMA load inputs from GMEM to SMEM (OOB auto-masked)zplgpu.copy_gmem_to_smem(z%.at[pl.ds(_tile_start, _tile_size)], z, _barrier_r!   z # Wait for TMA loads to completezplgpu.barrier_wait(_barrier_z# Compute on SMEM tiles,r  z7# TMA store outputs from SMEM to GMEM (OOB auto-masked)zplgpu.commit_smem()zplgpu.copy_smem_to_gmem(z$.at[pl.ds(_tile_start, _tile_size)])zplgpu.wait_smem_to_gmem(0)zreturn Nonez# Iterate over all tilesz.lax.fori_loop(0, _num_tiles, _tile_body, None)zA# Build SMEM scratch shapes for inputs, outputs, and TMA barriersz_scratch_shapes = {}z_scratch_shapes['z'] = plgpu.SMEM((_tile_size,), rR  z*'] = plgpu.SMEM((_tile_size,), out_dtypes[])z"'] = plgpu.Barrier(num_arrivals=1)z4# Create flattened output specs aligned to tile sizezV_flat_out_specs, _ = pallas_gpu_align_output_specs(out_shapes, out_dtypes, _tile_size)z## Call plgpu.kernel with TMA kernelz_result = plgpu.kernel(z_tma_kernel,zout_shape=_flat_out_specs,zscratch_shapes=_scratch_shapes,)(    z.flatten(),z$# Reshape results to original shapesz:return pallas_gpu_unpad_results(_result, _orig_out_shapes))r2  r@  r:  r  r  rk  r  rU  rU  ziprstripr|  )r$   rd  rv  r2  r@  r:  rb  r'   gmem_input_paramsgmem_output_paramssmem_input_paramssmem_output_paramswrapper_kernel_paramsall_smem_paramsr  barrier_paramsscratch_paramsgmem_insmem_inr  kernel_call_argsrU   gmem_outsmem_out
smem_param
orig_parambarrier_params                              r)   rX  %PallasKernel._codegen_jit_wrapper_tma  s~   xx!55)) 	OP,-;<67'((ENN:5'HI )12KLQR2EF2EQs%[2EF3@A=a5k=A2EF2EQs%[2EF3@A=a5k=ArMN 1 F+@38=P9Q3RS3RaIaS/3RS?#CDtyy)>?@nEUUWX	
 [[]NN2NN:;EFr"VW-6)=.)A) NN27);`ah`iituvtwwxy. r"AB%&78DAqNN%A!A#FG 9 r"89#4#I &--c288:	)Adii8H.I-J!LMr"M 45*-.@*U&HhNN28*BxjHlm +V ;<r"}-E H NN2NN56NNKLS X 	rO	
 	-.&'89MAz,/JNN#J</NzlZab :
 ''9:MAzNN#J</YZ[Y\\^_ ; ,MNN#M?2TU ,
 	rMNd	
 	r<=01[[]NN>*NN78NN<=  	t(ENNT%45 )sr=>STG GAFA T  ]P ]sO   V;#W 7WW
W63W&)F:W#;W&4W8
W#	W&&
W58
Xc                   UR                   nUR                  nSSR                  U5       S3nUR                  S5        UR                  S5        U H  nUR                  SU S35        M     UR                  S5        UR                  S	5        UR                  S
5        UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  SU S35        UR                  S5        UR                  S5        UR                  SU-   5        UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  SU S35        UR                  S5        UR                  S5        UR                  S 5        UR                  S!5        UR                  S"5        UR                  S#U-   5        UR                  S$5        UR                  S%5        UR                  S&5        UR                  S'5        UR                  S(5        UR                  S)5        UR                  S*5        UR                  S+U 35        UR                  S,5        UR                  S-5        UR                  S.5        UR                  S"5        UR                  S#U-   5        UR                  S$5        UR                  S%5        UR                  S/5        g )0NrA  r    r@  z7# Check if all tensors have same size (no broadcasting)z_all_sizes = []z_all_sizes.append(r  r  z'    _all_sizes.append(math.prod(shape))z_unique_sizes = set(_all_sizes)zH_can_pad = len(_unique_sizes) == 1 and all(s > 1 for s in _unique_sizes)r  zif _can_pad:z5    # All tensors same size - safe to flatten and padz+    _padded_inputs = pallas_gpu_pad_inputs(r!   zZ    _aligned_out_specs, _is_scalar = pallas_gpu_align_output_specs(out_shapes, out_dtypes)z    _result = plgpu.kernel(z        z%        out_shape=_aligned_out_specs,z    )(*_padded_inputs)zD    return pallas_gpu_unpad_results(_result, out_shapes, _is_scalar)zelse:zA    # Different sizes - check if it's a reduction (scalar output)z)    _out_numel = math.prod(out_shapes[0])r  z    if _out_numel <= 1:zG        # Scalar output (reduction) - pad inputs but keep scalar outputz/        _padded_inputs = pallas_gpu_pad_inputs(z#        _aligned_out_specs = tuple(z.            jax.ShapeDtypeStruct(shape, dtype)z;            for shape, dtype in zip(out_shapes, out_dtypes)z	        )z        _result = plgpu.kernel(z            z)            out_shape=_aligned_out_specs,z        )(*_padded_inputs)z        return _resultz	    else:zP        # Non-scalar output with broadcasting - broadcast inputs to output shapez%        _target_shape = out_shapes[0]z        _broadcasted = [z>            jnp.broadcast_to(_inp, _target_shape) for _inp in z	        ]z<        _padded_inputs = pallas_gpu_pad_inputs(_broadcasted)z^        _aligned_out_specs, _is_scalar = pallas_gpu_align_output_specs(out_shapes, out_dtypes)zH        return pallas_gpu_unpad_results(_result, out_shapes, _is_scalar))r2  r@  r  r  )r$   rd  rv  r2  r@  
input_listrb  s          r)   rY  ,PallasKernel._codegen_jit_wrapper_legacy_gpu  s    xx!55#678:

 	PQ()(ENN/wf=> )12@A89V	
 	r~&NODZLPQRSh	
 	45zJ./>?/0R	
 	wO	
 	BCv01U	
 	HTUVW<=GHTU{#89~
23BC34/0{#^	
 	>?12LZLY	
 	{#UVl	
 	89~
23BC34V	
r,   c                *   UR                   nUR                  S5        UR                  SU-   5        UR                  S5        UR                  S5        UR                  S5        UR                  SUR                   S35        UR                  S5        UR                  U(       a  S	U S
3OS5        UR                  S5        UR                  (       a.  UR                  SSR	                  UR                  5       S35        UR                  S5        g )Nzreturn pl.pallas_call(r  z     out_shape=out_shapes_pallas,z    out_specs=out_specs_pallas,z    in_specs=in_specs_pallas,z    interpret=r  z    grid=(1,),z    input_output_aliases={ z },z    input_output_aliases={},r  r    r!   )r2  r  r6  r@  r  )r$   rd  rv  ro  rs  r2  s         r)   rZ  )PallasKernel._codegen_jit_wrapper_cpu_tpuV  s     xx/0v
*+9:8967(=(='>a@A'( ++<*=TB/	

 	t""NNT$))C,C,C"D!EQGHsr,   c                l    UR                   (       a  U R                  X5        g U R                  X5        g r`   )r4  _codegen_main_entry_tpu_codegen_main_entry_default)r$   rd  rh  s      r)   r[   PallasKernel._codegen_main_entryo  s%    ::((?,,SCr,   c                	   UR                   nUR                  S5        UR                   S3nUR                  nUR                  SU SSR                  UR                  5       S35        UR                  5          UR                  S5        UR                  S5        UR                  S	5        / nUR                   H1  nUR                  U S
U SU S35        UR                  U S35        M3     UR                   H1  nUR                  U S
U SU S35        UR                  U S35        M3     UR                  SSR                  UR                   V	s/ s H	  n	SU	 S3PM     sn	5      -   S-   5        / n
UR                   Ht  n	UR                  R                  U	5      nUb>  [        R                  R                  U5      nUb  U
R                  [        U5      5        M_  U
R                  SU	 S35        Mv     UR                  SSR                  U
5      -   S-   5        SS/nUR!                  UR"                  5        UR!                  U5        UR                  SU SSR                  U5       S35        UR                  SU SUR                  S    S35        UR                  SU S35        UR                  5          UR                  SU S 35        S S S 5        [%        UR                  5      [%        UR                  5      -   nUR                  S!SR                  U5       S"35        UR                  S#5        UR                  5          UR                   Hv  n	UR                  R                  U	5      nUb=  [        R                  R                  U5      nUb  UR                  S$U	 S%U< S&35        M^  UR                  S$U	 S%U	 S'35        Mx     S S S 5        UR                  S"5        UR                  S(U S)35        UR&                   H)  nUR                  U   nUR                  U S*U S+35        M+     S S S 5        g s  sn	f ! , (       d  f       GN{= f! , (       d  f       N= f! , (       d  f       g = f),Nr  _mainrB  r  r    , stream=None):)jax.config.update('jax_enable_x64', True)jax.clear_caches()z+# Build JAX placeholders for export tracingz$_placeholder = jax.ShapeDtypeStruct(z#.shape, torch_dtype_to_jax_runtime(z.dtype))_placeholderout_shapes = (tuple(r  rJ  torch_dtype_to_jax_runtime(rR  out_dtypes = (rL  rM  zexported = jax.export.export(z, platforms=['tpu'])(r!   zkernel_key = 'z_' + '_'.join(str(s) for s in r   z.if not tpu_torch_pallas.lookup_custom_kernel('z', kernel_key):z)tpu_torch_pallas.register_custom_kernel('z/', kernel_key, exported.mlir_module_serialized)zinput_tensors = [r@  zoutput_shape_tensors = [ztorch.empty(z.shape, dtype=z, device='tpu'),z.dtype, device='tpu'),zTresults = tpu_torch_pallas.call_custom_kernel(input_tensors, output_shape_tensors, 'z', kernel_key)z.copy_(results[r  )r2  r  r3  r  rA  rU  r>  r  r?  r:  r<  r6  r   rJ  r  r   r  r;  r  rC  )r$   rd  rh  r2  	main_namekernel_name_strall_jax_input_namesrq  ptrr  dtype_exprsr  r   wrapper_placeholder_argsinput_tensor_namesrF  out_names                    r)   r  $PallasKernel._codegen_main_entry_tpuu  s    xxr'u-	//9+Qtyy)?)?@AQ	
 [[]NNFGNN/0 NNHI"$!..
!l"F!l"Ej\QY[ $**j\+FG / ''e?e>se8M $**cU,+?@ ( NN ))@Q@QR@QvdV73@QRST
 &(K))3377='GG--h7E(#**+=e+DE ""%@g#NO * NN+dii.DDtKL )5l'C$$++C,?,?@$++,?@NN/#$ %II678; NN  1 2,,/,=,=a,@+AJ NN@@QQ`a '((WY  "&c&6&6!7$s?O?O:P!PNN.tyy9K/L.MQOP NN56--D"77;;DAH+ ! 1 1( ; , NN".tfN5)K[ \ %NN&tfN4&@VW .  NN3NN#$N4 ..,,S1(?3%rBC /s ]. S@  G ]sS   :CQ1Q	'EQ1QA8Q1BQ A'Q1	Q1
Q	Q1 
Q.	*Q11
Q?c                  ^ UR                   nUR                  S5        UR                   S3nUR                  SU SSR                  UR                  5       S35        UR                  5          UR                  S5        UR                  S5        UR                  S	5        UR                  (       aJ  UR                  S
5        UR                   H)  nU R                  UUUR                  UR                  S9  M+     UR                  S5        UR                   H6  nUR                  S5      (       d  M  U R                  X6UR                  SS9  M8     UR                  S5        UR                   H6  nUR                  S5      (       d  M  U R                  X6UR                  SS9  M8     UR                  S5        UR                  SSR                  UR                   Vs/ s H	  nSU S3PM     sn5      -   S-   5        / nUR                   Ht  nUR                  R                  U5      n	U	b>  [        R                   R#                  U	5      n
U
b  UR%                  ['        U
5      5        M_  UR%                  SU S35        Mv     UR                  SSR                  U5      -   S-   5        0 mUR                   H  nU S3TU'   M     UR                   H  nU S3TU'   M     SS/nUR)                  UR*                  5        UR)                  U4S jUR,                   5       5        UR                  SU SSR                  U5       S35        UR                  S 5        UR.                  (       aJ  UR                  S!5        UR.                   H)  nUR                  U   nUR                  U S"U S#35        M+     S S S 5        g s  snf ! , (       d  f       g = f)$Nr  r  rB  r  r    r  z/# Enable JAX x64 mode for float64/int64 supportr  r  z*# Convert Torch -> JAX for donated outputs)
contiguousz+# Convert Torch -> JAX for in-place tensorsr>  Fz!# Convert Torch -> JAX for inputsrA  Tz-# Prepare output metadata from PyTorch tensorr  r  r  rJ  r  rR  r  _jaxrL  rM  c              3  .   >#    U  H
  nTU   v   M     g 7fr`   r;   )r  r  arg_name_maps     r)   r  ;PallasKernel._codegen_main_entry_default.<locals>.<genexpr>  s      %/FtT"/Fs   zres = r!   zjax.block_until_ready(res)z9result_values = res if isinstance(res, tuple) else (res,)z'.copy_(torch.from_dlpack(result_values[z])))r2  r  r3  r  rA  rU  r>  _emit_torch_to_jaxr4  r5  r?  r  r:  r<  r6  r   rJ  r  r  r   r  r;  r@  rC  )r$   rd  rh  r2  r  rq  r  r  r  r  r   wrapper_call_argsrF  r  r  s                 @r)   r  (PallasKernel._codegen_main_entry_default  s    xxr'u-	9+Qtyy)?)?@AQ	
 [[]NNLMNNFGNN/0KL"%"2"2J ++"

#&#7#7	 ,  #3 NNHI''>>,//++Dszze+T ( NN>?''>>(++++Dszzd+S ( NNJKNN ))@Q@QR@QvdV73@QRST
 &(K))3377='GG--h7E(#**+=e+DE ""%@g#NO * NN+dii.DDtKL+-L!..
.8\+>Z( /'''*e4LS! ( ".| <$$S%8%89$$ %/2/F/F%  NNV$4#5QtyyAR7S6TTUVWNN78&&O 22C"005HNN#*$KC5PST 3} ]@ SA ]s.   /CO6AOAO"O2GOO
Oc               L    U(       a  SOSnU R                  U SU U S35        g )Nz.detach().contiguous()z	.detach()z_jax = jax.dlpack.from_dlpack(r!   )r  )r2  r  r4  r  suffixs        r)   r  PallasKernel._emit_torch_to_jax)  s-     .8)[(#A(F8STUVr,   c                2   [         R                  R                  nU R                  R	                  5       u  pE  nU Vs/ s H  owR
                  PM     nnU V	s/ s H  oR                  S5      (       d  M  U	PM     n
n	[        [        [        U5      5      n[        U S0 5      nU
 V	s/ s H/  n	UR                  U	S5      (       d  M  XR                  U	5         PM1     nn	U SSR                  X-   5       S3nUR                  U5        gs  snf s  sn	f s  sn	f )z7Generate the Python code that calls this Pallas kernel.r=  rS  Fz.run(r    r!   N)r   rJ  wrapper_coder#   rQ  r  r  r  rV  r>   r  r6  rb  r  r  )r$   r  nodewrapperr]  r^  r  r   kernel_param_namesr'   r9  call_arg_strs	aliasablealias_call_argskernel_calls                  r)   call_kernelPallasKernel.call_kernel0  s    ''&&$(II$<$<$>!Q.67hffh7&8T&8LL<S1&8TSi01D"6;	 %
$}}Q& 7M22156$ 	 
 eDIIo.M$N#OqQ+& 8T
s   D
D5D'DD)	rS  rS  rM  rQ  rR  rP  rN  rO  r   )
r%   r<   rN   r<   rX  r  rY  r  r=   None)rb  r<   r=   r>   )r=   r   )rb  r<   r=   r   )rb  r<   r=   r  )rb  r<   r=   zlist[sympy.Symbol])r=   r  )r  r>   rb  r<   r=   r  )r=   r  )rb  r<   r=   tuple[str, bool])r  r   r=   Optional[int])r  r   r=   r  )r=   r  )r  r>   r=   z*Optional[tuple[Any, Any, Any, list, bool]])rb  r<   r=   ztuple[int, OrderedSet])rb  r<   rs  r   r=   r   )
r  r  r  r  r  r  r  r   r=   r  )
r  r>   rb  r<   r}  r>   r  r  r=   r  )rD  r>   r  r>   rb  r<   r}  r>   r  r  r=   r>   )r  r>   rG  r>   r=   r>   )r  r>   rb  r<   rG  r>   r=   r>   )rb  r<   r}  r>   r  r  r=   r  )r  r>   rj  r<   rk  r   r=   r  )r  r>   r=   r  )r  r>   r  r   r  r  r=   r7  r`   )r  r>   r  r>   rb  r<   r  r   r}  r>   r  r  r  r   r=   r7  )r  r>   r  r   r  zdict[str, Any]r  r>   r  r   r=   r>   )r  r>   rb  r<   r=   r   )
r  r>   rb  r<   r  r   r  r   r=   r  )r   )rb  r<   r   zsympy.Symbolr  int | floatr=   r  )r  )rb  r<   r  r>   r=   Optional[dict[str, Any]])r  r>   r  r>   r  rp  r=   r  )rb  r<   r  r>   r  rp  r=   r  )
r   r)  r   r)  r"  rH   r  +Union[CSEVariable, tuple[CSEVariable, ...]]r=   r  )r9  r>   r=   r  )r  ra   r=   r>   )rd  r1  r=   r  )re  r   rd  r1  r=   r  )r  zlist[tuple[int, Any, Any, Any]]r  rp  r  rp  r=   rp  )rd  r1  rv  r>   r=   r  )
rd  r1  rv  r>   ro  zlist[tuple[int, int]]rs  r>   r=   r  )rd  r1  rh  r>   r=   r  )
r2  r   r  r>   r4  r  r  r  r=   r  )r  r>   r  zOptional[IRNode]r=   r  )Fr?   r@   rA   rB   rC   rf   	overridespallas_pexprr   rD  rY   rZ  rd  ra  r~  r{  r   r  r  r  r  r  r  r  r*  r  r  r  r  r  r  r  r  r0  r3  r=  rH  rN  ra  rn  re  r  r  r  r  typing_extensionsoverrider  r  r  r  r  r  r   r6  r:  rx  rT  rV  r  rX  rY  rZ  r[  r  r  r  r  rD   __classcell__)rU  s   @r)   rF  rF  F  s    &I)5E&5E.88&089=8FJ8	8!>FbH<8:5K7.0>@6,,  		>@(KT''	'(,6	 ## # 	#
 !# 
#J>(>( >( 	>(
 >( 
>(@$($( $( 	$(
 $( 
$(LI I  I  	I 
 I  
I V  	
   
64@K@K *@K7:@K	@KD;(;(,/;(@D;(	;(z&9&9)3&9EO&9	&9P2h*=A	B 9191 91 	91
 91 91 91 91 
91vN
N
 N
 %	N

 N
 N
 
N
` &
  &
P@D LP-7-7 *-73>-7FI-7	-7  -7^ EF

 ,
7B
	
 
 57NN.1N	!N*

.1
CF
	!
B+
+
/2+
DG+
	!+
Z__ _ &	_
 ;_ 
5_B & &
M^-*uP)uP0?uP	uPn 6766  6 
	6 6&vUpL
"L
03L
	L
\  +	
  
2DeD"eD69eD	eDNK"K69K	KZ WW(+W59WJNW	W W' 'r,   rF  c                  F    \ rS rSr\r\SS j5       r        SS jrSr	g)PallasSchedulingiE  c                6    [        [        R                  /5      $ r`   )r   r   REDUCE_TO_SINGLE_ELEMENT)clsrT  s     r)   get_backend_features%PallasScheduling.get_backend_featuresH  s     >BBCDDr,   c                   [         R                  R                  nXR                  ;   a  UR                  U   $ [        R
                  R                  (       a$  [        U[        R
                  R                  5      OSn[        R                  " UR                  S5      5      R                  5       S S nUS:X  a  SU 3nOSU SU 3nXtR                  U'   UR                  SU5      n[        5       nUR                  SU< S	35        UR                  US
S9  UR                  S5        [!        X$5      u  pU	 SU
 3nUR#                  XxR%                  5       U5        U$ )Nr  zutf-8   fusedpallas_r  r?  zasync_compile.pallas(z, r'''Tr{  z''')
)r   rJ  r  src_to_kernelr   tritondescriptive_namesr   hashlibsha256encode	hexdigestr  r   r  r}  r   define_kernelr\  )r$   src_codenode_scheduler   r  
fused_namekernel_hashr3  compile_wrapperoriginsdetailed_originsmetadata_comments               r)   r  PallasScheduling.define_kernelN  sO    ''&&,,,((22 }}.. "-1P1PQ 	
 nnX__W%=>HHJ2AN #K=1K#J<q>K*5h' ##O[A(*!!$9+"OPxt4!!&)$7$O!%Yb)9(:;k+C+C+EGWXr,   r;   N)rT  ztorch.devicer=   zOrderedSet[BackendFeature])r  r>   r  zSequence[BaseSchedulerNode]r   rF  r=   r>   )
r?   r@   rA   rB   rF  kernel_typeclassmethodr	  r  rD   r;   r,   r)   r  r  E  sF    KE E
"" 3" 	"
 
"r,   r  )rN   rp  r=   rp  )B
__future__r   dataclassesr  r  r   r  typingr   r   r   r   r^  r  torch.utils._ordered_setr   torch.utils._sympy.functionsr	   r  r   irr   runtime.runtime_utilsr   r   r   r   virtualizedr   block_analysisr   commonr   r   r   r   r   simdr   r   r   r"   r   collections.abcrE   rF   rG   ops_handlerrH   	schedulerrI   MAIN_SUFFIXrM   rO   _logginggetArtifactLoggerr?   rW   rR   rR  rc   rf   	dataclassr1  rF  r  r;   r,   r)   <module>r5     s   "      6 6   / 8   6 >  /  -M < && 2+- 
 L ..228]K> >2U, Ut

K t

n # # #,|'': |''~O+~ +r,   