
    Z j                         S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  \R                  " \5      r " S S	5      rS
\l         g)z+
CLI entry point for `transformers serve`.
    N)	Annotated)logging)is_serve_available   )set_torch_seedc            /          \ rS rSr                    S4S\\S-  \R                  " SS94   S\\\R                  " SS94   S\\
S-  \R                  " S	S94   S
\\
S-  \R                  " SS94   S\\
S-  \R                  " SS94   S\\S-  \R                  " SS94   S\\S-  \R                  " SS94   S\\S-  \R                  " SS94   S\\\R                  " SS94   S\\S-  \R                  " SS94   S\\\R                  " SS94   S\\S-  \R                  " SS94   S\\\R                  " SS94   S\\
\R                  " SS94   S \\\R                  " S!S94   S"\\
\R                  " S#S94   S$\\\R                  " S%S94   S&\\\R                  " S'S94   S(\\
S-  \R                  " S)S94   S*\\\R                  " S+S,S-94   S.S4*S/ jjrS0 rS1 rS2 rS3rg)5Serve!   Nforce_modelz*Model to preload and use for all requests.)helpcontinuous_batchingzMEnable continuous batching with paged attention. Configure with --cb-* flags.cb_block_sizez6KV cache block size in tokens for continuous batching.cb_num_blocksz2Number of KV cache blocks for continuous batching.cb_max_batch_tokensz1Maximum tokens per batch for continuous batching.cb_max_memory_percentz/Max GPU memory fraction for KV cache (0.0-1.0).cb_use_cuda_graphz+Enable CUDA graphs for continuous batching.attn_implementationz2Attention implementation (e.g. flash_attention_2).compilez*Enable torch.compile for faster inference.quantizationz.Quantization method: 'bnb-4bit' or 'bnb-8bit'.devicez4Device for inference (e.g. 'auto', 'cuda:0', 'cpu').dtypez2Override model dtype. 'auto' derives from weights.trust_remote_codezTrust remote code when loading.model_timeoutzGSeconds before idle model is unloaded. Ignored when force_model is set.hostzServer listen address.portzServer listen port.enable_corszEnable permissive CORS.	log_levelz'Logging level (e.g. 'info', 'warning').default_seedzDefault torch seed.non_blockingTz1Run server in a background thread. Used by tests.)hiddenr   returnc           
      \   [        5       (       d  [        S5      eSS KnSSKJn  SSKJn  SSKJn  SSK	J
n  SSKJn  SS	KJn  SS
KJn  Ub  [#        U5        [$        R&                  " S5      nUR)                  [$        R*                  UR-                  5          5        U" UUUUU
UUS9U l        SSKJn  UUUUUS.R5                  5        VV s0 s H  u  nn U c  M  UU _M     n!nn U!(       a  U" S0 U!D6OS n"U" UU	U"S9U l        U" U R.                  U R6                  S9U l        U" U R.                  U R6                  S9U l        U" U R.                  U R6                  S9U l        U" U R.                  U R6                  5      U l        U" U R.                  U R8                  U R:                  U R<                  U R>                  U R6                  US9n#URA                  U#UUSS9n$URC                  U$5      U l"        U(       a  U RG                  5         g U RD                  RI                  5         g s  sn nf )NzRMissing dependencies for serving. Install with `pip install transformers[serving]`r   r   )ChatCompletionHandler)CompletionHandler)ModelManager)ResponseHandler)build_server)TranscriptionHandler)GenerationStatetransformers)r   r   r   r   r   r   r   )ContinuousBatchingConfig)
block_size
num_blocksmax_batch_tokensmax_memory_percentuse_cuda_graph)r   r   	cb_config)model_managergeneration_state)completion_handlerresponse_handlertranscription_handlerr3   r   info)r   r   r    )%r   ImportErroruvicornserving.chat_completionr#   serving.completionr$   serving.model_managerr%   serving.responser&   serving.serverr'   serving.transcriptionr(   serving.utilsr)   r   r   
get_loggersetLevel
log_levelslower_model_managerr*   r+   items_generation_state_chat_handler_completion_handler_response_handler_transcription_handlerConfigServerserverstart_serverrun)%selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r:   r#   r$   r%   r&   r'   r(   r)   transformers_loggerr+   kv	cb_kwargsr1   appconfigs%                                        g/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/cli/serve.py__init__Serve.__init__"   s*   \ "##rssB9750?2 #<( &00@$$W%7%7	8I%JK*/ 3%'#
 	:
 ,+$7&;"3 eg

1  AqD 	 

 >G,9y9D	!0 3"
 3--!33

 $5--!33$
 
 "1--!33"

 ';4;N;NPTPfPf&g##77!33"&"="=!33#
 $TVLnnV,KKOOg

s   
H(H(c                 |   ^  U 4S jn[         R                  " USSS9T l        T R                  R                  5         g )Nc                     > [         R                  " 5       n [         R                  " U 5        U R                  TR                  R                  5       5        g )N)asyncionew_event_loopset_event_looprun_until_completerO   serve)looprR   s    rY   _run Serve.start_server.<locals>._run   s:    ))+D""4(##DKK$5$5$78    zuvicorn-threadF)targetnamedaemon)	threadingThread_threadstart)rR   rd   s   ` rY   rP   Serve.start_server   s2    	9
 !''t:JSXYrf   c                 8    U R                   R                  5         g)z$Clear all loaded models from memory.N)rF   shutdownrR   s    rY   reset_loaded_modelsServe.reset_loaded_models   s    $$&rf   c                 "   U R                   R                  5         U R                  R                  5         U R                  (       a  U R                  R	                  5       (       d  g SU R
                  l        U R                  R                  SS9  g )NT   )timeout)rH   rp   rF   rl   is_aliverO   should_exitjoinrq   s    rY   kill_serverServe.kill_server   sc    '')$$&||4<<#8#8#:#:"&!$rf   )rI   rJ   rH   rF   rK   rl   rL   rO   )NFNNNNNNFNautor|   Fi,  	localhosti@  FwarningNF)__name__
__module____qualname____firstlineno__r   strtyperArgumentboolOptionintfloatrZ   rP   rr   rz   __static_attributes__r8   rf   rY   r	   r	   !   s@    qu
       di lrpvch LWIMUZbkX\ YAsTz5>>?k+llmA 'LLmnp
	A !$J*bcc
A !$J*^__
A '$J*]^^
A"  )DL%,,,]^^ 
#A( %4K+XYY
)A. '$J*^__
/A4 43_!``a5A6  $J*Z[[
7A< #u||1ghhi=A> tU\\7k%llm?A@ %T5<<=^+_%_`AAB !#lmm
CAJ U\\/GHHIKAL U\\/DEEFMAN tU\\7P%QQROAP S%,,4]"^^_QAR  d
ELL>S,T TUSAT  %,,d1dee
UAZ 
[AF'%rf   r	   u  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.
Models will be loaded and unloaded automatically based on usage and a timeout.


Endpoints:
    POST /v1/chat/completions — Chat completions (streaming + non-streaming).
    POST /v1/completions      — Legacy text completions from a prompt.
    GET  /v1/models           — Lists available models.
    GET  /health              — Health check.

Requires FastAPI and Uvicorn: pip install transformers[serving]
)__doc__r^   rj   typingr   r   transformers.utilsr   transformers.utils.import_utilsr   rA   r   rB   r   loggerr	   r8   rf   rY   <module>r      sI        & > ) 
		H	%W% W%trf   