
    Z jJ                     
   S r SSKrSSKrSSKrSSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SSKJr  SSKrSSKJrJr  S	S
KJr  SSKJrJrJr  \
(       a
  SSKJrJrJr  \R2                  " \5      r " S S5      r " S S5      rg)z3
Model loading, caching, and lifecycle management.
    N)Callable)	lru_cache)TYPE_CHECKING)scan_cache_dir)tqdm)BitsAndBytesConfigPreTrainedTokenizerBase   )logging   )Modalitymake_progress_tqdm_classreset_torch_cache)PreTrainedModelPreTrainedTokenizerFastProcessorMixinc            	       X    \ rS rSrSr  SSSS\SSS	S
4S jjrSS jrSS jrSS jr	Sr
g)
TimedModel+   aL  Wraps a model + processor and auto-unloads them after a period of inactivity.

Args:
    model: The loaded model.
    timeout_seconds: Seconds of inactivity before auto-unload. Use -1 to disable.
    processor: The associated processor or tokenizer.
    on_unload: Optional callback invoked after the model is unloaded from memory.
Nmodelr   timeout_seconds	processor/ProcessorMixin | PreTrainedTokenizerFast | None	on_unloadzCallable | Nonec                     Xl         [        UR                  5      U l        X0l        X l        X@l        [        R                  " U R
                  U R                  5      U l
        U R                  R                  5         g N)r   strname_or_path_name_or_pathr   r   
_on_unload	threadingTimer_timeout_reached_timerstart)selfr   r   r   r   s        w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/cli/serving/model_manager.py__init__TimedModel.__init__5   sZ     
 !3!34".#ood&:&:D<Q<QR    c                     U R                   R                  5         [        R                  " U R                  U R
                  5      U l         U R                   R                  5         g)z4Reset the inactivity timer (called on each request).N)r$   cancelr!   r"   r   r#   r%   r&   s    r'   reset_timerTimedModel.reset_timerD   s@    ood&:&:D<Q<QRr*   c                    [        U S5      (       ax  U R                  bj  U ?U ?SU l        SU l        [        R                  " 5         [        5         U R                  R                  5         U R                  b  U R                  5         gggg)z0Delete the model and processor, free GPU memory.r   N)	hasattrr   r   gccollectr   r$   r,   r    r-   s    r'   delete_modelTimedModel.delete_modelJ   sp    4!!djj&<
DJ!DNJJLKK *! + '=!r*   c                     U R                   S:  a@  U R                  5         [        R                  U R                   SU R                    S35        g g )Nr   z was removed from memory after zs of inactivity)r   r4   loggerinfor   r-   s    r'   r#   TimedModel._timeout_reachedW   sJ    !#KK4--..MdNbNbMccrst $r*   )r   r    r$   r   r   r   NNreturnN)__name__
__module____qualname____firstlineno____doc__intr(   r.   r4   r#   __static_attributes__ r*   r'   r   r   +   sM     HL'+   E	
 %"ur*   r   c                      \ rS rSrSr       S$S\S\S-  S\S\S-  S\S-  S	\S
\S-  4S jjr\	S\S-  4S j5       r
S r\	S\S\4S j5       rS\S-  4S jrS\SS4S jr S%S\S\S-  S\S-  SS4S jjr  S%S\S\S-  S\S-  SS4S jjrS\4S jrS&S jr\	 S'SSSSS\4S  jj5       r\	\S'S!\S-  S\\   4S" jj5       5       rS#rg)(ModelManager]   a  Loads, caches, and manages the lifecycle of models.

Handlers receive a reference to this and call `load_model_and_processor()`
to get a model ready for inference.

Args:
    device: Device to place models on (e.g. "auto", "cuda", "cpu").
    dtype: Torch dtype override. "auto" derives from model weights.
    trust_remote_code: Whether to trust remote code when loading models.
    attn_implementation: Attention implementation override (e.g. "flash_attention_2").
    quantization: Quantization method ("bnb-4bit" or "bnb-8bit").
    model_timeout: Seconds before an idle model is unloaded. -1 disables.
    force_model: If set, preload this model at init time.
Ndevicedtypetrust_remote_codeattn_implementationquantizationmodel_timeoutforce_modelc                    0 U l         0 U l        [        R                  " 5       U l        0 U l        0 U l        UR                  5       (       a  [        U5      OUU l	        U R                  U5      U l        X0l        X@l        XPl        X`l        Xpl        U R#                  5         Ub  SU l        Ub!  U R%                  U R'                  U5      5        g g )N)loaded_models_model_locksr!   Lock_model_locks_guard_loading_subscribers_loading_tasksisdigitrB   rH   _resolve_dtyperI   rJ   rK   rL   rM   rN   _validate_argsload_model_and_processorprocess_model_name)r&   rH   rI   rJ   rK   rL   rM   rN   s           r'   r(   ModelManager.__init__m   s     57 8:"+.."2 QS!79 &,^^%5%5c&k6((/
!2#6 (*& "!#D "))$*A*A+*NO #r*   c                     SS K nU S;   a  U $ [        XS 5      n[        X!R                  5      (       d  [	        SU  S35      eU$ )Nr   )autoNzUnsupported dtype: 'zF'. Must be 'auto' or a valid torch dtype (e.g. 'float16', 'bfloat16').)torchgetattr
isinstancerI   
ValueError)rI   r_   resolveds      r'   rX   ModelManager._resolve_dtype   sN    N"L5.(KK00&ug-st  r*   c                 \   U R                   b)  U R                   S;  a  [        SU R                    S35      e1 SknU R                  S L=(       a    U R                  R                  S5      nU R                  b5  U(       d-  U R                  U;  a  [        SU R                   SU S35      eg g g )	N)bnb-4bitbnb-8bitz"Unsupported quantization method: 'z$'. Must be 'bnb-4bit' or 'bnb-8bit'.>   sdpaeagerflex_attentionflash_attention_2flash_attention_3zkernels-community/z'Unsupported attention implementation: 'z'. Must be one of zF or a kernels-community kernel (e.g. 'kernels-community/flash-attn2').)rL   rb   rK   
startswith)r&   VALID_ATTN_IMPLEMENTATIONSis_kernels_communitys      r'   rY   ModelManager._validate_args   s    (T->->F^-^4T5F5F4GGkl  &s"#77tC  
H`H`HkHk I
 $$0(((0JJ9$:R:R9S T""<!=  >DE  K ) 1r*   model_idr<   c                     SU ;   a  U $ U  S3$ )zBCanonicalize to `'model_id@revision'` format. Defaults to `@main`.@z@mainrD   )rq   s    r'   r[   ModelManager.process_model_name   s     (?O5!!r*   c                 l    U R                   S:X  a  [        SSSS9$ U R                   S:X  a	  [        SS9$ g)zIReturn a BitsAndBytesConfig based on the `quantization` setting, or None.rf   Tnf4)load_in_4bitbnb_4bit_quant_typebnb_4bit_use_double_quantrg   )load_in_8bitN)rL   r   r-   s    r'   get_quantization_config$ModelManager.get_quantization_config   sD    
*%!$)*. 
 *,%488r*   model_id_and_revisionz(ProcessorMixin | PreTrainedTokenizerFastc                 j    SSK Jn  UR                  SS5      u  p4UR                  X4U R                  S9$ )ztLoad a processor for the given model.

Args:
    model_id_and_revision: Model ID in ``'model_id@revision'`` format.
r   )AutoProcessorrs   r   )revisionrJ   )transformersr   splitfrom_pretrainedrJ   )r&   r}   r   rq   r   s        r'   _load_processorModelManager._load_processor   s9     	/288a@,,X\`\r\r,ssr*   
tqdm_classprogress_callbackr   c                    SSK Jn  UR                  SS5      u  pVUU R                  U R                  U R
                  U R                  U R                  5       US.nUb  U" SUSS.5        UR                  " U40 UD6nSS	K	J
n	  UR                  U	;   a  SS
K Jn
  U
R                  " U40 UD6$ [        [         UR                  S   5      nUR                  " U40 UD6$ )aK  Load a model.

Args:
    model_id_and_revision (`str`): Model ID in ``'model_id@revision'`` format.
    tqdm_class (*optional*): tqdm subclass for progress bars during ``from_pretrained``.
    progress_callback (`Callable`, *optional*): Called with progress dicts during loading.

Returns:
    `PreTrainedModel`: The loaded model.
r   )
AutoConfigrs   r   )r   rK   rI   
device_maprJ   quantization_configr   loadingconfigstatusr   stage)%MODEL_FOR_MULTIMODAL_LM_MAPPING_NAMES)AutoModelForMultimodalLM)r   r   r   rK   rI   rH   rJ   r{   r   &transformers.models.auto.modeling_autor   
model_typer   r`   architectures)r&   r}   r   r   r   rq   r   model_kwargsr   r   r   architectures               r'   _load_modelModelManager._load_model   s     	,288a@ !#'#;#;ZZ++!%!7!7#'#?#?#A$
 (=R]efg++HEE` EE=+;;HUUU|V-A-A!-DE++HEEEr*   z@tuple[PreTrainedModel, ProcessorMixin | PreTrainedTokenizerFast]c           	        ^  T R                      T R                  R                  U[        R                  " 5       5      nSSS5        W   UT R
                  ;  a  [        R                  SU 35        Ub  U" SUSS.5        T R                  U5      nT R                  XUS9n[        UT R                  UU4U 4S jjS9T R
                  U'   Ub  U" S	US
S.5        O^T R
                  U   R                  5         T R
                  U   R                  nT R
                  U   R                  nUb  U" S	USS.5        SSS5        Xe4$ ! , (       d  f       GN= f! , (       d  f       WW4$ = f)at  Load a model (or return it from cache), resetting its inactivity timer.

Args:
    model_id_and_revision: Model ID in ``'model_id@revision'`` format.
    progress_callback: If provided, called with dicts like
        ``{"status": "loading", "model": ..., "stage": ...}`` during loading.
    tqdm_class: Optional tqdm subclass for progress bars during ``from_pretrained``.
NzLoading r   r   r   )r   r   c                 <   > TR                   R                  U S 5      $ r   )rQ   pop)keyr&   s    r'   <lambda>7ModelManager.load_model_and_processor.<locals>.<lambda>  s    @R@R@V@VWZ\`@ar*   )r   r   r   readyFr   r   cachedT)rT   rR   
setdefaultr!   rS   rQ   r7   warningr   r   r   rM   r.   r   r   )r&   r}   r   r   lockr   r   s   `      r'   rZ   %ModelManager.load_model_and_processor   sq    $$$$//0Ey~~GWXD % $D,>,>>*?)@AB$0%EZep&qr 001FG	(()Te )  =G$($6$6')>a	=""#89 %0%CXdi&jk""#89EEG**+@AGG ../DEOO	$0%CXdh&ij- . 5 %$ T. is   0EC1E
E
E'c                  ^ ^^^^^	#    Um[         R                  " 5       nTT R                  ;   a?  T R                  T   R                  5         S[        R
                  " STSS.5       S37v   gTT R                  ;   aC  T R                  T   R                  U5         UR                  5       I Sh  vN nUc   gU7v   M$  U/T R                  T'   [         R                  " 5       mS[        4UUU 4S jjm[        TT5      m	U	4S	 jmUUUUU U	4S
 jn[         R                  " U" 5       5      T R                  T'    UR                  5       I Sh  vN nUc  gU7v   M#   N N7f)u  Load a model and stream progress as SSE events.

Handles three cases:
1. Model already cached → single ``ready`` event
2. Load already in progress → join existing subscriber stream
3. First request → start loading, broadcast to all subscribers

Args:
    model_id_and_revision (`str`): Model ID in ``'model_id@revision'`` format.

Yields:
    `str`: SSE ``data: ...`` lines with progress updates.
data: r   Tr   

Npayloadc                 n   >^ S[         R                  " U 5       S3mUUU4S jnTR                  U5        g )Nr   r   c                  n   > TR                   R                  T/ 5       H  n U R                  T5        M     g r   )rU   get
put_nowait)qmidmsgr&   s    r'   	broadcastEModelManager.load_model_streaming.<locals>.enqueue.<locals>.broadcastN  s-    2266sB?ALL% @r*   )jsondumpscall_soon_threadsafe)r   r   r   loopr   r&   s     @r'   enqueue2ModelManager.load_model_streaming.<locals>.enqueueK  s0    4::g./t4C& %%i0r*   c                    > T" U0 UD6$ r   rD   )factoryargskwargsr   s      r'   
_tqdm_hook5ModelManager.load_model_streaming.<locals>._tqdm_hookV  s    t.v..r*   c                    >#     [         R                  " T5      n  [        R                  " TR                  TTTS9I S h  vN   [         R                  " U 5        UU4S jnTR                  U5        g  N3! [         R                  " U 5        f = f! [
         a9  n[        R                  ST SU 3SS9  T" ST[        U5      S.5         S nANtS nAff = f! UU4S jnTR                  U5        f = f7f)	N)r   r   zFailed to load z: T)exc_infoerror)r   r   messagec                     > TR                   R                  T/ 5       H  n U R                  S 5        M     TR                  R                  TS 5        g r   )rU   r   r   rV   )r   r   r&   s    r'   _send_sentinelKModelManager.load_model_streaming.<locals>.run_load.<locals>._send_sentinelm  sC    !66::3CT* D''++C6r*   )
r   set_tqdm_hookasyncio	to_threadrZ   	Exceptionr7   r   r   r   )	previous_hooker   r   r   r   r   r&   r   s	      r'   run_load3ModelManager.load_model_streaming.<locals>.run_loadY  s     : !( 5 5j A9!++55*1#-	   ))-87
 )).9% ))-8 Nse2aS9DI7SSVLMMN
7
 )).9sc   C4B $A6  A4A6 B C44A6 6BB 
C/C
C CC C11C4)r   QueuerQ   r.   r   r   rV   rU   appendr   get_running_loopdictr   create_task)
r&   r}   queueitemr   r   r   r   r   r   s
   `    @@@@@r'   load_model_streaming!ModelManager.load_model_streaming&  sS     $+2==? $$$$s#//14::3RV&WXYY]^^ $%%%%%c*11%8"YY[(< 
	  +0!!#&'')	1T 	1 	1 .gs;
	/	: 	:6 $+#6#6xz#BC $D|J	 k )l %s%   B)E$1E 2BE$E"E$"E$c                 x    [        U R                  R                  5       5       H  nUR                  5         M     g)z,Delete all loaded models and free resources.N)listrQ   valuesr4   )r&   timeds     r'   shutdownModelManager.shutdown|  s,    $,,3356E  7r*   r   r   r   c                    Ub%  [        U[        5      (       a  [        R                  $ SSKJnJnJn  U R                  R                  nXTR                  5       ;   a  [        R                  $ XSR                  5       ;   a  [        R                  $ XRR                  5       ;   a  [        R                  $ [        SU 35      e)a  Detect whether a model is an LLM or VLM based on its architecture.

Args:
    model (`PreTrainedModel`): The loaded model.
    processor (`ProcessorMixin | PreTrainedTokenizerFast`, *optional*):
        If a plain tokenizer (not a multi-modal processor), short-circuits to LLM.

Returns:
    `Modality`: The detected modality (``Modality.LLM``, ``Modality.VLM``, or ``Modality.MULTIMODAL``).
r   !MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMESr   zUnknown modality for: )ra   r	   r   LLMr   r   r   r   	__class__r=   r   
MULTIMODALVLMrb   )r   r   r   r   r   model_classnames         r'   get_model_modalityModelManager.get_model_modality  s      Z	;R%S%S<<	
 	
  //22JJLL&&& Q Q SS<< H H JJ<<5o5FGHHr*   	cache_dirc           	      h  ^^^ SSK JnJnJn  / n[        R                  S5        [        [        U 5      R                  5       GHj  nUR                  S:w  a  M  UR                  R                  5        GH5  u  pg[        S UR                   5       S5      nU(       d  M,  [        R                  " UR!                  5       R#                  5       5      n	[%        U	[&        5      (       a  SU	;   d  M{  U	S   n
UR)                  5       mUR)                  5       mUR)                  5       m[+        UUU4S jU
 5       5      (       d  M  S	UR,                  ;   a  UR,                  R/                  S	5      OS
nUR,                  US:w  a  SU 3OS
-   nUR1                  UUSUR2                  S.5        GM8     GMm     U$ )a.  List generative models (LLMs and VLMs) available in the HuggingFace cache.

Args:
    cache_dir (`str`, *optional*): Path to the HuggingFace cache directory.
        Defaults to the standard cache location.

Returns:
    `list[dict]`: OpenAI-compatible model list entries with ``id``, ``object``, etc.
r   r   z/Scanning the cache directory for LLMs and VLMs.r   c              3   \   #    U  H"  oR                   S :X  d  M  UR                  v   M$     g7f)zconfig.jsonN)	file_name	file_path).0fs     r'   	<genexpr>.ModelManager.get_gen_models.<locals>.<genexpr>  s$     #m9LAP[P[_lPlKAKK9Ls   ,,Nr   c              3   B   >#    U  H  o/ TQTQTQ;   d  M  Uv   M     g 7fr   rD   )r   archllms
multimodalvlmss     r'   r   r     s)     ]A\4A\$A\Q[A\9\tts   	/ mainrs   )owned_byidobjectcreated)r   r   r   r   r7   r   r   r   repos	repo_typerefsitemsnextfilesr   loadsopenreadra   r   r   anyrepo_idr   r   last_modified)r   r   r   r   generative_modelsreporefrevision_infoconfig_pathr   r   authorrepo_handler   r   r   s                @@@r'   get_gen_modelsModelManager.get_gen_models  sj   	
 	
 HI	2889D~~(&*iioo&7""#m9L9L#most"K$4$4$6$;$;$=>"6400_5N & 78??AAHHJBIIK
]]]]8;t||8KT\\//4QSF"&,,sf}AcU)RT"UK%,,(."-&-'+'9'9	# '8	 :< ! r*   )rU   rV   rR   rT   rK   rH   rI   rN   rQ   rM   rL   rJ   )r^   r^   FNNi,  Nr:   r;   r   )r=   r>   r?   r@   rA   r   boolrB   r(   staticmethodrX   rY   r[   r   r{   r   typer   r   rZ   r   r   r   r   r   r   r   r  rC   rD   r*   r'   rF   rF   ]   s   " ""'*.#' "&%P%P Tz%P  	%P
 !4Z%P Dj%P %P 4Z%PN 
cDj 
 
& "S "S " "
);d)B 
	tS 	t=g 	t pt'F%('F6:Tk'F]ehl]l'F	'FX .2"&	) ")  $d?)  4K	) 
 
L) VT Tl!
 aeI I-^I	I I@ 1!#* 1!T
 1!  1!r*   rF   )rA   r   r2   r   r!   collections.abcr   	functoolsr   typingr   huggingface_hubr   r   r   r   r	   utilsr   r   r   r   r   r   r   
get_loggerr=   r7   r   rF   rD   r*   r'   <module>r     sl     	   $    *   D  H H UU 
		H	%/u /udx! x!r*   