
    Z j                     *   S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSKJ	r
  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SSKJrJrJ r   SSK!J"r"  SSK#J$r$  \RJ                  " \&5      r' " S S\RP                  5      r) " S S\RP                  5      r* " S S\RP                  5      r+ " S S\5      r,\ " S S\5      5       r-\ " S S\-5      5       r.\" SS 9 " S! S"\-\5      5       r/\" S#S 9 " S$ S%\-5      5       r0/ S&Qr1g)'zPyTorch OpenAI ImageGPT model.    N)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D)auto_docstringloggingtorch_float)maybe_autocast   )ImageGPTConfigc                   x   ^  \ rS rSrS	S\\   S\4U 4S jjjrS\R                  S\R                  4S jr
SrU =r$ )
ImageGPTLayerNorm/   hidden_sizeepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__r   r   	ParametertorchTensorweight)selfr   r   	__class__s      /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/imagegpt/modeling_imagegpt.pyr!   ImageGPTLayerNorm.__init__0   s,    ll5<<#<=    tensorreturnc           	          U[         R                  " [         R                  " [         R                  " U5      SSS9U R                  -   5      -  nXR
                  -  nU$ )NT)axiskeepdim)r#   sqrtmeansquarer   r%   )r&   r+   s     r(   forwardImageGPTLayerNorm.forward5   sI    %**UZZV0D2W[%\_c_g_g%ghh++%r*   )r   r%   )gh㈵>)__name__
__module____qualname____firstlineno__tupleintfloatr!   r#   r$   r4   __static_attributes____classcell__r'   s   @r(   r   r   /   s?    >E#J >U > >
ell u||  r*   r   c                     ^  \ rS rSrSS\S-  S\S-  4U 4S jjjrSS jrSS jrS r	S	 r
      SS
\R                  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\4S jjrSrU =r$ )ImageGPTAttention<   Nis_cross_attention	layer_idxc           
        > [         TU ]  5         Xl        UR                  nU R	                  S[
        R                  " [
        R                  " XD4[
        R                  S95      R                  SSXD5      SS9  UR                  U l        UR                  U l        U R                  U R                  -  U l        U R                  U l        U R                  U R                  -  U R                  :w  a&  [!        SU R                   SU R                   S35      eUR"                  U l        X l        UR&                  U l        X0l        UR*                  U l        U R$                  (       aN  [-        S	U R                  -  U R                  5      U l        [-        U R                  U R                  5      U l        O([-        S
U R                  -  U R                  5      U l        [-        U R                  U R                  5      U l        [4        R6                  " UR8                  5      U l        [4        R6                  " UR<                  5      U l        g )Nbiasdtyper   F)
persistentz=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).   r   ) r    r!   configmax_position_embeddingsregister_bufferr#   trilonesboolviewr   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsrC   scale_attn_by_inverse_layer_idxrD   reorder_and_upcast_attnr   c_attnq_attnc_projr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout)r&   rK   rC   rD   max_positionsr'   s        r(   r!   ImageGPTAttention.__init__=   s   66JJuzz="@

STYY1m  	 	
  ++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;r*   c                 F   [         R                  " XR                  SS5      5      nU R                  (       a   U[	        UR                  S5      S-  5      -  nU R                  (       a  U[        U R                  S-   5      -  nU R                  (       d  UR                  S5      UR                  S5      pvU R                  S S 2S S 2Xv-
  U2S U24   n[         R                  " UR                  5      R                  n	[         R                  " XR                  UR                  S9n	[         R                   " XU	5      nUb  XT-   n["        R$                  " SS9" U5      nUR'                  UR                  5      nU R)                  U5      n[         R                  " XS5      n
X4$ )Nr.         ?r   rH   devicedim)r#   matmul	transposerX   r   sizerY   r<   rD   rC   rF   finforH   minr+   ri   wherer   Softmaxtyper`   )r&   querykeyvalueattention_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs              r(   _attnImageGPTAttention._attne   sP   ||E==R+@A""'+ejjn6K*LLL //'%0B*CCL&&',zz"~sxx|*))Aq**Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{*ML%'8Lzzb),7 $((5((6ll<7((r*   c           	      l   UR                  5       u  pVpxUR                  5       u    pn	[        R                  " XV-  Xz[        R                  UR                  S9nSnU R
                  (       a   U[        UR                  S5      5      S-  -  nU R                  (       a  U[        U R                  S-   5      -  n[        UR                  R                  SS9   UR                  SXx5      UR                  SS5      R                  SX5      p[        R                  " XR                  5       UR                  5       S	US
9nUR                  XVXz5      nS S S 5        U R                  (       d  UR                  S5      UR                  S5      nnU R                  S S 2S S 2UU-
  U2S U24   n[        R                   " UR"                  5      R$                  n[        R&                  " UUR"                  UR                  S9n[        R(                  " UUU5      nUb  X-   n[*        R,                  " SS9" U5      nUR"                  [        R                  :w  a  [/        S5      eUR                  UR"                  5      nU R1                  U5      n[        R2                  " X5      nUU4$ ! , (       d  f       GN\= f)Nrh         ?r.   rg   r   F)enabledrf   r   )betaalpharj   zDError with upcasting, attn_weights does not have dtype torch.float32)rn   r#   emptyfloat32ri   rX   r<   rY   rD   r   rs   reshaperm   baddbmmrC   rF   ro   rH   rp   r+   rq   r   rr   RuntimeErrorr`   rl   )r&   rt   ru   rv   rw   bszrT   	q_seq_lendk_	k_seq_lenrx   scale_factorqkry   rz   r{   r|   r}   s                       r(   _upcast_and_reordered_attn,ImageGPTAttention._upcast_and_reordered_attn   s2   (-

%	 XXZ1 {{3?IPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L ELL--u===Y3S]]2r5J5R5RSUWY5eq ==wwy!'')RS[ghL'//	UL >
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'8Lzzb),7 .eff#((5((6ll<7L((; >=s   A9J$$
J3c                 v    UR                  5       SS X#4-   nUR                  " U6 nUR                  SSSS5      $ )z:
Splits hidden_size dim into attn_head_size and num_heads
Nr.   r   rJ   r   r   )rn   rQ   permuter&   r+   rT   attn_head_size	new_shapes        r(   _split_headsImageGPTAttention._split_heads   sA     KKM#2&))DD	i(~~aAq))r*   c                     UR                  SSSS5      R                  5       nUR                  5       SS X#-  4-   nUR                  U5      $ )zC
Merges attn_head_size dim and num_attn_heads dim into hidden_size
r   rJ   r   r   Nrf   )r   
contiguousrn   rQ   r   s        r(   _merge_headsImageGPTAttention._merge_heads   sM     1a+668KKM#2&)*D)FF	{{9%%r*   hidden_states
layer_pastrw   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionsr,   c                    US Ln	UR                   u  pnUb]  [        U[        5      (       aF  UR                  R	                  U R
                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       Ga-  [        U S5      (       d  [        S5      eUb`  W(       aY  U R                  U5      nWR                  U R
                     R                  nUR                  U R
                     R                  nGOKU R                  U5      nU R                  U5      R                  U R                   SS9u  nnUR#                  U
SU R$                  U R&                  5      R)                  SS5      nUR#                  U
SU R$                  U R&                  5      R)                  SS5      nOU R                  U5      R                  U R                   SS9u  nnnUR#                  U
SU R$                  U R&                  5      R)                  SS5      nUR#                  U
SU R$                  U R&                  5      R)                  SS5      nUb@  WR+                  UUU R
                  5      u  nnU	(       a  SUR                  U R
                  '   UR#                  XU R$                  U R&                  5      R)                  SS5      nU R,                  (       a  U R/                  UUUU5      u  nnOU R1                  UUUU5      u  nnU R3                  UU R$                  U R&                  5      nU R5                  U5      nU R7                  U5      nUU4$ )Nr\   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.rJ   rj   r.   r   T)shape
isinstancer   
is_updatedgetrD   cross_attention_cacheself_attention_cachehasattrrW   r\   layerskeysvaluesr[   splitrV   rQ   rT   rU   rm   updaterZ   r   r~   r   r]   rb   )r&   r   r   rw   r   r   r   r   kwargsrC   r   seq_lenr   r   curr_past_key_valuescurrent_statesrt   ru   rv   r}   rx   s                        r(   r4   ImageGPTAttention.forward   s    3$>'--a!*&9::'2266t~~F
%+5+K+K(+5+J+J('1$2D.-4** t 
 %*M2*11$..AFF,33DNNCJJM2![[8>>tTU>V
UhhsBFPPQRTUV

3DNNDMMJTTUVXYZ $N ; A A$//WX A YE3((3DNNDMMBLLQPQRCJJsBFPPQRTUVE!-44S%PJC!8<
%%dnn5

3GQQRSUVW''(,(G(GsTY[i(j%K(,

5#un(U%K''T^^T]]Skk+.((5L((r*   )r`   r[   r]   rK   rR   rU   rC   rD   rT   r\   rZ   rb   rY   rX   rV   )FNr   NNNNFF)r6   r7   r8   r9   rP   r;   r!   r~   r   r   r   r#   r$   r	   r:   r4   r=   r>   r?   s   @r(   rA   rA   <   s    &<4$; &<SVY]S] &< &<P )D.)`*& $(.2596:!&).B)||B) DLB) t+	B)
  %||d2B) !&t 3B) $;B)  $;B) 
B) B)r*   rA   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ImageGPTMLPi  c                    > [         TU ]  5         UR                  n[        X5      U l        [        X15      U l        [        UR                     U l        [        R                  " UR                  5      U l        g r   )r    r!   r   r   c_fcr]   r   activation_functionactr   r^   ra   dropout)r&   intermediate_sizerK   rR   r'   s       r(   r!   ImageGPTMLP.__init__  sZ    &&	,8	Y:&445zz&"4"45r*   r   r,   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r]   r   )r&   r   s     r(   r4   ImageGPTMLP.forward  s@    		-0/M2]3r*   )r   r   r]   r   )
r6   r7   r8   r9   r!   r#   r$   r4   r=   r>   r?   s   @r(   r   r     s(    6U\\ ell  r*   r   c                      ^  \ rS rSrSU 4S jjr      SS\R                  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\S-  S\	4S jjr
SrU =r$ )ImageGPTBlocki  Nc                   > [         TU ]  5         UR                  nUR                  b  UR                  OSU-  n[	        X1R
                  S9U l        [        XS9U l        [	        X1R
                  S9U l	        UR                  (       a(  [        USUS9U l        [	        X1R
                  S9U l        [        XA5      U l        g )N   r   rD   T)rC   rD   )r    r!   r   n_innerr   layer_norm_epsilonln_1rA   attnln_2add_cross_attentioncrossattentionln_cross_attnr   mlp)r&   rK   rD   r   	inner_dimr'   s        r(   r!   ImageGPTBlock.__init__  s    ((&,nn&@FNNa+o	%k7P7PQ	%fB	%k7P7PQ	%%"3Ft_h"iD!2;D]D]!^Dy1r*   r   r   rw   r   r   r   r   r,   c           	      z   Un	U R                  U5      nU R                  UUUUUS9n
U
S   nU
SS  nX-   nUbW  [        U S5      (       d  [        SU  S35      eUn	U R	                  U5      nU R                  UUUUUUS9nUS   nX-   nXSS  -   nUn	U R                  U5      nU R                  U5      nX-   nU4U-   $ )N)r   rw   r   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   rw   r   r   r   )r   r   r   rW   r   r   r   r   )r&   r   r   rw   r   r   r   r   r   residualattn_outputsr}   outputscross_attn_outputsfeed_forward_hidden_statess                  r(   r4   ImageGPTBlock.forward-  s'    !		-0yy!)/ ! 
 #1oqr"#. ,4!122 =dV DZ Z  %H ..}=M!%!4!4%-&;'="3 "5 " -Q/K$2M12 66G 		-0%)XXm%<" ='))r*   )r   r   r   r   r   r   r   r   )r6   r7   r8   r9   r!   r#   r$   r	   rP   r:   r4   r=   r>   r?   s   @r(   r   r     s    2$ $(.2596:!&).5*||5* DL5* t+	5*
  %||d25* !&t 35* $;5*  $;5* 
5* 5*r*   r   c                   r   ^  \ rS rSr% \\S'   SrSrSrSr	S/r
\R                  " 5       U 4S j5       rS	rU =r$ )
ImageGPTPreTrainedModelie  rK   transformer	input_ids)imageTr   c           
      z  > [         TU ]  U5        [        U[        5      (       a  UR	                  5        Hm  u  p#SU;   d  M  SU;   d  M  [
        R                  " USU R                  R                  [        R                  " SU R                  R                  -  5      -  S9  Mo     g[        U[        5      (       a  UR                  R                  n[
        R                  " UR                  [         R"                  " [         R$                  " XD4[         R&                  S95      R)                  SSXD5      5        gg)	zInitialize the weights.r]   r%   g        rJ   )r2   stdrG   r   N)r    _init_weightsr   r   named_parametersinitnormal_rK   initializer_rangemathr1   n_layerrA   rL   copy_rF   r#   rN   rO   rP   rQ   )r&   modulenameprc   r'   s        r(   r   %ImageGPTPreTrainedModel._init_weightsn  s     	f% fo..!224t#D(8LL$++2O2ORVR[R[\]`d`k`k`s`s\sRt2tu 5  122"MMAAMJJ

5::}&DEJJWX]]q- 3r*    )r6   r7   r8   r9   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr#   no_gradr   r=   r>   r?   s   @r(   r   r   e  sA    %!O!&*#()
]]_ r*   r   c                   ~  ^  \ rS rSrS\4U 4S jjrS rS r\            SS\	R                  S-  S\S-  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\S-  S\S\\-  4S jj5       rSrU =r$ )ImageGPTModeli  rK   c           
      Z  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  U R                  5      U l        [        R
                  " UR                  U R                  5      U l	        [        R                  " UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        XS9PM     sn5      U l        [%        U R                  UR&                  S9U l        SU l        U R-                  5         g s  snf )Nr   r   F)r    r!   r   rR   r   	Embedding
vocab_sizewterL   wper^   
embd_pdropdrop
ModuleListrangenum_hidden_layersr   hr   r   ln_fgradient_checkpointing	post_init)r&   rK   ir'   s      r(   r!   ImageGPTModel.__init__  s     ++<< 1 14>>B<< > >OJJv001	ERXRjRjLklLkqf BLklm%dnn&:S:ST	&+#  ms   D(c                     U R                   $ r   r   )r&   s    r(   get_input_embeddings"ImageGPTModel.get_input_embeddings  s    xxr*   c                     Xl         g r   r	  )r&   new_embeddingss     r(   set_input_embeddings"ImageGPTModel.set_input_embeddings  s    !r*   Nr   past_key_valuesrw   token_type_idsposition_idsinputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   r,   c                 j   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  Ub  [        S5      eUbF  U R                  X5        UR                  5       nUR                  SUS   5      nUR                  S   nO1Ub#  UR                  5       SS nUR                  S   nO[        S5      eUb  UR                  OUR                  nU R                  (       a/  U R                  (       a  U	(       a  [        R                  S5        Sn	Ub  UR                  SUS   5      nU	(       a  Uc  [        U R                   S9nUcA  Ub  UR!                  5       OSn["        R$                  " US   US	9U-   nUR'                  S5      nUby  US::  a  [        S
5      eUR                  US5      nUSS2SSSS24   nUR)                  U R*                  S9nSU-
  ["        R,                  " U R*                  5      R.                  -  nU R                   R0                  (       aE  UbB  UR                  5       u  nnnUU4nUc  ["        R2                  " UUS	9nU R5                  U5      nOSnUc  U R7                  U5      nU R9                  U5      nUUR)                  UR                  5      -   nUb  U R7                  U5      nUU-   nU R;                  U5      nUUR                  S5      4-   nU
(       a  SOSnU
(       a  U R                   R0                  (       a  SOSnU(       a  SOSn[=        U R>                  5       H\  u  nnU(       a  UU4-   nU" UUUUUU	U
S9nUS   nU
(       d  M-  UUS   4-   nU R                   R0                  (       d  MS  UUS   4-   nM^     U RA                  U5      nUR                  " U6 nU(       a  UU4-   nU(       d  [C        S UUUUU4 5       5      $ [E        UUUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTModel
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer.   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rK   )ri   z$batch_size has to be defined and > 0rG   r   r   )r   r   r   r   rJ   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   ).0vs     r(   	<genexpr>(ImageGPTModel.forward.<locals>.<genexpr>B  s      wA ws   	)last_hidden_stater  r   
attentionscross_attentions)#rK   r   r  r   r  rW   %warn_if_padding_and_no_attention_maskrn   rQ   r   ri   r  trainingloggerwarning_oncer
   get_seq_lengthr#   arange	unsqueezetorH   ro   rp   r   rO   invert_attention_maskr   r   r   	enumerater  r  r:   r   ) r&   r   r  rw   r  r  r  r   r   r   r   r  r  r   input_shape
batch_sizeri   past_seen_tokensencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr  blockr   s                                    r(   r4   ImageGPTModel.forward  sN   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY ]%>cdd"66yQ#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T&&4==##p "	%+00[_EN0*$++>OCRC^==?de <<BGJZZL'11!4L %Q !GHH+00R@N ,AtT1,<=N ,..TZZ.@N!N2ekk$**6M6Q6QQN ;;**/D/P=R=W=W=Y: 7$68O#P %-).4HQW)X&%)%?%?@V%W"%)"  HHY/M((<0%(:(:=;O;O(PP% $ 8),==M		-0"m&8&8&<%>>$5b4%64;;;Z;Zr`d"6BD!$&&)HAu#$58H$H!%'=#"3G $AJM  &9WQZM&I#;;222+?71:-+O(% *( 		-0%**L9   1]4D D ':KM`bvw   9+++*1
 	
r*   )r   rR   r  r  r  r   r   )NNNNNNNNNNNN)r6   r7   r8   r9   r   r!   r
  r  r   r#   r$   r	   rP   r   r:   r   r4   r=   r>   r?   s   @r(   r   r     sF   ~  "  *.(,.2.2,0-1596:!%)-,0#'m
<<$&m
 m
 t+	m

 t+m
 llT)m
 ||d*m
  %||d2m
 !&t 3m
 $;m
  $;m
 #Tkm
 D[m
 m
 
:	:m
 m
r*   r   z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            !         ^  \ rS rSrSS0rS\4U 4S jjr\             SS\R                  S-  S\
S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\S\\-  4S jj5       rSrU =r$ )ImageGPTForCausalImageModelingiQ  zlm_head.weightztransformer.wte.weightrK   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  S-
  SS9U l        U R                  5         g )Nr   FrF   )
r    r!   r   r   r   Linearn_embdr   lm_headr  r&   rK   r'   s     r(   r!   'ImageGPTForCausalImageModeling.__init__Z  sL     (0yy0A0AA0EER 	r*   Nr   r  rw   r  r  r  r   r   labelsr   r   r  r  r   r,   c                 (   Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUUS9nUS   nU R                  U5      nSnU	br  USSS2SS24   R	                  5       nU	SSS24   R	                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
>>> import torch
>>> import matplotlib.pyplot as plt
>>> import numpy as np

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> model.to(device)  # doctest: +IGNORE_RESULT

>>> # unconditional generation of 8 images
>>> batch_size = 4
>>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
>>> context = context.to(device)
>>> output = model.generate(
...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
... )

>>> clusters = image_processor.clusters
>>> height = image_processor.size["height"]
>>> width = image_processor.size["width"]

>>> samples = output[:, 1:].detach().cpu().numpy()
>>> samples_img = [
...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
... ]  # convert color cluster tokens back to pixels
>>> f, axes = plt.subplots(1, batch_size, dpi=300)

>>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
...     ax.axis("off")
...     ax.imshow(img)
```N)r  rw   r  r  r  r   r   r   r   r  r  r   .r.   r   )losslogitsr  r   r  r  )rK   r  r   r>  r   r   rQ   rn   r   r  r   r  r  )r&   r   r  rw   r  r  r  r   r   rA  r   r   r  r  r   transformer_outputsr   	lm_logitsrC  shift_logitsshift_labelsloss_fctoutputs                          r(   r4   &ImageGPTForCausalImageModeling.forwardb  sL   J &1%<k$++BYBY"..+))%'"7#9/!5# / 
 ,A.LL/	$S#2#q[1<<>L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D\$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r*   )r>  r   )NNNNNNNNNNNNN)r6   r7   r8   r9   _tied_weights_keysr   r!   r   r#   r$   r	   rP   r   r:   r   r4   r=   r>   r?   s   @r(   r9  r9  Q  sa    +,DE~   *.(,.2.2,0-1596:&*!%)-,0#'l
<<$&l
 l
 t+	l

 t+l
 llT)l
 ||d*l
  %||d2l
 !&t 3l
 t#l
 $;l
  $;l
 #Tkl
 D[l
 l
  
2	2!l
 l
r*   r9  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                   R  ^  \ rS rSrS\4U 4S jjr\           SS\R                  S-  S\	S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\
S-  S\
S-  S\
S-  S\
S-  S\S\\-  4S jj5       rSrU =r$ )ImageGPTForImageClassificationi  rK   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g )NFr;  )
r    r!   
num_labelsr   r   r   r<  r=  scorer  r?  s     r(   r!   'ImageGPTForImageClassification.__init__  sR      ++(0YYv}}dooEJ
 	r*   Nr   r  rw   r  r  r  rA  r   r   r  r  r   r,   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nUR                  SS9nU R	                  U5      nSnUb  U R                  UUU R                   5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
```N)	r  rw   r  r  r  r   r   r  r  r   r   rj   )rC  rD  r  r   r  )
rK   r  r   r2   rQ  loss_functionr   r  r   r  )r&   r   r  rw   r  r  r  rA  r   r   r  r  r   rE  r   pooled_hidden_statesrD  rC  rJ  s                      r(   r4   &ImageGPTForImageClassification.forward  s    f &1%<k$++BYBY"..+))%'/!5# / 
 ,A.,11a1801%%ffdkkBDY!4QR!88F)-)9TGf$EvE//??-;;*55
 	
r*   )rP  rQ  r   )NNNNNNNNNNN)r6   r7   r8   r9   r   r!   r   r#   r$   r	   rP   r   r:   r   r4   r=   r>   r?   s   @r(   rN  rN    s%   ~   *.(,.2.2,0-1&*!%)-,0#'T
<<$&T
 T
 t+	T

 t+T
 llT)T
 ||d*T
 t#T
 $;T
  $;T
 #TkT
 D[T
 T
 
1	1T
 T
r*   rN  )r9  rN  r   r   )2__doc__r   typingr   r#   r   torch.nnr    r   r   activationsr   cache_utilsr	   r
   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   utils.genericr   configuration_imagegptr   
get_loggerr6   r!  Moduler   rA   r   r   r   r   r9  rN  __all__r   r*   r(   <module>rh     s=   %     % & ! C C ) 9 
 . # 
 , 2 
		H	%
		 
M)		 M)`")) "E*. E*P o  D E
+ E
 E
P x
%<o x
x
v _
%< _
_
Dr*   