
    Z j                     @   S SK r S SKrS SKJrJrJr  S SKJr  SSKJr	  SSK
Jr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJr  SSKJr  SSKJrJ r J!r!  SSK"J#r#  SSK$J%r%J&r&  SSK'J(r(  \!RR                  " \*5      r+ " S S\RX                  5      r- " S S\RX                  5      r. " S S\RX                  5      r/ " S S\RX                  5      r0 " S S\RX                  5      r1 " S S\RX                  5      r2 " S S \5      r3 " S! S"\RX                  5      r4 " S# S$\RX                  5      r5 " S% S&\RX                  5      r6 " S' S(\RX                  5      r7 " S) S*\RX                  5      r8 " S+ S,\5      r9 " S- S.\95      r: " S/ S0\9\5      r;/ S1Qr<g)2    N)Tensordevicenn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargscan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )BlipTextConfigc                      ^  \ rS rSrSrU 4S jr    SS\R                  S-  S\R                  S-  S\R                  S-  S\	S	\R                  4
S
 jjrSrU =r$ )BlipTextEmbeddings-   z;Construct the embeddings from word and position embeddings.c                 "  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR                  5      R%                  S5      SS9  Xl        g )N)padding_idxepsposition_idsr   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandconfigselfr:   	__class__s     |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/blip/modeling_blip_text.pyr)   BlipTextEmbeddings.__init__0   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
     N	input_idsr$   inputs_embedspast_key_values_lengthreturnc                 ,   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XFU-   24   nUc  U R                  U5      nUnU R                  U5      nXx-  nU R	                  U5      nU R                  U5      nU$ )Nr&   r   )sizer$   r.   r0   r1   r5   )	r<   rA   r$   rB   rC   input_shape
seq_length
embeddingsr0   s	            r>   forwardBlipTextEmbeddings.forward?   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL  00;M"
"66|D)
^^J/
\\*-
r@   )r1   r:   r5   r0   r.   )NNNr   )__name__
__module____qualname____firstlineno____doc__r)   r7   
LongTensorFloatTensorintr   rJ   __static_attributes____classcell__r=   s   @r>   r   r   -   sx    E" .20426&'##d* &&- ((4/	
 !$ 
 r@   r   c                     ^  \ rS rSrSU 4S jjrS rS rS rS r    SS\	R                  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\S-  S\\   S\\	R                  \	R                  4   4S jjrSrU =r$ )BlipTextSelfAttention^   Nc                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a5  [        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        X0l
        [        R                  " UR                  U R                  5      U l        U(       aa  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        O`[        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R"                  " UR$                  5      U l        g )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d))r(   r)   r:   r,   num_attention_headshasattr
ValueErrorrS   attention_head_sizeall_head_size	layer_idxr   Linearqueryencoder_hidden_sizekeyvaluer3   attention_probs_dropout_probr5   r<   r:   is_cross_attentionra   r=   s       r>   r)   BlipTextSelfAttention.__init___   s_    : ::a?PVXhHiHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PP"YYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EFr@   c                     Xl         g Nattn_gradients)r<   rn   s     r>   save_attn_gradients)BlipTextSelfAttention.save_attn_gradientsw   s    ,r@   c                     U R                   $ rl   rm   r<   s    r>   get_attn_gradients(BlipTextSelfAttention.get_attn_gradientsz   s    """r@   c                     Xl         g rl   attention_map)r<   rw   s     r>   save_attention_map(BlipTextSelfAttention.save_attention_map}   s    *r@   c                     U R                   $ rl   rv   rr   s    r>   get_attention_map'BlipTextSelfAttention.get_attention_map   s    !!!r@   hidden_statesattention_maskencoder_hidden_statesencoder_attention_maskpast_key_valueskwargsrD   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	US Ln
U
(       a  UOUnSnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU
(       a  UR                  nOUR                  nOUnU
(       a  UOUnU
(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nO/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R!                  U5      R                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU
(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " XR	                  SS5      5      nU[(        R*                  " U R                  5      -  nUb  UUR-                  UR.                  5      -   n[0        R2                  " SS9" U5      nU R5                  U5      n[$        R&                  " UU5      nUR7                  SSSS	5      R9                  5       nUR;                  5       S S U R<                  4-   nUR                  " U6 nUU4$ )
Nr&   r      FT)dimr   r   )shaper_   rc   view	transpose
isinstancer   
is_updatedgetra   cross_attention_cacheself_attention_cachelayerskeysvaluesre   rf   updater7   matmulmathsqrttor   r   Softmaxr5   permute
contiguousrF   r`   )r<   r}   r~   r   r   r   r   rG   hidden_shapequery_layerri   r   curr_past_key_valuescurrent_states	key_layervalue_layerkv_shapeattention_scoresattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                         r>   rJ   BlipTextSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQR
 3$>3E/>
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$2D.-/"=*,33DNNCHHI.55dnnELLKQ--cr2QBQ8P8PQH055h?II!QOI**^499(CMMaQRSK*)=)D)DY]a]k]k)l&	%*_FY*Z*ZAEO..t~~> !<<5H5HR5PQ+dii8P8P.QQ%/.2C2CDTD[D[2\\ **,-=> #',,"?%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDo--r@   )r`   r_   rw   rn   r:   r5   re   ra   r\   rc   rf   rl   NNNN)rL   rM   rN   rO   r)   ro   rs   rx   r{   r7   r   rR   r
   r   r   tuplerJ   rT   rU   rV   s   @r>   rX   rX   ^   s    G0-#+" 48:>;?(,D.||D. ))D0D.  %0047	D.
 !& 1 1D 8D. D. +,D. 
u||U\\)	*D. D.r@   rX   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BlipTextSelfOutput   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr"   )r(   r)   r   rb   r,   denser1   r2   r3   r4   r5   r;   s     r>   r)   BlipTextSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r@   r}   input_tensorrD   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rl   r   r5   r1   r<   r}   r   s      r>   rJ   BlipTextSelfOutput.forward   5    

=1]3}'CDr@   r1   r   r5   
rL   rM   rN   rO   r)   r7   r   rJ   rT   rU   rV   s   @r>   r   r      6    >U\\  RWR^R^  r@   r   c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	\
   S	\\R                  \R                  4   4S
 jjrSrU =r$ )BlipTextAttention   Nc                 `   > [         TU ]  5         [        XUS9U l        [	        U5      U l        g )Nra   )r(   r)   rX   r<   r   outputrh   s       r>   r)   BlipTextAttention.__init__   s)    )&PYZ	(0r@   r}   r~   r   r   r   rD   c                 R    U R                  UUUUS9u  pgU R                  Xa5      nX4$ )Nr~   r   r   )r<   r   )	r<   r}   r~   r   r   r   r   r   attention_outputs	            r>   rJ   BlipTextAttention.forward   s@     *.)"7+	 *3 *
&  ;;}D00r@   )r   r<   )FN)NNN)rL   rM   rN   rO   r)   r7   r   rR   r
   r   r   r   rJ   rT   rU   rV   s   @r>   r   r      s    1 48:>(,1||1 ))D01  %0047	1
 1 +,1 
u||U\\)	*1 1r@   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipTextIntermediate   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rl   )r(   r)   r   rb   r,   intermediate_sizer   r   
hidden_actstrr	   intermediate_act_fnr;   s     r>   r)   BlipTextIntermediate.__init__   s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r@   r}   rD   c                 J    U R                  U5      nU R                  U5      nU$ rl   r   r   r<   r}   s     r>   rJ   BlipTextIntermediate.forward   s&    

=100?r@   r   r   rV   s   @r>   r   r      s(    9U\\ ell  r@   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BlipTextOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r(   r)   r   rb   r   r,   r   r1   r2   r3   r4   r5   r;   s     r>   r)   BlipTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r@   r}   r   rD   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rl   r   r   s      r>   rJ   BlipTextOutput.forward
  r   r@   r   r   rV   s   @r>   r   r     r   r@   r   c                      ^  \ rS rSrU 4S jr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\R                  4S jjrS rSrU =r$ )BlipTextLayeri  c                 B  > [         TU ]  5         Xl        UR                  U l        SU l        [        XS9U l        X l        U R                  R                  (       a#  [        XR                  R                  US9U l	        [        U5      U l        [        U5      U l        g )Nr   r   )ri   ra   )r(   r)   r:   chunk_size_feed_forwardseq_len_dimr   	attention	layer_num
is_decodercrossattentionr   intermediater   r   )r<   r:   r   r=   s      r>   r)   BlipTextLayer.__init__  s~    '-'E'E$*6G";;!!"3;;+A+AY#D 18$V,r@   Nr}   r   r~   r   r   r   rD   c                     U R                  UUUS9u  pxUb  U R                  UUUUS9u  px[        U R                  U R                  U R
                  U5      n	U	$ )N)r~   r   r   )r   r   r   feed_forward_chunkr   r   )
r<   r}   r   r~   r   r   r   r   _layer_outputs
             r>   rJ   BlipTextLayer.forward   s     #nn)+ - 
 !,"&"5"5 5&; /	 #6 # 1##T%A%A4CSCSUe
 r@   c                 J    U R                  U5      nU R                  X!5      nU$ rl   )r   r   )r<   r   intermediate_outputr   s       r>   r    BlipTextLayer.feed_forward_chunk;  s)    "//0@A{{#6Ir@   )r   r   r:   r   r   r   r   r   r   )rL   rM   rN   rO   r)   r7   r   rR   r
   r   r   rJ   r   rT   rU   rV   s   @r>   r   r     s    -" ;?37;?(,||  %0047 ))D0	
 !& 1 1D 8  +, 
6 r@   r   c                      ^  \ rS rSrU 4S jr     SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	S-  S
\
\   S\4S jjrSrU =r$ )BlipTextEncoderiB  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf )NF)
r(   r)   r:   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r<   r:   ir=   s      r>   r)   BlipTextEncoder.__init__C  sR    ]]eFLdLdFe#fFeM&$<Fe#fg
&+# $gs   A&Nr}   r~   r   r   r   	use_cacher   rD   c                    U R                   (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       ad  [	        U[
        5      (       a  [        U[        U R                  S95      nO1Uc.  [        [        U R                  S9[        U R                  S95      nU R                   H  nU" UU4UUUS.UD6nM     [        UUS9$ )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r:   )r~   r   r   )last_hidden_stater   )
r   trainingloggerwarningr   r   r   r:   r   r   )	r<   r}   r~   r   r   r   r   r   layer_modules	            r>   rJ   BlipTextEncoder.forwardI  s     &&4==p "	 /<88"5o|[_[f[fGg"h ("5 4l$++6V# !JJL(%  .'= / M ' 9++
 	
r@   )r:   r   r   )NNNNN)rL   rM   rN   rO   r)   r7   r   rR   r
   boolr   r   r   rJ   rT   rU   rV   s   @r>   r   r   B  s    , 48:>;?(,!%(
||(
 ))D0(
  %0047	(

 !& 1 1D 8(
 (
 $;(
 +,(
 
3(
 (
r@   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipTextPooleriu  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rl   )r(   r)   r   rb   r,   r   Tanh
activationr;   s     r>   r)   BlipTextPooler.__init__v  s9    YYv1163E3EF
'')r@   r}   rD   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )r<   r}   first_token_tensorpooled_outputs       r>   rJ   BlipTextPooler.forward{  s6     +1a40

#566r@   )r  r   r   rV   s   @r>   r   r   u  s(    $
U\\ ell  r@   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipTextPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )r(   r)   r   rb   r,   r   r   r   r   r	   transform_act_fnr1   r2   r;   s     r>   r)   (BlipTextPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr@   r}   rD   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rl   )r   r
  r1   r   s     r>   rJ   'BlipTextPredictionHeadTransform.forward  s4    

=1--m<}5r@   )r1   r   r
  r   rV   s   @r>   r  r    s)    UU\\ ell  r@   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BlipTextLMPredictionHeadi  c                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        g )NT)bias)r(   r)   r  	transformr   rb   r,   r+   decoder	Parameterr7   zerosr  r;   s     r>   r)   !BlipTextLMPredictionHead.__init__  s[    8@ yy!3!3V5F5FTRLLV->->!?@	r@   c                 J    U R                  U5      nU R                  U5      nU$ rl   )r  r  r   s     r>   rJ    BlipTextLMPredictionHead.forward  s$    }5]3r@   )r  r  r  )rL   rM   rN   rO   r)   rJ   rT   rU   rV   s   @r>   r  r    s    A r@   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipTextOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g rl   )r(   r)   r  predictionsr;   s     r>   r)   BlipTextOnlyMLMHead.__init__  s    3F;r@   sequence_outputrD   c                 (    U R                  U5      nU$ rl   r  )r<   r  prediction_scoress      r>   rJ   BlipTextOnlyMLMHead.forward  s     ,,_=  r@   r   r   rV   s   @r>   r  r    s(    <!u|| ! ! !r@   r  c                   h   ^  \ rS rSr% Sr\\S'   Sr/ r\	\
" \SSS9/\
" \SSS9/S	.rU 4S
 jrSrU =r$ )BlipTextPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
r:   bertr   z.attention.)index
layer_namez.crossattention.)r}   
attentionscross_attentionsc                   > [         TU ]  U5        [        U[        5      (       a\  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g g )Nr&   r%   )r(   _init_weightsr   r   initcopy_r$   r7   r8   r   r9   )r<   moduler=   s     r>   r+  %BlipTextPreTrainedModel._init_weights  s^    f%f011JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 2r@    )rL   rM   rN   rO   rP   r   __annotations__base_model_prefix_no_split_modulesr   r   rX   _can_record_outputsr+  rT   rU   rV   s   @r>   r$  r$    sZ    
 &0mT
 0FXY
i ir@   r$  c                     ^  \ rS rSrSrSU 4S jjrS rS rS\S\	\
   S\S	\S
\4
S jr\\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S	\S-  S\\   S
\4S jj5       5       rSrU =r$ )BlipTextModeli  a  
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
`encoder_hidden_states` is then expected as an input to the forward pass.
c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OS U l        U R                  5         g rl   )
r(   r)   r:   r   rI   r   encoderr   pooler	post_init)r<   r:   add_pooling_layerr=   s      r>   r)   BlipTextModel.__init__  sG     ,V4&v.0AnV,tr@   c                 .    U R                   R                  $ rl   rI   r.   rr   s    r>   get_input_embeddings"BlipTextModel.get_input_embeddings  s    ...r@   c                 $    XR                   l        g rl   r>  )r<   rf   s     r>   set_input_embeddings"BlipTextModel.set_input_embeddings  s    */'r@   r~   rG   r   r   rD   c                    UR                  5       S:X  a  USS2SSS2SS24   nGO1UR                  5       S:X  Ga   U(       a  Uu  pg[        R                  " XsS9nUSSSS24   R                  XgS5      USSS2S4   :*  n	U	R	                  UR
                  5      n	U	R                  S   UR                  S   :  aU  UR                  S   U	R                  S   -
  n
[        R                  " [        R                  " XgU
4X9R
                  S9U	/SS9n	U	SS2SSS2SS24   USS2SSSS24   -  nO*USS2SSSS24   nO[        S	U S
UR                   S35      eUR	                  U R
                  S9nSU-
  S-  nU$ )a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`tuple[int]`):
        The shape of the input to the model.
    device (`torch.device`):
        The device of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
r   Nr   r   r   )r   dtyper&   )axisz!Wrong shape for input_ids (shape z) or attention_mask (shape ))rF  g      ?g     )
r   r7   r8   repeatr   rF  r   catonesr^   )r<   r~   rG   r   r   extended_attention_mask
batch_sizerH   seq_idscausal_maskprefix_seq_lens              r>   get_extended_attention_mask)BlipTextModel.get_extended_attention_mask  s   & 1$&4Qa]&C#!Q& )4&
,,zA%dD!m4;;JTUVZabfhikoboZpp)nn^-A-AB$$Q'.*>*>q*AA%3%9%9!%<{?P?PQR?S%SN"'))!JJ!+ HQW_p_p (	  #K +6aq!m*D~VWY]_cefVfGg*g'*8D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&r@   NrA   r$   rB   encoder_embedsr   r   r   r   r   c           	         U
(       d  Sn	Ub  Ub  [        S5      eUb2  U R                  X5        UR                  5       nUu  pUR                  nOYUb$  UR                  5       SS nUu  pUR                  nO2Ub$  UR                  5       SS nUu  pUR                  nO[        S5      eUc  SOUR	                  5       nUc)  [
        R                  " XU-   45      R                  U5      nU R                  X,X5      nUb  [        U[        5      (       a  US   R                  5       u  nnnOUR                  5       u  nnnUU4n[        U[        5      (       a"  U Vs/ s H  nU R                  U5      PM     nnO>Uc'  [
        R                  " UUS9nU R                  U5      nOU R                  U5      nOSnUc  U R                  UUUUS9nOUnU R                  " U4UUUUU	S	.UD6nUR                  nU R                  b  U R                  U5      OSn[!        UUUR"                  UR$                  UR&                  UR(                  S
9$ s  snf )a  
encoder_hidden_states  (`torch.FloatTensor`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
    the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor`, *optional*):
    Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
    the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
past_key_values (`Cache`, *optional*):
    Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
    If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
    don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
    `decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
    If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
    `past_key_values`).
FNzDYou cannot specify both input_ids and inputs_embeds at the same timer&   zGYou have to specify either input_ids or inputs_embeds or encoder_embedsr   rE  )rA   r$   rB   rC   )r~   r   r   r   r   )r   pooler_outputr   r}   r(  r)  )r^   %warn_if_padding_and_no_attention_maskrF   r   get_seq_lengthr7   rK  r   rQ  r   listinvert_attention_maskrI   r8  r   r9  r   r   r}   r(  r)  )r<   rA   r~   r$   rB   rS  r   r   r   r   r   r   rG   rM  rH   r   rC   rL  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapemaskencoder_extended_attention_maskembedding_outputencoder_outputsr  r  s                               r>   rJ   BlipTextModel.forward$  s   D I ]%>cdd"66yQ#..*K%0"J%%F&',,.s3K%0"J"))F'(--/4K%0"J#**Ffgg&5&=?CaCaCc!"ZZBX5X(YZ]]^deN 150P0P1
 !,/66AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$77`v2w`vX\43M3Md3S`v/2w/'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+!##)+'=	  /    .EI\\F
2"7#B+F
 F
 *;;8<8OO4UY;-'+;;)77&11,==
 	
? 3xs   I)r:   rI   r8  r9  )T)
NNNNNNNNNF)rL   rM   rN   rO   rP   r)   r?  rB  r   r   rS   r   r   rQ  r   r   r7   r
   r   r   r   rJ   rT   rU   rV   s   @r>   r6  r6    sZ   /0<'$<'38:<'GM<'[_<'	<'|   *..2,0-1.2596:(,!%"'p
<<$&p
 t+p
 llT)	p

 ||d*p
 t+p
  %||d2p
 !&t 3p
 p
 $;p
 4Kp
 +,p
 
6p
   p
r@   r6  c                      ^  \ rS rSrSSS.rU 4S jrS rS rS rS	 r	\
             SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\S
-  S\S
-  S\S
-  S\S
-  S\\R                  -  S\\   S\4S jj5       rSU 4S jjrSrU =r$ )BlipTextLMHeadModeli  zcls.predictions.biasz&bert.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                    > [         TU ]  U5        [        USS9U l        [	        U5      U l        UR                  U l        U R                  5         g )NF)r;  )r(   r)   r6  r%  r  clslabel_smoothingr:  r;   s     r>   r)   BlipTextLMHeadModel.__init__  sB     !&EB	&v.%55r@   c                 6    U R                   R                  5       $ rl   )r%  r?  rr   s    r>   r?  (BlipTextLMHeadModel.get_input_embeddings  s    yy--//r@   c                 :    U R                   R                  U5        g rl   )r%  rB  r<   new_embeddingss     r>   rB  (BlipTextLMHeadModel.set_input_embeddings  s    		&&~6r@   c                 B    U R                   R                  R                  $ rl   )re  r  r  rr   s    r>   get_output_embeddings)BlipTextLMHeadModel.get_output_embeddings  s    xx##+++r@   c                     XR                   R                  l        UR                  U R                   R                  l        g rl   )re  r  r  r  rk  s     r>   set_output_embeddings)BlipTextLMHeadModel.set_output_embeddings  s*    '5$$2$7$7!r@   NrA   r~   r$   rB   r   r   labelsr   r   return_logitsr   	reductionlogits_to_keepr   rD   c                 <   Ub  Sn	U R                   " U4UUUUUUU	US.UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU
(       a  USS2SS2SS24   R                  5       $ SnUb  USS2SS2SS24   R                  5       nUSS2SS24   R                  5       R                  UR                  5      n[        XR                  S9nU" UR                  SU R                  R                  5      UR                  S5      5      nUS:X  a0  UR                  UR                  S5      S5      R                  S5      n[!        UUUR"                  UR$                  UR&                  UR(                  S	9$ )
a  
encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
    hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
    configured as a decoder.
encoder_attention_mask (`torch.FloatTensor`, *optional*):
    Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
    the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
labels (`torch.LongTensor`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
past_key_values (`Cache`, *optional*):
    Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
    If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
    don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
    `decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
    If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
    `past_key_values`).
NF)r~   r$   rB   r   r   r   r   r   r&   r   )rv  rf  noner   )losslogitsr   r}   r(  r)  )r%  r   r   rS   slicere  r   r   r   r   rf  r   r:   r+   rF   sumr   r   r}   r(  r)  )r<   rA   r~   r$   rB   r   r   rt  r   r   ru  r   rv  rw  r   outputsr}   slice_indicesr!  lm_lossshifted_prediction_scoresloss_fcts                         r>   rJ   BlipTextLMHeadModel.forward  s   P I@D		A
)%'"7#9+!A
 A
  118B>SV8W8W~ot4]k HH]1mQ3F%GH$QQY/::<<(9!SbS!)(D(O(O(Q%AqrE]--/223L3S3STF')MaMabH8==b$++BXBXY[a[f[fgi[jkGF"!,,'8'='=a'@"EII!L0$#33!//))$55
 	
r@   c                 :   > [         TU ]  " U4UUS.UD6nSUS'   U$ )N)r   r~   Tr   )r(   prepare_inputs_for_generation)r<   rA   r   r~   model_kwargsmodel_inputsr=   s         r>   r  1BlipTextLMHeadModel.prepare_inputs_for_generation	  s>     w<
+)
 	
 &*\"r@   )r%  re  rf  )NNNNNNNNNFTmeanr   )NN)rL   rM   rN   rO   _tied_weights_keysr)   r?  rB  ro  rr  r   r7   r   r
   r   r   rS   r   r   r   rJ   r  rT   rU   rV   s   @r>   rc  rc    sy   (>*R
07,8  *..2,0-1596:&*(,!%%*"& &-.P
<<$&P
 t+P
 llT)	P

 ||d*P
  %||d2P
 !&t 3P
 t#P
 P
 $;P
 d{P
 4KP
 :P
 ell*P
 +,P
  
+!P
 P
d r@   rc  )r6  rc  r$  )=r   r7   r   r   r   torch.nnr    r   r,  activationsr	   cache_utilsr
   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   processing_utilsr   pytorch_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   r   configuration_blipr   
get_loggerrL   r   Moduler   rX   r   r   r   r   r   r   r   r  r  r  r$  r6  rc  __all__r0  r@   r>   <module>r     sY      $ $ % & ! C C ) 9 
 . & 6 B B 7 E . 
		H	%- -bi.BII i.Z 1		 12299  RYY -. -b/
bii /
fRYY  bii $ryy "!")) !io i4I
+ I
Zz1? zz Nr@   