
    Z j                       S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSK	r	SSK	J
r
  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/  SSK0J1r1J2r2J3r3  \)Rh                  " \55      r6\' " S S\!5      5       r7SWS\	Rp                  S\	Rr                  S\:S-  4S jjr; SXS\	Rx                  S\	Rr                  S\	Rz                  S\:4S jjr>\'\ " S  S!\5      5       5       r?\'" S"S#9\ " S$ S%\%5      5       5       r@\'" S&S#9\ " S' S(\%5      5       5       rA " S) S*\
R                  5      rC SYS+\
R                  S,\	Rp                  S-\	Rp                  S.\	Rp                  S/\	Rp                  S-  S0\DS1\D4S2 jjrE " S3 S4\
R                  5      rF " S5 S6\
R                  5      rG " S7 S8\5      rH " S9 S:\
R                  5      rI " S; S<\75      rJ " S= S>\
R                  5      rK " S? S@\
R                  5      rL " SA SB\
R                  5      rM " SC SD\5      rN " SE SF\75      rO " SG SH\75      rP " SI SJ\75      rQ\'" SKS#9 " SL SM\7\5      5       rR " SN SO\
R                  5      rS\'" SPS#9 " SQ SR\75      5       rT\'" SSS#9 " ST SU\7\5      5       rU/ SVQrVg)ZzPyTorch KOSMOS-2 model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)OutputRecordercapture_outputs   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigc                       \ rS rSr% \\S'   SrSrSS/rSr	Sr
Sr\R                  " 5       S\R                  4S	 j5       rS
rg)Kosmos2PreTrainedModel0   config)imagetextTKosmos2VisionEncoderLayerKosmos2TextBlockFmodulec                 t   [        U R                  S5      (       a  U R                  R                  nO;[        U R                  S5      (       a   U R                  R                  R                  n[        U R                  S5      (       a  U R                  R                  nO;[        U R                  S5      (       a   U R                  R
                  R                  n[        U[        5      (       Ga  [        R                  " UR                  SUR                  S-  W-  S9  [        R                  " UR                  R                  UR                  R                  U-  S9  [        R                  " UR                  R                  UR                  R                  U-  S9  [        R                   " UR"                  [$        R&                  " UR"                  R(                  S	   5      R+                  S
5      5        GO[        U[,        5      (       a  UR                  S-  SUR                  R.                  -  S-  -  W-  nUR                  S-  U-  n[        R                  " UR0                  R                  US9  [        R                  " UR2                  R                  US9  [        R                  " UR4                  R                  US9  [        R                  " UR6                  R                  US9  GO[        U[8        5      (       a  UR                  R:                  S-  SUR                  R.                  -  S-  -  W-  nSUR                  R:                  -  S-  U-  n[        R                  " UR<                  R                  US9  [        R                  " UR>                  R                  US9  GO0[        U[@        5      (       a  [        R                  " UR0                  R                  WS9  [        R                  " UR2                  R                  US9  [        R                  " UR4                  R                  US9  [        R                  " UR6                  R                  US9  GOu[        U[B        5      (       aT  [        R                  " UR<                  R                  WS9  [        R                  " UR>                  R                  US9  GO[        U[D        5      (       a+  [        R                  " URF                  R                  WS9  GO[        U[H        5      (       aK  [        R                  " URJ                  R                  WS9  [        R                  " URL                  5        GOl[        U[N        5      (       a  [        R                  " URP                  R                  SWS9  URP                  RR                  bA  [        RT                  " URP                  R                  URP                  RR                     5        O[        U[V        RX                  5      (       aA  [        RZ                  " UR                  5        [        RT                  " UR\                  5        Ot[        U[^        5      (       a_  URa                  URb                  URd                  -   URf                  URR                  5      n[        R                   " URh                  U5        [        U[V        Rj                  5      (       a/  UR\                  b!  [        RT                  " UR\                  5        ggg)zInitialize the weightsinitializer_factorvision_configinit_stdtext_config              )meanstd)r5   r    r6      N)6hasattrr'   r.   r/   r0   r1   
isinstanceKosmos2VisionEmbeddingsinitnormal_class_embedding	embed_dimpatch_embeddingweightinitializer_rangeposition_embeddingcopy_position_idstorcharangeshapeexpandKosmos2VisionAttentionnum_hidden_layersq_projk_projv_projout_projKosmos2VisionMLPhidden_sizefc1fc2KosmosTextAttentionKosmos2TextFFNKosmos2TextForCausalLMlm_headKosmos2ImageToTextProjectiondenselatent_queryKosmos2TextTransformerembed_tokenspadding_idxzeros_r   	LayerNormones_bias(Kosmos2TextSinusoidalPositionalEmbeddingget_embeddingnum_positionsoffsetembedding_dimweightsLinear)selfr,   factorr5   in_proj_stdout_proj_stdfc_stdemb_weightss           }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_init_weights$Kosmos2PreTrainedModel._init_weights:   s    4;; 455[[33FT[[/22[[..AAF4;;
++++&&CT[[-00++))22Cf566LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 677!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B 011!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**< 344LL--37LL--37LL--37LL//S9//LL**4LL**4 677LL..C8 <==LL,,#6LL,,- 677LL,,33#3G""..:F//66v7J7J7V7VWX--JJv}}%KK$ HII ..$$v}}4f6J6JFL^L^K JJv~~{3fbii((V[[-DKK$ .E(     N)__name__
__module____qualname____firstlineno__r!   __annotations__input_modalitiessupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdparF   no_gradr   Modulerp   __static_attributes__rs   rr   ro   r%   r%   0   sT    (&*#46HI"& N
]]_8%BII 8% 8%rr   r%   maskdtypetgt_lenc                 2   U R                  5       u  p4Ub  UOUnU SS2SSSS24   R                  USX$5      R                  U5      nSU-
  nUR                  UR                  [        R
                  5      [        R                  " U5      R                  5      $ )zW
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
Nr          ?)sizerI   tomasked_fillrF   boolfinfomin)r   r   r   bszsrc_lenexpanded_maskinverted_masks          ro   _expand_maskr   v   s     99;LC ,g'GD$)*11#q'KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZrr   input_ids_shapedevicepast_key_values_lengthc           	         U u  pE[         R                  " XU4[         R                  " U5      R                  US9n[         R                  " UR                  S5      US9nUR                  XwS-   R                  UR                  S5      S5      :  S5        UR                  U5      nUS:  a*  [         R                  " [         R                  " XSXS9U/SS9nUSSSS2SS24   R                  USXUU-   5      $ )z:
Make causal mask used for bi-directional self-attention.
)r   r6   r    r   r   r   dimN)rF   fullr   r   rG   r   masked_fill_viewr   catzerosrI   )r   r   r   r   r   r   r   	mask_conds           ro   _make_causal_maskr      s     #LC::w(%++e*<*@*@PDTYYr]6:Iiq="6"6tyy}a"HH!L775>D!yy%++gUbdhioqrdAq !((aDZ:Z[[rr   c                   H    \ rS rSr% SrSr\\R                     S-  \	S'   Sr
g)'BaseModelOutputWithProjectionAttentions   aY  
projection_attentions (`tuple(torch.FloatTensor)`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
    the weighted average in the self-attention heads.
Nprojection_attentionsrs   )rt   ru   rv   rw   __doc__r   tuplerF   FloatTensorrx   r   rs   rr   ro   r   r      s%     >B5!2!23d:Arr   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   *   \ rS rSr% SrSr\R                  S-  \S'   Sr	\
S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S	'   Sr\\S
'   S\\   4S jrSrg)Kosmos2ModelOutput   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
    the weighted average in the self-attention heads.
vision_model_output (`BaseModelOutputWithPooling`, *optional*):
    The output of the [`Kosmos2VisionModel`].
Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsr   vision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)text_model_outputr   Ngetattrto_tuple.0kri   s     ro   	<genexpr>.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   <      
   LLDGRYZ^`aRbRkRkRmm    25r   keysri   s   `ro   r   Kosmos2ModelOutput.to_tuple   #     
YY[
 
 	
rr   rs   )rt   ru   rv   rw   r   r   rF   r   rx   r   r
   r   r   r   r   r   r   r   r   r   r   rs   rr   ro   r   r      s    & 37u((4/6$(OUT\(59M5**+d2926Je''(4/6-1L%##d*1=A5!2!23d:A6:3:
%* 
rr   r   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   R   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   Sr\\R                     S-  \S
'   Sr\\S'   S\\   4S jrSrg)*Kosmos2ForConditionalGenerationModelOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
    the weighted average in the self-attention heads.
vision_model_output (`BaseModelOutputWithPooling`, *optional*):
    The output of the [`Kosmos2VisionModel`].
Nlosslogitsr   r   r   r   r   r   r   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7fr   r   r   s     ro   r   FKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   r   r   r   r   s   `ro   r   3Kosmos2ForConditionalGenerationModelOutput.to_tuple   r   rr   rs   )rt   ru   rv   rw   r   r   rF   r   rx   r   r   r
   r   r   r   r   r   r   r   r   r   r   rs   rr   ro   r   r      s    . &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/6-1L%##d*1=A5!2!23d:A6:3:
%* 
rr   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )r;      r'   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridera   r8   r    rE   r7   
persistent)super__init__r'   rQ   r?   
image_size
patch_sizer   	ParameterrF   randnr>   Conv2dnum_channelsr@   num_patchesrd   	EmbeddingrC   register_bufferrG   rI   ri   r'   	__class__s     ro   r    Kosmos2VisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]joprr   
embeddingsheightwidthr   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r    r   Nr6   g      ?r   r8   bicubicF)r   modealign_cornersr   )rH   rC   rA   	unsqueezerF   jit
is_tracingrE   r   r   reshapepermuter   
functionalinterpolater   r   )ri   r   r   r   r   rC   rd   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                ro   interpolate_pos_encoding0Kosmos2VisionEmbeddings.interpolate_pos_encoding  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCrr   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model ().r   r8   r    r6   r   )rH   r   
ValueErrorr@   rA   r   r   flatten	transposer>   rI   rF   r   r   rC   rE   )ri   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              ro   forwardKosmos2VisionEmbeddings.forward<  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJrr   )	r>   r'   r?   r   r   rd   r@   r   rC   F)rt   ru   rv   rw   r#   r   rF   Tensorintr   r   r  r   __classcell__r   s   @ro   r;   r;      sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  rr   r;   r,   querykeyvalueattention_maskscalingdropoutc                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr6   r   ptrainingr    r8   )	rF   matmulr  r   r   softmaxr  r  
contiguous)
r,   r  r  r  r  r  r  kwargsattn_weightsattn_outputs
             ro   eager_attention_forwardr  P  s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rr   c                      ^  \ rS rSrSrU 4S jr SS\R                  S\R                  S-  S\\	   S\
\R                  \R                  S-  4   4S	 jjrS
rU =r$ )rJ   if  =Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r   r3   F)r   r   r'   rQ   r?   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr  	is_causalr   rh   rM   rN   rL   rO   r   s     ro   r   Kosmos2VisionAttention.__init__i  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Arr   Nr   r  r  r   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      nU R                  U5      nU R	                  U5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  U R                  U R                  (       d  SOU R                  S.UD6u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )#Input shape: Batch x Time x ChannelNr6   r    r8   r2   )r*  r  r  )rH   r'  rL   rM   rN   r   r  r   get_interfacer'   _attn_implementationr  r*  r(  r  r  r   r  rO   )ri   r   r  r  input_shapehidden_shapequeriesr   valuesattention_interfacer  r  s               ro   r  Kosmos2VisionAttention.forward}  sP    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8
%
 nnJJ#}}C$,,
%
 
%
! "));;;;FFHmmK0((rr   )r'   r  r?   r'  r*  rM   r&  rO   rL   r(  rN   N)rt   ru   rv   rw   r   r   rF   r
  r   r   r   r  r   r  r  s   @ro   rJ   rJ   f  sk    GB. /3%)||%) t+%) +,	%)
 
u||U\\D00	1%) %)rr   rJ   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )rP   i  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r6  )r   r   r'   r	   
hidden_actactivation_fnr   rh   rQ   intermediate_sizerR   rS   r   s     ro   r   Kosmos2VisionMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJrr   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r6  )rR   r:  rS   ri   r   s     ro   r  Kosmos2VisionMLP.forward  s4    /**=9/rr   )r:  r'   rR   rS   )
rt   ru   rv   rw   r   rF   r
  r  r   r  r  s   @ro   rP   rP     s)    KU\\ ell  rr   rP   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jrS	rU =r$ )
r*   i  r'   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)r   r   rQ   r?   rJ   	self_attnr   r_   layer_norm_epslayer_norm1rP   mlplayer_norm2r   s     ro   r   "Kosmos2VisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRrr   r   r  r  r   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r   r  rs   )rG  rE  rI  rH  )ri   r   r  r  residualr  s         ro   r  !Kosmos2VisionEncoderLayer.forward  sz     !((7>> 
')
 

 !0 ((7/ 0rr   )r?   rG  rI  rH  rE  )rt   ru   rv   rw   r#   r   rF   r
  r   r   r   r  r   r  r  s   @ro   r*   r*     sV    S2 S||  +,	
 
		 rr   r*   c                   v   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\\-  4S	 jjrS
rU =r$ )Kosmos2VisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Kosmos2VisionEncoderLayer`].

Args:
    config: Kosmos2VisionConfig
r'   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r   r   r'   r   
ModuleListrangerK   r*   layersgradient_checkpointing)ri   r'   r  r   s      ro   r   Kosmos2VisionEncoder.__init__  sU    mmPUV\VnVnPo$pPo1%>v%FPo$pq&+# %qs   A&Nr  r  r   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
)r   )rT  r   )ri   inputs_embedsr  r  r   encoder_layers         ro   r  Kosmos2VisionEncoder.forward  sC    ( &![[M) M ) 7+
 	
rr   )r'   rU  rT  r6  )rt   ru   rv   rw   r   r#   r   rF   r
  r   r   r   r   r  r   r  r  s   @ro   rO  rO    sV    ,2 , /3
 t+
 +,	

 
	 
 
rr   rO  c                      ^  \ rS rSr\\S.rS\4U 4S jjr\	\
" SS9\  SS\R                  S-  S	\S
\\   S\4S jj5       5       5       rSrU =r$ )Kosmos2VisionTransformeri  )r   r   r'   c                 (  > [         TU ]  U5        UR                  n[        U5      U l        [
        R                  " X!R                  S9U l        [        U5      U l
        [
        R                  " X!R                  S9U l        U R                  5         g rB  )r   r   rQ   r;   r   r   r_   rF  pre_layrnormrO  encoderpost_layernorm	post_init)ri   r'   r?   r   s      ro   r   !Kosmos2VisionTransformer.__init__  sk     &&	1&9LL8M8MN+F3 ll9:O:OPrr   F)tie_last_hidden_statesNr   r   r  r   c                     Uc  [        S5      eU R                  XS9nU R                  U5      nU R                  " SSU0UD6nUS   nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r   rX  r   )r   pooler_outputrs   )r   r   r^  r_  r`  r   )ri   r   r   r  r   encoder_outputsr   pooled_outputs           ro   r   Kosmos2VisionTransformer.forward  s     ?@@h))-8,, 
'


 ,A.)!Q'2++M:)/'
 	
rr   )r   r_  r`  r^  rQ  )rt   ru   rv   rw   r*   rJ   _can_record_outputsr#   r   r   r   r   rF   r   r   r   r   r   r  r   r  r  s   @ro   r\  r\    s    2,
	2 	  E2 26).
''$.
 #'
 +,	

 
$
  3  
rr   r\  c                   P  ^  \ rS rSrSrSS\S\S\S-  4U 4S jjjrSS\S\S\S-  4S	 jjr\SS\S\S\S-  4S
 jj5       r	\
R                  " 5           SS\
R                  S-  S\
R                  S-  S\S\
R                  S-  4S jj5       r\S 5       r\SS j5       rSrU =r$ )rb   i8  zDThis module produces sinusoidal positional embeddings of any length.Nrd   rf   r]   c                    > [         TU ]  5         SU l        Xl        X l        X0l        U R                  XR                  -   X#5        g )Nr8   )r   r   re   rd   rf   r]   make_weights)ri   rd   rf   r]   r   s       ro   r   1Kosmos2TextSinusoidalPositionalEmbedding.__init__<  s>    **&-++5}Rrr   num_embeddingsc                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )Nrg   r   Fr   )rc   r9   r   rg   r   r   r   )ri   rn  rf   r]   rn   s        ro   rl  5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsE  s\    ((T4##%..t||/A/A$,,J]J].^KYFrr   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
r8   i'  r    r   r   r   r6   N)mathlogrF   exprG   int64floatr   r   sincosr   r   r   get_default_dtype)rn  rf   r]   half_dimembs        ro   rc   6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingM  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00rr   	input_idsrX  r   rE   c                    UbK  UR                  5       u  pVUc5  U R                  XR                  U5      R                  UR                  5      nO4UR                  5       S S u  pVUc  U R                  X#U R                  5      nU R                  S-   U-   U-   nXpR                  R                  S5      :  a3  U R                  XpR                  -   U R                  U R                  5        U R                  R                  SUR                  S5      5      R                  XVU R                  R                  S   5      R                  5       $ )Nr6   r    r   )r   "create_position_ids_from_input_idsr]   r   r   &create_position_ids_from_inputs_embedsrg   rl  re   rf   index_selectr   rH   detach)ri   r}  rX  r   rE   r   seq_lenmax_poss           ro   r  0Kosmos2TextSinusoidalPositionalEmbedding.forwardc  s*     $>>+LC##FF//1G "Y%%&  )--/4LC##JJ!4;K;K 
 ""Q&03II\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGVZVbVbVhVhikVlmttvvrr   c                    U R                  5       SS nUS   n[        R                  " US-   XB-   S-   [        R                  U R                  S9nUR                  S5      R                  U5      R                  5       U-   $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr6   r    r   r   )r   rF   rG   longr   r   rI   r  )rX  r   r]   r0  sequence_lengthrE   s         ro   r  OKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s}     $((*3B/%a.||!O_:Q>ejjYfYmYm
 %%a(//<GGILbbbrr   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r    r   )ner  rF   cumsumtype_asr  )r}  r]   r   r   incremental_indicess        ro   r  KKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_input_ids  sW     ||K(,,.$||Da8@@FI__cgg"'')K77rr   )rf   rd   re   r]   r6  )NNr   Nr   )rt   ru   rv   rw   r   r  r   rl  staticmethodrc   rF   r   r
  r  r  r  r   r  r  s   @ro   rb   rb   8  s   NSc S# SCRVJ S SG3 Gs GQTW[Q[ G 1c 1# 1CRVJ 1 1( ]]_ *.-1&',0w<<$&w ||d*w !$	w
 llT)w w8 c c" 8 8rr   rb   c                   (  ^  \ rS rSrSr     SS\S\S\S\S-  S\S-  S	\S-  S
\S-  4U 4S jjjr   SS\	R                  S\	R                  S-  S\S-  S\	R                  S-  S\\	R                  \	R                  S-  \S-  4   4
S jjrSrU =r$ )rT   i  r!  Nr?   r&  r  
is_decoderadd_inner_attn_layernormra   	layer_idxc	                 X  > [         T	U ]  5         Xl        X l        X0l        X@l        X#-  U l        SU l        U R                  U-  U R                  :w  a  [        SU R                   SU S35      eU R                  S-  U l	        XPl
        Xl        [        R                  " X"US9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        S U l        U(       a$  [        R&                  " X!R(                  S9U l        g g )NTr#  r$  r   r3   )ra   rC  )r   r   r'   r?   r&  r  r'  r*  r   r  r  r  r   rh   rM   rN   rL   rO   inner_attn_lnr_   rF  )
ri   r'   r?   r&  r  r  r  ra   r  r   s
            ro   r   KosmosTextAttention.__init__  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TB "#!#i=R=R!SD $rr   r   encoder_hidden_statesr   r  r   c                 d   USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      n	U	R                  U5      R	                  SS5      n	Sn
Ub]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nO/ UR                   SS QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R!                  U5      R                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U U	UUU4U R.                  (       d  SOU R0                  U R2                  S.UD6u  nnUR4                  " / UQSP76 R7                  5       nU R8                  b  U R9                  U5      nU R;                  U5      nUU4$ )	r-  Nr6   r    r8   FTr2   )r  r  )rH   r'  rL   r   r  r:   r   
is_updatedgetr  cross_attention_cacheself_attention_cacherT  r   r3  rM   rN   updater   r.  r'   r/  r  r  r  r  r   r  r  rO   )ri   r   r  r   r  r  is_cross_attentionr0  r1  query_statesr  curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shaper4  r  r  s                      ro   r  KosmosTextAttention.forward  sz    3$>#))#2.88b8$--8{{=1#((6@@AF
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$2D.-/"=*-44T^^DIIJ/66t~~FMMLF--cr2FBFFH^499(CMMaQRSJ;;~6;;HEOOPQSTUL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFH),,[9KmmK0L((rr   )r'   r  r?   r'  r  r*  r  rM   r  r&  rO   rL   r  rN   )r2   FFTN)NNN)rt   ru   rv   rw   r   r  rv  r   r   rF   r
  r
   r   r  r   r  r  s   @ro   rT   rT     s   G "'05 !%$T $T 	$T
 $T 4K$T #'+$T Tk$T $;$T $TR 6:(,.2E)||E)  %||d2E) 	E)
 t+E) 
u||U\\D0%$,>	?E) E)rr   rT   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )rU   i  r'   c                   > [         TU ]  5         UR                  U l        [        UR                     U l        UR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        g rB  )r   r   r  r	   activation_functionr:  activation_dropoutr   rh   r?   ffn_dimrR   rS   r_   rF  ffn_layernormr   s     ro   r   Kosmos2TextFFN.__init__  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STrr   c                 R   U R                  U R                  U5      5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R                  S9nU$ )Nr  )	r:  rR   r   r   r  r  r  r  rS   r>  s     ro   r  Kosmos2TextFFN.forward$  s    **488M+BC--m?V?Vaeanan-o**=9/--m||VZVcVc-drr   )r  r:  r  rR   rS   r  )	rt   ru   rv   rw   r"   r   r  r   r  r  s   @ro   rU   rU     s    
U0 
U rr   rU   c                   ,  ^  \ rS rSrSS\4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\	S-  S\
\R                  \
\R                  \R                  4   S-  4   4S jjrSrU =r$ )r+   i.  Nr'   c           
        > [         TU ]  5         UR                  U l        [        UU R                  UR                  UR
                  SSUS9U l        UR                  U l        [        R                  " U R                  UR                  S9U l        UR                  (       a`  [        UU R                  UR                  UR
                  SSUS9U l        [        R                  " U R                  UR                  S9U l        [        U5      U l        [        R                  " U R                  UR                  S9U l        g )NT)r?   r&  r  r  r  r  rC  F)r   r   r?   rT   attention_headsr)  rE  r  r   r_   rF  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrU   ffnfinal_layer_norm)ri   r'   r  r   s      ro   r   Kosmos2TextBlock.__init__/  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).#!D ,.<<FLaLa+bD(!&) "T^^AVAV Wrr   r   r  r  encoder_attention_maskr   output_attentionsr   c           	      &   UnU R                  U5      nU R                  " SUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUb  [        U S5      (       d  [        SU  S35      eUnU R                  U5      nU R                  " SUUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r   r   r  r  r  r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r  r  r   r  rs   )r  rE  r   r   r  r  r9   r   r  r  r  r  )
ri   r   r  r  r  r   r  r  rL  r  s
             ro   r  Kosmos2TextBlock.forwardN  sT    !11-@>> 
'+)/	

 
 --m||VZVcVc-d 0 !,400 =dV DD D 
 %H 88GM#00  +&;5 /"3   M MM11-<<Z^ZgZg1hM$4M !--m< / 0rr   )r  r?   r  r  r  r  rE  r  r6  )NNNNF)rt   ru   rv   rw   r"   r   rF   r
  r
   r   r   r   r  r   r  r  s   @ro   r+   r+   .  s    X0 X XD /3596:(,).6||6 t+6  %||d2	6
 !&t 36 6  $;6 
u  %(9(95;L;L(L"MPT"TT	U6 6rr   r+   c            #       z  ^  \ rS rSr% \\S'   Sr\\" \	SSS9\" \	SSS9S.r
S\4U 4S	 jjrS
 r     S S\R                  S-  S\R                  S-  S\R                  S-  S\S\R                  S-  4
S jjr\\\             S!S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )"r[   i  r'   r)   r    rE  )index
layer_namer  )r   r   cross_attentionsc           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR                  (       a   [
        R                  " UR                  5      OSU l        [        R                  " UR                  UR                  UR                  S9U l        [        UR                  UR                  UR                  S9U l        [        R"                  " [%        UR&                  5       Vs/ s H  n[)        XS9PM     sn5      U l        [        R*                  " UR                  UR,                  5      U l        SU l        U R3                  5         g s  snf )Nr   )r]   )rd   rf   r]   )r  F)r   r   r  	layerdropscale_embeddingrr  sqrtr?   embed_scaler   r   
vocab_sizepad_token_idr\   rb   max_position_embeddingsembed_positionsrR  rS  rT  r+   r_   rF  
layer_normrU  ra  )ri   r'   ir   s      ro   r   Kosmos2TextTransformer.__init__  s    ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmTYZ`ZgZgTh$iThq%5f%JTh$ij,,v'7'79N9NO&+# %js   9E c                     S nUS   S:  a   [        UUR                  UR                  US9nUb9  [        XR                  US   S9R	                  UR                  5      nUc  UOXe-   nU$ )Nr6   r    )r   r   r   )r   r   r   r   r   )ri   r  r0  rX  r   combined_attention_maskexpanded_attn_masks          ro   _prepare_decoder_attention_mask6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-n>Q>Q[fgi[j!k!n!n$$" '>&E"K]Kw $ '&rr   NrX  r   img_input_maskr   rE   c                    Uc  U R                  U5      nUbW  UR                  UR                  5      R                  SUR	                  S5      5      X$R                  [
        R                  S9'   X R                  -  nU R                  UUUUS9nUR                  UR                  5      nX'-   n[        R                  R                  XR                  U R                  S9nU$ )Nr6   r   )r}  rX  r   rE   r  )r\   r   r   r   r   rF   r   r  r  r   r   r  r  )	ri   r}  rX  r   r  r   rE   	positionsr   s	            ro   forward_embedding(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%1--m||VZVcVc-drr   r}  r  image_embeds_position_maskr  r  r   	use_cacher  output_hidden_statesreturn_dictr  r   c           	         Ub  Ub  [        S5      eUb"  UR                  nUR                  SUS   5      nO"Ub  UR                  5       SS nO[        S5      eU
(       ab  Uc_  Uc  U R                  R
                  (       a.  [        [        U R                  S9[        U R                  S95      O[        U R                  S9nUb  UR                  5       OSnUS:  a  SnSnU R                  UUUUUU	S9nU R                  X/UU5      nUb  Ub  [        XhR                  US   S9n[        R                  R                  UU R                  U R                   S	9nU R"                   HL  nU R                   (       a(  [$        R&                  " / 5      nUU R(                  :  a  M<  U" UUU4UUUU
S
.UD6nMN     U R+                  U5      n[-        UUS9$ )  
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
NzDYou cannot specify both input_ids and inputs_embeds at the same timer6   z5You have to specify either input_ids or inputs_embeds)r'   r   )r}  rX  r   r  r   rE   r  r  )r  r   r  r  )r   r   )r   rH   r   r   r'   is_encoder_decoderr   r   get_seq_lengthr  r  r   r   r   r   r  r  rT  rF   randr  r  r   )ri   r}  r  r   r  r  r  r   rX  rE   r  r  r  r  r  r0  r   r   decoder_layerdropout_probabilitys                       ro   r  Kosmos2TextTransformer.forward  s   <  ]%>cdd"#//K!r;r?;I&',,.s3KTUU0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg "A%L)-&..'%5#9% / 
 ==8N

 !,1G1S%12HJ]J]grsugv%w"--mt||VZVcVc-d![[M}}&+jjn#&7)%	 (> /"3#	 	M )& 68++
 	
rr   )r  r  r  r\   rU  r  r  rT  )NNNr   N)NNNNNNNNNNNNN)rt   ru   rv   rw   r"   rx   ry   r+   r   rT   ri  r   r  rF   r
  r  r  r   r   r   r
   r   r   r   r   r   r  r   r  r  s   @ro   r[   r[     s    )$%8kZ*+>aTbc0 *'4 .2,0.2&',0! ||d*! llT)	!
 t+! !$! llT)!F   *..2,0:>596:(,-1,0!%)-,0#'_
<<$&_
 t+_
 llT)	_

 %*LL4$7_
  %||d2_
 !&t 3_
 _
 ||d*_
 llT)_
 $;_
  $;_
 #Tk_
 D[_
 -._
  
:	:!_
    _
rr   r[   c                      ^  \ rS rSr% \\S'   SrSrS\4U 4S jjrS\	R                  4S jr\\  SS\R                  S-  S	\S
\\   S\\-  4S jj5       5       rSrU =r$ )Kosmos2VisionModeliD  r'   r   )r(   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r6  )r   r   r\  modelra  r   s     ro   r   Kosmos2VisionModel.__init__I  s&     -f5
rr   r   c                 B    U R                   R                  R                  $ r6  )r  r   r@   r   s    ro   get_input_embeddings'Kosmos2VisionModel.get_input_embeddingsO  s    zz$$444rr   Nr   r  c                 ,    U R                   " SUUS.UD6$ )N)r   r   rs   r  )ri   r   r   r  s       ro   r  Kosmos2VisionModel.forwardR  s*     zz 
%%=
 
 	
rr   r  rQ  )rt   ru   rv   rw   r#   rx   main_input_namery   r   r   r   r  r   r   rF   r   r   r   r   r   r   r  r   r  r  s   @ro   r  r  D  s    $O!2 5bii 5  26).

''$.

 #'

 +,	


 
8	8

  

rr   r  c                     ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\\          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )Kosmos2TextModelia  r'   r  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r6  )r   r   r[   r  ra  r   s     ro   r   Kosmos2TextModel.__init__e  s&     +F3
rr   r   c                 .    U R                   R                  $ r6  r  r\   r   s    ro   r  %Kosmos2TextModel.get_input_embeddingsk      zz&&&rr   Nr}  r  r   r  r  r  r   rX  rE   r  r  c                 <    U R                   " SUUUUUUUUU	U
S.
UD6$ )r  
r}  r  r   r  r  r  r   rX  rE   r  rs   r  )ri   r}  r  r   r  r  r  r   rX  rE   r  r  s               ro   r  Kosmos2TextModel.forwardn  sB    4 zz 
)%'A"7#9+'%
 
 	
rr   r  )
NNNNNNNNNN)rt   ru   rv   rw   r"   rx   ry   r   r   r   r  r   r   rF   r
  r
   r   r   r   r   r   r  r   r  r  s   @ro   r  r  a  s:    0 'bii '  *..2,0:>596:(,-1,0!%$
<<$&$
 t+$
 llT)	$

 %*LL4$7$
  %||d2$
 !&t 3$
 $
 ||d*$
 llT)$
 $;$
 +,$
 
:	:$
  $
rr   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                    4  ^  \ rS rSr% \\S'   SS0rS\4U 4S jjrS\R                  4S jr
S\R                  4S jr\\            SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\R"                  S	-  S\S	-  S\\R                  -  S\\   S\\-  4S jj5       5       r       SU 4S jjrSrU =r$ )rV   i  r'   zlm_head.weightzmodel.embed_tokens.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NF)in_featuresout_featuresra   )
r   r   r[   r  r   rh   r?   r  rW   ra  r   s     ro   r   Kosmos2TextForCausalLM.__init__  sI     +F3
yyV-=-=FL]L]dij 	rr   r   c                 .    U R                   R                  $ r6  r  r   s    ro   r  +Kosmos2TextForCausalLM.get_input_embeddings  r  rr   c                     U R                   $ r6  )rW   r   s    ro   get_output_embeddings,Kosmos2TextForCausalLM.get_output_embeddings  s    ||rr   Nr}  r  r   r  r  r  r   rX  rE   labelsr  logits_to_keepr  c                    U
b  U(       a  [         R                  S5        SnU R                  " SUUUUUUUUU	US.
UD6nUR                  n[	        U[
        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU
b)  U R                  " SUXR                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.Fr  )r   r  r  )r   r   r   r   r   r  rs   )loggerwarningr  r   r:   r  slicerW   loss_functionr'   r  r   r   r   r   r  )ri   r}  r  r   r  r  r  r   rX  rE   r  r  r	  r  outputsr   slice_indicesr   r   s                      ro   r  Kosmos2TextForCausalLM.forward  s   @ klI=AZZ >
)%'A"7#9+'%>
 >
  118B>SV8W8W~ot4]kmA}a,?@A%%pVF{{OeOepiopD0#33!//))$55
 	
rr   c	                   > U(       d  U(       a  S nS nOUb  Ub  UR                  5       S S OUR                  5       u  pUR                  5       S   n[        R                  " U[        R                  " XU-
  4[        R                  UR
                  S94SS9n[        TU ]  " U4UUUUUUUS.U	D6nUR                  SS 5        U$ )Nr6   )r   r   r   r    r   )r   r  r   r  rX  r  is_first_iterationrE   )	r   rF   r   r   r   r   r   prepare_inputs_for_generationpop)ri   r}  r   r  r   r  rX  r  r  model_kwargsr  r  mask_lenmodel_inputsr   s                 ro   r  4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s    $ "iL)-& (3?L?X-"4"4"6s";^l^q^q^sJ1668<H)..KKjH2D%EUZZ`i`p`pq *& w<

+)%'A'1

 

 	.rr   )rW   r  )NNNNNNNNNNNr   )NNNNNNF)rt   ru   rv   rw   r"   rx   _tied_weights_keysr   r   r   r  r  r   r   rF   r
  r
   
LongTensorr   r  r   r   r   r   r  r  r   r  r  s   @ro   rV   rV     s    *,GH0 'bii 'ryy   *..2,0:>596:(,-1,0*.!%-.A
<<$&A
 t+A
 llT)	A

 %*LL4$7A
  %||d2A
 !&t 3A
 A
 ||d*A
 llT)A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
L #' 0 0rr   rV   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )rX   i(  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)r'   c           	        > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  5      U l        [        R                  " [        R                  " UR                  UR                  R                  5      5      U l        [        UR                  UR                  R                  UR                  R                  UR                  R                   SSS9U l        g )NF)r  r  r  )r   r   r   rh   r/   rQ   r1   r?   rY   r   rF   r   latent_query_numrZ   rT   r  r)  x_attnr   s     ro   r   %Kosmos2ImageToTextProjection.__init__+  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
rr   c                     U R                  U5      nU R                  R                  S5      R                  UR	                  S5      SS5      n[
        R                  " X#/SS9nU R                  UUS S S S9u  p%X%4$ )Nr   r6   r    r   )r   r  r   r  r  )rY   rZ   r   rI   r   rF   r   r  )ri   featuresr   rZ   key_value_statesr  s         ro   r  $Kosmos2ImageToTextProjection.forward9  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m%BJ&*kk&"2 " '2 '
# **rr   )rY   rZ   r  )
rt   ru   rv   rw   r   r!   r   r  r   r  r  s   @ro   rX   rX   (  s    w
} 
+ +rr   rX   z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c                     ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
S r\\ SS\R                  S\S	-  S
\\   S\\-  4S jj5       5       r\\          SS\R,                  S	-  S\R,                  S	-  S\R,                  S	-  S\R,                  S	-  S\S	-  S\R,                  S	-  S\R,                  S	-  S\R,                  S	-  S\S	-  S\S
\\   S\\-  4S jj5       5       rSrU =r$ )Kosmos2ModeliK  r'   r   c                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        [        U5      U l	        U R                  5         g r6  )r   r   r  r1   
text_modelr  r/   vision_modelrX   image_to_text_projectionra  r   s     ro   r   Kosmos2Model.__init__T  sN     *6+=+=>.v/C/CD(DV(L% 	rr   r   c                 B    U R                   R                  R                  $ r6  r(  r  r\   r   s    ro   r  !Kosmos2Model.get_input_embeddings^      $$111rr   c                 8    XR                   R                  l        g r6  r-  ri   r  s     ro   set_input_embeddings!Kosmos2Model.set_input_embeddingsa      -2*rr   r   Nr  c                 b   SU;   a-  [         R                  " S[        5        UR                  SS 5        U R                  " SUUSS.UD6nU R                  R
                  R                  US   5      n[        R                  R                  USS9nU R                  U5      u  pVXTl        Xdl        U$ )	Nreturn_attentionsz`return_attentions` is deprecated and will be removed in a future version. Please use `return_dict` and access `projection_attentions` from the returned `ModelOutput` instead.T)r   r   r  r   r6   r   rs   )warningswarnFutureWarningr  r)  r  r`  r   r   	normalizer*  re  r   )ri   r   r   r  vision_outputr   r   s          ro   get_image_featuresKosmos2Model.get_image_featuresd  s     &(MM_
 JJ*D1AEARAR B
%%=B
 	B
 ((..==mA>NO}}..|.D.2.K.KL.Y+&2#.C+rr   r}  r  r  r   r   rX  rE   r  c                 .   SnSnUc<  Uc  [        S5      eU R                  " U4U
SS.UD6nUR                  nUR                  nU R                  " SUUUUUUUU	SS.	UD6n[        UR                  UR                  UR                  UR                  UUUS9$ )a  
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Kosmos2Model

>>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
>>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

>>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> text = (
...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
...     "</object>"
... )

>>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

>>> last_hidden_state = model(
...     pixel_values=inputs["pixel_values"],
...     input_ids=inputs["input_ids"],
...     attention_mask=inputs["attention_mask"],
...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
... ).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 91, 2048]
```N<You have to specify either `pixel_values` or `image_embeds`.T)r   r  )	r}  r  r   r  r   rX  rE   r  r  )r   r   r   r   r   r   r   rs   )
r   r<  re  r   r(  r   r   r   r   r   )ri   r   r}  r  r  r   r   rX  rE   r  r   r  r   r   image_featuresr  s                   ro   r  Kosmos2Model.forward  s    t # $# !_``!447O]aekN *77L$2$H$H!// 
)%'A+'%
 
 "%77#33!//))%"7 3
 	
rr   r*  r(  r)  r	  )
NNNNNNNNNF)rt   ru   rv   rw   r!   rx   r  r   r   r   r  r2  r   r   rF   r   r   r   r   r   r   r<  r
  r
   r   r  r   r  r  s   @ro   r&  r&  K  s    $O} 2bii 23  16'' #'+ +,	
 
8	8  <  -1)-:>.2(,,0-1,0!%).X
llT)X
 <<$&X
 %*LL4$7	X

 t+X
 X
 llT)X
 ||d*X
 llT)X
 $;X
 #'X
 +,X
 
#	#X
  X
rr   r&  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c                     ^  \ rS rSr% \\S'   SrSS0rS\4U 4S jjrS\	R                  4S jrS	 rS\	R                  4S
 jrS r\\           SS\R$                  S-  S\R$                  S-  S\R$                  S-  S\R$                  S-  S\S-  S\R$                  S-  S\R$                  S-  S\R$                  S-  S\R(                  S-  S\S-  S\\R$                  -  S\\   S\\-  4S jj5       5       r\R8                  " 5             SS\R$                  S-  S\R$                  S-  S\R$                  S-  S\R$                  S-  S\R$                  S-  S\R$                  S-  4S jj5       rSrU =r$ )Kosmos2ForConditionalGenerationi  r'   r   ztext_model.lm_head.weightz$text_model.model.embed_tokens.weightc                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        [        U5      U l	        U R                  5         g r6  )r   r   rV   r1   r(  r  r/   r)  rX   r*  ra  r   s     ro   r   (Kosmos2ForConditionalGeneration.__init__  sN     01C1CD.v/C/CD(DV(L% 	rr   r   c                 B    U R                   R                  R                  $ r6  r-  r   s    ro   r  4Kosmos2ForConditionalGeneration.get_input_embeddings  r/  rr   c                 8    XR                   R                  l        g r6  r-  r1  s     ro   r2  4Kosmos2ForConditionalGeneration.set_input_embeddings  r4  rr   c                 6    U R                   R                  5       $ r6  )r(  r  r   s    ro   r  5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466rr   c                 :    U R                   R                  U5        g r6  )r(  set_output_embeddings)ri   new_embeddingss     ro   rN  5Kosmos2ForConditionalGeneration.set_output_embeddings   s    --n=rr   Nr}  r  r  r   r   rX  rE   r  r  r	  r  c                    SnSnUcv  Uc  [        S5      eU R                  US9nU R                  R                  R                  US   5      n[        R
                  R                  USS9nU R                  U5      u  pnU R                  " S	UUUUUUUU	U
US.
UD6n[        UR                  UR                  UR                  UR                  UR                  UUUS9$ )
a.
  
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

>>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
>>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

>>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> prompt = "<grounding> An image of"

>>> inputs = processor(text=prompt, images=image, return_tensors="pt")

>>> generated_ids = model.generate(
...     pixel_values=inputs["pixel_values"],
...     input_ids=inputs["input_ids"],
...     attention_mask=inputs["attention_mask"],
...     image_embeds=None,
...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
...     use_cache=True,
...     max_new_tokens=64,
... )
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
>>> processed_text
'<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

>>> caption, entities = processor.post_process_generation(generated_text)
>>> caption
'An image of a snowman warming himself by a fire.'

>>> entities
[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
```Nr?  )r   r   r6   r   )
r}  r  r   r  r   rX  rE   r  r  r	  )r   r   r   r   r   r   r   r   rs   )r   r)  r  r`  r   r   r:  r*  r(  r   r   r   r   r   r   )ri   r   r}  r  r  r   r   rX  rE   r  r  r	  r  r   r   
lm_outputss                   ro   r  'Kosmos2ForConditionalGeneration.forward  s   N # $# !_``"&"3"3) #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L8< 9
)%'A+'%)9
 9

 :$$&66$22!,,%"7 3	
 		
rr   c           	         UR                  SS 5      nUb  Ub  [        SU S35      eUc  Ub  UnUcj  U R                  U5      n	U R                  R                  R	                  U	S   5      n[
        R                  R                  USS9nU R                  U5      u  pZU R                  R                  " SUUUUUS.UD6nU$ )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r6   r   )r}  r  r   r  rX  rs   )r  r   r)  r  r`  r   r   r:  r*  r(  generate)ri   r   r  r}  r  r   rX  r  rU  r   r   outputs               ro   rV  (Kosmos2ForConditionalGeneration.generater  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L)) 
)%'A'
 
 rr   rB  )NNNNNNNNNNr   )NNNNNN) rt   ru   rv   rw   r!   rx   r  r  r   r   r   r  r2  r  rN  r   r   rF   r
  r
   r  r   r  r   r   r   r   r  r   rV  r   r  r  s   @ro   rD  rD    s&    $O57]^	} 	2bii 237ryy 7>  -1)-:>.2(,,0-1,0*.!%-.k
llT)k
 <<$&k
 %*LL4$7	k

 t+k
 k
 llT)k
 ||d*k
 llT)k
   4'k
 $;k
 ell*k
 +,k
 
;	;k
  k
Z ]]_ -1:>)-.2,0-1%llT)% %*LL4$7% <<$&	%
 t+% llT)% ||d*% %rr   rD  )rD  r&  r%   r6  r  )r2   )Wr   rr  r7  collections.abcr   dataclassesr   typingr   rF   r    r   r<   activationsr	   cache_utilsr
   r   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   r   configuration_kosmos2r!   r"   r#   
get_loggerrt   r  r%   r
  r   r  r   Sizer   r   r   r   r   r   r;   rv  r  rJ   rP   r*   rO  r\  rb   rT   rU   r+   r[   r  r  rV   rX   r&  rD  __all__rs   rr   ro   <module>rl     sJ      $ !    & ! C C ) B 9  G & j j 7 E X X 
		H	% B%_ B% B%J[u|| [EKK [#* [ jk\ZZ\(-\=B\\\cf\" 

B.H 
B  
B 
  
  
  
F 
 %
 %
 %
RPbii Pv %II%<<% 
% <<	%
 LL4'% % %,<)RYY <)@ryy   : B-
299 -
b,
5 ,
`j8ryy j8Zo)")) o)dRYY .V1 Vrz
3 z
z
/ 
:3
- 3
l H3_ HHV +299  +F 
N
) N

N
b q&<o qqh Xrr   