
    Z j                        S r SSKrSSKrSSKJr  SSKJrJrJr  SSKJ	r
  SSKJr  SSKJrJrJrJrJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJ r J!r!J"r"  \RF                  " \$5      r% " S S\RL                  5      r' " S S\RL                  5      r( " S S\RL                  5      r) " S S\RL                  5      r* " S S\RL                  5      r+ " S S\RL                  5      r, " S S\RL                  5      r- " S S\RL                  5      r. " S S\RL                  5      r/\ " S  S!\5      5       r0\ " S" S#\05      5       r1\ " S$ S%\05      5       r2 " S& S'\RL                  5      r3\" S(S)9 " S* S+\05      5       r4\ " S, S-\05      5       r5\ " S. S/\05      5       r6 " S0 S1\RL                  5      r7\ " S2 S3\05      5       r8S6S4 jr9/ S5Qr:g)7zPyTorch I-BERT model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)gelu))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )IBertConfig)IntGELUIntLayerNorm
IntSoftmaxQuantActQuantEmbeddingQuantLinearc                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )IBertEmbeddings-   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                 N  > [         TU ]  5         UR                  U l        SU l        SU l        SU l        SU l        SU l        [        UR                  UR                  UR                  U R                  U R                  S9U l        [        UR                  UR                  U R                  U R                  S9U l        U R                  S[         R"                  " UR$                  5      R'                  S5      S	S
9  UR                  U l        [        UR$                  UR                  U R(                  U R                  U R                  S9U l        [-        U R                  U R                  S9U l        [-        U R                  U R                  S9U l        [3        UR                  UR4                  U R                  U R                  UR6                  S9U l        [-        U R
                  U R                  S9U l        [<        R>                  " UR@                  5      U l!        g )N             )padding_idx
weight_bit
quant_mode)r%   r&   position_idsr   F)
persistentr&   eps
output_bitr&   force_dequant)"super__init__r&   embedding_bitembedding_act_bitact_bitln_input_bitln_output_bitr   
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsregister_buffertorcharangemax_position_embeddingsexpandr$   position_embeddingsr   embeddings_act1embeddings_act2r   layer_norm_epsr/   	LayerNormoutput_activationr   Dropouthidden_dropout_probdropoutselfconfig	__class__s     y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/ibert/modeling_ibert.pyr1   IBertEmbeddings.__init__2   s    ++!#-++)) 
 &4""F$6$64CUCUbfbqbq&
"
 	ELL)G)GHOOPWXej 	 	

 "..#1**(())$
   ((>(>4??['(>(>4??[%%%)) ..
 "*$,,4??!Szz&"<"<=    c                    UcD  Ub0  [        XR                  U5      R                  UR                  5      nOU R	                  U5      nUb  UR                  5       nOUR                  5       S S nUc8  [        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      u  pGOS nU R                  U5      u  pU R                  UUUU	S9u  pU R                  U5      u  pU R                  U
UUUS9u  pU R                  X5      u  pU R                  U
5      n
U R!                  X5      u  pX4$ )Nr)   dtypedeviceidentityidentity_scaling_factor)"create_position_ids_from_input_idsr$   torU   &create_position_ids_from_inputs_embedssizer>   zeroslongr'   r:   r<   rC   rB   rF   rJ   rG   )rL   	input_idstoken_type_idsr'   inputs_embedspast_key_values_lengthinput_shapeinputs_embeds_scaling_factorr<   $token_type_embeddings_scaling_factor
embeddingsembeddings_scaling_factorrB   "position_embeddings_scaling_factors                 rO   forwardIBertEmbeddings.forwardc   sq    $A//1G "Y%%&   $JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN :>:N:Ny:Y7M7+/(FJF`F`aoFpC040D0D(*$H	 1E 1
-
 CGBZBZ[gBh?040D0D%($F	 1E 1
-
 15z0e-
\\*-
040F0Fz0m-
44rQ   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr)   r   rS   r   )r\   r>   r?   r$   r^   rU   	unsqueezerA   )rL   ra   rc   sequence_lengthr'   s        rO   r[   6IBertEmbeddings.create_position_ids_from_inputs_embeds   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rQ   )rF   r4   rJ   r3   r2   rC   rD   r5   r6   rG   r$   rB   r&   r<   r:   )NNNNr   )
__name__
__module____qualname____firstlineno____doc__r1   ri   r[   __static_attributes____classcell__rN   s   @rO   r   r   -   s%    />d rs,5\= =rQ   r   c                   6   ^  \ rS rSrU 4S jr  SS jrSrU =r$ )IBertSelfAttention   c           
      $  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        SU l        SU l        SU l	        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        UR                  U R                  SU R                  U R                  U R                  SS	9U l        [        UR                  U R                  SU R                  U R                  U R                  SS	9U l        [        UR                  U R                  SU R                  U R                  U R                  SS	9U l        [#        U R                  U R                  S
9U l        [#        U R                  U R                  S
9U l        [#        U R                  U R                  S
9U l        [#        U R                  U R                  S
9U l        [,        R.                  " UR0                  5      U l        [5        U R                  U R                  UR6                  S9U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r    r#   Tbiasr%   bias_bitr&   per_channelr+   r&   r/   )r0   r1   r8   num_attention_headshasattr
ValueErrorr&   r%   r   r4   intattention_head_sizeall_head_sizer   querykeyvaluer   query_activationkey_activationvalue_activationrG   r   rH   attention_probs_dropout_probrJ   r   r/   softmaxrK   s     rO   r1   IBertSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  !++#)#=#= #&v'9'9F<V<V'V#W !558P8PP !]]

 ]]
 !]]

 !)$// R&t||P ($// R!)$,,4??!Szz&"E"EF!$,,4??Z`ZnZnorQ   c                 l   U R                  X5      u  pVU R                  X5      u  pxU R                  X5      u  pU R                  XV5      u  pU R	                  Xx5      u  pU R                  X5      u  nnUR                  S S n/ UQSPU R                  P7nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      n[        R                  " XR                  SS5      5      n[        R                  " U R                  5      nUU-  nU R                  (       a  X-  U-  nOS nUb  UU-   nU R                  UU5      u  nnU R!                  U5      n[        R                  " UU5      nUb  UU-  nOS nUR#                  SSSS5      R%                  5       nUR'                  5       S S U R(                  4-   nUR                  " U6 nU R+                  UU5      u  nnU(       a  UU4OU4nU(       a  UU4OU4nUU4$ )Nr)   r      r   r   )r   r   r   r   r   r   shaper   view	transposer>   matmulmathsqrtr&   r   rJ   permute
contiguousr\   r   rG   )rL   hidden_stateshidden_states_scaling_factorattention_maskoutput_attentionsmixed_query_layer mixed_query_layer_scaling_factormixed_key_layermixed_key_layer_scaling_factormixed_value_layer mixed_value_layer_scaling_factorquery_layerquery_layer_scaling_factor	key_layerkey_layer_scaling_factorvalue_layervalue_layer_scaling_factorrc   hidden_shapeattention_scoresscaleattention_scores_scaling_factorattention_probsattention_probs_scaling_factorcontext_layercontext_layer_scaling_factornew_context_layer_shapeoutputsoutput_scaling_factors                                rO   ri   IBertSelfAttention.forward   sp    ?Cjj>u;:>((=:o7>Bjj>u; 372G2G3
/ /3.A.A/.r+	262G2G3
//
 $))#2.CCbC$*B*BC!&&|4>>q!DNN<0::1a@	!&&|4>>q!D !<<5H5HR5PQ		$223+e3??.H.cfk.k+.2+%/.@ ;?,,=;
77 ,,7_kB)5+ILf+f(+/(%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD 7;6L6L77
33 7H=/2mM] ! *+IJ.0 	 ---rQ   )r4   r   r   r   rJ   r   r   r   rG   r&   r   r   r   r   r   r%   NFro   rp   rq   rr   r1   ri   rt   ru   rv   s   @rO   rx   rx      s    5pv H. H.rQ   rx   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )IBertSelfOutputi&  c           
      t  > [         TU ]  5         UR                  U l        SU l        SU l        SU l        SU l        SU l        [        UR                  UR                  SU R                  U R
                  U R                  SS9U l
        [        U R                  U R                  S9U l        [        UR                  UR                  U R                  U R                  UR                  S9U l        [        U R                  U R                  S9U l        [$        R&                  " UR(                  5      U l        g Nr    r#   r"   Tr}   r+   r,   )r0   r1   r&   r4   r%   r   r5   r6   r   r8   denser   ln_input_actr   rE   r/   rF   rG   r   rH   rI   rJ   rK   s     rO   r1   IBertSelfOutput.__init__'  s     ++ ]]

 %T%6%64??S%%%)) ..
 "*$,,4??!Szz&"<"<=rQ   c                     U R                  X5      u  pU R                  U5      nU R                  UUUUS9u  pU R                  X5      u  pU R	                  X5      u  pX4$ NrV   r   rJ   r   rF   rG   rL   r   r   input_tensorinput_tensor_scaling_factors        rO   ri   IBertSelfOutput.forwardD  z    6:jj6m3]36:6G6G(!$?	 7H 7
3 7;nn]6q36:6L6L7
3 ::rQ   rF   r4   r   r   rJ   r   r5   r6   rG   r&   r%   r   rv   s   @rO   r   r   &      >:; ;rQ   r   c                   6   ^  \ rS rSrU 4S jr  SS jrSrU =r$ )IBertAttentioniU  c                    > [         TU ]  5         UR                  U l        [        U5      U l        [        U5      U l        g N)r0   r1   r&   rx   rL   r   outputrK   s     rO   r1   IBertAttention.__init__V  s3     ++&v.	%f-rQ   c                     U R                  UUUU5      u  pVU R                  US   US   X5      u  pxU4USS  -   n	U4USS  -   n
X4$ )Nr   r   )rL   r   )rL   r   r   r   r   self_outputsself_outputs_scaling_factorattention_outputattention_output_scaling_factorr   outputs_scaling_factors              rO   ri   IBertAttention.forward\  s~     59II(	5
1 =AKKO8;]=
9 $%QR(88"A!CFabcbdFe!e..rQ   )r   r&   rL   r   r   rv   s   @rO   r   r   U  s    . / /rQ   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )IBertIntermediateiq  c           
        > [         TU ]  5         UR                  U l        SU l        SU l        SU l        [        UR                  UR                  SU R                  U R
                  U R                  SS9U l	        UR                  S:w  a  [        S5      e[        U R                  UR                  S9U l        [        U R                  U R                  S9U l        g )	Nr    r#   Tr}   r	   z3I-BERT only supports 'gelu' for `config.hidden_act`r   r+   )r0   r1   r&   r4   r%   r   r   r8   intermediate_sizer   
hidden_actr   r   r/   intermediate_act_fnr   rG   rK   s     rO   r1   IBertIntermediate.__init__r  s     ++ $$]]

 &RSS#*dooU[UiUi#j !)$,,4??!SrQ   c                 z    U R                  X5      u  pU R                  X5      u  pU R                  X5      u  pX4$ r   )r   r   rG   )rL   r   r   s      rO   ri   IBertIntermediate.forward  sL    6:jj6m36:6N6N7
3
 7;6L6L7
3 ::rQ   )r4   r   r   r   rG   r&   r%   r   rv   s   @rO   r   r   q  s    T(
; 
;rQ   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )IBertOutputi  c           
      t  > [         TU ]  5         UR                  U l        SU l        SU l        SU l        SU l        SU l        [        UR                  UR                  SU R                  U R
                  U R                  SS9U l        [        U R                  U R                  S9U l        [        UR                  UR                  U R                  U R                  UR                   S9U l        [        U R                  U R                  S9U l        [&        R(                  " UR*                  5      U l        g r   )r0   r1   r&   r4   r%   r   r5   r6   r   r   r8   r   r   r   r   rE   r/   rF   rG   r   rH   rI   rJ   rK   s     rO   r1   IBertOutput.__init__  s     ++ $$]]

 %T%6%64??S%%%)) ..
 "*$,,4??!Szz&"<"<=rQ   c                     U R                  X5      u  pU R                  U5      nU R                  UUUUS9u  pU R                  X5      u  pU R	                  X5      u  pX4$ r   r   r   s        rO   ri   IBertOutput.forward  r   rQ   r   r   rv   s   @rO   r   r     r   rQ   r   c                   <   ^  \ rS rSrU 4S jr  SS jrS rSrU =r$ )
IBertLayeri  c                 L  > [         TU ]  5         UR                  U l        SU l        SU l        [        U5      U l        [        U5      U l        [        U5      U l
        [        U R                  U R                  S9U l        [        U R                  U R                  S9U l        g )Nr    r   r+   )r0   r1   r&   r4   seq_len_dimr   	attentionr   intermediater   r   r   pre_intermediate_actpre_output_actrK   s     rO   r1   IBertLayer.__init__  s}     ++'/-f5!&)$,T\\doo$V!&t||PrQ   c                 ~    U R                  UUUUS9u  pVUS   nUS   nUSS  n	U R                  Xx5      u  pU
4U	-   n	U	$ )N)r   r   r   )r   feed_forward_chunk)rL   r   r   r   r   self_attention_outputs%self_attention_outputs_scaling_factorr   r   r   layer_outputlayer_output_scaling_factors               rO   ri   IBertLayer.forward  sz     IM(/	 IW I
E 2!4*OPQ*R'(,484K4K5
1  /G+rQ   c                     U R                  X5      u  pU R                  X5      u  p4U R                  X45      u  p4U R                  X4X5      u  pVXV4$ r   )r   r   r   r   )rL   r   r   intermediate_output"intermediate_output_scaling_factorr   r   s          rO   r   IBertLayer.feed_forward_chunk  su    <@<U<U=
9 CGBSBSC
? CGBUBUC
? 59KKEU5
1 88rQ   )r4   r   r   r   r   r   r&   r   r   )	ro   rp   rq   rr   r1   ri   r   rt   ru   rv   s   @rO   r   r     s     Q" 29 9rQ   r   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )IBertEncoderi  c                    > [         TU ]  5         Xl        UR                  U l        [        R
                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l	        g s  snf r   )
r0   r1   rM   r&   r   
ModuleListrangenum_hidden_layersr   layer)rL   rM   _rN   s      rO   r1   IBertEncoder.__init__  sT     ++]]fF^F^@_#`@_1Jv$6@_#`a
#`s   A0c                 2   U(       a  SOS nU(       a  SOS nS n	[        U R                  5       H2  u  pU(       a  Xq4-   nU" UUUU5      nUS   nU(       d  M*  XS   4-   nM4     U(       a  Xq4-   nU(       d  [        S UUUU	4 5       5      $ [        UUUU	S9$ )N r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r  ).0vs     rO   	<genexpr>'IBertEncoder.forward.<locals>.<genexpr>!  s"      	A  s   	)last_hidden_stater   
attentionscross_attentions)	enumerater  tupler
   )rL   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_outputss                rO   ri   IBertEncoder.forward  s     #7BD$5b4#(4OA#$58H$H!(,!	M *!,M  &91=M<O&O#  5   14D D 	 "%'(		 	 	 9++*1	
 	
rQ   )rM   r  r&   )NFFTr   rv   s   @rO   r   r     s!    b "/
 /
rQ   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )IBertPooleri3  c                    > [         TU ]  5         UR                  U l        [        R                  " UR
                  UR
                  5      U l        [        R                  " 5       U l        g r   )	r0   r1   r&   r   Linearr8   r   Tanh
activationrK   s     rO   r1   IBertPooler.__init__4  sF     ++YYv1163E3EF
'')rQ   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r   r  )rL   r   first_token_tensorpooled_outputs       rO   ri   IBertPooler.forward:  s6     +1a40

#566rQ   )r  r   r&   r   rv   s   @rO   r  r  3  s    $ rQ   r  c                   \    \ rS rSr% \\S'   Sr\R                  " 5       S 5       r	SS jr
Srg)	IBertPreTrainedModeliC  rM   ibertc                 r   [        U[        [        R                  45      (       a  [        R
                  " UR                  SU R                  R                  S9  UR                  b   [        R                  " UR                  5        [        USS5      b@  [        R                  " UR                  5        [        R                  " UR                  5        [        USS5      b!  [        R                  " UR                  5        gg[        U[        [        R                   45      (       a  [        R
                  " UR                  SU R                  R                  S9  UR"                  bI  [        UR                  SS5      (       d-  [        R                  " UR                  UR"                     5        [        USS5      bA  [        R                  " UR$                  5        [        R                  " UR                  5        gg[        U[&        [        R(                  45      (       ap  [        R                  " UR                  5        [        R*                  " UR                  5        [        US	S5      b!  [        R                  " UR,                  5        gg[        U[.        5      (       a!  [        R                  " UR                  5        g[        U[0        5      (       a\  [        R2                  " UR4                  [6        R8                  " UR4                  R:                  S
   5      R=                  S5      5        g[        U[>        5      (       ac  [        R@                  " URB                  S5        [        R@                  " URD                  S5        [        R                  " URF                  5        gg)zInitialize the weightsg        )meanstdNweight_integerbias_integer_is_hf_initializedFweight_scaling_factorshiftr)   r(   gh㈵gh㈵>)$
isinstancer   r   r  initnormal_weightrM   initializer_ranger~   zeros_getattrr+  fc_scaling_factorr,  r   	Embeddingr$   r.  r   rF   ones_r/  IBertLMHeadr   copy_r'   r>   r?   r   rA   r   	constant_x_minx_maxact_scaling_factor)rL   modules     rO   _init_weights"IBertPreTrainedModel._init_weightsH  s_    f{BII677LLSdkk6S6ST{{&FKK(v/6BF112F445v~t4@F//0 A >??LLSdkk6S6ST!!-gfmmMach6i6iFMM&*<*<=>v6=IF889F112 J r|| <==KK$JJv}}%vw-9FLL) :,,KK$00JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh))NN6<</NN6<<.KK112 *rQ   Nc                     [        S5      e)Nz6`resize_token_embeddings` is not supported for I-BERT.)NotImplementedError)rL   new_num_tokenss     rO   resize_token_embeddings,IBertPreTrainedModel.resize_token_embeddingsj  s    !"Z[[rQ   r  r   )ro   rp   rq   rr   r   __annotations__base_model_prefixr>   no_gradrA  rF  rt   r  rQ   rO   r&  r&  C  s-    
]]_3 3B\rQ   r&  c                   <  ^  \ rS rSrSrSU 4S jjrS rS r\        SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\\\	R                     -  4S jj5       rSrU =r$ )
IBertModelin  a  

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

c                    > [         TU ]  U5        Xl        UR                  U l        [	        U5      U l        [        U5      U l        U(       a  [        U5      OSU l	        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r0   r1   rM   r&   r   rf   r   encoderr  pooler	post_init)rL   rM   add_pooling_layerrN   s      rO   r1   IBertModel.__init__y  sX    
 	  ++)&1#F+->k&)D 	rQ   c                 .    U R                   R                  $ r   rf   r:   rL   s    rO   get_input_embeddingsIBertModel.get_input_embeddings  s    ...rQ   c                 $    XR                   l        g r   rT  )rL   r   s     rO   set_input_embeddingsIBertModel.set_input_embeddings  s    */'rQ   Nr_   r   r`   r'   ra   r   r  r  returnc	           	      H   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUc$  [        R                  " U
[        R                  US9nU R                  X*5      nU R                  UUUUS9u  nnU R                  UUUUUUS9nUS   nU R                  b  U R                  U5      OS nU(       d
  UU4US	S  -   $ [!        UUUR"                  UR$                  UR&                  S
9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer)   z5You have to specify either input_ids or inputs_embeds)rU   rS   )r_   r'   r`   ra   )r   r   r  r  r   r   )r  pooler_outputr   r  r  )rM   r   r  r  r   %warn_if_padding_and_no_attention_maskr\   rU   r>   onesr]   r^   get_extended_attention_maskrf   rN  rO  r   r   r  r  )rL   r_   r   r`   r'   ra   r   r  r  kwargsrc   
batch_size
seq_lengthrU   extended_attention_maskembedding_outputembedding_output_scaling_factorencoder_outputssequence_outputr#  s                       rO   ri   IBertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!"[[EJJvVN 150P0PQ_0m<@OO%)'	 =L =
99 ,,+2/!5# ' 
 *!,8<8OO4UY#]3oab6III;-')77&11,==
 	
rQ   )rM   rf   rN  rO  r&   )T)NNNNNNNN)ro   rp   rq   rr   rs   r1   rV  rY  r   r>   
LongTensorFloatTensorboolr   r  ri   rt   ru   rv   s   @rO   rL  rL  n  s    "/0  .237260426)-,0#'B
##d*B
 ))D0B
 ((4/	B

 &&-B
 ((4/B
  $;B
 #TkB
 D[B
 
6e>O>O8P	PB
 B
rQ   rL  c                   ^  ^  \ rS rSrSSS.rU 4S jrS rS r\         SS	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\\\	R                     -  4S jj5       rSrU =r$ )IBertForMaskedLMi  z(ibert.embeddings.word_embeddings.weight$zlm_head.bias)zlm_head.decoder.weightzlm_head.decoder.biasc                    > [         TU ]  U5        [        USS9U l        [	        U5      U l        U R                  5         g NF)rQ  )r0   r1   rL  r'  r:  lm_headrP  rK   s     rO   r1   IBertForMaskedLM.__init__  s6     %@
"6* 	rQ   c                 .    U R                   R                  $ r   )rq  decoderrU  s    rO   get_output_embeddings&IBertForMaskedLM.get_output_embeddings  s    ||###rQ   c                 Z    XR                   l        UR                  U R                   l        g r   )rq  rt  r~   )rL   new_embeddingss     rO   set_output_embeddings&IBertForMaskedLM.set_output_embeddings  s    -*//rQ   Nr_   r   r`   r'   ra   labelsr   r  r  r[  c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Nr   r`   r'   ra   r   r  r  r   r)   r   losslogitsr   r  )
rM   r  r'  rq  r   r   r7   r   r   r  )rL   r_   r   r`   r'   ra   r{  r   r  r  ra  r   rh  prediction_scoresmasked_lm_lossloss_fctr   s                    rO   ri   IBertForMaskedLM.forward  s    ( &1%<k$++BYBY**))%'/!5#  	
 "!* LL9')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rQ   )r'  rq  	NNNNNNNNN)ro   rp   rq   rr   _tied_weights_keysr1   ru  ry  r   r>   rj  rk  rl  r   r  ri   rt   ru   rv   s   @rO   rn  rn    s    #M .
$0  .237260426*.)-,0#'0
##d*0
 ))D00
 ((4/	0

 &&-0
 ((4/0
   4'0
  $;0
 #Tk0
 D[0
 
% 1 12	20
 0
rQ   rn  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r:  i!  z)I-BERT Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        g )N)r-   )r0   r1   r   r  r8   r   rF   rE   
layer_normr7   rt  	Parameterr>   r]   r~   rK   s     rO   r1   IBertLMHead.__init__$  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	rQ   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r	   r  rt  )rL   featuresra  xs       rO   ri   IBertLMHead.forward,  s;    JJx GOOA LLOrQ   )r~   rt  r   r  	ro   rp   rq   rr   rs   r1   ri   rt   ru   rv   s   @rO   r:  r:  !  s    3A rQ   r:  z
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                   H  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\\R                     -  4S jj5       rSrU =r$ )IBertForSequenceClassificationi7  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [        U5      U l        U R                  5         g rp  )r0   r1   
num_labelsrL  r'  IBertClassificationHead
classifierrP  rK   s     rO   r1   'IBertForSequenceClassification.__init__>  sC      ++%@
1&9 	rQ   Nr_   r   r`   r'   ra   r{  r   r  r  r[  c
                 .   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU	(       d  U4US	S -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S
9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr}  r   r   
regressionsingle_label_classificationmulti_label_classificationr)   r   r~  )rM   r  r'  r  problem_typer  rT   r>   r^   r   r   squeezer   r   r   r   r   r  rL   r_   r   r`   r'   ra   r{  r   r  r  ra  r   rh  r  r  r  r   s                    rO   ri   &IBertForSequenceClassification.forwardH  s   ( &1%<k$++BYBY**))%'/!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rQ   )r  r'  r  r  )ro   rp   rq   rr   r1   r   r>   rj  rk  rl  r   r  ri   rt   ru   rv   s   @rO   r  r  7  s     .237260426*.)-,0#'A
##d*A
 ))D0A
 ((4/	A

 &&-A
 ((4/A
   4'A
  $;A
 #TkA
 D[A
 
"E%*;*;$<	<A
 A
rQ   r  c                   H  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\\R                     -  4S jj5       rSrU =r$ )IBertForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r0   r1   rL  r'  r   rH   rI   rJ   r  r8   r  rP  rK   s     rO   r1   IBertForMultipleChoice.__init__  sV     '
zz&"<"<=))F$6$6: 	rQ   Nr_   r`   r   r{  r'   ra   r   r  r  r[  c
                 Z   U	b  U	OU R                   R                  n	Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	S9nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" UU5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nr   r)   r   )r'   r`   r   ra   r   r  r  r   r~  )rM   r  r   r   r\   r'  rJ   r  r   r   r   r  )rL   r_   r`   r   r{  r'   ra   r   r  r  ra  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r#  r  reshaped_logitsr  r  r   s                           rO   ri   IBertForMultipleChoice.forward  s   X &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ***..,/!5#  	
  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rQ   )r  rJ   r'  r  )ro   rp   rq   rr   r1   r   r>   rj  rk  rl  r   r  ri   rt   ru   rv   s   @rO   r  r    s     .22637*.0426)-,0#'V
##d*V
 ((4/V
 ))D0	V

   4'V
 &&-V
 ((4/V
  $;V
 #TkV
 D[V
 
#U5+<+<%=	=V
 V
rQ   r  c                   H  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\\R                     -  4S jj5       rSrU =r$ )IBertForTokenClassificationi  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rp  )r0   r1   r  rL  r'  r   rH   rI   rJ   r  r8   r  rP  rK   s     rO   r1   $IBertForTokenClassification.__init__  sk      ++%@
zz&"<"<=))F$6$68I8IJ 	rQ   Nr_   r   r`   r'   ra   r{  r   r  r  r[  c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr}  r   r)   r   r~  )rM   r  r'  rJ   r  r   r   r  r   r   r  r  s                    rO   ri   #IBertForTokenClassification.forward   s    $ &1%<k$++BYBY**))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rQ   )r  rJ   r'  r  r  )ro   rp   rq   rr   r1   r   r>   rj  rk  rl  r   r  ri   rt   ru   rv   s   @rO   r  r    s    	  .237260426*.)-,0#'1
##d*1
 ))D01
 ((4/	1

 &&-1
 ((4/1
   4'1
  $;1
 #Tk1
 D[1
 
u'8'8!9	91
 1
rQ   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i5  z-Head for sentence-level classification tasks.c                 ,  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        g r   )r0   r1   r   r  r8   r   rH   rI   rJ   r  out_projrK   s     rO   r1    IBertClassificationHead.__init__8  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHrQ   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r!  )rJ   r   r>   tanhr  )rL   r  ra  r   s       rO   ri   IBertClassificationHead.forward>  s^     Aq)]3

=1

=1]3m4rQ   )r   rJ   r  r  rv   s   @rO   r  r  5  s    7I rQ   r  c                   h  ^  \ rS rSrU 4S jr\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\	S-  S\	S-  S\	S-  S\
\\R                     -  4S jj5       rSrU =r$ )IBertForQuestionAnsweringiH  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rp  )
r0   r1   r  rL  r'  r   r  r8   
qa_outputsrP  rK   s     rO   r1   "IBertForQuestionAnswering.__init__J  sU      ++%@
))F$6$68I8IJ 	rQ   Nr_   r   r`   r'   ra   start_positionsend_positionsr   r  r  r[  c                 "   U
b  U
OU R                   R                  n
U R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU
(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr}  r   r   r)   dim)ignore_indexr   )r  start_logits
end_logitsr   r  )rM   r  r'  r  splitr  r   lenr\   clampr   r   r   r  )rL   r_   r   r`   r'   ra   r  r  r   r  r  ra  r   rh  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rO   ri   !IBertForQuestionAnswering.forwardT  s    &1%<k$++BYBY**))%'/!5#  	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rQ   )r'  r  r  )
NNNNNNNNNN)ro   rp   rq   rr   r1   r   r>   rj  rk  rl  r   r  ri   rt   ru   rv   s   @rO   r  r  H  s     .2372604263715)-,0#'=
##d*=
 ))D0=
 ((4/	=

 &&-=
 ((4/=
 ))D0=
 ''$.=
  $;=
 #Tk=
 D[=
 
&e.?.?(@	@=
 =
rQ   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )a1  
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's *utils.make_positions*.

Args:
input_ids (`torch.LongTensor`):
       Indices of input sequence tokens in the vocabulary.

Returns: torch.Tensor
r   r  )ner   r>   cumsumtype_asr^   )r_   r$   rb   maskincremental_indicess        rO   rY   rY     sW     <<$((*D <<!4<<TBE[[_cc##%33rQ   )rn  r  r  r  r  rL  r&  )r   );rs   r   r>   r   torch.nnr   r   r    r   r1  activationsr	   modeling_outputsr
   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_ibertr   quant_modulesr   r   r   r   r   r   
get_loggerro   loggerModuler   rx   r   r   r   r   r   r   r  r&  rL  rn  r:  r  r  r  r  r  rY   __all__r  rQ   rO   <module>r     s  "     A A &    . , , c c 
		H	%s=bii s=l@. @.F,;bii ,;^/RYY /8;		 ;D,;")) ,;^59 59p6
299 6
r"))   '\? '\ '\T d
% d
 d
N G
+ G
 G
T")) , M
%9 M
M
` b
1 b
 b
J >
"6 >
 >
Bbii & I
 4 I
 I
X4"rQ   