
    Z jƍ                        S r SSKrSSKrSSKJr  SSKJrJrJr  SSKJ	r
  SSKJr  SSKJrJrJrJrJrJrJr  SS	KJr  SS
KJrJr  SSKJr  \R8                  " \5      r " S S\R>                  5      r  " S S\R>                  5      r! " S S\RD                  5      r# " S S\R>                  5      r$ " S S\R>                  5      r% " S S\R>                  5      r& " S S\R>                  5      r' " S S\R>                  5      r( " S S\R>                  5      r) " S S \R>                  5      r* " S! S"\R>                  5      r+ " S# S$\R>                  5      r,\ " S% S&\5      5       r-\ " S' S(\-5      5       r.\ " S) S*\-5      5       r/\" S+S,9 " S- S.\-5      5       r0\ " S/ S0\-5      5       r1\ " S1 S2\-5      5       r2\ " S3 S4\-5      5       r3/ S5Qr4g)6zPyTorch SqueezeBert model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )SqueezeBertConfigc                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )SqueezeBertEmbeddings,   zGConstruct the embeddings from word, position and token_type embeddings.c                 v  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                   5      U l        U R%                  S[&        R(                  " UR                  5      R+                  S5      SS9  g )N)padding_idxepsposition_idsr   F)
persistent)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormhidden_sizelayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__s     څ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/squeezebert/modeling_squeezebert.pyr"   SqueezeBertEmbeddings.__init__/   s    !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
    c                    Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUc  U R                  U5      nU R                  U5      nU R                  U5      nXG-   U-   n	U R                  U	5      n	U R                  U	5      n	U	$ )Nr   r   dtypedevice)sizer   r3   zeroslongr@   r'   r)   r+   r,   r1   )
r7   	input_idstoken_type_idsr   inputs_embedsinput_shape
seq_lengthr)   r+   
embeddingss
             r:   forwardSqueezeBertEmbeddings.forward=   s     #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M"66|D $ : :> J"8;PP
^^J/
\\*-
r<   )r,   r1   r)   r+   r'   )NNNN	__name__
__module____qualname____firstlineno____doc__r"   rJ   __static_attributes____classcell__r9   s   @r:   r   r   ,   s    Q
 r<   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MatMulWrapperV   z
Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.
c                 "   > [         TU ]  5         g N)r!   r"   )r7   r9   s    r:   r"   MatMulWrapper.__init__\   s    r<   c                 .    [         R                  " X5      $ )a  

:param inputs: two torch tensors :return: matmul of these tensors

Here are the typical dimensions found in BERT (the B is optional) mat1.shape: [B, <optional extra dims>, M, K]
mat2.shape: [B, <optional extra dims>, K, N] output shape: [B, <optional extra dims>, M, N]
)r3   matmul)r7   mat1mat2s      r:   rJ   MatMulWrapper.forward_   s     ||D''r<    rL   rT   s   @r:   rV   rV   V   s    
( (r<   rV   c                   (    \ rS rSrSrSS jrS rSrg)SqueezeBertLayerNormj   z
This is a nn.LayerNorm subclass that accepts NCW data layout and performs normalization in the C dimension.

N = batch C = channels W = sequence length
c                 @    [         R                  R                  XUS9  g )N)normalized_shaper   )r   r,   r"   )r7   r-   r   s      r:   r"   SqueezeBertLayerNorm.__init__q   s    
dcJr<   c                     UR                  SSS5      n[        R                  R                  X5      nUR                  SSS5      $ )Nr      r   )permuter   r,   rJ   )r7   xs     r:   rJ   SqueezeBertLayerNorm.forwardt   s;    IIaALL  )yyAq!!r<   r`   N)g-q=)rM   rN   rO   rP   rQ   r"   rJ   rR   r`   r<   r:   rb   rb   j   s    K"r<   rb   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ConvDropoutLayerNormz   z0
ConvDropoutLayerNorm: Conv, Dropout, LayerNorm
c                    > [         TU ]  5         [        R                  " XSUS9U l        [        U5      U l        [        R                  " U5      U l        g Nr   in_channelsout_channelskernel_sizegroups)	r!   r"   r   Conv1dconv1drb   	layernormr/   r1   )r7   cincoutru   dropout_probr9   s        r:   r"   ConvDropoutLayerNorm.__init__   s@    iiCPQZ`a-d3zz,/r<   c                 t    U R                  U5      nU R                  U5      nX2-   nU R                  U5      nU$ rY   rw   r1   rx   )r7   hidden_statesinput_tensorrj   s       r:   rJ   ConvDropoutLayerNorm.forward   s8    KK&LLONN1r<   r~   rL   rT   s   @r:   rm   rm   z   s    0 r<   rm   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ConvActivation   z"
ConvActivation: Conv, Activation
c                 t   > [         TU ]  5         [        R                  " XSUS9U l        [
        U   U l        g rp   )r!   r"   r   rv   rw   r	   act)r7   ry   rz   ru   r   r9   s        r:   r"   ConvActivation.__init__   s/    iiCPQZ`a#;r<   c                 F    U R                  U5      nU R                  U5      $ rY   )rw   r   )r7   rj   outputs      r:   rJ   ConvActivation.forward   s    Qxxr<   )r   rw   rL   rT   s   @r:   r   r      s    
   r<   r   c                   D   ^  \ rS rSrSU 4S jjrS rS rS rS rSr	U =r
$ )	SqueezeBertSelfAttention   c                 n  > [         TU ]  5         X!R                  -  S:w  a  [        SU SUR                   S35      eUR                  U l        [	        X!R                  -  5      U l        U R                  U R
                  -  U l        [        R                  " X"SUS9U l	        [        R                  " X"SUS9U l
        [        R                  " X"SUS9U l        [        R                  " UR                  5      U l        [        R                  " SS9U l        [#        5       U l        [#        5       U l        g	)
z
config = used for some things; ignored for others (work in progress...) cin = input channels = output channels
groups = number of groups to use in conv1d layers
r   zcin (z6) is not a multiple of the number of attention heads ()r   rq   r   dimN)r!   r"   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rv   querykeyvaluer/   attention_probs_dropout_probr1   SoftmaxsoftmaxrV   	matmul_qk
matmul_qkv)r7   r8   ry   q_groupsk_groupsv_groupsr9   s         r:   r"   !SqueezeBertSelfAttention.__init__   s    
 	+++q0uRSYSmSmRnnop  $*#=#= #&s-G-G'G#H !558P8PPYY3aX`a
99AV^_YY3aX`a
zz&"E"EFzzb)&'/r<   c                     UR                  5       S   U R                  U R                  UR                  5       S   4nUR                  " U6 nUR	                  SSSS5      $ )zg
- input: [N, C, W]
- output: [N, C1, W, C2] where C1 is the head index, and C2 is one head's contents
r   r   r   r   rh   )rA   r   r   viewri   r7   rj   new_x_shapes      r:   transpose_for_scores-SqueezeBertSelfAttention.transpose_for_scores   s[    
 vvx{D$<$<d>V>VXYX^X^X`acXdeFFK yyAq!$$r<   c                     UR                  5       S   U R                  U R                  UR                  5       S   4nUR                  " U6 nU$ )zg
- input: [N, C, W]
- output: [N, C1, C2, W] where C1 is the head index, and C2 is one head's contents
r   r   )rA   r   r   r   r   s      r:   transpose_key_for_scores1SqueezeBertSelfAttention.transpose_key_for_scores   sK    
 vvx{D$<$<d>V>VXYX^X^X`acXdeFFK r<   c                     UR                  SSSS5      R                  5       nUR                  5       S   U R                  UR                  5       S   4nUR                  " U6 nU$ )z-
- input: [N, C1, W, C2]
- output: [N, C, W]
r   r   r   rh   )ri   
contiguousrA   r   r   r   s      r:   transpose_output)SqueezeBertSelfAttention.transpose_output   sZ    
 IIaAq!,,.vvx{D$6$6DFFK r<   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      n	U R                  Xx5      n
U
[        R                  " U R                  5      -  n
X-   n
U R                  U
5      nU R                  U5      nU R                  X5      nU R                  U5      nSU0nU(       a  XS'   U$ )z
expects hidden_states in [N, C, W] data layout.

The attention_mask data layout is [N, W], and it does not need to be transposed.
context_layerattention_score)r   r   r   r   r   r   mathsqrtr   r   r1   r   r   )r7   r   attention_maskoutput_attentionsmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerr   attention_probsr   results                 r:   rJ    SqueezeBertSelfAttention.forward   s     !JJ}5((=1 JJ}5//0AB11/B	//0AB ..@)DIId6N6N,OO): ,,7 ,,7E--m<!=1(7$%r<   )
r   r   r1   r   r   r   r   r   r   r   )r   r   r   )rM   rN   rO   rP   r"   r   r   r   rJ   rR   rS   rT   s   @r:   r   r      s!    *0%! !r<   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertModule   c                   > [         TU ]  5         UR                  nUR                  nUR                  nUR                  n[	        XUR
                  UR                  UR                  S9U l        [        X#UR                  UR                  S9U l        [        X4UR                  UR                  S9U l        [        XEUR"                  UR                  S9U l        g)aP  
- hidden_size = input chans = output chans for Q, K, V (they are all the same ... for now) = output chans for
  the module
- intermediate_size = output chans for intermediate layer
- groups = number of groups for all layers in the BertModule. (eventually we could change the interface to
  allow different groups for different layers)
)r8   ry   r   r   r   )ry   rz   ru   r{   )ry   rz   ru   r   N)r!   r"   r-   intermediate_sizer   r   r   r   	attentionrm   post_attention_groupsr0   post_attentionr   intermediate_groups
hidden_actintermediateoutput_groupsr   )r7   r8   c0c1c2c3r9   s         r:   r"   SqueezeBertModule.__init__   s     	%%1FOOfoo`f`o`o
 3F$@$@vOiOi
 +r6C]C]cictctu*F$8$8vGaGa
r<   c                     U R                  XU5      nUS   nU R                  XQ5      nU R                  U5      nU R                  Xv5      nSU0n	U(       a  US   U	S'   U	$ )Nr   feature_mapr   )r   r   r   r   )
r7   r   r   r   attattention_outputpost_attention_outputintermediate_outputlayer_outputoutput_dicts
             r:   rJ   SqueezeBertModule.forward  su    nn]<MN/ $ 3 34D T"//0EF{{#6N$l3-01B-CK)*r<   )r   r   r   r   rM   rN   rO   rP   r"   rJ   rR   rS   rT   s   @r:   r   r      s    
4 r<   r   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )SqueezeBertEncoderi!  c                    >^ [         TU ]  5         TR                  TR                  :X  d   S5       e[        R
                  " U4S j[        TR                  5       5       5      U l        g )NzIf you want embedding_size != intermediate hidden_size, please insert a Conv1d layer to adjust the number of channels before the first SqueezeBertModule.c              3   :   >#    U  H  n[        T5      v   M     g 7frY   )r   ).0_r8   s     r:   	<genexpr>.SqueezeBertEncoder.__init__.<locals>.<genexpr>+  s     #gGf!$5f$=$=Gfs   )	r!   r"   r%   r-   r   
ModuleListrangenum_hidden_layerslayersr6   s    `r:   r"   SqueezeBertEncoder.__init__"  sW    $$(:(:: 	
2	
: mm#guVMeMeGf#ggr<   c                    UR                  SSS5      nU(       a  SOS nU(       a  SOS nU R                   H]  nU(       a+  UR                  SSS5      nXa4-  nUR                  SSS5      nUR                  XU5      n	U	S   nU(       d  MU  XyS   4-  nM_     UR                  SSS5      nU(       a  Xa4-  nU(       d  [        S XU4 5       5      $ [	        XUS9$ )	Nr   rh   r   r`   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frY   r`   )r   vs     r:   r   -SqueezeBertEncoder.forward.<locals>.<genexpr>O  s     h$Vq$Vs   	)last_hidden_stater   
attentions)ri   r   rJ   tupler
   )
r7   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_attentionslayerr   s
             r:   rJ   SqueezeBertEncoder.forward-  s     &--aA6"6BD0d[[E# - 5 5aA >!%55! - 5 5aA > ==HYZL(7M  0A#B"DD ! &--aA6!11h]~$Vhhh+Yg
 	
r<   )r   )NFFTr   rT   s   @r:   r   r   !  s!    	h "%
 %
r<   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertPooleriU  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rY   )r!   r"   r   Linearr-   denseTanh
activationr6   s     r:   r"   SqueezeBertPooler.__init__V  s9    YYv1163E3EF
'')r<   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r7   r   first_token_tensorpooled_outputs       r:   rJ   SqueezeBertPooler.forward[  s6     +1a40

#566r<   )r   r   r   rT   s   @r:   r   r   U  s    $
 r<   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )"SqueezeBertPredictionHeadTransformid  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g )Nr   )r!   r"   r   r   r-   r   
isinstancer   strr	   transform_act_fnr,   r.   r6   s     r:   r"   +SqueezeBertPredictionHeadTransform.__init__e  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr<   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rY   )r   r  r,   r7   r   s     r:   rJ   *SqueezeBertPredictionHeadTransform.forwardn  s4    

=1--m<}5r<   )r,   r   r  r   rT   s   @r:   r  r  d  s    U r<   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertLMPredictionHeadiu  c                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        g )NT)bias)r!   r"   r  	transformr   r   r-   r$   decoder	Parameterr3   rB   r  r6   s     r:   r"   $SqueezeBertLMPredictionHead.__init__v  s[    ;FC yy!3!3V5F5FTRLLV->->!?@	r<   c                 J    U R                  U5      nU R                  U5      nU$ rY   )r  r  r	  s     r:   rJ   #SqueezeBertLMPredictionHead.forward  s$    }5]3r<   )r  r  r  r   rT   s   @r:   r  r  u  s    A r<   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g rY   )r!   r"   r  predictionsr6   s     r:   r"   SqueezeBertOnlyMLMHead.__init__  s    6v>r<   c                 (    U R                  U5      nU$ rY   r  )r7   sequence_outputprediction_scoress      r:   rJ   SqueezeBertOnlyMLMHead.forward  s     ,,_=  r<   r  r   rT   s   @r:   r  r    s    ?! !r<   r  c                   `   ^  \ rS rSr% \\S'   Sr\R                  " 5       U 4S j5       r	Sr
U =r$ )SqueezeBertPreTrainedModeli  r8   transformerc                 r  > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a\  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        gg)zInitialize the weightsr   r   N)r!   _init_weightsr  r  initzeros_r  r   copy_r   r3   r4   shaper5   )r7   moduler9   s     r:   r#  (SqueezeBertPreTrainedModel._init_weights  s~     	f%f9::KK$ 566JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 7r<   r`   )rM   rN   rO   rP   r   __annotations__base_model_prefixr3   no_gradr#  rR   rS   rT   s   @r:   r   r     s)    %
]]_i ir<   r   c                     ^  \ rS rSrU 4S jrS rS r\        SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )SqueezeBertModeli  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  5         g rY   )	r!   r"   r   rI   r   encoderr   pooler	post_initr6   s     r:   r"   SqueezeBertModel.__init__  s@     /7)&1'/ 	r<   c                 .    U R                   R                  $ rY   rI   r'   r7   s    r:   get_input_embeddings%SqueezeBertModel.get_input_embeddings  s    ...r<   c                 $    XR                   l        g rY   r5  r7   new_embeddingss     r:   set_input_embeddings%SqueezeBertModel.set_input_embeddings  s    *8'r<   NrD   r   rE   r   rF   r   r   r   returnc	                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eUb  UR                  OUR                  nUc  [        R                  " XS9nUc$  [        R                  " U
[        R                  US9nU R                  X*5      nU R                  XX5S9nU R                  UUUUUS9nUS   nU R                  U5      nU(       d
  UU4US	S  -   $ [!        UUUR"                  UR$                  S
9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r@   r>   )rD   r   rE   rF   )r   r   r   r   r   r   r   )r   pooler_outputr   r   )r8   r   r   r   r   %warn_if_padding_and_no_attention_maskrA   r@   r3   onesrB   rC   get_extended_attention_maskrI   r0  r1  r   r   r   )r7   rD   r   rE   r   rF   r   r   r   kwargsrG   r@   extended_attention_maskembedding_outputencoder_outputsr  r   s                    r:   rJ   SqueezeBertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>cdd"66yQ#..*K&',,.s3KTUU%.%:!!@T@T!"ZZCN!"[[EJJvVN"&"B"B>"_??> + 
 ,,*2/!5# ' 
 *!,O4#]3oab6III)-')77&11	
 	
r<   )rI   r0  r1  )NNNNNNNN)rM   rN   rO   rP   r"   r7  r<  r   r3   TensorFloatTensorboolr   r   rJ   rR   rS   rT   s   @r:   r.  r.    s    /9  *..2.2,026)-,0#':
<<$&:
 t+:
 t+	:

 llT):
 ((4/:
  $;:
 #Tk:
 D[:
 
+	+:
 :
r<   r.  c                   D  ^  \ rS rSrSSS.rU 4S jrS rS r\         SS	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )SqueezeBertForMaskedLMi  zcls.predictions.biasz-transformer.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g rY   )r!   r"   r.  r!  r  clsr2  r6   s     r:   r"   SqueezeBertForMaskedLM.__init__  s5     +F3)&1 	r<   c                 B    U R                   R                  R                  $ rY   )rO  r  r  r6  s    r:   get_output_embeddings,SqueezeBertForMaskedLM.get_output_embeddings  s    xx##+++r<   c                     XR                   R                  l        UR                  U R                   R                  l        g rY   )rO  r  r  r  r:  s     r:   set_output_embeddings,SqueezeBertForMaskedLM.set_output_embeddings  s*    '5$$2$7$7!r<   NrD   r   rE   r   rF   labelsr   r   r   r>  c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Nr   rE   r   rF   r   r   r   r   r   rh   losslogitsr   r   )
r8   r   r!  rO  r   r   r$   r   r   r   )r7   rD   r   rE   r   rF   rW  r   r   r   rD  outputsr  r  masked_lm_lossloss_fctr   s                    r:   rJ   SqueezeBertForMaskedLM.forward  s    ( &1%<k$++BYBY""))%'/!5# # 	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r<   )rO  r!  	NNNNNNNNN)rM   rN   rO   rP   _tied_weights_keysr"   rR  rU  r   r3   rI  rK  r   r   rJ   rR   rS   rT   s   @r:   rM  rM    s     )?*Y
,8  *..2.2,0-1&*)-,0#'1
<<$&1
 t+1
 t+	1

 llT)1
 ||d*1
 t#1
  $;1
 #Tk1
 D[1
 
	1
 1
r<   rM  z
    SqueezeBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )$SqueezeBertForSequenceClassificationi=  c                 P  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  U R                  R                  5      U l        U R                  5         g rY   )r!   r"   
num_labelsr8   r.  r!  r   r/   r0   r1   r   r-   
classifierr2  r6   s     r:   r"   -SqueezeBertForSequenceClassification.__init__D  ss      +++F3zz&"<"<=))F$6$68N8NO 	r<   NrD   r   rE   r   rF   rW  r   r   r   r>  c
                 P   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NrY  r   
regressionsingle_label_classificationmulti_label_classificationr   rh   rZ  )r8   r   r!  r1   rh  problem_typerg  r?   r3   rC   r   r   squeezer   r   r   r   r   r   )r7   rD   r   rE   r   rF   rW  r   r   r   rD  r]  r   r\  r[  r_  r   s                    r:   rJ   ,SqueezeBertForSequenceClassification.forwardP  s   ( &1%<k$++BYBY""))%'/!5# # 	
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r<   )rh  r8   r1   rg  r!  ra  )rM   rN   rO   rP   r"   r   r3   rI  rK  r   r   rJ   rR   rS   rT   s   @r:   re  re  =  s    
  *..2.2,0-1&*)-,0#'E
<<$&E
 t+E
 t+	E

 llT)E
 ||d*E
 t#E
  $;E
 #TkE
 D[E
 
)	)E
 E
r<   re  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )SqueezeBertForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r!   r"   r.  r!  r   r/   r0   r1   r   r-   rh  r2  r6   s     r:   r"   %SqueezeBertForMultipleChoice.__init__  sW     +F3zz&"<"<=))F$6$6: 	r<   NrD   r   rE   r   rF   rW  r   r   r   r>  c
                 X   U	b  U	OU R                   R                  n	Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	S9nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
    *input_ids* above)
Nr   r   rY  rh   rZ  )r8   r   r'  r   rA   r!  r1   rh  r   r   r   r   )r7   rD   r   rE   r   rF   rW  r   r   r   rD  num_choicesr]  r   r\  reshaped_logitsr[  r_  r   s                      r:   rJ   $SqueezeBertForMultipleChoice.forward  s   X &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ""))%'/!5# # 	
  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r<   )rh  r1   r!  ra  )rM   rN   rO   rP   r"   r   r3   rI  rK  r   r   rJ   rR   rS   rT   s   @r:   rr  rr    s      *..2.2,0-1&*)-,0#'W
<<$&W
 t+W
 t+	W

 llT)W
 ||d*W
 t#W
  $;W
 #TkW
 D[W
 
*	*W
 W
r<   rr  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )!SqueezeBertForTokenClassificationi   c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rY   )r!   r"   rg  r.  r!  r   r/   r0   r1   r   r-   rh  r2  r6   s     r:   r"   *SqueezeBertForTokenClassification.__init__  sj      +++F3zz&"<"<=))F$6$68I8IJ 	r<   NrD   r   rE   r   rF   rW  r   r   r   r>  c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
NrY  r   r   rh   rZ  )r8   r   r!  r1   rh  r   r   rg  r   r   r   )r7   rD   r   rE   r   rF   rW  r   r   r   rD  r]  r  r\  r[  r_  r   s                    r:   rJ   )SqueezeBertForTokenClassification.forward  s    $ &1%<k$++BYBY""))%'/!5# # 	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r<   )rh  r1   rg  r!  ra  )rM   rN   rO   rP   r"   r   r3   rI  rK  r   r   rJ   rR   rS   rT   s   @r:   r{  r{     s    	  *..2.2,0-1&*)-,0#'1
<<$&1
 t+1
 t+	1

 llT)1
 ||d*1
 t#1
  $;1
 #Tk1
 D[1
 
&	&1
 1
r<   r{  c                   N  ^  \ rS rSrU 4S jr\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )SqueezeBertForQuestionAnsweringiB  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rY   )
r!   r"   rg  r.  r!  r   r   r-   
qa_outputsr2  r6   s     r:   r"   (SqueezeBertForQuestionAnswering.__init__D  sT      +++F3))F$6$68I8IJ 	r<   NrD   r   rE   r   rF   start_positionsend_positionsr   r   r   r>  c                 "   U
b  U
OU R                   R                  n
U R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU
(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	NrY  r   r   r   r   )ignore_indexrh   )r[  start_logits
end_logitsr   r   )r8   r   r!  r  splitro  r   lenrA   clampr   r   r   r   )r7   rD   r   rE   r   rF   r  r  r   r   r   rD  r]  r  r\  r  r  
total_lossignored_indexr_  
start_lossend_lossr   s                          r:   rJ   'SqueezeBertForQuestionAnswering.forwardN  s    &1%<k$++BYBY""))%'/!5# # 	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r<   )rg  r  r!  )
NNNNNNNNNN)rM   rN   rO   rP   r"   r   r3   rI  rK  r   r   rJ   rR   rS   rT   s   @r:   r  r  B  s      *..2.2,0-1/3-1)-,0#'=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 ,=
 ||d*=
  $;=
 #Tk=
 D[=
 
-	-=
 =
r<   r  )rM  rr  r  re  r{  r.  r   r   )5rQ   r   r3   r   torch.nnr   r   r    r   r$  activationsr	   modeling_outputsr
   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_squeezebertr   
get_loggerrM   loggerModuler   rV   r,   rb   rm   r   r   r   r   r   r  r  r  r   r.  rM  re  rr  r{  r  __all__r`   r<   r:   <module>r     s   !    A A & !   . 9 
		H	%'BII 'T(BII (("2<< " 299 ( RYY  Wryy Wt'		 'T1
 1
h		  "")) &!RYY ! i i i L
1 L
 L
^ H
7 H
 H
V S
+E S
S
l c
#= c
 c
L >
(B >
 >
B I
&@ I
 I
X	r<   