
    Z j                     4   S r SSKJr  SSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJrJr  SSKJrJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,  \%RZ                  " \.5      r/ " S S\R`                  5      r1  SBS\R`                  S\Rd                  S\Rd                  S\Rd                  S\Rd                  S-  S\3S-  S\3S\\#   4S jjr4 " S S \R`                  5      r5 " S! S"\R`                  5      r6 " S# S$\R`                  5      r7 " S% S&\R`                  5      r8\$ " S' S(\5      5       r9\$" S)S*9\ " S+ S,\"5      5       5       r:\$ " S- S.\95      5       r;\$" S/S*9 " S0 S1\95      5       r< " S2 S3\R`                  5      r= " S4 S5\R`                  5      r>\$ " S6 S7\95      5       r?\$" S8S*9 " S9 S:\95      5       r@\$ " S; S<\95      5       rA\$ " S= S>\95      5       rB\$ " S? S@\95      5       rC/ SAQrDg)CzPyTorch ALBERT model.    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)create_bidirectional_mask)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )AlbertConfigc                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  4
S jjrSrU =r$ )AlbertEmbeddings1   zI
Construct the embeddings from word, position and token_type embeddings.
configc                 
  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  S9SS9  g )	N)padding_idxepsposition_idsr   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr)   sizelongselfr$   	__class__s     {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/albert/modeling_albert.pyr0   AlbertEmbeddings.__init__6   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`"f&;&;AVAVWzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr-   r)   inputs_embedsreturnc                    Ub  UR                  5       nOUR                  5       S S nUu  pgUc  U R                  S S 2S U24   nUc  [        U S5      (       aQ  U R                  R	                  UR
                  S   S5      n[        R                  " USUS9nUR	                  Xg5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  U5      nX-   n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr+   r-   r   r   )dimindex)r.   device)rD   r)   hasattrr-   rB   shaper@   gatherrC   rE   rR   r5   r9   r7   r:   r>   )rG   rL   r-   r)   rM   input_shape
batch_size
seq_lengthbuffered_token_type_idsr9   
embeddingsr7   s               rI   forwardAlbertEmbeddings.forwardG   s@     #..*K',,.s3K!,
,,Q^<L
 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
rK   )r:   r>   r7   r9   r5   )NNNN)__name__
__module____qualname____firstlineno____doc__r    r0   r@   
LongTensorFloatTensorTensorr[   __static_attributes____classcell__rH   s   @rI   r"   r"   1   s    
| 
& .2260426'##d*' ((4/' &&-	'
 ((4/' 
' 'rK   r"   modulequerykeyvalueattention_maskscalingr>   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr+            r	   rP   )ptrainingr   )
rD   r@   matmul	transposer   
functionalsoftmaxr>   rt   
contiguous)
rh   ri   rj   rk   rl   rm   r>   rn   attn_weightsattn_outputs
             rI   eager_attention_forwardr|   r   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rK   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   S\\R                  \R                  4   4S	 jjrS
rU =r$ )AlbertAttention   r$   c                 6  > [         TU ]  5         UR                  UR                  -  S:w  a6  [	        US5      (       d%  [        SUR                   SUR                   35      eXl        UR                  U l        UR                  U l        UR                  UR                  -  U l        U R                  U R                  -  U l        U R                  S-  U l	        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                   " UR                  U R                  5      U l        [        R                   " UR                  U R                  5      U l        [        R                   " UR                  U R                  5      U l        [        R                   " UR                  UR                  5      U l        [        R*                  " UR                  UR,                  S9U l        SU l        g )Nr   r3   zThe hidden size (z6) is not a multiple of the number of attention heads (rp   r'   F)r/   r0   hidden_sizenum_attention_headsrS   
ValueErrorr$   attention_head_sizeall_head_sizerm   r   r<   attention_probs_dropout_probattention_dropoutr=   output_dropoutLinearri   rj   rk   denser:   r;   	is_causalrF   s     rI   r0   AlbertAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 4457  #)#=#= !--#)#5#59S9S#S !558P8PP//5!#F,O,O!P jj)C)CDYYv1143E3EF
99V//1C1CDYYv1143E3EF
YYv1163E3EF
f&8&8f>S>STrK   Nhidden_statesrl   rn   rN   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  (       d  SOU R                  R                  U R                  S.UD6u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
U R'                  U
5      n
U R)                  X-   5      n
X4$ )Nr+   r   rq           )r>   rm   )rT   r   ri   viewrv   rj   rk   r   get_interfacer$   _attn_implementationr|   rt   r   rs   rm   reshapery   r   r   r:   )rG   r   rl   rn   rV   hidden_shapequery_layer	key_layervalue_layerattention_interfacer{   rz   s               rI   r[   AlbertAttention.forward   so    $))#2.CCbC$*B*BC jj/44lCMMaQRSHH]+00,?II!QO	jj/44lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2H2J2JLL	%
 	%
! "));;;;FFHjj-))+6nn]%@A((rK   )r:   r   r   r   r$   r   r   r   rj   r   r   ri   rm   rk   Nr]   r^   r_   r`   r    r0   r@   rd   rc   r   r   tupler[   re   rf   rg   s   @rI   r~   r~      sk    | < 48")||") ))D0") +,	")
 
u||U\\)	*") ")rK   r~   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   S\\R                  \R                  4   4S	 jjrS
\R                  S\R                  4S jrSrU =r$ )AlbertLayer   r$   c                   > [         TU ]  5         Xl        UR                  U l        SU l        [
        R                  " UR                  UR                  S9U l	        [        U5      U l        [
        R                  " UR                  UR                  5      U l        [
        R                  " UR                  UR                  5      U l        [         UR"                     U l        [
        R&                  " UR(                  5      U l        g )Nr   r'   )r/   r0   r$   chunk_size_feed_forwardseq_len_dimr   r:   r   r;   full_layer_layer_normr~   	attentionr   intermediate_sizeffn
ffn_outputr   
hidden_act
activationr<   r=   r>   rF   s     rI   r0   AlbertLayer.__init__   s    '-'E'E$%'\\&2D2D&J_J_%`"(099V//1I1IJ))F$<$<f>P>PQ !2!23zz&"<"<=rK   Nr   rl   rn   rN   c                     U R                   " X40 UD6u  pE[        U R                  U R                  U R                  U5      nU R                  Xd-   5      nU$ r   )r   r   ff_chunkr   r   r   )rG   r   rl   rn   attention_output_r   s          rI   r[   AlbertLayer.forward   s\     #nn]UfU.MM((	

 22:3PQrK   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )rG   r   r   s      rI   r   AlbertLayer.ff_chunk   s3    XX./
__Z0
__Z0
rK   )	r   r   r   r$   r>   r   r   r   r   r   )r]   r^   r_   r`   r    r0   r@   rd   rc   r   r   r   r[   r   re   rf   rg   s   @rI   r   r      s    >| >  48|| ))D0 +,	
 
u||U\\)	*" %,,  rK   r   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   S\\R                  \\R                     -  S	4   4S
 jjrSrU =r$ )AlbertLayerGroup   r$   c                    > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r   )r/   r0   r   
ModuleListrangeinner_group_numr   albert_layersrG   r$   r   rH   s      rI   r0   AlbertLayerGroup.__init__   sC    ]]vOeOeIf+gIfAK,?If+gh+gs   ANr   rl   rn   rN   .c                 T    [        U R                  5       H  u  pEU" X40 UD6nM     U$ r   )	enumerater   )rG   r   rl   rn   layer_indexalbert_layers         rI   r[   AlbertLayerGroup.forward   s2     *343E3E)F%K(Q&QM *GrK   )r   r   r   rg   s   @rI   r   r      sw    i| i 48|| ))D0 +,	
 
u||eELL1136	7 rK   r   c            
          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   S\\-  4S	 jjrS
rU =r$ )AlbertTransformeri  r$   c                 (  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r   )r/   r0   r$   r   r   r3   r   embedding_hidden_mapping_inr   r   num_hidden_groupsr   albert_layer_groupsr   s      rI   r0   AlbertTransformer.__init__  sj    +-99V5J5JFL^L^+_(#%==TYZ`ZrZrTs1tTsq2B62JTs1t#u 1ts   -BNr   rl   rn   rN   c                     U R                  U5      n[        U R                  R                  5       HR  n[	        X@R                  R                  U R                  R
                  -  -  5      nU R                  U   " UU40 UD6nMT     [        US9$ )N)last_hidden_state)r   r   r$   num_hidden_layersintr   r   r   )rG   r   rl   rn   i	group_idxs         rI   r[   AlbertTransformer.forward  s     88Gt{{445AA!>!>A^A^!^_`I 44Y? M	 6 ??rK   )r   r$   r   r   )r]   r^   r_   r`   r    r0   r@   rd   rc   r   r   r   r   r[   re   rf   rg   s   @rI   r   r     sf    v| v 48@||@ ))D0@ +,	@
 
5	 @ @rK   r   c                   d    \ rS rSr\rSrSrSrSr	Sr
\\S.r\R                  " 5       S 5       rSrg)AlbertPreTrainedModeli#  albertT)r   
attentionsc                    [        U[        R                  5      (       ac  [        R                  " UR
                  SU R                  R                  S9  UR                  b!  [        R                  " UR                  5        gg[        U[        R                  5      (       a  [        R                  " UR
                  SU R                  R                  S9  UR                  bK  [        UR
                  SS5      (       d.  [        R                  " UR
                  UR                     5        ggg[        U[        R                  5      (       aA  [        R                  " UR                  5        [        R                  " UR
                  5        g[        U[        5      (       a!  [        R                  " UR                  5        g[        U[         5      (       a|  [        R"                  " UR$                  [&        R(                  " UR$                  R*                  S   5      R-                  S5      5        [        R                  " UR.                  5        gg)zInitialize the weights.r   )meanstdN_is_hf_initializedFr+   r*   )
isinstancer   r   initnormal_weightr$   initializer_rangebiaszeros_r1   r&   getattrr:   ones_AlbertMLMHeadr"   copy_r)   r@   rA   rT   rB   r-   )rG   rh   s     rI   _init_weights#AlbertPreTrainedModel._init_weights0  ss    fbii((LLSdkk6S6ST{{&FKK( '--LLSdkk6S6ST!!-gfmmMach6i6iFMM&*<*<=> 7j---KK$JJv}}%..KK$ 011JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 2rK    N)r]   r^   r_   r`   r    config_classbase_model_prefix_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r~   _can_record_outputsr@   no_gradr   re   r   rK   rI   r   r   #  sL    L N"&$%
 ]]_/ /rK   r   z2
    Output type of [`AlbertForPreTraining`].
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   S	rg)
AlbertForPreTrainingOutputiF  ae  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the masked language modeling loss and the next sequence prediction
    (classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
    Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
    before SoftMax).
Nlossprediction_logits
sop_logitsr   r   r   )r]   r^   r_   r`   ra   r   r@   rc   __annotations__r   r   r   r   r   re   r   rK   rI   r   r   F  s}    	 &*D%

d
")26u((4/6+/J!!D(/59M5**+d2926Je''(4/6rK   r   c                   h  ^  \ rS rSr\rSrSS\S\4U 4S jjjrS\	R                  4S jrS\	R                  SS	4S
 jr\\\     SS\R"                  S	-  S\R$                  S	-  S\R"                  S	-  S\R"                  S	-  S\R$                  S	-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )AlbertModeli_  r   r$   add_pooling_layerc                 r  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       aK  [        R                  " UR                  UR                  5      U l
        [        R                  " 5       U l        OSU l
        SU l        UR                  U l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r/   r0   r$   r"   rZ   r   encoderr   r   r   poolerTanhpooler_activationr   attn_implementation	post_init)rG   r$   r   rH   s      rI   r0   AlbertModel.__init__d  s    
 	 *62(0))F$6$68J8JKDK%'WWYD"DK%)D"#)#>#>  	rK   rN   c                 .    U R                   R                  $ r   rZ   r5   rG   s    rI   get_input_embeddings AlbertModel.get_input_embeddingsz  s    ...rK   rk   Nc                 $    XR                   l        g r   r   )rG   rk   s     rI   set_input_embeddings AlbertModel.set_input_embeddings}  s    */'rK   rL   rl   r-   r)   rM   rn   c                 4   US L US L-  (       a  [        S5      eU R                  XX5S9n[        U R                  UUS9nU R                  " UU4SU0UD6nUS   n	U R
                  b'  U R                  U R                  U	S S 2S4   5      5      OS n
[        U	U
S9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r)   r-   rM   )r$   rM   rl   r)   r   )r   pooler_output)r   rZ   r   r$   r   r   r   r   )rG   rL   rl   r-   r)   rM   rn   embedding_outputencoder_outputssequence_outputpooled_outputs              rI   r[   AlbertModel.forward  s     -t";<YZZ?? + 
 3;;*)
 ,,
 &
 	
 *!,VZVaVaVm..t{{?1a4;P/QRsw)-'
 	
rK   )r   r$   rZ   r   r   r   )T)NNNNN)r]   r^   r_   r`   r    r   r   boolr0   r   r1   r  r  r   r   r   r@   rb   rc   r   r   r   r   r[   re   rf   rg   s   @rI   r   r   _  s   L |   ,/bll /0",, 04 0   .237260426$
##d*$
 ))D0$
 ((4/	$

 &&-$
 ((4/$
 +,$
 
$e	+$
    $
rK   r   z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                     ^  \ rS rSrSSS.rS\4U 4S jjrS\R                  4S jr	S	\R                  SS
4S jr
S\R                  4S jr\\       SS\R                   S
-  S\R"                  S
-  S\R                   S
-  S\R                   S
-  S\R"                  S
-  S\R                   S
-  S\R                   S
-  S\\   S\\-  4S jj5       5       rSrU =r$ )AlbertForPreTrainingi  (albert.embeddings.word_embeddings.weightpredictions.biaszpredictions.decoder.weightzpredictions.decoder.biasr$   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  5         g r   )	r/   r0   r   r   r   predictionsAlbertSOPHeadsop_classifierr   rF   s     rI   r0   AlbertForPreTraining.__init__  sB     !&)(0+F3 	rK   rN   c                 .    U R                   R                  $ r   r  decoderr  s    rI   get_output_embeddings*AlbertForPreTraining.get_output_embeddings      '''rK   new_embeddingsNc                 $    XR                   l        g r   r  rG   r  s     rI   set_output_embeddings*AlbertForPreTraining.set_output_embeddings  s    #1 rK   c                 B    U R                   R                  R                  $ r   r   rZ   r5   r  s    rI   r  )AlbertForPreTraining.get_input_embeddings      {{%%555rK   rL   rl   r-   r)   rM   labelssentence_order_labelrn   c           	         U R                   " U4UUUUSS.UD6n	U	SS u  pU R                  U
5      nU R                  U5      nSnUbv  Ubs  [        5       nU" UR	                  SU R
                  R                  5      UR	                  S5      5      nU" UR	                  SS5      UR	                  S5      5      nUU-   n[        UUUU	R                  U	R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
    (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
    sequence B), `1` indicates switched order (sequence B, then sequence A).

Example:

```python
>>> from transformers import AutoTokenizer, AlbertForPreTraining
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
>>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
>>> # Batch size 1
>>> outputs = model(input_ids)

>>> prediction_logits = outputs.prediction_logits
>>> sop_logits = outputs.sop_logits
```Trl   r-   r)   rM   return_dictNrq   r+   )r   r   r   r   r   )
r   r  r  r   r   r$   r2   r   r   r   )rG   rL   rl   r-   r)   rM   r(  r)  rn   outputsr  r  prediction_scores
sop_scores
total_lossloss_fctmasked_lm_losssentence_order_losss                     rI   r[   AlbertForPreTraining.forward  s   N ++
))%'
 
 *1!& ,,_=((7

"6"B')H%&7&<&<RAWAW&XZ`ZeZefhZijN"*:??2q+ACWC\C\]_C`"a'*==J)/!!//))
 	
rK   )r   r  r  NNNNNNN)r]   r^   r_   r`   _tied_weights_keysr    r0   r   r   r  r"  r1   r  r   r   r@   rb   rc   r   r   r   r   r[   re   rf   rg   s   @rI   r  r    sE    'Q$6
| (ryy (2BII 2$ 26bll 6  .237260426*.8<A
##d*A
 ))D0A
 ((4/	A

 &&-A
 ((4/A
   4'A
 $..5A
 +,A
 
$e	+A
  A
rK   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r   i  r$   c                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        R                  " [        R                  " UR                  5      5      U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        UR                      U l        g )Nr'   )r/   r0   r   r:   r3   r;   	Parameterr@   rC   r2   r   r   r   r   r  r   r   r   rF   s     rI   r0   AlbertMLMHead.__init__  s    f&;&;AVAVWLLV->->!?@	YYv1163H3HI
yy!6!68I8IJ !2!23rK   r   rN   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUnU$ r   )r   r   r:   r  )rG   r   r.  s      rI   r[   AlbertMLMHead.forward  sF    

=16}5]3)  rK   )r:   r   r   r  r   r]   r^   r_   r`   r    r0   r@   rd   r[   re   rf   rg   s   @rI   r   r     s/    4| 4!U\\ !ell ! !rK   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r  i$  r$   c                    > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        g r   )
r/   r0   r   r<   classifier_dropout_probr>   r   r   
num_labels
classifierrF   s     rI   r0   AlbertSOPHead.__init__%  sB    zz&"@"@A))F$6$68I8IJrK   r  rN   c                 J    U R                  U5      nU R                  U5      nU$ r   )r>   rB  )rG   r  dropout_pooled_outputlogitss       rI   r[   AlbertSOPHead.forward+  s%     $] ;!67rK   )rB  r>   r=  rg   s   @rI   r  r  $  s1    K| KU\\ ell  rK   r  c                     ^  \ rS rSrSSS.rU 4S jrS\R                  4S jrS\R                  SS	4S
 jr	S\R                  4S jr\\      SS\R                  S	-  S\R                   S	-  S\R                  S	-  S\R                  S	-  S\R                   S	-  S\R                  S	-  S\\   S\\-  4S jj5       5       rSrU =r$ )AlbertForMaskedLMi1  r  r  r  c                    > [         TU ]  U5        [        USS9U l        [	        U5      U l        U R                  5         g NF)r   )r/   r0   r   r   r   r  r   rF   s     rI   r0   AlbertForMaskedLM.__init__8  s7     !&EB(0 	rK   rN   c                 .    U R                   R                  $ r   r  r  s    rI   r  'AlbertForMaskedLM.get_output_embeddingsA  r  rK   r  Nc                 Z    XR                   l        UR                  U R                   l        g r   )r  r  r   r!  s     rI   r"  'AlbertForMaskedLM.set_output_embeddingsD  s"    #1  . 3 3rK   c                 B    U R                   R                  R                  $ r   r%  r  s    rI   r  &AlbertForMaskedLM.get_input_embeddingsH  r'  rK   rL   rl   r-   r)   rM   r(  rn   c           
      6   U R                   " SUUUUUSS.UD6nUS   n	U R                  U	5      n
SnUbF  [        5       nU" U
R                  SU R                  R
                  5      UR                  S5      5      n[        UU
UR                  UR                  S9$ )a$  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> import torch
>>> from transformers import AutoTokenizer, AlbertForMaskedLM

>>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
>>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

>>> # add mask_token
>>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> # retrieve index of [MASK]
>>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
>>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
>>> tokenizer.decode(predicted_token_id)
'france'
```

```python
>>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
>>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
>>> outputs = model(**inputs, labels=labels)
>>> round(outputs.loss.item(), 2)
0.81
```
TrL   rl   r-   r)   rM   r,  r   Nr+   r   rF  r   r   r   )	r   r  r   r   r$   r2   r   r   r   )rG   rL   rl   r-   r)   rM   r(  rn   r-  sequence_outputsr.  r2  r1  s                rI   r[   AlbertForMaskedLM.forwardK  s    ^ ++ 
))%'
 
 #1: ,,-=>')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rK   )r   r  NNNNNN)r]   r^   r_   r`   r6  r0   r   r   r  r"  r1   r  r   r   r@   rb   rc   r   r   r   r   r[   re   rf   rg   s   @rI   rI  rI  1  s%    'Q$6
(ryy (4BII 4$ 46bll 6  .237260426*.D
##d*D
 ))D0D
 ((4/	D

 &&-D
 ((4/D
   4'D
 +,D
 
%	D
  D
rK   rI  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   &  ^  \ rS rSrS\4U 4S jjr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )AlbertForSequenceClassificationi  r$   c                 P  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  U R                  R                  5      U l        U R                  5         g r   )r/   r0   rA  r$   r   r   r   r<   r@  r>   r   r   rB  r   rF   s     rI   r0   (AlbertForSequenceClassification.__init__  sr      ++!&)zz&"@"@A))F$6$68N8NO 	rK   NrL   rl   r-   r)   rM   r(  rn   rN   c           
         U R                   " S
UUUUUSS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
SnUGb  U R                  R                  c  U R
                  S:X  a  SU R                  l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R                  S:X  aI  [        5       nU R
                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                  R                  S:X  a=  [        5       nU" U
R                  SU R
                  5      UR                  S5      5      nO,U R                  R                  S:X  a  [        5       nU" X5      n[        UU
UR                   UR"                  S	9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
TrT  r   N
regressionsingle_label_classificationmulti_label_classificationr+   rU  r   )r   r>   rB  r$   problem_typerA  r.   r@   rE   r   r   squeezer   r   r   r   r   r   )rG   rL   rl   r-   r)   rM   r(  rn   r-  r  rF  r   r1  s                rI   r[   'AlbertForSequenceClassification.forward  s   $ ++ 
))%'
 
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rK   )r   rB  r$   r>   rA  rX  )r]   r^   r_   r`   r    r0   r   r   r@   rb   rc   r   r   r   r   r[   re   rf   rg   s   @rI   rZ  rZ    s    
| 
  .237260426*.;
##d*;
 ))D0;
 ((4/	;

 &&-;
 ((4/;
   4';
 +,;
 
"E	);
  ;
rK   rZ  c                   &  ^  \ rS rSrS\4U 4S jjr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )AlbertForTokenClassificationi  r$   c                 x  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  U R                  R                  5      U l        U R                  5         g rK  )r/   r0   rA  r   r   r@  r=   r   r<   r>   r   r   r$   rB  r   )rG   r$   r@  rH   s      rI   r0   %AlbertForTokenClassification.__init__  s      ++!&EB --9 **++ 	 
 zz"9:))F$6$68N8NO 	rK   NrL   rl   r-   r)   rM   r(  rn   rN   c           	      D   U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
SnUb<  [        5       nU" U
R	                  SU R
                  5      UR	                  S5      5      n[        UU
UR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Tr+  r   Nr+   rU  )	r   r>   rB  r   r   rA  r   r   r   )rG   rL   rl   r-   r)   rM   r(  rn   r-  r  rF  r   r1  s                rI   r[   $AlbertForTokenClassification.forward  s      ++
))%'
 
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
rK   )r   rB  r>   rA  rX  )r]   r^   r_   r`   r    r0   r   r   r@   rb   rc   r   r   r   r   r[   re   rf   rg   s   @rI   re  re    s    |    .237260426*.'
##d*'
 ))D0'
 ((4/	'

 &&-'
 ((4/'
   4''
 +,'
 
	&'
  '
rK   re  c                   F  ^  \ rS rSrS\4U 4S jjr\\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )AlbertForQuestionAnsweringi%  r$   c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rK  )
r/   r0   rA  r   r   r   r   r   
qa_outputsr   rF   s     rI   r0   #AlbertForQuestionAnswering.__init__'  sU      ++!&EB))F$6$68I8IJ 	rK   NrL   rl   r-   r)   rM   start_positionsend_positionsrn   rN   c           
         U R                   " S
UUUUUSS.UD6n	U	S   n
U R                  U
5      nUR                  SSS9u  pUR                  S5      R	                  5       nUR                  S5      R	                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  n[        UUUU	R                  U	R                  S	9$ )NTrT  r   r   r+   rr   )ignore_indexrq   )r   start_logits
end_logitsr   r   r   )r   rm  splitrb  ry   lenrD   clampr   r   r   r   )rG   rL   rl   r-   r)   rM   ro  rp  rn   r-  r  rF  rs  rt  r0  ignored_indexr1  
start_lossend_losss                      rI   r[   "AlbertForQuestionAnswering.forward1  sx    ++ 
))%'
 
 "!*#?#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J+%!!//))
 	
rK   )r   rA  rm  r5  r]   r^   r_   r`   r    r0   r   r   r@   rb   rc   r   r   r   r   r[   re   rf   rg   s   @rI   rk  rk  %  s    |   .23726042637153
##d*3
 ))D03
 ((4/	3

 &&-3
 ((4/3
 ))D03
 ''$.3
 +,3
 
$e	+3
  3
rK   rk  c                   &  ^  \ rS rSrS\4U 4S jjr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )AlbertForMultipleChoiceii  r$   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r/   r0   r   r   r   r<   r@  r>   r   r   rB  r   rF   s     rI   r0    AlbertForMultipleChoice.__init__k  sV     !&)zz&"@"@A))F$6$6: 	rK   NrL   rl   r-   r)   rM   r(  rn   rN   c           	         Ub  UR                   S   OUR                   S   nUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " U4UUUUSS.UD6n	U	S   n
U R	                  U
5      n
U R                  U
5      nUR                  SU5      nSnUb  [        5       nU" X5      n[        UUU	R                  U	R                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
    [`PreTrainedTokenizer.encode`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
    *input_ids* above)
Nr   r+   Tr+  rU  )
rT   r   rD   r   r>   rB  r   r   r   r   )rG   rL   rl   r-   r)   rM   r(  rn   num_choicesr-  r  rF  reshaped_logitsr   r1  s                  rI   r[   AlbertForMultipleChoice.forwardu  s   T -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	
 ++
))%'
 
  
]3#}= ++b+6')HO4D("!//))	
 	
rK   )r   rB  r>   rX  r|  rg   s   @rI   r~  r~  i  s    |   .237260426*.M
##d*M
 ))D0M
 ((4/	M

 &&-M
 ((4/M
   4'M
 +,M
 
$e	+M
  M
rK   r~  )r   r   r  rI  rZ  re  rk  r~  )Nr   )Era   collections.abcr   dataclassesr   r@   r   torch.nnr   r   r    r
   r   activationsr   masking_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_albertr    
get_loggerr]   loggerModuler"   rd   floatr|   r~   r   r   r   r   r   r   r  r   r  rI  rZ  re  rk  r~  __all__r   rK   rI   <module>r     s    $ !   A A & ! 6   G & N M I 5 . 
		H	%=ryy =N !%II%<<% 
% <<	%
 LL4'% T\% % '(%8>)bii >)B#")) #Lryy "@		 @: /O / /D 
 7 7 7& G
' G
 G
T \
0 \
\
~!BII !*
BII 
 _
- _
 _
D J
&; J
J
Z :
#8 :
 :
z @
!6 @
 @
F Z
3 Z
 Z
z	rK   