
    Z jL                     *   S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJr  SSKJr  SSKJr  SSKJrJr  SSK J!r!  \RD                  " \#5      r$/ SQr%\" SS9\ " S S\5      5       5       r& " S S\RN                  5      r( " S S\RN                  5      r) " S S\RN                  5      r* " S S\RN                  5      r+ " S S\RN                  5      r, " S S \RN                  5      r- " S! S"\RN                  5      r. " S# S$\RN                  5      r/ " S% S&\5      r0 " S' S(\RN                  5      r1 " S) S*\RN                  5      r2 " S+ S,\RN                  5      r3 " S- S.\RN                  5      r4 " S/ S0\RN                  5      r5\ " S1 S2\5      5       r6\ " S3 S4\65      5       r7\" S5S9 " S6 S7\65      5       r8\ " S8 S9\65      5       r9\ " S: S;\65      5       r:\ " S< S=\65      5       r;/ S>Qr<g)?zPyTorch CANINE model.    N)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging   )CanineConfig)   +   ;   =   I   a   g   q                           a  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	CanineModelOutputWithPooling0   aW  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
    shallow Transformer encoder).
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
    Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
    weights are trained from the next sentence prediction (classification) objective during pretraining.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
    encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
    config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
    initial input to each Transformer encoder. The hidden states of the shallow encoders have length
    `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
    `config.downsampling_rate`.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
    num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
    config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
    attention softmax, used to compute the weighted average in the self-attention heads.
Nlast_hidden_statepooler_outputhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r,   torchFloatTensor__annotations__r-   r.   tupler/   __static_attributes__r0       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/canine/modeling_canine.pyr*   r*   0   sh    , 37u((4/6.2M5$$t+259M5**+d2926Je''(4/6r;   r*   c                      ^  \ rS rSrSrU 4S jrS\S\4S jrS\S\S\4S jr    SS
\	R                  S	-  S\	R                  S	-  S\	R                  S	-  S\	R                  S	-  S\	R                  4
S jjrSrU =r$ )CanineEmbeddingsU   z<Construct the character, position and token_type embeddings.c           	        > [         TU ]  5         Xl        UR                  UR                  -  n[        UR                  5       H3  nSU 3n[        X[        R                  " UR                  U5      5        M5     [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                   5      U l        U R%                  S[&        R(                  " UR*                  5      R-                  S5      SS9  g )NHashBucketCodepointEmbedder_epsposition_idsr   F)
persistent)super__init__confighidden_sizenum_hash_functionsrangesetattrr   	Embeddingnum_hash_bucketschar_position_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr6   arangemax_position_embeddingsexpand)selfrJ   shard_embedding_sizeiname	__class__s        r<   rI   CanineEmbeddings.__init__X   s     &11V5N5NNv001A1!5DDV-D-DFZ [\ 2 )+V5L5LfN`N`(a%%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
r;   
num_hashesnum_bucketsc                     U[        [        5      :  a  [        S[        [        5       35      e[        SU n/ nU H  nUS-   U-  U-  nUR                  U5        M!     U$ )aW  
Converts ids to hash bucket ids via multiple hashing.

Args:
    input_ids: The codepoints or other IDs to be hashed.
    num_hashes: The number of hash functions to use.
    num_buckets: The number of hash buckets (i.e. embeddings in each table).

Returns:
    A list of tensors, each of which is the hash bucket IDs from one hash function.
z`num_hashes` must be <= Nr   )len_PRIMES
ValueErrorappend)r]   	input_idsrc   rd   primesresult_tensorsprimehasheds           r<   _hash_bucket_tensors%CanineEmbeddings._hash_bucket_tensorsm   sk     G$7G~FGG*%E 1}-<F!!&)  r;   embedding_sizec                     X#-  S:w  a  [        SU SU S35      eU R                  XUS9n/ n[        U5       H,  u  pxSU 3n	[        X	5      " U5      n
UR	                  U
5        M.     [
        R                  " USS9$ )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)rc   rd   rA   rF   dim)rh   ro   	enumerategetattrri   r6   cat)r]   rj   rq   rc   rd   hash_bucket_tensorsembedding_shardsr_   hash_bucket_idsr`   shard_embeddingss              r<   _embed_hash_buckets$CanineEmbeddings._embed_hash_buckets   s    &!+:>:JJ\]g\hhnopp"77	fq7r"+,?"@A1!5D&t2?C##$45 #A
 yy)r22r;   Nrj   token_type_idsrD   inputs_embedsreturnc                 @   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUcO  U R                  XR                  R                  U R                  R                  U R                  R                  5      nU R                  U5      nXG-   nU R                  U5      n	X-  nU R                  U5      nU R                  U5      nU$ )NrF   r   dtypedevice)sizerD   r6   zeroslongr   r|   rJ   rK   rL   rP   rS   rQ   rT   rX   )
r]   rj   r~   rD   r   input_shape
seq_lengthrS   
embeddingsposition_embeddingss
             r<   forwardCanineEmbeddings.forward   s     #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  44;;22DKK4R4RTXT_T_TpTpM !% : :> J":
";;LI)
^^J/
\\*-
r;   )rT   rQ   rJ   rX   rS   )NNNN)r1   r2   r3   r4   r5   rI   intro   r|   r6   
LongTensorr7   r   r:   __classcell__ra   s   @r<   r>   r>   U   s    F
*# C .3S 3c 3`c 3  .2260426!##d*! ((4/! &&-	!
 ((4/! 
		! !r;   r>   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )CharactersToMolecules   zeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                 6  > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S9U l        [        UR                     U l	        [        R                  " UR                  UR                  S9U l
        g )Nin_channelsout_channelskernel_sizestriderB   )rH   rI   r   Conv1drK   downsampling_rateconvr
   
hidden_act
activationrT   rU   r]   rJ   ra   s     r<   rI   CharactersToMolecules.__init__   st    II**++00++	
	 !!2!23f&8&8f>S>STr;   char_encodingr   c                 0   US S 2SS2S S 24   n[         R                  " USS5      nU R                  U5      n[         R                  " USS5      nU R                  U5      nUS S 2SS2S S 24   n[         R                  " X$/SS9nU R                  U5      nU$ )Nr   r      rF   rs   )r6   	transposer   r   rw   rT   )r]   r   cls_encodingdownsampleddownsampled_truncatedresults         r<   r   CharactersToMolecules.forward   s    $Q!QY/ q!<ii.ook1a8ook2 !,AqtQJ 7 L@aH'r;   )rT   r   r   r1   r2   r3   r4   r5   rI   r6   Tensorr   r:   r   r   s   @r<   r   r      s,    oUU\\ ell  r;   r   c                      ^  \ rS rSrSrU 4S jr S
S\R                  S\R                  S-  S\R                  4S jjrS	r	U =r
$ )ConvProjection   z
Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
characters.
c                 ~  > [         TU ]  5         Xl        [        R                  " UR
                  S-  UR
                  UR                  SS9U l        [        UR                     U l
        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   r   rB   )rH   rI   rJ   r   r   rK   upsampling_kernel_sizer   r
   r   r   rT   rU   rV   rW   rX   r   s     r<   rI   ConvProjection.__init__   s    II**Q.++55	
	 !!2!23f&8&8f>S>STzz&"<"<=r;   Ninputsfinal_seq_char_positionsr   c                    [         R                  " USS5      nU R                  R                  S-
  nUS-  nX4-
  n[        R
                  " XE4S5      nU R                  U" U5      5      n[         R                  " USS5      nU R                  U5      nU R                  U5      nU R                  U5      nUnUb  [        S5      eUn	U	$ )Nr   r   r   z,CanineForMaskedLM is currently not supported)r6   r   rJ   r   r   ConstantPad1dr   r   rT   rX   NotImplementedError)
r]   r   r   	pad_totalpad_begpad_endpadr   final_char_seq	query_seqs
             r<   r   ConvProjection.forward   s     A.
 KK66:	q.%1153v;'A.('f%#/
 &&TUU&Ir;   )rT   r   rJ   r   rX   Nr   r   s   @r<   r   r      sI    
>  9="" #(,,"5" 
	" "r;   r   c                      ^  \ rS rSrU 4S jr  SS\R                  S\R                  S\R                  S-  S\S-  S\	\R                  \R                  S-  4   4
S	 jjr
S
rU =r$ )CanineSelfAttentioni  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        g )Nr   rq   zThe hidden size (z6) is not a multiple of the number of attention heads ())rH   rI   rK   num_attention_headshasattrrh   r   attention_head_sizeall_head_sizer   LinearquerykeyvaluerV   attention_probs_dropout_probrX   r   s     r<   rI   CanineSelfAttention.__init__  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr;   Nfrom_tensor	to_tensorattention_maskoutput_attentionsr   c                    UR                   u  pVnU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      n	U R                  U5      R                  USU R                  U R                  5      R                  SS5      n
[        R                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUbg  UR                  S:X  aS  [        R                  " USS9nSUR                  5       -
  [        R                  " UR                   5      R"                  -  nX-   n[$        R&                  R)                  USS9nU R+                  U5      n[        R                  " X5      nUR-                  SSSS5      R/                  5       nUR1                  5       S S U R2                  4-   nUR                  " U6 nU(       a  X4nU$ U4nU$ )	NrF   r   r   r   rs   g      ?r   )shaper   viewr   r   r   r   r   r6   matmulmathsqrtndim	unsqueezefloatfinfor   minr   
functionalsoftmaxrX   permute
contiguousr   r   )r]   r   r   r   r   
batch_sizer   _	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                   r<   r   CanineSelfAttention.forward-  s    %0$5$5!
 HHYT*b$":":D<T<TUYq!_ 	 JJy!T*b$":":D<T<TUYq!_ 	 JJ{#T*b$":":D<T<TUYq!_ 	 !<<5H5HR5PQ+dii8P8P.QQ%""a'!&Q!G #&(<(<(>">%++N^NdNdBeBiBi!i/@ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=2 O\M]r;   )r   r   rX   r   r   r   r   NF)r1   r2   r3   r4   rI   r6   r   r7   boolr9   r   r:   r   r   s   @r<   r   r     sy    G, 48).;\\; <<; ))D0	;
  $;; 
u||U\\D00	1; ;r;   r   c                      ^  \ rS rSrU 4S jrS\\R                     S\R                  S\\R                  \R                  4   4S jrSr	U =r
$ )CanineSelfOutputik  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrB   )rH   rI   r   r   rK   denserT   rU   rV   rW   rX   r   s     r<   rI   CanineSelfOutput.__init__l  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r;   r.   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   rX   rT   r]   r.   r   s      r<   r   CanineSelfOutput.forwardr  s7     

=1]3}'CDr;   rT   r   rX   r1   r2   r3   r4   rI   r9   r6   r7   r   r:   r   r   s   @r<   r   r   k  sQ    >"5#4#45EJEVEV	u  %"3"33	4 r;   r   c                      ^  \ rS rSrSr       SS\S\S\S\S\S\4U 4S	 jjjr  SS\\	R                     S\	R                  S
-  S\S
-  S\\	R                  \	R                  S
-  4   4S jjrSrU =r$ )CanineAttentioni{  aB  
Additional arguments related to local attention:

    - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
    - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
      attend
    to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
    *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
    positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
    width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
    128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
    **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
    *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
    skip when moving to the next block in `to_tensor`.
always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	                    > [         T	U ]  5         [        U5      U l        [	        U5      U l        X l        XV:  a  [        S5      eXx:  a  [        S5      eX0l        X@l	        XPl
        X`l        Xpl        Xl        g )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)rH   rI   r   r]   r   outputlocalrh   r   r   r   r  r  r  
r]   rJ   r  r   r   r   r  r  r  ra   s
            r<   rI   CanineAttention.__init__  s     	'/	&v. 
"=w  !9r  0O,-J*'>$(@%%:"&<#r;   Nr.   r   r   r   c                    U R                   (       d  U R                  XX#5      nUS   nGOUR                  S   =pgU=p/ n
U R                  (       a  U
R	                  S5        SnOSn[        XU R                  5       H-  n[        XlU R                  -   5      nU
R	                  X45        M/     / nU R                  (       a  UR	                  SU45        [        SXpR                  5       H-  n[        X|U R                  -   5      nUR	                  X45        M/     [        U
5      [        U5      :w  a  [        SU
 SU
 S35      e/ n/ n[        X5       H  u  u  nnu  nnUS S 2UU2S S 24   nU	S S 2UU2S S 24   nUS S 2UU2UU24   nU R                  (       aJ  US S 2UU2SS24   n[        R                   " UU/SS9nU	S S 2SS2S S 24   n[        R                   " UU/SS9nU R                  UUUU5      nUR	                  US   5        U(       d  M  UR	                  US   5        M     [        R                   " USS9nU R#                  XQ5      nU4nU R                   (       d
  UWSS  -   nU$ U[%        W5      -   nU$ )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.r   rs   )r  r]   r   r   ri   rM   r  r   r   r  r  rf   rh   zipr   r6   rw   r  r9   )r]   r.   r   r   self_outputsattention_outputfrom_seq_lengthto_seq_lengthr   r   from_chunks
from_startchunk_start	chunk_end	to_chunksattention_output_chunksattention_probs_chunksfrom_endto_startto_endfrom_tensor_chunkto_tensor_chunkattention_mask_chunkcls_attention_maskcls_positionattention_outputs_chunkr   s                              r<   r   CanineAttention.forward  s    zz99]>eL+A.;.A.A!.DDO&33K K11""6* 

$Z$B_B_`t?[?[1[\	""K#;<  a
 I11  !]!34$Q7R7RST=W=W/WX	  +!9:  T ;3y>1 Ek] S$$/=0AC  ')#%'">A+>Y:&X(:6$/:h3F0I$J!"+Ax,A"B (6aH9LhW]o6]'^$77)7:h;NPQRSPS8S)T&+0996HJ^5_ef+g(#,Q!QY#7L&+ii0OUV&WO*.))%8LN_+' (../Fq/IJ$$*112I!2LM% ?Z(  %yy)@aH;;'7G#%zzQR 00G  &< ==Gr;   )	r   r  r   r  r  r   r  r  r]   FFF   r!  r!  r!  r   )r1   r2   r3   r4   r5   r   r   rI   r9   r6   r7   r   r:   r   r   s   @r<   r   r   {  s    & 05.3'*(+%(&)= *.	=
 (,= "%= #&=  #= !$= =F 48).	GU../G ))D0G  $;	G
 
u  %"3"3d"::	;G Gr;   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )CanineIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rH   rI   r   r   rK   intermediate_sizer   
isinstancer   strr
   intermediate_act_fnr   s     r<   rI   CanineIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r;   r.   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r(  r]   r.   s     r<   r   CanineIntermediate.forward  s&    

=100?r;   r+  )
r1   r2   r3   r4   rI   r6   r7   r   r:   r   r   s   @r<   r#  r#    s,    9U%6%6 5;L;L  r;   r#  c                      ^  \ rS rSrU 4S jrS\\R                     S\R                  S\R                  4S jrSr	U =r
$ )CanineOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rH   rI   r   r   r%  rK   r   rT   rU   rV   rW   rX   r   s     r<   rI   CanineOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r;   r.   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r<   r   CanineOutput.forward  s5    

=1]3}'CDr;   r   r   r   s   @r<   r/  r/    s?    >U5+<+<%= UM^M^ chctct  r;   r/  c                      ^  \ rS rSrU 4S jr  SS\\R                     S\R                  S-  S\S-  S\\R                  \R                  S-  4   4S jjr	S	 r
S
rU =r$ )CanineLayeri  c	           
         > [         T	U ]  5         UR                  U l        SU l        [	        UUUUUUUU5      U l        [        U5      U l        [        U5      U l	        g Nr   )
rH   rI   chunk_size_feed_forwardseq_len_dimr   	attentionr#  intermediater/  r  r  s
            r<   rI   CanineLayer.__init__  se     	'-'E'E$(+)#$!"	
 /v6"6*r;   Nr.   r   r   r   c                     U R                  UUUS9nUS   nUSS  n[        U R                  U R                  U R                  U5      nU4U-   nU$ )N)r   r   r   )r:  r   feed_forward_chunkr8  r9  )r]   r.   r   r   self_attention_outputsr  r   layer_outputs           r<   r   CanineLayer.forward/  ss     "&/ "0 "

 2!4(,0##T%A%A4CSCSUe
  /G+r;   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r;  r  )r]   r  intermediate_outputr@  s       r<   r>  CanineLayer.feed_forward_chunkE  s)    "//0@A{{#6Ir;   )r:  r8  r;  r  r9  r   )r1   r2   r3   r4   rI   r9   r6   r7   r   r   r>  r:   r   r   s   @r<   r5  r5    sz    +< 48).	U../ ))D0  $;	
 
u  %"3"3d"::	;, r;   r5  c                      ^  \ rS rSr       SU 4S jjr    SS\\R                     S\R                  S-  S\S-  S\S-  S\S-  S	\\	-  4S
 jjr
SrU =r$ )CanineEncoderiK  c	                    > [         T
U ]  5         Xl        [        R                  " [        UR                  5       V	s/ s H  n	[        UUUUUUUU5      PM     sn	5      U l        SU l	        g s  sn	f r   )
rH   rI   rJ   r   
ModuleListrM   num_hidden_layersr5  layergradient_checkpointing)r]   rJ   r  r   r   r   r  r  r  r   ra   s             r<   rI   CanineEncoder.__init__L  s}     	]] v778 9A 31+,)*	 9

 ',#s   A-Nr.   r   r   output_hidden_statesreturn_dictr   c                 $   U(       a  SOS nU(       a  SOS n[        U R                  5       H0  u  pU(       a  Xa4-   nU	" XU5      n
U
S   nU(       d  M(  XzS   4-   nM2     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr0   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r0   .0vs     r<   	<genexpr>(CanineEncoder.forward.<locals>.<genexpr>  s     m$[q$[   	)r,   r.   r/   )ru   rJ  r9   r   )r]   r.   r   r   rM  rN  all_hidden_statesall_self_attentionsr_   layer_modulelayer_outputss              r<   r   CanineEncoder.forwardj  s     #7BD$5b4(4OA#$58H$H!(HYZM)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r;   )rJ   rK  rJ  r   )NFFT)r1   r2   r3   r4   rI   r9   r6   r7   r   r   r   r:   r   r   s   @r<   rF  rF  K  s     (-&+ #!$!",B 48).,1#'
U../
 ))D0
  $;	

 #Tk
 D[
 
	 
 
r;   rF  c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )CaninePooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rH   rI   r   r   rK   r   Tanhr   r   s     r<   rI   CaninePooler.__init__  s9    YYv1163E3EF
'')r;   r.   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r]   r.   first_token_tensorpooled_outputs       r<   r   CaninePooler.forward  s6     +1a40

#566r;   )r   r   r   r   s   @r<   r]  r]    s1    $
U5+<+<%= %BSBS  r;   r]  c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )CaninePredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rH   rI   r   r   rK   r   r&  r   r'  r
   transform_act_fnrT   rU   r   s     r<   rI   &CaninePredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr;   r.   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   rh  rT   r,  s     r<   r   %CaninePredictionHeadTransform.forward  s4    

=1--m<}5r;   )rT   r   rh  r   r   s   @r<   rf  rf    s2    UU5+<+<%= %BSBS  r;   rf  c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )CanineLMPredictionHeadi  c                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        g )NT)bias)rH   rI   rf  	transformr   r   rK   
vocab_sizedecoder	Parameterr6   r   ro  r   s     r<   rI   CanineLMPredictionHead.__init__  s[    6v> yy!3!3V5F5FTRLLV->->!?@	r;   r.   r   c                 J    U R                  U5      nU R                  U5      nU$ r   )rp  rr  r,  s     r<   r   CanineLMPredictionHead.forward  s$    }5]3r;   )ro  rr  rp  r   r   s   @r<   rm  rm    s2    AU5+<+<%= %BSBS  r;   rm  c                   n   ^  \ rS rSrU 4S jrS\\R                     S\\R                     4S jrSr	U =r
$ )CanineOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g r   )rH   rI   rm  predictionsr   s     r<   rI   CanineOnlyMLMHead.__init__  s    1&9r;   sequence_outputr   c                 (    U R                  U5      nU$ r   rz  )r]   r|  prediction_scoress      r<   r   CanineOnlyMLMHead.forward  s     !,,_=  r;   r~  )r1   r2   r3   r4   rI   r9   r6   r   r   r:   r   r   s   @r<   rx  rx    s6    :!u||,! 
u||	! !r;   rx  c                   <   ^  \ rS rSr% \\S'   SrSrU 4S jrSr	U =r
$ )CaninePreTrainedModeli  rJ   canineTc                   > [         TU ]  U5        [        U[        5      (       a\  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g g )NrF   rE   )rH   _init_weightsr&  r>   initcopy_rD   r6   rZ   r   r\   )r]   modulera   s     r<   r  #CaninePreTrainedModel._init_weights  s^    f%f.//JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 0r;   r0   )r1   r2   r3   r4   r   r8   base_model_prefixsupports_gradient_checkpointingr  r:   r   r   s   @r<   r  r    s!     &*#i ir;   r  c                   |  ^  \ rS rSrSU 4S jjrS rS\R                  S\4S jr	S\R                  S\S	\R                  4S
 jr
\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S	\\-  4S jj5       rSrU =r$ )CanineModeli  c                   > [         TU ]  U5        Xl        [        R                  " U5      nSUl        [        U5      U l        [        USSSUR                  UR                  UR                  UR                  S9U l
        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        U(       a  [#        U5      OSU l        U R'                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   TF)r  r   r   r   r  r  r  N)rH   rI   rJ   copydeepcopyrI  r>   char_embeddingsrF  local_transformer_strideinitial_char_encoderr   chars_to_moleculesencoderr   
projectionfinal_char_encoderr]  pooler	post_init)r]   rJ   add_pooling_layershallow_configra   s       r<   rI   CanineModel.__init__  s    
 	 v.+,(/7$1,1*/$*$C$C%+%D%D"("A"A#)#B#B	%
! #8"?$V,(0"/"?.?l6*T 	r;   c                    UR                   S   UR                   S   pCUR                   S   n[        R                  " X#SU45      R                  5       n[        R                  " X4S4[        R
                  UR                  S9nXb-  nU$ )a  
Create 3D attention mask from a 2D tensor mask.

Args:
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].

Returns:
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
r   r   )r   r   r   )r   r6   reshaper   onesfloat32r   )r]   r   to_maskr   r  r  broadcast_onesmasks           r<   )_create_3d_attention_mask_from_input_mask5CanineModel._create_3d_attention_mask_from_input_mask  s     '2&7&7&:K<M<Ma<POa(--a)GHNNP
 *q)IQVQ^Q^gnguguv 'r;   char_attention_maskr   c                     UR                   u  p4[        R                  " XSU45      n[        R                  R	                  X"S9" UR                  5       5      n[        R                  " USS9nU$ )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   rF   rs   )r   r6   r  r   	MaxPool1dr   squeeze)r]   r  r   r   char_seq_lenpoolable_char_maskpooled_molecule_maskmolecule_attention_masks           r<   _downsample_attention_mask&CanineModel._downsample_attention_mask  sp     $7#<#< 
"]]+>QP\@]^  %xx11>O1j$$& 

 #(--0D""M&&r;   	moleculeschar_seq_lengthr   c                     U R                   R                  nUSS2SS2SS24   n[        R                  " XCSS9nUSS2SS2SS24   nX#-  n[        R                  " UXs-   SS9n[        R                  " XX/SS9$ )zDRepeats molecules to make them the same length as the char sequence.Nr   r   )repeatsrt   rF   rs   )rJ   r   r6   repeat_interleaverw   )	r]   r  r  ratemolecules_without_extra_clsrepeatedlast_moleculeremainder_lengthremainder_repeateds	            r<   _repeat_moleculesCanineModel._repeat_molecules'  s     {{,,&/12q&9#**+FZ\] "!RS!),*1"44$+	
 yy(7R@@r;   Nrj   r   r~   rD   r   r   rM  rN  c	                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOS n
U(       a  SOS nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[	        S5      eUu  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUc$  [        R                  " U[        R                  US9nU R                  X,5      nU R                  X R                   R                  S9nU R                  UUUR                  S   45      nU R!                  UUUUS9nU R#                  Ub  UOUU5      nU R%                  UUUUS	9nUR&                  nU R)                  U5      nU R+                  UUUUUS
9nUS   nU R,                  b  U R-                  U5      OS nU R/                  UUS   S9n[        R0                  " UU/SS9nU R3                  U5      nU R5                  UUUUS	9nUR&                  nU(       a7  U(       a  UR6                  OUS   nU
UR6                  -   U-   UR6                  -   n
U(       a7  U(       a  UR8                  OUS   n UUR8                  -   U -   UR8                  -   nU(       d  UU4n!U![;        S X4 5       5      -  n!U!$ [=        UUU
US9$ )Nr0   zDYou cannot specify both input_ids and inputs_embeds at the same timerF   z5You have to specify either input_ids or inputs_embeds)r   r   )r   )rj   rD   r~   r   )r   r   rM  )r   r   rM  rN  r   )r  rs   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r0   rQ  s     r<   rT  &CanineModel.forward.<locals>.<genexpr>  s     a'O!AA'OrV  )r,   r-   r.   r/   )rJ   r   rM  rN  rh   %warn_if_padding_and_no_attention_maskr   r   r6   r  r   r   get_extended_attention_maskr  r   r   r  r  r  r,   r  r  r  r  rw   r  r  r.   r/   r9   r*   )"r]   rj   r   r~   rD   r   r   rM  rN  kwargsrW  rX  r   r   r   r   extended_attention_maskr   extended_molecule_attention_maskinput_char_embeddingsr  init_chars_encoder_outputsinput_char_encodinginit_molecule_encodingencoder_outputsmolecule_sequence_outputrc  repeated_moleculesconcatr|  final_chars_encoder_outputsdeep_encoder_hidden_statesdeep_encoder_self_attentionsr  s"                                     r<   r   CanineModel.forward@  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 #7BD$5b4%0%<k$++BYBY ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!"[[EJJvVN 150P0PQ_0m"&"A"Akk.K.K #B #
 :>9Y9Y#j2I2O2OPR2S%T:
(
 !% 4 4%)'	 !5 !
 #LL".IM>
 &*%>%>!./!5	 &? &
" 9JJ  "&!8!89L!M ,,";/!5# ' 
 $31#5 AEAX$<=^b "334L^ijl^m3n /1CD"M //&1 '+&=&=2/!5	 '> '
# 6GGJU)F)F[jkl[m&!,::;,- .;;<  IT?+E+EZijlZm(#,778./ .889   %}5Fea(9'OaaaFM+-'+*	
 	
r;   )r  r  rJ   r  r  r  r  r  )T)NNNNNNNN)r1   r2   r3   r4   rI   r  r6   r   r   r  r  r   r   r7   r   r9   r*   r   r:   r   r   s   @r<   r  r    s)    D6'ell '_b '"A5<< A# ARWR^R^ A2  .237260426)-,0#'T
##d*T
 ))D0T
 ((4/	T

 &&-T
 ((4/T
  $;T
 #TkT
 D[T
 
-	-T
 T
r;   r  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )CanineForSequenceClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   rH   rI   
num_labelsr  r  r   rV   rW   rX   r   rK   
classifierr  r   s     r<   rI   (CanineForSequenceClassification.__init__  i      ++!&)zz&"<"<=))F$6$68I8IJ 	r;   Nrj   r   r~   rD   r   labelsr   rM  rN  r   c
                 P   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r~   rD   r   r   rM  rN  r   
regressionsingle_label_classificationmulti_label_classificationrF   r   losslogitsr.   r/   )rJ   rN  r  rX   r  problem_typer  r   r6   r   r   r   r  r   r   r   r   r.   r/   )r]   rj   r   r~   rD   r   r  r   rM  rN  r  r   rc  r  r  loss_fctr  s                    r<   r   'CanineForSequenceClassification.forward  s   ( &1%<k$++BYBY++))%'/!5#  	
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r;   r  r  rX   r  	NNNNNNNNN)r1   r2   r3   r4   rI   r   r6   r   r7   r   r9   r   r   r:   r   r   s   @r<   r  r    s    	  .237260426*.)-,0#'D
##d*D
 ))D0D
 ((4/	D

 &&-D
 ((4/D
   4'D
  $;D
 #TkD
 D[D
 
)	)D
 D
r;   r  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )CanineForMultipleChoicei2  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g r7  )rH   rI   r  r  r   rV   rW   rX   r   rK   r  r  r   s     r<   rI    CanineForMultipleChoice.__init__4  sV     !&)zz&"<"<=))F$6$6: 	r;   Nrj   r   r~   rD   r   r  r   rM  rN  r   c
                 X   U	b  U	OU R                   R                  n	Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	S9nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   rF   r   r  r   r  )rJ   rN  r   r   r   r  rX   r  r   r   r.   r/   )r]   rj   r   r~   rD   r   r  r   rM  rN  r  num_choicesr   rc  r  reshaped_logitsr  r  r  s                      r<   r   CanineForMultipleChoice.forward>  s   X &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ++))%'/!5#  	
  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r;   )r  r  rX   r  )r1   r2   r3   r4   rI   r   r6   r   r7   r   r9   r   r   r:   r   r   s   @r<   r  r  2  s      .237260426*.)-,0#'W
##d*W
 ))D0W
 ((4/	W

 &&-W
 ((4/W
   4'W
  $;W
 #TkW
 D[W
 
*	*W
 W
r;   r  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )CanineForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   r  r   s     r<   rI   %CanineForTokenClassification.__init__  r  r;   Nrj   r   r~   rD   r   r  r   rM  rN  r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

Example:

```python
>>> from transformers import AutoTokenizer, CanineForTokenClassification
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
>>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

>>> inputs = tokenizer(
...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
... )

>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_token_class_ids = logits.argmax(-1)

>>> # Note that tokens are classified rather then input words which means that
>>> # there might be more predicted token classes than words.
>>> # Multiple token classes might account for the same word
>>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
>>> predicted_tokens_classes  # doctest: +SKIP
```

```python
>>> labels = predicted_token_class_ids
>>> loss = model(**inputs, labels=labels).loss
>>> round(loss.item(), 2)  # doctest: +SKIP
```Nr  r   rF   r   r  )rJ   rN  r  rX   r  r   r   r  r   r.   r/   )r]   rj   r   r~   rD   r   r  r   rM  rN  r  r   r|  r  r  r  r  s                    r<   r   $CanineForTokenClassification.forward  s    ` &1%<k$++BYBY++))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r;   r  r  )r1   r2   r3   r4   rI   r   r6   r   r7   r   r9   r   r   r:   r   r   s   @r<   r  r    s    	  .237260426*.)-,0#'O
##d*O
 ))D0O
 ((4/	O

 &&-O
 ((4/O
   4'O
  $;O
 #TkO
 D[O
 
&	&O
 O
r;   r  c                   N  ^  \ rS rSrU 4S jr\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )CanineForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   )
rH   rI   r  r  r  r   r   rK   
qa_outputsr  r   s     r<   rI   #CanineForQuestionAnswering.__init__  sS      ++!&)))F$6$68I8IJ 	r;   Nrj   r   r~   rD   r   start_positionsend_positionsr   rM  rN  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5        UR                  SU5        [        US9nU" X5      nU" UU5      nUU-   S-  nU
(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   rF   rs   )ignore_indexr   )r  start_logits
end_logitsr.   r/   )rJ   rN  r  r  splitr  rf   r   clamp_r   r   r.   r/   )r]   rj   r   r~   rD   r   r  r  r   rM  rN  r  r   r|  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r<   r   "CanineForQuestionAnswering.forward  s    &1%<k$++BYBY++))%'/!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r;   )r  r  r  )
NNNNNNNNNN)r1   r2   r3   r4   rI   r   r6   r   r7   r   r9   r   r   r:   r   r   s   @r<   r  r    s     .2372604263715)-,0#'=
##d*=
 ))D0=
 ((4/	=

 &&-=
 ((4/=
 ))D0=
 ''$.=
  $;=
 #Tk=
 D[=
 
-	-=
 =
r;   r  )r  r  r  r  r5  r  r  )=r5   r  r   dataclassesr   r6   r   torch.nnr   r   r    r	   r  activationsr
   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   configuration_caniner   
get_loggerr1   loggerrg   r*   Moduler>   r   r   r   r   r   r#  r/  r5  rF  r]  rf  rm  rx  r  r  r  r  r  r  __all__r0   r;   r<   <module>r     sB      !   A A & ! 9  . 6 , . 
		H	% U  7; 7 7:^ryy ^B)BII )X5RYY 5pN")) Nbryy  xbii xv 299 5, 5p=
BII =
@299 BII "RYY &
!		 
! iO i i }
' }
 }
@ Q
&; Q
Q
h c
3 c
 c
L \
#8 \
 \
~ I
!6 I
 I
Xr;   