o
    f2h20                     @   s0  d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
mZmZmZ d dlZi ddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&i d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHi dIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidji dkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~ddddddddddddddi ddddddddddddddddddddddddddddddddddddddddddddddddddZi dd e D dddddddd7d7ddddÜZeG ddń dŃZeddƍddedefdd̄Zeddƍdddd͜dededee dee def
ddӄZdS )    N)	dataclassfield)cached_property	lru_cache)DictListOptionalTupleenenglishzhchinesedegermanesspanishrurussiankokoreanfrfrenchjajapanesept
portuguesetrturkishplpolishcacatalannldutchararabicsvswedishititalianid
indonesianhihindififinnishvi
vietnamesehehebrewuk	ukrainianelgreekmsmalaycsczechroromaniandadanishhu	hungariantatamilno	norwegianththaiururduhrcroatianbg	bulgarianlt
lithuanianlalatinmimaoriml	malayalamcywelshskslovaktetelugufapersianlvlatvianbnbengalisrserbianazazerbaijanisl	slovenianknkannadaetestonianmk
macedonianbrbretoneubasqueis	icelandichyarmeniannenepalimn	mongolianbsbosniankkkazakhsqalbanianswswahiliglgalicianmrmarathipapunjabisisinhalakmkhmersnshonayoyorubasosomaliaf	afrikaansococcitankageorgianbe
belarusiantgtajiksdsindhigugujaratiamamharicyiyiddishlolaouzuzbekfofaroesehtzhaitian creolepspashtotkturkmennnnynorskmtmaltesesanskritluxembourgishmyanmartibetantagalogmalagasyassamesetatarhawaiianlingalahausabashkirjavanese	sundanese	cantonese)salbmybotlmgastthawlnhabajwsuyuec                 C   s   i | ]\}}||qS  r   ).0codelanguager   r   e/var/www/html/alexa/alex_system/speach-to-text/venv/lib/python3.10/site-packages/whisper/tokenizer.py
<dictcomp>s   s    r   r   r   )burmese	valencianflemishhaitianletzeburgeschpushtopanjabi	moldavianmoldovan	sinhalese	castilianmandarinc                   @   s  e Zd ZU dZejed< eed< dZe	e
 ed< dZe	e
 ed< dZee ed< eed	Zee
ef ed
< dd Zdd Zdee de
fddZdee de
fddZedefddZedefddZedefddZedefddZedefddZedefdd Zedefd!d"Zedefd#d$Zedefd%d&Z edefd'd(Z!d)d* Z"edee fd+d,Z#edee
 fd-d.Z$edee fd/d0Z%edee fd1d2Z&d3ee fd4d5Z'd3ee fd6d7Z(d3ee fd8d9Z)dS ):	TokenizerzIA thin wrapper around `tiktoken` providing quick access to special tokensencodingnum_languagesNr   taskr   sot_sequence)default_factoryspecial_tokensc           	      C   s   | j jD ]}| j |}|| j|< q| jd }| jd }| jd }tt d | j }|g}| jd urA|	|d |
| j  | jd urT| jdkrM|n|}|	| t|| _d S )N<|startoftranscript|><|translate|><|transcribe|>   
transcribe)r   special_tokens_setencode_single_tokenr   tuple	LANGUAGESkeysr   r   appendindexr   r   )	selfspecialspecial_tokensot	translater   langsr   
task_tokenr   r   r   __post_init__   s   





zTokenizer.__post_init__c                 K      | j j|fi |S N)r   encode)r   textkwargsr   r   r   r      s   zTokenizer.encode	token_idsreturnc                    s&    fdd|D } j j|fi |S )Nc                    s   g | ]	}| j k r|qS r   )timestamp_begin)r   tr   r   r   
<listcomp>   s    z$Tokenizer.decode.<locals>.<listcomp>r   decoder   r  r  r   r  r   r
     s   zTokenizer.decodec                 K   r   )z
        Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        r	  r  r   r   r   decode_with_timestamps   s   z Tokenizer.decode_with_timestampsc                 C   s   | j jS r   )r   	eot_tokenr  r   r   r   eot   s   zTokenizer.eotc                 C   
   | j d S )Nr   r   r  r   r   r   r         
zTokenizer.transcribec                 C   r  )Nr   r  r  r   r   r   r      r  zTokenizer.translatec                 C   r  )Nr   r  r  r   r   r   r      r  zTokenizer.sotc                 C   r  )N<|startoflm|>r  r  r   r   r   sot_lm   r  zTokenizer.sot_lmc                 C   r  )N<|startofprev|>r  r  r   r   r   sot_prev   r  zTokenizer.sot_prevc                 C   r  )N<|nospeech|>r  r  r   r   r   	no_speech   r  zTokenizer.no_speechc                 C   r  )N<|notimestamps|>r  r  r   r   r   no_timestamps   r  zTokenizer.no_timestampsc                 C   r  )Nz<|0.00|>r  r  r   r   r   r     r  zTokenizer.timestamp_beginc                 C   s   | j du r	td| | j S )zGReturns the token id corresponding to the value of the `language` fieldNz6This tokenizer does not have language token configured)r   
ValueErrorto_language_tokenr  r   r   r   language_token   s   
zTokenizer.language_tokenc                 C   s.   | j d| dd  }r|S td| d)N<||>z	Language z not found in tokenizer.)r   getKeyError)r   r   tokenr   r   r   r     s   zTokenizer.to_language_tokenc                 C   sB   g }| j  D ]\}}|dtv r|| qt|d | j S )N<|>)r   itemsstripr   r   r   r   )r   resultr!  token_idr   r   r   all_language_tokens   s   
zTokenizer.all_language_tokensc                    s   t  fdd jD S )Nc                 3   s"    | ]}  |gd V  qdS )r"  N)r
  r$  )r   _lr  r   r   	<genexpr>   s     z/Tokenizer.all_language_codes.<locals>.<genexpr>)r   r'  r  r   r  r   all_language_codes   s   zTokenizer.all_language_codesc                 C   s   t t| j| jg S r   )r   listr   r  r  r   r   r   #sot_sequence_including_notimestamps   s   z-Tokenizer.sot_sequence_including_notimestampsc                 C   s   t d}|d 7 }td}tdd |D sJ | jdd | jdd h}|t | D ]$}| j|| jd	| fD ]}t|d
ksK||v rR||d  q?q/tt	|S )u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c                 s   s,    | ]}d t |  kodkn  V  qdS )i@&  i&  N)ord)r   cr   r   r   r)    s   * z.Tokenizer.non_speech_tokens.<locals>.<genexpr>z -r   z ' r   )
r+  splitsetallr   r   lenaddr   sorted)r   symbolsmiscellaneousr%  symboltokensr   r   r   non_speech_tokens   s     
zTokenizer.non_speech_tokensr9  c                 C   s   | j dv r
| |S | |S )N>   r   r   r   rF   r   r   )r   split_tokens_on_unicodesplit_tokens_on_spaces)r   r9  r   r   r   split_to_word_tokens  s   


zTokenizer.split_to_word_tokensc           
      C   s   |  |}d}g }g }g }d}|D ]-}|| |  |}	||	vs,|||	|  |kr>||	 || g }|t|	7 }q||fS )Nu   �r   )r  r   r   r3  )
r   r9  decoded_fullreplacement_charwordsword_tokenscurrent_tokensunicode_offsetr!  decodedr   r   r   r;    s&   




z!Tokenizer.split_tokens_on_unicodec                 C   s   |  |\}}g }g }t||D ]=\}}|d | jk}|d}	| tjv }
|s3|	s3|
s3t|dkr>|| || q|d | |d< |d 	| q||fS )Nr   r/  )
r;  zipr  
startswithr$  stringpunctuationr3  r   extend)r   r9  subwordssubword_tokens_listr@  rA  subwordsubword_tokensr   
with_spacerI  r   r   r   r<  7  s   

z Tokenizer.split_tokens_on_spaces)*__name__
__module____qualname____doc__tiktokenEncoding__annotations__intr   r   strr   r   r	   r   dictr   r   r   r   r   r
  r  r   r  r   r   r   r  r  r  r  r  r  r  r'  r*  r,  r:  r=  r;  r<  r   r   r   r   r      sX   
 
#	r   )maxsizegpt2c   namer   c                 C   s   t jt jtd|  d}dd dd t|D D }t|}i }ddgd	d
 tt	 d | D dddddddd
 t
dD }|D ]
}|||< |d7 }qMtjt j||d||dS )Nassetsz	.tiktokenc                 S   s    i | ]\}}t |t|qS r   )base64	b64decoderW  )r   r!  rankr   r   r   r   M  s    z get_encoding.<locals>.<dictcomp>c                 s   s    | ]	}|r|  V  qd S r   )r0  )r   liner   r   r   r)  O  s    zget_encoding.<locals>.<genexpr>z<|endoftext|>r   c                 S   s   g | ]}d | dqS )r  r  r   )r   langr   r   r   r  W  s    z get_encoding.<locals>.<listcomp>r   r   r  r  r  r  c                 S   s   g | ]}d |d ddqS )r  g{Gz?z.2fr  r   )r   ir   r   r   r  ^  s    i  r   zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)r]  explicit_n_vocabpat_strmergeable_ranksr   )ospathjoindirname__file__openr3  r+  r   r   rangerT  rU  basename)r]  r   
vocab_pathranksn_vocabr   specialsr!  r   r   r   get_encodingJ  sF   	


rt  )r   r   r   multilingualr   r   r  c                C   sz   |d ur|  }|tvr|tv rt| }ntd| | r)d}|p#d}|p'd}nd}d }d }t||d}t||||dS )NzUnsupported language: ru  r
   r   r[  )r]  r   )r   r   r   r   )lowerr   TO_LANGUAGE_CODEr  rt  r   )ru  r   r   r   encoding_namer   r   r   r   get_tokenizern  s"   

ry  )r[  r\  )r_  rh  rH  dataclassesr   r   	functoolsr   r   typingr   r   r   r	   rT  r   r#  rw  r   rX  rW  rt  boolry  r   r   r   r   <module>   s   	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVh G#