o
    f2hvr                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
Zd dlZd dlZddlmZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' erpdd	l(m)Z) dd
dddddddddddddde	e*ej+ej,f dee- de	e.ee.df f dee. dee. dee. de-dee* de-d e*d!e*d"e	e*ee. f d#ee. fd$d%Z/d&d' Z0e1d(kre0  dS dS ))    N)TYPE_CHECKINGListOptionalTupleUnion   )FRAMES_PER_SECOND
HOP_LENGTHN_FRAMES	N_SAMPLESSAMPLE_RATElog_mel_spectrogrampad_or_trim)DecodingOptionsDecodingResult)add_word_timestamps)	LANGUAGESTO_LANGUAGE_CODEget_tokenizer)	exact_divformat_timestampget_end
get_writer	make_safeoptional_floatoptional_intstr2bool)Whisper)        皙?g?333333?g?      ?333333@      r    TF   "'“¿([{-   "'.。,，!！?？:：”)]}、0)verbosetemperaturecompression_ratio_thresholdlogprob_thresholdno_speech_thresholdcondition_on_previous_textinitial_promptword_timestampsprepend_punctuationsappend_punctuationsclip_timestampshallucination_silence_thresholdmodelr   audior'   r(   .r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   c          O         s*	   ddr	tjntj}jtdkr+tj rtd |tjkr+td tj}|tjkr4dd< t	|j
jtd}|jd t }t|t t } d	d
d
u rjs\dd	< n3|rbtd t|tj|}|\}}t||j dd	< |d
urtdtd	     d	 } dd}tjj||dt|trdd |r|dng D }dd |D }t|dkr| d t|d dkr| | t!t"|d
d
d |dd
d }d|	r|dkrtd dtj#dt$f fdd}d}|| d t%tj
j&}|t t }g }g }d} |d
ur3'd |(  }!|)|! ng }!d!td"td#tj#d$t$ffd%d&}"t*j*|d'|dud(+}#d)}$|t|k ru|| \}%}&|%k rh|%|&kr|d7 }|t|k r~|| d qTtt t }'tt t t }(t+t| |& })|d
d
|) f }|)t t }*t|tj|}|| d
 d*< ||}+t,|+j-},d
ur|+j.k}-d
ur|+j/krd}-|-r|)7 qT}.g }/d+t0dtfd,d-	dt1t0 dt2f	fd.d/}0d0t3t0 dt1t0 fd1d2}1|,4j5}2|2d3d
 6 ddgk}3t7|2d
d |2dd
 @ d }4|48d t|4dkr|46 }5|3rQ|5 t|, d}6|5D ]0}7|,|6|7 }8|8d 9 j5 }9|8d 9 j5 }:|/ |"|'|9|  |'|:|  |8|+d4 |7}6qU|3r|)7 nK|,|6d  9 j5 };|;| 7 n9|*}<|,|2: ;  }=t|=dkr|=d 9 j5kr|=d 9 j5 };|;| }<|/ |"|'|'|< |,|+d4 |)7 |	rt<|/||)|
||$d5 |3st=|/}>|>d
ur|>|'krt>|>t? |d
ur|}?|3s+t=|/}>|>d
ur+|>|'kr+|(|> }@|@|?kr't>|>t? n|.|) |1|/}A|Ad
urN|0|ArN|Ad! |' }B|B|?krN|.t>|Bt?  qT|$}Ct@t|/D ]}D|/|D }E|Ed6 scqV|0|Er|1|/|Dd d
 }F|Fd
ur|Fd6 d d! }Gn|'|* }G|Ed! |C |?kp|Ed! |?k p|Ed! |' d7k }H|G|Ed"  |?kp|0|Fp|(|Ed"  d7k }I|Hr|Irt>t|'d |Ed! t? ||Ed"  |?k r|g |/|Dd
<  n|Ed" }CqVt=|/}>|>d
ur|>}$|r|/D ]&}E|Ed! |Ed" |Ed8 }J}K}Ld9tA|J d:tA|K d;|L }MttB|M qtC|/D ]#\}N}E|Ed! |Ed" ks/|Ed8 ( d<kr;d<|Ed8< g |Ed#< g |Ed6< q|)d=d tC|/t|d>D  |)d?d |/D  |r`|+jDd@krdt|} |#Et+||.  |t|k s[W d
   n	1 sw   Y  t0F|t|!d
 ||dAS )Bay  
    Transcribe an audio file using Whisper

    Parameters
    ----------
    model: Whisper
        The Whisper model instance

    audio: Union[str, np.ndarray, torch.Tensor]
        The path to the audio file to open, or the audio waveform

    verbose: bool
        Whether to display the text being decoded to the console. If True, displays all the details,
        If False, displays minimal details. If None, does not display anything

    temperature: Union[float, Tuple[float, ...]]
        Temperature for sampling. It can be a tuple of temperatures, which will be successively used
        upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.

    compression_ratio_threshold: float
        If the gzip compression ratio is above this value, treat as failed

    logprob_threshold: float
        If the average log probability over sampled tokens is below this value, treat as failed

    no_speech_threshold: float
        If the no_speech probability is higher than this value AND the average log probability
        over sampled tokens is below `logprob_threshold`, consider the segment as silent

    condition_on_previous_text: bool
        if True, the previous output of the model is provided as a prompt for the next window;
        disabling may make the text inconsistent across windows, but the model becomes less prone to
        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.

    word_timestamps: bool
        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
        and include the timestamps for each word in each segment.

    prepend_punctuations: str
        If word_timestamps is True, merge these punctuation symbols with the next word

    append_punctuations: str
        If word_timestamps is True, merge these punctuation symbols with the previous word

    initial_prompt: Optional[str]
        Optional text to provide as a prompt for the first window. This can be used to provide, or
        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
        to make it more likely to predict those word correctly.

    decode_options: dict
        Keyword arguments to construct `DecodingOptions` instances

    clip_timestamps: Union[str, List[float]]
        Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
        The last end timestamp defaults to the end of the file.

    hallucination_silence_threshold: Optional[float]
        When word_timestamps is True, skip silent periods longer than this threshold (in seconds)
        when a possible hallucination is detected

    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    fp16Tcpuz2Performing inference on CPU when CUDA is availablez0FP16 is not supported on CPU; using FP32 insteadF)paddinglanguageNenz]Detecting language using up to the first 30 seconds. Use `--language` to specify the language)keyzDetected language: task
transcribe)num_languagesr9   r<   c                 S   s   g | ]}t |qS  )float.0tsr?   r?   f/var/www/html/alexa/alex_system/speach-to-text/venv/lib/python3.10/site-packages/whisper/transcribe.py
<listcomp>   s    ztranscribe.<locals>.<listcomp>,c                 S   s   g | ]}t |t qS r?   )roundr   rA   r?   r?   rD   rE          r      r   u*   "'“¿([{-"'.。,，!！?？:：”)]}、	translatez:Word-level timestamps on translations may not be reliable.segmentreturnc                    s   t ttfr
gn}d }|D ]V}i }|dkr'|dd  |dd  n|dd  tdi |d|i}| |}d} d urK|j krKd}d urV|jk rVd}d ura|jkrad}|sf |S q|S )	Nr   	beam_sizepatiencebest_ofr(   FTr?   )	
isinstanceintr@   popr   decodecompression_ratioavg_logprobno_speech_prob)rK   temperaturesdecode_resulttkwargsoptionsneeds_fallback)r)   decode_optionsr*   r3   r+   r(   r?   rD   decode_with_fallback   s4   


z(transcribe.<locals>.decode_with_fallback startendtokensresultc              
      s@   |  }fdd|D } | ||||j|j|j|jd	S )Nc                    s   g | ]	}| j k r|qS r?   )eot)rB   token)	tokenizerr?   rD   rE      s    z3transcribe.<locals>.new_segment.<locals>.<listcomp>)	seekr`   ra   textrb   r(   rU   rT   rV   )tolistrS   r(   rU   rT   rV   )r`   ra   rb   rc   text_tokens)rg   rf   r?   rD   new_segment   s   ztranscribe.<locals>.new_segmentframes)totalunitdisabler   promptwordc                 S   s`   |  dd}| d | d  }d}|dk r|d7 }|dk r$|d| d 7 }|d	kr.||d	 7 }|S )
Nprobabilityr   ra   r`   g333333?r!   g/$?          @)get)rq   rr   durationscorer?   r?   rD   word_anomaly_score,  s   z&transcribe.<locals>.word_anomaly_scorec                    sd   | d u s| d s
dS  fdd| d D }|d d }t fdd|D }|dkp1|d	 t|kS )
NwordsFc                    s   g | ]
}|d   vr|qS )rq   r?   rB   w)punctuationr?   rD   rE   ;  s    z:transcribe.<locals>.is_segment_anomaly.<locals>.<listcomp>   c                 3   s    | ]} |V  qd S )Nr?   rz   )rx   r?   rD   	<genexpr>=  s    z9transcribe.<locals>.is_segment_anomaly.<locals>.<genexpr>   g{Gz?)sumlen)rK   ry   rw   )r|   rx   r?   rD   is_segment_anomaly8  s   z&transcribe.<locals>.is_segment_anomalysegmentsc                 S   s   t dd | D d S )Nc                 s   s    | ]	}|d  r|V  qdS )ry   Nr?   )rB   sr?   r?   rD   r~   A  s    z9transcribe.<locals>.next_words_segment.<locals>.<genexpr>)next)r   r?   r?   rD   next_words_segment@  s   z&transcribe.<locals>.next_words_segment)r`   ra   rb   rc   )r   r3   rf   mel
num_framesr/   r0   last_speech_timestampry   rt   rh   [z --> z]  c                 S   s   g | ]
\}}d |i|qS )idr?   )rB   irK   r?   r?   rD   rE     s    
)r`   c                 S   s   g | ]}|d  D ]}|qqS )rb   r?   )rB   rK   re   r?   r?   rD   rE     s    g      ?)rh   r   r9   )Gru   torchfloat16float32devicecudais_availablewarningswarnr   dimsn_melsr   shaper
   r@   r	   r   is_multilingualprintr   todetect_languagemaxr   titler   r>   rP   strsplitr   appendlistzipTensorr   r   n_audio_ctxencodestripextendtqdmmintensorrb   rV   rU   dictr   boolr   getimestamp_beginri   whereadd_itemnonzeroflattenr   r   rG   r   ranger   r   	enumerater(   updaterS   )Or3   r4   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r]   dtyper   content_framescontent_durationmel_segment_probsr9   r<   seek_points
seek_clipsr^   clip_idxinput_stridetime_precision
all_tokensall_segmentsprompt_reset_sinceinitial_prompt_tokensrk   pbarr   seek_clip_startseek_clip_endtime_offsetwindow_end_timesegment_sizesegment_durationrc   rb   should_skipprevious_seekcurrent_segmentsr   r   timestamp_tokenssingle_timestamp_endingconsecutiveslices
last_slicecurrent_slicesliced_tokensstart_timestamp_posend_timestamp_poslast_timestamp_posrv   
timestampslast_word_end	thresholdremaining_durationfirst_segmentgaphal_last_endsirK   next_segmenthal_next_startsilence_beforesilence_afterr`   ra   rh   liner   r?   )
r)   r]   r*   r3   r+   r|   rg   r(   rf   rx   rD   r=   &   s  S








"
"(







"











$  qr=   c                     s  ddl m fdd} tjtjd}|jddtdd	 |jd
d| dd |jdtd dd |jdtj	 r8dnddd |jddtddd |jddtdg ddd |jd t
d!d"d |jd#td$d$d%gd&d |jd'td tt td(d) t D  d*d |jd+td,d-d |jd.td/d0d |jd1td/d2d |jd3td d4d |jd5td d6d |jd7td8d9d |jd:td d;d |jd<t
d!d=d |jd>t
d!d?d |jd@tdAdBd |jdCtdDdEd |jdFtdGdHd |jdItdJdKd |jdLt
dMdNd |jdOtdPdQd |jdRtdSdTd |jdUt
dMdVd |jdWtd dXd |jdYtd dZd |jd[td d\d |jd]td,d^d |jd_td`dad |jdbtdcdd | j  de} df} dg} dh} di}tj|d!dj |dkr dl dmvr dl d urt| dn dl  do dp dl<  dq} dr }d urtt|ds|}n|g} dt }	d,krt|	 ddul m}
 |
|||dv}t||}g dw} dx s|D ]} | r| dy| dz q d{ r d| std}  d~ r d| rtd  fdd|D } dD ]C}zt!||fdq|i }|||fi | W q  t"yc } zt#$  t%d| dt&|j' dt|  W Y d }~q d }~ww d S )Nr   available_modelsc                    s,   |   v st j| r| S td   d)Nzmodel should be one of z or path to a model checkpoint)ospathexists
ValueError)namer   r?   rD   valid_model_name  s
   zcli.<locals>.valid_model_name)formatter_classr4   +zaudio file(s) to transcribe)nargstypehelpz--modelturboz name of the Whisper model to use)defaultr   r   z--model_dirz>the path to save model files; uses ~/.cache/whisper by default)r   r   r   z--devicer   r6   z#device to use for PyTorch inference)r   r   z--output_dirz-o.zdirectory to save the outputsz--output_formatz-fall)txtvttsrttsvjsonr   zSformat of the output file; if not specified, all available formats will be produced)r   r   choicesr   z	--verboseTz4whether to print out the progress and debug messagesz--taskr=   rJ   zawhether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')z
--languagec                 S   s   g | ]}|  qS r?   )r   )rB   kr?   r?   rD   rE   
  s    zcli.<locals>.<listcomp>zHlanguage spoken in the audio, specify None to perform language detectionz--temperaturer   ztemperature to use for samplingz	--best_of   z<number of candidates when sampling with non-zero temperaturez--beam_sizezHnumber of beams in beam search, only applicable when temperature is zeroz
--patiencezoptional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam searchz--length_penaltyzoptional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by defaultz--suppress_tokensz-1zcomma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuationsz--initial_promptz:optional text to provide as a prompt for the first window.z--condition_on_previous_textzif True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loopz--fp16z5whether to perform inference in fp16; True by defaultz#--temperature_increment_on_fallbackr   zhtemperature to increase when falling back when the decoding fails to meet either of the thresholds belowz--compression_ratio_thresholdr"   zUif the gzip compression ratio is higher than this value, treat the decoding as failedz--logprob_thresholdr#   zUif the average log probability is lower than this value, treat the decoding as failedz--no_speech_thresholdr    zif the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silencez--word_timestampsFzQ(experimental) extract word-level timestamps and refine the results based on themz--prepend_punctuationsr$   zNif word_timestamps is True, merge these punctuation symbols with the next wordz--append_punctuationsr%   zRif word_timestamps is True, merge these punctuation symbols with the previous wordz--highlight_wordszT(requires --word_timestamps True) underline each word as it is spoken in srt and vttz--max_line_widthze(requires --word_timestamps True) the maximum number of characters in a line before breaking the linez--max_line_countzJ(requires --word_timestamps True) the maximum number of lines in a segmentz--max_words_per_linezk(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segmentz	--threadsz]number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADSz--clip_timestampsr&   zcomma-separated list start,end,start,end,... timestamps (in seconds) of clips to process, where the last end timestamp defaults to the end of the filez!--hallucination_silence_thresholdz(requires --word_timestamps True) skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected)r   r   r3   	model_dir
output_diroutput_formatr   )exist_okz.enr9   >   r:   Englishz) is an English-only model but receipted 'z'; using English instead.r:   r(   !temperature_increment_on_fallbackgzo ?threads)
load_model)r   download_root)highlight_wordsmax_line_countmax_line_widthmax_words_per_liner.   z--z  requires --word_timestamps Truer  r  z7--max_line_count has no effect without --max_line_widthr  z8--max_words_per_line has no effect with --max_line_widthc                    s   i | ]}|  |qS r?   )rR   )rB   arg)argsr?   rD   
<dictcomp>R  rH   zcli.<locals>.<dictcomp>z	Skipping z due to z: )(r   r   argparseArgumentParserArgumentDefaultsHelpFormatteradd_argumentr   r   r   r   r   sortedr   keysr   r@   r   r   
parse_args__dict__rR   r   makedirsendswithr   r   tuplenparangeset_num_threadsr	  r   errorr=   	Exception	traceback	print_excr   r   __name__)r   parser
model_namer  r  r  r   r(   	incrementr  r	  r3   writerword_optionsoptionwriter_args
audio_pathrc   er?   )r  r   rD   cli  s   2












2r.  __main__)2r  r   r"  r   typingr   r   r   r   r   numpyr  r   r   r4   r   r	   r
   r   r   r   r   decodingr   r   timingr   rf   r   r   r   utilsr   r   r   r   r   r   r   r   r3   r   r   ndarrayr   r   r@   r=   r.  r$  r?   r?   r?   rD   <module>   s~    $	(	

   Rg
