o
    f2hD                     @   s2  d dl Z d dlmZ d dlmZmZ d dlmZmZ d dl	Z
d dlZd dlm  mZ ddlmZ dZdZd	Zd
Zee ZeeeZed ZeeeZeeeZefdedefddZefdddedefddZedddedejfddZ 		 	d!deee
j!ejf dededeeeej"f  fdd Z#dS )"    N)	lru_cache)CalledProcessErrorrun)OptionalUnion   )	exact_divi>  i           filesrc                 C   s   ddddd| dddd	d
ddt |dg}z
t|dddj}W n ty4 } ztd|j  |d}~ww t|tj	
 tjd S )a?  
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    ffmpegz-nostdinz-threads0z-iz-fs16lez-ac1z-acodec	pcm_s16lez-ar-T)capture_outputcheckzFailed to load audio: Ng      @)strr   stdoutr   RuntimeErrorstderrdecodenp
frombufferint16flattenastypefloat32)r   r   cmdoute r$   a/var/www/html/alexa/alex_system/speach-to-text/venv/lib/python3.10/site-packages/whisper/audio.py
load_audio   s"   r&   )axislengthr(   c                C   s   t | rC| j| |kr| j|t j|| jdd} | j| |k rAdg| j }d|| j|  f||< t| dd |ddd D } | S | j| |krS| j	t
||d	} | j| |k rqdg| j }d|| j|  f||< t| |} | S )
zO
    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
    )device)dimindex)r   r   r   c                 S   s   g | ]	}|D ]}|qqS r$   r$   ).0sizespadr$   r$   r%   
<listcomp>N   s    zpad_or_trim.<locals>.<listcomp>Nr'   )indicesr(   )torch	is_tensorshapeindex_selectaranger*   ndimFr/   takeranger   )arrayr)   r(   
pad_widthsr$   r$   r%   pad_or_trimA   s"   
 
r=   )maxsizen_melsreturnc                 C   sz   |dv sJ d| t jt jtdd}tj|dd}t|d|  	| W  d   S 1 s6w   Y  dS )	ad  
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:

        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
        )
    >   P      zUnsupported n_mels: assetszmel_filters.npzF)allow_picklemel_N)
ospathjoindirname__file__r   loadr2   
from_numpyto)r*   r?   filters_pathfr$   r$   r%   mel_filters[   s
   $rP   rA   audiopaddingr*   c           
      C   s   t | st| trt| } t | } |dur| |} |dkr(t| d|f} t 	t
| j}t j| t
t|dd}|dddf  d }t| j|}|| }t j|dd	 }	t |	|	 d
 }	|	d d }	|	S )ap  
    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
        The number of Mel-frequency filters, only 80 is supported

    padding: int
        Number of zero samples to pad to the right

    device: Optional[Union[str, torch.device]]
        If given, the audio tensor is moved to this device before STFT

    Returns
    -------
    torch.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    Nr   T)windowreturn_complex.r'   r   g|=)ming       @g      @)r2   r3   
isinstancer   r&   rL   rM   r8   r/   hann_windowN_FFTr*   stft
HOP_LENGTHabsrP   clamplog10maximummax)
rQ   r?   rR   r*   rS   rY   
magnitudesfiltersmel_speclog_specr$   r$   r%   log_mel_spectrogramn   s"   



rd   )rA   r   N)$rF   	functoolsr   
subprocessr   r   typingr   r   numpyr   r2   torch.nn.functionalnn
functionalr8   utilsr   SAMPLE_RATErX   rZ   CHUNK_LENGTH	N_SAMPLESN_FRAMESN_SAMPLES_PER_TOKENFRAMES_PER_SECONDTOKENS_PER_SECONDr   intr&   r=   TensorrP   ndarrayr*   rd   r$   r$   r$   r%   <module>   sB    


(