o
    f2h-                  
   @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlZd dlm  mZ d dlmZmZ ddlmZ ddlmZ dd	lmZ z
d d
lmZ dZW n eeefye   dZdZY nw eG dd dZG dd dej Z G dd dej!Z!G dd dej"Z"d$ddZ#edd Z$G dd dej%Z&G dd dej%Z'G dd dej%Z(G d d! d!ej%Z)G d"d# d#ej%Z*dS )%    N)contextmanager)	dataclass)DictIterableOptionalTuple)Tensornn   )decode)detect_language)
transcribe)scaled_dot_product_attentionTFc                   @   s^   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< dS )ModelDimensionsn_melsn_audio_ctxn_audio_staten_audio_headn_audio_layern_vocab
n_text_ctxn_text_staten_text_headn_text_layerN)__name__
__module____qualname__int__annotations__ r   r   a/var/www/html/alexa/alex_system/speach-to-text/venv/lib/python3.10/site-packages/whisper/model.pyr      s   
 r   c                       s&   e Zd Zdedef fddZ  ZS )	LayerNormxreturnc                    s   t  | |jS N)superforwardfloattypedtypeselfr"   	__class__r   r    r&   (   s   zLayerNorm.forward)r   r   r   r   r&   __classcell__r   r   r,   r    r!   '   s    r!   c                   @   s   e Zd ZdedefddZdS )Linearr"   r#   c                 C   s2   t || j|j| jd u rd S | j|jS r$   )Flinearweighttor)   biasr*   r   r   r    r&   -   s   zLinear.forwardN)r   r   r   r   r&   r   r   r   r    r/   ,   s    r/   c                       s2   e Zd Zdededee def fddZ  ZS )Conv1dr"   r2   r4   r#   c                    s.   t  |||j|d u rd S ||jS r$   )r%   _conv_forwardr3   r)   )r+   r"   r2   r4   r,   r   r    r6   6   s
   
zConv1d._conv_forward)r   r   r   r   r   r6   r.   r   r   r,   r    r5   5   s    r5   '  c                 C   s   |d dksJ t ||d d  }t| t|d  }t| ddt jf |t jddf  }tjt|t|gddS )z*Returns sinusoids for positional embedding   r   r
   Ndim)	nplogtorchexparangenewaxiscatsincos)lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_timer   r   r    	sinusoids>   s
   *rJ   c                  c   s*    t j} zdt _d V  W | t _d S | t _w )NF)MultiHeadAttentionuse_sdpa)
prev_stater   r   r    disable_sdpaG   s   rN   c                       s   e Zd ZdZdedef fddZ			ddedee d	ee d
ee fddZ		ddededed	ee de
ejeej f f
ddZ  ZS )rK   Tn_staten_headc                    sH   t    || _t||| _t||dd| _t||| _t||| _d S )NF)r4   )r%   __init__rP   r/   querykeyvalueout)r+   rO   rP   r,   r   r    rQ   T   s   
zMultiHeadAttention.__init__Nr"   xamaskkv_cachec           
      C   s   |  |}|d u s|d u s| j|vr)| |d u r|n|}| |d u r%|n|}n
|| j }|| j }| ||||\}}	| ||	fS r$   )rR   rS   rT   qkv_attentionrU   )
r+   r"   rV   rW   rX   qkvwvqkr   r   r    r&   \   s   


zMultiHeadAttention.forwardrZ   r[   r\   r#   c                 C   sv  |j \}}}|| j d }|jg |j d d | jdR  dddd}|jg |j d d | jdR  dddd}|jg |j d d | jdR  dddd}trztjrzt||||d uoe|dkd}	|	ddddjdd}
d }|
|fS || || 	dd	 }|d ur||d |d |f  }|
 }tj|dd
|j}|| ddddjdd}
| }|
|fS )Ng      пr8   r   r
      )	is_causal)	start_dimr9   )shaperP   viewpermuteSDPA_AVAILABLErK   rL   r   flatten	transposer'   r0   softmaxr3   r)   detach)r+   rZ   r[   r\   rW   n_batchn_ctxrO   scalearU   r^   wr   r   r    rY   r   s(   000
z MultiHeadAttention.qkv_attentionNNNr$   )r   r   r   rL   r   rQ   r   r   dictr&   r   r=   rY   r.   r   r   r,   r    rK   Q   s6    
rK   c                
       sZ   e Zd Zddededef fddZ			dded	ee d
ee dee fddZ	  Z
S )ResidualAttentionBlockFrO   rP   cross_attentionc                    s|   t    t||| _t|| _|rt||nd | _|r t|nd | _|d }t	t
||t t
||| _t|| _d S )N   )r%   rQ   rK   attnr!   attn_ln
cross_attncross_attn_lnr	   
Sequentialr/   GELUmlpmlp_ln)r+   rO   rP   rt   n_mlpr,   r   r    rQ      s   

zResidualAttentionBlock.__init__Nr"   rV   rW   rX   c                 C   sZ   || j | |||dd  }| jr!|| j| |||dd  }|| | | }|S )NrW   rX   r   )rX   )rv   rw   rx   ry   r|   r}   )r+   r"   rV   rW   rX   r   r   r    r&      s
   zResidualAttentionBlock.forward)Frq   )r   r   r   r   boolrQ   r   r   rr   r&   r.   r   r   r,   r    rs      s    rs   c                
       s@   e Zd Zdededededef
 fddZdefd	d
Z  ZS )AudioEncoderr   rm   rO   rP   n_layerc                    sp   t    t|ddd| _tdddd| _| dt| t fddt	|D | _
t| _d S )	Nr`   r
   )kernel_sizepaddingr8   )r   strider   positional_embeddingc                    s   g | ]}t  qS r   rs   .0_rP   rO   r   r    
<listcomp>   s    z)AudioEncoder.__init__.<locals>.<listcomp>)r%   rQ   r5   conv1conv2register_bufferrJ   r	   
ModuleListrangeblocksr!   ln_post)r+   r   rm   rO   rP   r   r,   r   r    rQ      s   
zAudioEncoder.__init__r"   c                 C   s   t | |}t | |}|ddd}|jdd | jjks&J d|| j |j}| j	D ]}||}q2| 
|}|S )zt
        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
            the mel spectrogram of the audio
        r   r8   r
   Nzincorrect audio shape)r0   gelur   r   rf   rd   r   r3   r)   r   r   )r+   r"   blockr   r   r    r&      s   


zAudioEncoder.forward)r   r   r   r   rQ   r   r&   r.   r   r   r,   r    r      s    r   c                
       sN   e Zd Zdededededef
 fddZdd	ed
edee fddZ  Z	S )TextDecoderr   rm   rO   rP   r   c                    s   t    t|| _tt|| _t	 fddt
|D | _t| _t||tj d}| jd|dd d S )Nc                    s   g | ]	}t  d dqS )T)rt   r   r   r   r   r    r      s    z(TextDecoder.__init__.<locals>.<listcomp>r
   rW   F
persistent)r%   rQ   r	   	Embeddingtoken_embedding	Parameterr=   emptyr   r   r   r   r!   lnfill_r;   inftriu_r   )r+   r   rm   rO   rP   r   rW   r,   r   r    rQ      s   

zTextDecoder.__init__Nr"   rV   rX   c                 C   s   |rt t| jd nd}| || j|||jd    }||j}| jD ]}|||| j	|d}q)| 
|}|t| jj|jdd  }|S )z
        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
            the text tokens
        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
            the encoded audio features to be attended on
        r
   r   r_   r   )nextitervaluesrd   r   r   r3   r)   r   rW   r   r=   ri   r2   r'   )r+   r"   rV   rX   offsetr   logitsr   r   r    r&      s   

zTextDecoder.forwardr$   )
r   r   r   r   rQ   r   r   rr   r&   r.   r   r   r,   r    r      s    $r   c                       s   e Zd Zdef fddZdefddZdejfdd	Z	d
ejdejfddZ
dejd
ejdeeejf fddZedd Zedd Zedd Zddee fddZeZeZeZ  ZS )Whisperdimsc                    s   t    || _t| jj| jj| jj| jj| jj| _	t
| jj| jj| jj| jj| jj| _tj| jj| jjtjd}d|| jjd d < | jd| dd d S )Nr)   Tr8   alignment_headsFr   )r%   rQ   r   r   r   r   r   r   r   encoderr   r   r   r   r   r   decoderr=   zerosr   r   	to_sparse)r+   r   	all_headsr,   r   r    rQ      s*   
	zWhisper.__init__dumpc                 C   sP   t jtt|td }t	|
| jj| jj}| jd| dd d S )Nr   r   Fr   )r;   
frombuffergzip
decompressbase64	b85decoder   copyr=   
from_numpyreshaper   r   r   r   r   )r+   r   arrayrW   r   r   r    set_alignment_heads  s   
zWhisper.set_alignment_headsmelc                 C   s
   |  |S r$   )r   )r+   r   r   r   r    embed_audio  s   
zWhisper.embed_audiotokensaudio_featuresc                 C   s   |  ||S r$   )r   )r+   r   r   r   r   r    r   "  s   zWhisper.logitsr#   c                 C   s   |  || |S r$   )r   r   )r+   r   r   r   r   r    r&   %  s   zWhisper.forwardc                 C   s   t |  jS r$   )r   
parametersdevicer+   r   r   r    r   *  s   zWhisper.devicec                 C   s   | j jdkS )Ni  )r   r   r   r   r   r    is_multilingual.  s   zWhisper.is_multilingualc                 C   s   | j jd t| j S )Ni5  )r   r   r   r   r   r   r   r    num_languages2  s   zWhisper.num_languagesNcachec                    sP    duri  ni  g  fdddt jffdd}j|  fS )a  
        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
        tensors calculated for the previous positions. This method returns a dictionary that stores
        all caches, and the necessary hooks for the key and value projection modules that save the
        intermediate tensors to be reused during later calculations.

        Returns
        -------
        cache : Dict[nn.Module, torch.Tensor]
            A dictionary object mapping the key/value projection modules to its cache
        hooks : List[RemovableHandle]
            List of PyTorch RemovableHandle objects to stop the hooks to be called
        Nc                    sP   |  vs|j d jjkr| | <  |  S tj |  |gdd  | <  |  S )Nr
   r9   )rd   r   r   r=   rA   rk   )moduler   output)r   r+   r   r    save_to_cacheG  s
   z5Whisper.install_kv_cache_hooks.<locals>.save_to_cachelayerc                    s6   t | tr | j  | j d S d S r$   )
isinstancerK   appendrS   register_forward_hookrT   )r   )hooksr   r   r    install_hooksO  s   
z5Whisper.install_kv_cache_hooks.<locals>.install_hooks)r	   Moduler   apply)r+   r   r   r   )r   r   r   r+   r    install_kv_cache_hooks6  s   zWhisper.install_kv_cache_hooksr$   )r   r   r   r   rQ   bytesr   r=   r   r   r   r   strr&   propertyr   r   r   r   rr   r   detect_language_functionr   transcribe_functionr   decode_functionr   r.   r   r   r,   r    r      s,    	



!r   )r7   )+r   r   
contextlibr   dataclassesr   typingr   r   r   r   numpyr;   r=   torch.nn.functionalr	   
functionalr0   r   decodingr   r   r   r   r   r   r   rg   ImportErrorRuntimeErrorOSErrorr   r!   r/   r5   rJ   rN   r   rK   rs   r   r   r   r   r   r   r    <module>   s@    	
		
	= !-