o
    D2h%                      @  sh   d Z ddlmZ ddlZddlZddlZG dd dZ	dd ddZ	dd!ddZd"ddZ	dd Z
dS )#zJThis is an educational implementation of the byte pair encoding algorithm.    )annotationsNc                   @  sZ   e Zd Zd!ddZd"d#ddZd$ddZd%ddZd&ddZed'ddZ	edd Z
d S )(SimpleBytePairEncodingpat_strstrmergeable_ranksdict[bytes, int]returnNonec                C  s0   || _ || _dd | D | _t|| _dS )zCreates an Encoding object.c                 S  s   i | ]\}}||qS  r
   ).0token_bytestokenr
   r
   i/var/www/html/alexa/alex_system/speach-to-text/venv/lib/python3.10/site-packages/tiktoken/_educational.py
<dictcomp>       z3SimpleBytePairEncoding.__init__.<locals>.<dictcomp>N)r   r   items_decoderregexcompile_pat)selfr   r   r
   r
   r   __init__   s   zSimpleBytePairEncoding.__init__colourtext	visualise
str | None	list[int]c                 C  sB   | j |}g }|D ]}|d}t| j||d}|| q
|S )z`Encodes a string into tokens.

        >>> enc.encode("hello world")
        [388, 372]
        utf-8)r   )r   findallencode
bpe_encoder   extend)r   r   r   wordstokensword
word_bytesword_tokensr
   r
   r   r      s   
zSimpleBytePairEncoding.encoder#   bytesc                   s   d  fdd|D S )znDecodes a list of tokens into bytes.

        >>> enc.decode_bytes([388, 372])
        b'hello world'
            c                 3  s    | ]} j | V  qd S Nr   r   r   r   r
   r   	<genexpr>-   s    z6SimpleBytePairEncoding.decode_bytes.<locals>.<genexpr>)joinr   r#   r
   r,   r   decode_bytes'   s   z#SimpleBytePairEncoding.decode_bytesc                 C  s   |  |jdddS )u   Decodes a list of tokens into a string.

        Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace
        the invalid bytes with the replacement character "�".

        >>> enc.decode([388, 372])
        'hello world'
        r   replaceerrors)r0   decoder/   r
   r
   r   r4   /   s   	zSimpleBytePairEncoding.decodelist[bytes]c                   s    fdd|D S )zDecodes a list of tokens into a list of bytes.

        Useful for visualising how a string is tokenised.

        >>> enc.decode_tokens_bytes([388, 372])
        [b'hello', b' world']
        c                   s   g | ]} j | qS r
   r*   r+   r,   r
   r   
<listcomp>B   r   z>SimpleBytePairEncoding.decode_tokens_bytes.<locals>.<listcomp>r
   r/   r
   r,   r   decode_tokens_bytes:   s   z*SimpleBytePairEncoding.decode_tokens_bytestraining_data
vocab_sizeintc                 C  s   t | ||d}t||dS )z#Train a BPE tokeniser on some data!)datar9   r   r   r   )	bpe_trainr   )r8   r9   r   r   r
   r
   r   trainD   s   zSimpleBytePairEncoding.trainc                 C  s$   t | tr
t| } t| j| jdS )Nr<   )
isinstancer   tiktokenget_encodingr   _pat_str_mergeable_ranks)encodingr
   r
   r   from_tiktokenJ   s
   

z$SimpleBytePairEncoding.from_tiktokenN)r   r   r   r   r   r	   r   )r   r   r   r   r   r   )r#   r   r   r'   )r#   r   r   r   )r#   r   r   r5   )r8   r   r9   r:   r   r   )__name__
__module____qualname__r   r   r0   r4   r7   staticmethodr>   rE   r
   r
   r
   r   r      s    





r   r   r   r   inputr'   r   r   r   r   c           
        s  dd |D }	 |r|dv rt | n|dkrt| d }d }tt|d d |dd  D ]\}} |d |d  }|d urM|d u sI||k rM|}|}q.|d u rSn |d usYJ |d | || ||d   g ||d	 d   }q|rxt   fd
d|D }	|	S )Nc                 S     g | ]}t |gqS r
   r'   r   br
   r
   r   r6   V   r   zbpe_encode.<locals>.<listcomp>Tr   colorsimple   r      c                   s   g | ]} | qS r
   r
   )r   partr   r
   r   r6   s   s    )visualise_tokensprint	enumeratezipget)
r   rK   r   partsmin_idxmin_rankipairrankr#   r
   rW   r   r    S   s0   
&2r    r;   r   r9   r:   r   c                   s  |dk rt di }tdD ]	}||t|g< qdd t|| D }t||k rt  |D ]}t|d d |dd  D ]
} |  d7  < q?q0t	  fddd	}	|	d
 |	d  }
t|}|||
< g }|D ]K}g }d
}|t|d k r|| ||d  f|	kr|
|
 |d7 }n|
||  |d7 }|t|d k sw|t|d kr|
||  |
| qi|}|rtd|	d
  d|	d   td|
 dt| d |dv rtd tdd |d d D  n|dkrtd |d d D ]}t| qtd t||k s*|S )N   z;vocab_size must be at least 256, so we can encode all bytesc                 S  s    g | ]}d d | dD qS )c                 S  rL   r
   rM   rN   r
   r
   r   r6      r   z(bpe_train.<locals>.<listcomp>.<listcomp>r   )r   )r   r$   r
   r
   r   r6      s    zbpe_train.<locals>.<listcomp>rS   rT   c                   s    |  S r)   r
   )xstatsr
   r   <lambda>   s    zbpe_train.<locals>.<lambda>)keyr   rU   z The current most common pair is z + zSo we made z our zth tokenrP   z9Now the first fifty words in our training data look like:c                 S  s   g | ]	}|D ]}|qqS r
   r
   )r   r$   r   r
   r
   r   r6          2   rR   z:Now the first twenty words in our training data look like:   
)
ValueErrorranger'   r   r   lencollectionsCounterr[   maxappendrY   rX   )r;   r9   r   r   ranksr`   r"   piecera   most_common_pairr   r   	new_wordsr$   new_wordr
   re   r   r=   w   s\   




-r=   token_valuesr5   r	   c                 C  s   dd dD }dd | D }d}d }|D ].}||t |  }||kr2||d t |  }||ks2J |}|t |7 }t|| dd qtd	 d S )
Nc                 S  s   g | ]}d | dqS )z[48;5;mr
   )r   r`   r
   r
   r   r6      s    z$visualise_tokens.<locals>.<listcomp>)         M   P   D      c                 S  s   g | ]	}|j d ddqS )r   r1   r2   )r4   )r   rd   r
   r
   r   r6      ri   r   rT    )endz[0m)ro   rY   )ry   
backgroundunicode_token_valuesrunning_length
last_colorr   rQ   r
   r
   r   rX      s   rX   c                  C  s   d} t t}| }W d    n1 sw   Y  tj|d| d}td |d}||dks4J ||dks=J |	|ddgksHJ |S )	NzN's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+iX  )r9   r   zJThis is the sequence of merges performed in order to encode 'hello world':zhello worlds   hello worlds   hellos    world)
open__file__readr   r>   rY   r   r4   r0   r7   )gpt2_patternfr;   encr#   r
   r
   r   train_simple_encoding   s   


r   rF   )r   r   rK   r'   r   r   r   r   )
r;   r   r9   r:   r   r   r   r   r   r   )ry   r5   r   r	   )__doc__
__future__r   rp   r   r@   r   r    r=   rX   r   r
   r
   r
   r   <module>   s    H%
E