o
    "i/                     @  s  U d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d
dlmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ eeddtddZ%eedduddZ&eeddvddZ'e(dd e) D Z*de+d< dd e*D Z,de+d< eeddwd!d"Z-eeddud#d$Z.eeddud%d&Z/eeddud'd(Z0eeddud)d*Z1eeddud+d,Z2eeddud-d.Z3eeddud/d0Z4eeddud1d2Z5eeddud3d4Z6eeddud5d6Z7eeddud7d8Z8eeddud9d:Z9eeddud;d<Z:eeddud=d>Z;ee<eddxd@dAZ=eeddudBdCZ>dydzdHdIZ?edJdd{dLdMZ@d|dOdPZAd}dRdSZBd~ddWdXZCdd\d]ZDdd^d_ZEd`ejFdafddedfZG	dddrdsZHdS )    )annotationsN)bisect_right)IncrementalDecoder)aliases)	lru_cache)findall)	Generator)MultibyteIncrementalDecoder   )ENCODING_MARKSIANA_SUPPORTED_SIMILARRE_POSSIBLE_ENCODING_INDICATIONUNICODE_RANGES_COMBINEDUNICODE_SECONDARY_RANGE_KEYWORDUTF8_MAXIMAL_ALLOCATIONCOMMON_CJK_CHARACTERS_LATIN_CJK_HANGUL	_KATAKANA	_HIRAGANA_THAI_ARABIC_ARABIC_ISOLATED_FORM_ACCENT_KEYWORDS_ACCENTUATED)maxsize	characterstrreturnintc                 C  s   zt | }W n
 ty   Y dS w d}d|v r|tO }d|v r$|tO }d|v r,|tO }d|v r4|tO }d|v r<|tO }d|v rD|tO }d|v rT|t	O }d	|v rT|t
O }tD ]}||v rc|tO } |S qV|S )
zRCompute all name-based classification flags with a single unicodedata.name() call.r   LATINCJKHANGULKATAKANAHIRAGANATHAIARABICzISOLATED FORM)unicodedataname
ValueErrorr   r   r   r   r   r   r   r   r   r   )r   descflagskw r.   \/var/www/html/voicebot/backend/venv/lib/python3.10/site-packages/charset_normalizer/utils.py_character_flags&   s:   r0   boolc                 C     t t| t@ S N)r1   r0   r   r   r.   r.   r/   is_accentuatedI      r5   c                 C  s.   t | }|s	| S |d}tt|d dS )N r      )r(   decompositionsplitchrr    )r   
decomposedcodesr.   r.   r/   remove_accentN   s
   

r>   c                 c  s"    | ]\}}|j |j|fV  qd S r3   )startstop).0r)   	ord_ranger.   r.   r/   	<genexpr>[   s
    
rC   zlist[tuple[int, int, str]]_UNICODE_RANGES_SORTEDc                 C  s   g | ]}|d  qS )r   r.   )rA   er.   r.   r/   
<listcomp>_   s    rF   z	list[int]_UNICODE_RANGE_STARTS
str | Nonec                 C  s<   t | }tt|d }|dkrt| \}}}||k r|S dS )zK
    Retrieve the Unicode range official name from a single character.
    r
   r   N)ordr   rG   rD   )r   character_ordidxr?   r@   r)   r.   r.   r/   unicode_rangeb   s   rL   c                 C  r2   r3   )r1   r0   r   r4   r.   r.   r/   is_latins   r6   rM   c                 C  s2   t | }d|v rdS t| }|d u rdS d|v S )NPTFPunctuationr(   categoryrL   r   character_categorycharacter_ranger.   r.   r/   is_punctuationx   s   
rU   c                 C  sB   t | }d|v sd|v rdS t| }|d u rdS d|v o |dkS )NSNTFFormsLorP   rR   r.   r.   r/   	is_symbol   s   
rZ   c                 C  s$   t | }|d u r
dS d|v pd|v S )NF	EmoticonsPictographs)rL   )r   rT   r.   r.   r/   is_emoticon   s   r]   c                 C  s.   |   s| dv r
dS t| }d|v p|dv S )N>      ｜+<>TZ>   PcPdPo)isspacer(   rQ   )r   rS   r.   r.   r/   is_separator   s   
rg   c                 C  s   |   |  kS r3   )islowerisupperr4   r.   r.   r/   is_case_variable   r6   rj   c                 C  r2   r3   )r1   r0   r   r4   r.   r.   r/   is_cjk   r6   rk   c                 C  r2   r3   )r1   r0   r   r4   r.   r.   r/   is_hiragana   r6   rl   c                 C  r2   r3   )r1   r0   r   r4   r.   r.   r/   is_katakana   r6   rm   c                 C  r2   r3   )r1   r0   r   r4   r.   r.   r/   	is_hangul   r6   rn   c                 C  r2   r3   )r1   r0   r   r4   r.   r.   r/   is_thai   r6   ro   c                 C  r2   r3   )r1   r0   r   r4   r.   r.   r/   	is_arabic   r6   rp   c                 C  r2   r3   )r1   r0   r   r4   r.   r.   r/   is_arabic_isolated_form   r6   rq   c                 C  s   | t vS r3   )r   r4   r.   r.   r/   is_cjk_uncommon   s   rr   
range_namec                   s   t  fddtD S )Nc                 3  s    | ]}| v V  qd S r3   r.   )rA   keywordrs   r.   r/   rC      s    z-is_unicode_range_secondary.<locals>.<genexpr>)anyr   ru   r.   ru   r/   is_unicode_range_secondary   s   rw   c                 C  s(   |   du o|  du o| dko| dkS )NFu   ﻿)rf   isprintabler4   r.   r.   r/   is_unprintable   s   
rz       sequencebytessearch_zonec                 C  s   t | ttfs	tt| }tt| dt|| jddd}t|dkr&dS |D ]'}|	 
dd}t D ]\}}||krD|    S ||krN|    S q6q(dS )zW
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    Nasciiignoreerrorsr   -_)
isinstancer}   	bytearray	TypeErrorlenr   r   mindecodelowerreplacer   items)r|   r~   seq_lenresultsspecified_encodingencoding_aliasencoding_ianar.   r.   r/   any_specified_encoding   s&   r      r)   c                 C  s    | dv pt td|  jtS )zQ
    Verify is a specific encoding is a multi byte one based on it IANA name
    >	   utf_7utf_8utf_16utf_32	utf_16_be	utf_16_le	utf_32_be	utf_32_le	utf_8_sig
encodings.)
issubclass	importlibimport_moduler   r	   )r)   r.   r.   r/   is_multi_byte_encoding  s   
r   tuple[str | None, bytes]c                 C  sJ   t D ] }t | }t|tr|g}|D ]}| |r!||f    S qqdS )z9
    Identify and extract SIG/BOM in given sequence.
    )N    )r   r   r}   
startswith)r|   iana_encodingmarksmarkr.   r.   r/   identify_sig_or_bom  s   

r   r   c                 C  s   | dvS )N>   r   r   r.   )r   r.   r.   r/   should_strip_sig_or_bom.  s   r   Tcp_namestrictc                 C  sN   |   dd} t D ]\}}| ||fv r|  S q|r%td|  d| S )zIReturns the Python normalized encoding name (Not the IANA official name).r   r   zUnable to retrieve IANA for '')r   r   r   r   r*   )r   r   r   r   r.   r.   r/   	iana_name2  s   r   iana_name_aiana_name_bfloatc           	      C  s   t | st |r
dS td|  j}td| j}|dd}|dd}d}tdD ]}t|g}||||krA|d7 }q,|d S )Ng        r   r   r   r      r
   )r   r   r   r   ranger}   r   )	r   r   	decoder_a	decoder_bid_aid_bcharacter_match_countito_be_decodedr.   r.   r/   cp_similarityC  s   


r   c                 C  s   | t v o	|t |  v S )z
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    )r   )r   r   r.   r.   r/   is_cp_similarW  s   
r   charset_normalizerz)%(asctime)s | %(levelname)s | %(message)slevelformat_stringNonec                 C  s:   t | }|| t  }|t | || d S r3   )logging	getLoggersetLevelStreamHandlersetFormatter	Formatter
addHandler)r)   r   r   loggerhandlerr.   r.   r/   set_logging_handlerb  s
   

r   	sequencesr   offsetsr   
chunk_sizebom_or_sig_availablestrip_sig_or_bomsig_payloadis_multi_byte_decoderdecoded_payloadGenerator[str, None, None]c	                 c  s&   |r|du r|D ]}	||	|	|  }
|
s d S |
V  q	d S |D ]p}	|	| }|t | d kr/q | |	|	|  }|rA|du rA|| }|j||rHdndd}
|r|	dkrt|d}|r|
d | |vrt|	|	d d	D ]#}| || }|r{|du r{|| }|j|dd}
|
d | |v r nqi|
V  q d S )
NF   r   r   r   r   r8      )r   r   r   r   )r   r   r   r   r   r   r   r   r   r   chunk	chunk_endcut_sequencechunk_partial_size_chkjr.   r.   r/   cut_sequence_chunkso  sD   

r   )r   r   r   r    )r   r   r   r1   )r   r   r   r   )r   r   r   rH   )rs   r   r   r1   )r{   )r|   r}   r~   r    r   rH   )r)   r   r   r1   )r|   r}   r   r   )r   r   r   r1   )T)r   r   r   r1   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r1   )r)   r   r   r    r   r   r   r   r3   )r   r}   r   r   r   r   r   r    r   r1   r   r1   r   r}   r   r1   r   rH   r   r   )I
__future__r   r   r   r(   bisectr   codecsr   encodings.aliasesr   	functoolsr   rer   typingr   _multibytecodecr	   constantr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   r5   r>   sortedr   rD   __annotations__rG   rL   rM   rU   rZ   r]   rg   rj   rk   rl   rm   rn   ro   rp   rq   rr   r   rw   rz   r   r   r   r   r   r   r   INFOr   r   r.   r.   r.   r/   <module>   s    L"		
 



