o
    "iY                     @  s  U d dl mZ d dlZd dlmZ d dlmZ ejdkr#d dlm	Z	 nzd dl
m	Z	 W n ey8   dd Z	Y nw d	d
lmZmZmZmZmZmZmZmZmZmZ d	dlmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& eeB eB eB eB Z'de(d< G dd dZ)e	G dd de)Z*e	G dd de)Z+e	G dd de)Z,e	G dd de)Z-e	G dd de)Z.e	G dd de)Z/e	G dd de)Z0e	G dd de)Z1e	G d d! d!e)Z2ed"d#d9d)d*Z3e4d+d, e)5 D Z6d-e(d.< ed/d#	1d:d;d7d8Z7dS )<    )annotationsN)	lru_cache)	getLogger)      )finalc                 C  s   | S N )clsr	   r	   Y/var/www/html/voicebot/backend/venv/lib/python3.10/site-packages/charset_normalizer/md.pyr         r      )
COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD_ACCENTUATED_CJK_HANGUL	_HIRAGANA	_KATAKANA_LATIN_THAI)_character_flagsis_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticonis_latinis_punctuationis_separator	is_symbolis_unprintableremove_accentunicode_rangeis_cjk_uncommonint_GLYPH_MASKc                   @  s@   e Zd ZdZdZdddZdd
dZdddZedddZ	dS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    r	   	characterstrreturnboolc                 C     t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr*   r	   r	   r   eligible<      zMessDetectorPlugin.eligibleNonec                 C  r.   )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r/   r1   r	   r	   r   feedB   s   zMessDetectorPlugin.feedc                 C  r.   )zB
        Permit to reset the plugin to the initial state.
        r/   r2   r	   r	   r   resetI   r4   zMessDetectorPlugin.resetfloatc                 C  r.   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r/   r7   r	   r	   r   ratioO   s   zMessDetectorPlugin.ratioNr*   r+   r,   r-   r*   r+   r,   r5   r,   r5   r,   r9   )
__name__
__module____qualname____doc__	__slots__r3   r6   r8   propertyr:   r	   r	   r	   r   r)   4   s    


r)   c                   @  F   e Zd ZdZdddZdd	d
ZdddZdddZedddZ	dS ) TooManySymbolOrPunctuationPlugin_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr,   r5   c                 C  s"   d| _ d| _d| _d | _d| _d S Nr   FrG   r7   r	   r	   r   __init__b   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r*   r+   r-   c                 C     |  S r   isprintabler1   r	   r	   r   r3   j      z)TooManySymbolOrPunctuationPlugin.eligiblec                 C  sh   |  j d7  _ || jkr/|tvr/t|r|  jd7  _n| s/t|r/t|s/|  jd7  _|| _d S )Nr      )	rJ   rK   r   r    rH   isdigitr"   r   rI   r1   r	   r	   r   r6   m   s   

z%TooManySymbolOrPunctuationPlugin.feedc                 C  s   d| _ d| _d| _d S Nr   )rH   rJ   rI   r7   r	   r	   r   r8      s   
z&TooManySymbolOrPunctuationPlugin.resetr9   c                 C  s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           333333?)rJ   rH   rI   )r2   ratio_of_punctuationr	   r	   r   r:      s   

z&TooManySymbolOrPunctuationPlugin.ratioNr=   r;   r<   r>   
r?   r@   rA   rC   rN   r3   r6   r8   rD   r:   r	   r	   r	   r   rF   X   s    



rF   c                   @  rE   )TooManyAccentuatedPluginrJ   _accentuated_countr,   r5   c                 C     d| _ d| _d S rU   r[   r7   r	   r	   r   rN         
z!TooManyAccentuatedPlugin.__init__r*   r+   r-   c                 C  rO   r   )isalphar1   r	   r	   r   r3      rR   z!TooManyAccentuatedPlugin.eligiblec                 C  ,   |  j d7  _ t|r|  jd7  _d S d S Nr   )rJ   r   r\   r1   r	   r	   r   r6         zTooManyAccentuatedPlugin.feedc                 C  r]   rU   r[   r7   r	   r	   r   r8      r^   zTooManyAccentuatedPlugin.resetr9   c                 C  s*   | j dk rdS | j| j  }|dkr|S dS )Nr   rV   gffffff?r[   )r2   ratio_of_accentuationr	   r	   r   r:      s   
zTooManyAccentuatedPlugin.ratioNr=   r;   r<   r>   rY   r	   r	   r	   r   rZ      s    



rZ   c                   @  rE   )UnprintablePlugin_unprintable_countrJ   r,   r5   c                 C  r]   rU   re   r7   r	   r	   r   rN      r^   zUnprintablePlugin.__init__r*   r+   r-   c                 C     dS NTr	   r1   r	   r	   r   r3      r   zUnprintablePlugin.eligiblec                 C  s(   t |r|  jd7  _|  jd7  _d S ra   )r#   rf   rJ   r1   r	   r	   r   r6      s   zUnprintablePlugin.feedc                 C  s
   d| _ d S rU   )rf   r7   r	   r	   r   r8      s   
zUnprintablePlugin.resetr9   c                 C     | j dkrdS | jd | j  S )Nr   rV   r   )rJ   rf   r7   r	   r	   r   r:         
zUnprintablePlugin.ratioNr=   r;   r<   r>   rY   r	   r	   r	   r   rd      s    



rd   c                   @  rE   )SuspiciousDuplicateAccentPlugin_successive_countrJ   _last_latin_character_last_was_accentuatedr,   r5   c                 C     d| _ d| _d | _d| _d S rM   rl   r7   r	   r	   r   rN      s   
z(SuspiciousDuplicateAccentPlugin.__init__r*   r+   r-   c                 C  s   |  ot|S r   )r_   r   r1   r	   r	   r   r3      s   z(SuspiciousDuplicateAccentPlugin.eligiblec                 C  sz   |  j d7  _ t|}| jd ur5|r5| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _|| _d S ra   )rJ   r   rn   ro   isupperrm   r$   )r2   r*   current_accentuatedr	   r	   r   r6      s   

z$SuspiciousDuplicateAccentPlugin.feedc                 C  rp   rM   rl   r7   r	   r	   r   r8         
z%SuspiciousDuplicateAccentPlugin.resetr9   c                 C  ri   )Nr   rV   rS   )rJ   rm   r7   r	   r	   r   r:      rj   z%SuspiciousDuplicateAccentPlugin.ratioNr=   r;   r<   r>   rY   r	   r	   r	   r   rk      s    



rk   c                   @  rE   )SuspiciousRange"_suspicious_successive_range_countrJ   _last_printable_seen_last_printable_ranger,   r5   c                 C     d| _ d| _d | _d | _d S rU   ru   r7   r	   r	   r   rN     rs   zSuspiciousRange.__init__r*   r+   r-   c                 C  rO   r   rP   r1   r	   r	   r   r3   	  rR   zSuspiciousRange.eligiblec                 C  s   |  j d7  _ | st|s|tv rd | _d | _d S | jd u r*|| _t|| _d S | j}t|}t||r=|  jd7  _|| _|| _d S ra   )	rJ   isspacer    r   rw   rx   r%    is_suspiciously_successive_rangerv   )r2   r*   unicode_range_aunicode_range_br	   r	   r   r6     s&   



zSuspiciousRange.feedc                 C  ry   rU   )rJ   rv   rw   rx   r7   r	   r	   r   r8   &  rs   zSuspiciousRange.resetr9   c                 C  s"   | j dkrdS | jd | j  }|S )N   rV   rS   )rJ   rv   )r2   ratio_of_suspicious_range_usager	   r	   r   r:   ,  s   
zSuspiciousRange.ratioNr=   r;   r<   r>   rY   r	   r	   r	   r   rt      s    



rt   c                   @  rE   )SuperWeirdWordPlugin_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrJ   _bad_character_count_buffer_length_buffer_last_char_buffer_last_char_accentuated_buffer_accent_count_buffer_glyph_count_buffer_upper_countr,   r5   c                 C  sR   d| _ d| _d| _d| _d| _d| _d| _d| _d | _d| _	d| _
d| _d| _d S rM   r   r7   r	   r	   r   rN   J  s   
zSuperWeirdWordPlugin.__init__r*   r+   r-   c                 C  rg   rh   r	   r1   r	   r	   r   r3   \  r   zSuperWeirdWordPlugin.eligiblec                 C  s0  |  rL|  jd7  _|| _| r|  jd7  _t|}t|t@ }|| _|r/|  j	d7  _	| j
s?|t@ r8|r?|t@ s?d| _
|t@ rJ|  jd7  _d S | jsQd S | s]t|s]t|r| jr|  jd7  _| j}|  j|7  _|dkr| j	| dkrd| _n'| jr| j r| j|kr|  jd7  _d| _n| jdkrd| _|  jd7  _|dkr| j
r| jdko| j| dk}|s|  jd7  _d| _| jr|  jd7  _|  j|7  _d| _d| _
d| _d | _d| _d| _	d| _d| _d S |d	vr| st|rd| _|  jd7  _|| _d| _d S d S d S d S )
Nr   T         ?   r   rW   F>   _-<=>|~)r_   r   r   rq   r   r   r-   r   r   r   r   r   r(   r   rz   r    r!   r   rJ   r   r   r   r   rT   r"   )r2   r*   flagschar_accentuatedbuffer_lengthprobable_camel_casedr	   r	   r   r6   _  s   





zSuperWeirdWordPlugin.feedc                 C  sR   d| _ d | _d| _d| _d| _d| _d| _d| _d| _d| _	d| _
d| _d| _d S rM   )r   r   r   r   r   r   r   rJ   r   r   r   r   r   r7   r	   r	   r   r8     s   
zSuperWeirdWordPlugin.resetr9   c                 C  s$   | j dkr| jdkrdS | j| j S )N
   r   rV   )r   r   r   rJ   r7   r	   r	   r   r:     s   zSuperWeirdWordPlugin.ratioNr=   r;   r<   r>   rY   r	   r	   r	   r   r   8  s    



Pr   c                   @  sJ   e Zd ZdZdZdddZdd
dZdddZdddZe	dddZ
dS )CjkUncommonPluginz<
    Detect messy CJK text that probably means nothing.
    rJ   _uncommon_countr,   r5   c                 C  r]   rU   r   r7   r	   r	   r   rN     r^   zCjkUncommonPlugin.__init__r*   r+   r-   c                 C     t |S r   )r   r1   r	   r	   r   r3     rR   zCjkUncommonPlugin.eligiblec                 C  r`   ra   )rJ   r&   r   r1   r	   r	   r   r6     s
   zCjkUncommonPlugin.feedc                 C  r]   rU   r   r7   r	   r	   r   r8     r^   zCjkUncommonPlugin.resetr9   c                 C  s.   | j dk rdS | j| j  }|dkr|d S dS )Nr   rV   r   r   r   )r2   uncommon_form_usager	   r	   r   r:     s   
zCjkUncommonPlugin.ratioNr=   r;   r<   r>   )r?   r@   rA   rB   rC   rN   r3   r6   r8   rD   r:   r	   r	   r	   r   r     s    



r   c                   @  rE   )ArchaicUpperLowerPlugin_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrJ   _last_alpha_seen_current_ascii_onlyr,   r5   c                 C  s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   Tr   r7   r	   r	   r   rN     s   
z ArchaicUpperLowerPlugin.__init__r*   r+   r-   c                 C  rg   rh   r	   r1   r	   r	   r   r3     r   z ArchaicUpperLowerPlugin.eligiblec                 C  s  |  ot|}| }|r>| jdkr>| jdkr&| s&| js&|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jrH|
 sHd| _| jd urt| rV| j s_| rq| j rq| jrm|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )Nr   @   Fr   TrS   )r_   r   r   rT   r   r   r   r   r   rJ   isasciirq   islower)r2   r*   is_concerned	chunk_sepr	   r	   r   r6     sD   


zArchaicUpperLowerPlugin.feedc                 C  s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)rJ   r   r   r   r   r   r   r7   r	   r	   r   r8   2  s   
zArchaicUpperLowerPlugin.resetr9   c                 C  s   | j dkrdS | j| j  S )Nr   rV   )rJ   r   r7   r	   r	   r   r:   ;  s   
zArchaicUpperLowerPlugin.ratioNr=   r;   r<   r>   rY   r	   r	   r	   r   r     s    




*	r   c                   @  sF   e Zd ZdZdddZdddZdddZdddZedddZ	dS )ArabicIsolatedFormPluginrJ   _isolated_form_countr,   r5   c                 C  r]   rU   r   r7   r	   r	   r   rN   G  r^   z!ArabicIsolatedFormPlugin.__init__c                 C  r]   rU   r   r7   r	   r	   r   r8   K  r^   zArabicIsolatedFormPlugin.resetr*   r+   r-   c                 C  r   r   )r   r1   r	   r	   r   r3   O  rR   z!ArabicIsolatedFormPlugin.eligiblec                 C  r`   ra   )rJ   r   r   r1   r	   r	   r   r6   R  rb   zArabicIsolatedFormPlugin.feedr9   c                 C  s   | j dk rdS | j| j  }|S )Nr   rV   r   )r2   isolated_form_usager	   r	   r   r:   X  s   
zArabicIsolatedFormPlugin.ratioNr=   r;   r<   r>   )
r?   r@   rA   rC   rN   r8   r3   r6   rD   r:   r	   r	   r	   r   r   C  s    



r      )maxsizer|   
str | Noner}   r,   r-   c                 C  sv  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS | dks|dkrdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr   )r|   r}   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charsr	   r	   r   r{   b  sZ   r{   c                 c  s    | ]}|V  qd S r   r	   .0md_classr	   r	   r   	<genexpr>  s    
r   z$tuple[type[MessDetectorPlugin], ...]_DETECTOR_CLASSESi   皙?Fdecoded_sequencer+   maximum_thresholdr9   debugc              	   C  s^  dd t D }t| }|dk rd}n	|dk rd}nd}td||D ]+}| |||  D ]}|D ]}||r;|| q/q+td	d
 |D }	|	|krL nq!|D ]}|dr[|d qOtdd
 |D }	|rtd}
|
td| d|	 d|  |dkr|
td| dd   |
td| dd   |D ]}|
t|j	 d|j
  qt|	dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S  s   g | ]}| qS r	   r	   r   r	   r	   r   
<listcomp>  s    zmess_ratio.<locals>.<listcomp>i      r   r      r   c                 s      | ]}|j V  qd S r   r:   r   dtr	   r	   r   r         zmess_ratio.<locals>.<genexpr>
c                 s  r   r   r   r   r	   r	   r   r     r   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=   zStarting with: NzEnding with: iz: r   )r   lenranger3   r6   sumr   logr   	__class__r:   round)r   r   r   	detectorsseq_lenstepblock_startr*   detectormean_mess_ratiologgerr   r	   r	   r   
mess_ratio  sR   




r   )r|   r   r}   r   r,   r-   )r   F)r   r+   r   r9   r   r-   r,   r9   )8
__future__r   sys	functoolsr   loggingr   version_infotypingr   typing_extensionsImportErrorconstantr   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r(   __annotations__r)   rF   rZ   rd   rk   rt   r   r   r   r   r{   tuple__subclasses__r   r   r	   r	   r	   r   <module>   sT    
0D$70= %VJ