o
    Mfmp                     @   sn   d Z ddlmZmZ ddlZddlmZmZmZm	Z	m
Z
mZmZ dd Zejje  ddZG d	d
 d
ZdS )zC
Unit tests for nltk.tokenize.
See also nltk/test/tokenize.doctest
    )ListTupleN)LegalitySyllableTokenizerStanfordSegmenterSyllableTokenizerTreebankWordTokenizerTweetTokenizerpunktword_tokenizec                  C   s6   zt  } | d | d W dS  ty   Y dS w )NarzhTF)r   default_configLookupError)seg r   ^/var/www/html/analyze/labelStudio/lib/python3.10/site-packages/nltk/test/unit/test_tokenize.pyload_stanford_segmenter   s   

r   z/NLTK was unable to find stanford-segmenter.jar.)reasonc                   @   s  e Zd Zdd Zejddg dg dffdg dg d	ffd
g dg dffdg dg dffdg dg dffdg dg dffdg dg dffdg dg dffgdedee	e e	e f fddZ
dd Zd d! Zed"d# Zed$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Zd<d= Zejd>g d?dEdAdBZdCdD Zd@S )FTestTokenizec                 C   s2   t ddd}d}||}g d}||ksJ dS )zW
        Test TweetTokenizer using words with special and accented characters.
        T)strip_handles
reduce_lenuA   @myke: Let's test these words: resumé España München français)
:zLet'stestthesewordsr   u   resuméu   Españau   Münchenu	   françaisNr   tokenize)self	tokenizers9tokensexpectedr   r   r   test_tweet_tokenizer%   s
   
z!TestTokenize.test_tweet_tokenizerztest_input, expectedsz#My text 0106404243030 is great text)Mytext0106404243030isgreatr$   )r#   r$   0106404243030r'   r(   r$   zMy ticket id is 1234543124123)r#   ticketidr'   1234543124123)r#   r*   r+   r'   1234543124123z<@remy: This is waaaaayyyy too much for you!!!!!! 01064042430)r   Thisr'   waaayyytoomuchforyou!r5   r5   r%   z*My number is 06-46124080, except it's not.)	r#   numberr'   z06-46124080,exceptit'snot.z+My number is 601-984-4813, except it's not.)	r#   r6   r'   z601-984-4813r7   r8   r9   r:   r;   )
r#   r6   r'   z601-984-4813r7   r8   r9   r:   r;   z/My number is (393)  928 -3010, except it's not.)	r#   r6   r'   (393)  928 -3010r7   r8   r9   r:   r;   )r#   r6   r'   (393)928-3010r7   r8   r9   r:   r;   z1The product identification number is 48103284512.)Theproductidentificationr6   r'   
48103284512r;   )rD   rE   rF   r6   r'   48103284512r;   z(My favourite substraction is 240 - 1353.)r#   	favouritesubstractionr'   z
240 - 1353r;   )r#   rJ   rK   r'   240rB   1353r;   
test_input	expectedsc                 C   s@   t ddg|D ]\}}tdd|d}||}||ksJ qdS )a  
        Test `match_phone_numbers` in TweetTokenizer.

        Note that TweetTokenizer is also passed the following for these tests:
            * strip_handles=True
            * reduce_len=True

        :param test_input: The input string to tokenize using TweetTokenizer.
        :type test_input: str
        :param expecteds: A 2-tuple of tokenized sentences. The first of the two
            tokenized is the expected output of tokenization with `match_phone_numbers=True`.
            The second of the two tokenized lists is the expected output of tokenization
            with `match_phone_numbers=False`.
        :type expecteds: Tuple[List[str], List[str]]
        TF)r   r   match_phone_numbersN)zipr   r   )r   rN   rO   rP   r!   r   	predictedr   r   r   test_tweet_tokenizer_expanded;   s    E
z*TestTokenize.test_tweet_tokenizer_expandedc                 C   s$   t  }|d}|g dksJ dS )z3
        Test SyllableTokenizer tokenizer.
        justification)justificationN)r   r   )r   r   r    r   r   r   +test_sonority_sequencing_syllable_tokenizer  s   
z8TestTokenize.test_sonority_sequencing_syllable_tokenizerc                 C   s:   ddl m} d}t| }||}|g dksJ dS )z;
        Test LegalitySyllableTokenizer tokenizer.
        r   )r   	wonderful)wonderfulN)nltk.corpusr   r   r   )r   r   	test_wordr   r    r   r   r   *test_legality_principle_syllable_tokenizer  s
   
z7TestTokenize.test_legality_principle_syllable_tokenizerc                 C   :   t  }|d d}|| }| g dksJ dS )zN
        Test the Stanford Word Segmenter for Arabic (default config)
        r   un   يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات)u   يبحثu   علمu   الحاسوبu   استخدامu   الحوسبةu   بu   جميعu
   اشكالu   هاu   لu   حلu   المشكلاتNr   r   segmentsplitr   r   sentsegmented_sentr   r   r   test_stanford_segmenter_arabic  
   
z+TestTokenize.test_stanford_segmenter_arabicc                 C   rb   )zO
        Test the Stanford Word Segmenter for Chinese (default config)
        r   u$   这是斯坦福中文分词器测试)u   这u   是u	   斯坦福u   中文u	   分词器u   测试Nrc   rf   r   r   r   test_stanford_segmenter_chinese3  rj   z,TestTokenize.test_stanford_segmenter_chinesec                 C   sL   t  }d}dg}||}||ksJ d}g d}||}||ks$J dS )zT
        Test a string that resembles a phone number but contains a newline
        r=   z(393)
928 -3010)r>   r?   r@   z	928 -3010Nr   )r   r   test1r!   resulttest2r   r   r   test_phone_tokenizer>  s   

z!TestTokenize.test_phone_tokenizerc                 C   sl   t  }d}dg}||}||ksJ d}dg}||}||ks#J d}g d}||}||ks4J dS )zX
        Test a string that contains Emoji ZWJ Sequences and skin tone modifier
        u   👨‍👩‍👧‍👧u   👨🏿u   🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽)u   🤔u   🙈meu   asír7   seu   😌dsu   💕u   👭u   👙hellou   👩🏾‍🎓emojirs   u   👨‍👩‍👦‍👦howareu   😊r4   today   🙅🏽rx   Nr   r   r   rl   r!   rm   rn   test3r   r   r   test_emoji_tokenizerQ  s   


z!TestTokenize.test_emoji_tokenizerc                 C       d}g d}t ||ksJ dS )zA
        Test padding of asterisk for word tokenization.
        z1This is a, *weird sentence with *asterisks in it.)r/   r'   ar7   *weirdsentencewithr~   	asterisksinitr;   Nr
   r   r$   r!   r   r   r   test_pad_asterisk  s   zTestTokenize.test_pad_asteriskc                 C   r|   )z@
        Test padding of dotdot* for word tokenization.
        zPWhy did dotdot.. not get tokenized but dotdotdot... did? How about manydots.....)Whydiddotdotz..r:   get	tokenizedbut	dotdotdotz...r   ?Howaboutmanydotsz.....Nr   r   r   r   r   test_pad_dotdot  s   zTestTokenize.test_pad_dotdotc                 C   s   t dd}d}g d}||}||ksJ d}g d}||}||ks'J d}g d}||}||ks8J d	}g d
}||}||ksIJ d}g d}||}||ksZJ d}	g d}||	}||kskJ d}
g d}||
}||ks|J dS )zW
        Test remove_handle() from casual.py with specially crafted edge cases
        T)r   z-@twitter hello @twi_tter_. hi @12345 @123news)rs   r;   hiu]   @n`@n~@n(@n)@n-@n=@n+@n\@n|@n[@n]@n{@n}@n;@n:@n'@n"@n/@n?@n.@n,@n<@n>@n @n
@n ñ@n.ü@n.ç@n.)`~r>   r@   rB   =+\|[]{};r   '"/r   r;   r7   <>   ñr;      ür;      çr;   zKa@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n)&r}   @njr   zr   Ar   Lr   Zr   1r   4r   7r   9r   0r   _r   r5   r   @r   #r   $r   %r   &r   r~   r   z@n!a @n#a @n$a @n%a @n&a @n*a)r5   r}   r   r}   r   r}   r   r}   r   r}   r~   r}   zD@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n)r5   r   r   r   r   r   r   r   r   r   r~   r   r   r   r   r   r   r   r   z@n_r   z@n7r   z@njr   z^@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle)pqrstuvwxyz1234r   endofhandlez^@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde)p@abcdez@abcdefghijklmnor   r   r   5r   Nr   )r   r   rl   r!   rm   rn   rz   test4test5test6test7r   r   r   test_remove_handle  s:   



(




zTestTokenize.test_remove_handlec                 C   s|   t  }d}g d}t||}||ksJ d}g d}t||}||ks)J d}g d}t||}||ks<J dS )zC
        Test TreebankWordTokenizer.span_tokenize function
        zNGood muffins cost $3.88
in New (York).  Please (buy) me
two of them.
(Thanks).))r      )      )      )      )r      )      )      )       )r   $   )r   %   )r   &   )(   .   )/   0   )r   3   )r   4   )5   7   )8   ;   )<   >   )?   D   )E   F   )r   L   )r   M   )r   N   zmThe DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues)r      r         
      r   r            r   r   r   r   r   *   r   +   ,   r   r   2   r   9   :   @   A   r   r   J   K   r   )r   U   )V   \   )]   _   )`   f   )g   m   zqThe DUP is similar to the "religious right" in the United States and takes a ``hardline'' stance on social issues)r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r	  )r   O   )r  W   )r  Y   )Z   r  )a   c   )d   j   )k   q   N)r   listspan_tokenizery   r   r   r   test_treebank_span_tokenizerI  s   z)TestTokenize.test_treebank_span_tokenizerc                 C   s<   d}g d}t ||ksJ d}g d}t ||ksJ dS )z-
        Test word_tokenize function
        z0The 'v', I've been fooled but I'll seek revenge.)rD   r   vr   r7   Iz'vebeenfooledr   r"  z'llseekrevenger;   z'v' 're')r   r!  r   z'rer   Nr   )r   r   r!   r   r   r   test_word_tokenize  s   zTestTokenize.test_word_tokenizec                 C   sT   dddgfdg dfdg dfg}|D ]\}}dd	 t |D }||ks'J qd S )
N12r   rH   )rH   N123)r)  rH   3)r,  Nr   )r)  r+  )r,  r   )r   Nc                 S   s   g | ]}|qS r   r   ).0xr   r   r   
<listcomp>  s    z5TestTokenize.test_punkt_pair_iter.<locals>.<listcomp>)r	   
_pair_iter)r   
test_casesrN   expected_outputactual_outputr   r   r   test_punkt_pair_iter  s   


z!TestTokenize.test_punkt_pair_iterc                 C   s   t g }t|}t| d S N)iterr	   r0  r  )r   r   genr   r   r   5test_punkt_pair_iter_handles_stop_iteration_exception  s   
zBTestTokenize.test_punkt_pair_iter_handles_stop_iteration_exceptionc                 C   s0   t  }G dd d}| |_t|d d S )Nc                   @   s   e Zd Zdd ZdS )zkTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exception.<locals>.TestPunktTokenizeWordsMockc                 S   s   t g S r5  )r6  )r   sr   r   r   r
     s   zyTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exception.<locals>.TestPunktTokenizeWordsMock.word_tokenizeN)__name__
__module____qualname__r
   r   r   r   r   TestPunktTokenizeWordsMock  s    r=  r   )r	   PunktBaseClass
_lang_varsr  _tokenize_words)r   objr=  r   r   r   :test_punkt_tokenize_words_handles_stop_iteration_exception  s   zGTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exceptionc                 C   sB   G dd dt j}t j| d}d}g d}|||ksJ d S )Nc                   @      e Zd ZdZdS )zNTestTokenize.test_punkt_tokenize_custom_lang_vars.<locals>.BengaliLanguageVars)r;   r   r5   u   ।Nr:  r;  r<  sent_end_charsr   r   r   r   BengaliLanguageVars      rF  )	lang_varsc  উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।)u  উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।u+  অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন।u/  এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।)r	   PunktLanguageVarsPunktSentenceTokenizerr   )r   rF  rA  	sentencesr!   r   r   r   $test_punkt_tokenize_custom_lang_vars  s
   z1TestTokenize.test_punkt_tokenize_custom_lang_varsc                 C   s(   t  }d}dg}|||ksJ d S )NrI  )r	   rK  r   )r   rA  rL  r!   r   r   r   'test_punkt_tokenize_no_custom_lang_vars  s
   z4TestTokenize.test_punkt_tokenize_no_custom_lang_varsz%input_text,n_sents,n_splits,lang_vars))z4Subject: Some subject. Attachments: Some attachments      )z4Subject: Some subject! Attachments: Some attachmentsrO  rP  )z4This is just a normal sentence, just like any other.rP  r   Nc                 C   sJ   t  }|d kr||_t|||ksJ tt|||ks#J d S r5  )r	   rK  r?  lenr   r  debug_decisions)r   
input_textn_sentsn_splitsrH  r   r   r   r   punkt_debug_decisions  s
   z"TestTokenize.punkt_debug_decisionsc                 C   s*   G dd dt j}| jddd| d d S )Nc                   @   rC  )zGTestTokenize.test_punkt_debug_decisions_custom_end.<locals>.ExtLangVars)r;   r   r5   ^NrD  r   r   r   r   ExtLangVars!  rG  rX  z4Subject: Some subject^ Attachments: Some attachmentsrO  rP  )rT  rU  rH  )r	   rJ  rV  )r   rX  r   r   r   %test_punkt_debug_decisions_custom_end  s   
z2TestTokenize.test_punkt_debug_decisions_custom_endr5  )r:  r;  r<  r"   pytestmarkparametrizestrr   r   rS   rZ   ra   check_stanford_segmenterri   rk   ro   r{   r   r   r   r   r'  r4  r8  rB  rM  rN  rV  rY  r   r   r   r   r   $   s    &#
  3


0 ar   )__doc__typingr   r   rZ  nltk.tokenizer   r   r   r   r   r	   r
   r   r[  skipifr^  r   r   r   r   r   <module>   s    $
