³ò
4ÒÇIc           @   sx   d  Z  d d k Td e f d „  ƒ  YZ d e f d „  ƒ  YZ d e f d „  ƒ  YZ d	 e f d
 „  ƒ  YZ d d „ Z d S(   sÎ  
Tokenizers that divide strings into substrings using the string
C{split()} method.

These tokenizers follow the standard L{TokenizerI} interface, and so
can be used with any code that expects a tokenizer.  For example,
these tokenizers can be used to specify the tokenization conventions
when building a L{CorpusReader<nltk.corpus.reader.api.CorpusReader>}.
But if you are tokenizing a string yourself, consider using string
C{split()} method directly instead.
iÿÿÿÿ(   t   *t   WhitespaceTokenizerc           B   s   e  Z d  Z d „  Z RS(   s¯  
    A tokenizer that divides a string into substrings by treating any
    sequence of whitespace characters as a separator.  Whitespace
    characters are space (C{' '}), tab (C{'\t'}), and newline
    (C{'\n'}).  If you are performing the tokenization yourself
    (rather than building a tokenizer to pass to some other piece of
    code), consider using the string C{split()} method instead:

        >>> words = s.split()
    c         C   s
   | i  ƒ  S(   N(   t   split(   t   selft   s(    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyt   tokenize#   s    (   t   __name__t
   __module__t   __doc__R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyR      s   
t   SpaceTokenizerc           B   s   e  Z d  Z d „  Z RS(   sR  
    A tokenizer that divides a string into substrings by treating any
    single space character as a separator.  If you are performing the
    tokenization yourself (rather than building a tokenizer to pass to
    some other piece of code), consider using the string C{split()}
    method instead:

        >>> words = s.split(' ')
    c         C   s   | i  d ƒ S(   Nt    (   R   (   R   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyR   0   s    (   R   R   R   R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyR	   &   s   	t   TabTokenizerc           B   s   e  Z d  Z d „  Z RS(   sQ  
    A tokenizer that divides a string into substrings by treating any
    single tab character as a separator.  If you are performing the
    tokenization yourself (rather than building a tokenizer to pass to
    some other piece of code), consider using the string C{split()}
    method instead:

        >>> words = s.split('\t')
    c         C   s   | i  d ƒ S(   Ns   	(   R   (   R   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyR   =   s    (   R   R   R   R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyR   3   s   	t   LineTokenizerc           B   s#   e  Z d  Z d d „ Z d „  Z RS(   sÆ   
    A tokenizer that divides a string into substrings by treating any
    single newline character as a separator.  Handling of blank lines
    may be controlled using a constructor parameter.
    t   discardc         C   s=   d } | | j o t  d d i | ƒ ƒ ‚ n | |  _ d S(   s	  
        @param blanklines: Indicates how blank lines should be
        handled.  Valid values are:
        
          - C{'discard'}: strip blank lines out of the token list
            before returning it.  A line is considered blank if
            it contains only whitespace characters.
          - C{'keep'}: leave all blank lines in the token list.
          - C{'discard-eof'}: if the string ends with a newline,
            then do not generate a corresponding token C{''} after
            that newline.
        R   t   keeps   discard-eofs   Blank lines must be one of: %sR
   N(   s   discardR   s   discard-eof(   t
   ValueErrort   joint   _blanklines(   R   t
   blanklinest   valid_blanklines(    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyt   __init__F   s
    c         C   s”   | i  d ƒ } |  i d j o6 g  } | D] } | i ƒ  o | | q* q* ~ } n< |  i d j o+ | o  | d i ƒ  o | i ƒ  q n | S(   Ns   
R   s   discard-eofiÿÿÿÿ(   R   R   t   rstript   stript   pop(   R   R   t   linest   _[1]t   l(    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyR   Z   s    6+(   R   R   R   R   R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyR   @   s   R   c         C   s   t  | ƒ i |  ƒ S(   N(   R   R   (   t   textR   (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pyt   line_tokenizeg   s    N(   R   t   apit
   TokenizerIR   R	   R   R   R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/simple.pys   <module>   s   
'