³ò
4ÒÇIc           @   sØ   d  Z  d d k Z d d k Z d d k l Z l Z d d k Td e f d „  ƒ  YZ d e f d „  ƒ  YZ	 d	 e f d
 „  ƒ  YZ
 d e e f d „  ƒ  YZ e e e i e i Be i Bd „ Z e	 ƒ  i Z e
 ƒ  i Z d S(   s…   
Tokenizers that divide strings into substrings using regular
expressions that can match either tokens or separators between tokens.
iÿÿÿÿN(   t   convert_regexp_to_nongroupingt
   Deprecated(   t   *t   RegexpTokenizerc           B   sC   e  Z d  Z e e e i e i Be i Bd „ Z	 d „  Z
 d „  Z RS(   sV  
    A tokenizer that splits a string into substrings using a regular
    expression.  The regular expression can be specified to match
    either tokens or separators between tokens.

    Unlike C{re.findall()} and C{re.split()}, C{RegexpTokenizer} does
    not treat regular expressions that contain grouping parenthases
    specially.
    c         C   s™   t  | d | ƒ } | |  _ | |  _ | |  _ | |  _ d |  _ t | ƒ } y t i	 | | ƒ |  _ Wn. t i
 j
 o } t d | | f ƒ ‚ n Xd S(   s‰  
        Construct a new tokenizer that splits strings using the given
        regular expression C{pattern}.  By default, C{pattern} will be
        used to find tokens; but if C{gaps} is set to C{False}, then
        C{patterns} will be used to find separators between tokens
        instead.

        @type pattern: C{str}
        @param pattern: The pattern used to build this tokenizer.
            This pattern may safely contain grouping parenthases.
        @type gaps: C{bool}
        @param gaps: True if this tokenizer's pattern should be used
            to find separators between tokens; False if this
            tokenizer's pattern should be used to find the tokens
            themselves.
        @type discard_empty: C{bool}
        @param discard_empty: True if any empty tokens (C{''})
            generated by the tokenizer should be discarded.  Empty
            tokens can only be generated if L{_gaps} is true.
        @type flags: C{int}
        @param flags: The regexp flags used to compile this
            tokenizer's pattern.  By default, the following flags are
            used: C{re.UNICODE | re.MULTILINE | re.DOTALL}.
        t   patterns"   Error in regular expression %r: %sN(   t   getattrt   _patternt   _gapst   _discard_emptyt   _flagst   Nonet   _regexpR    t   ret   compilet   errort
   ValueError(   t   selfR   t   gapst   discard_emptyt   flagst   nongrouping_patternt   e(    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyt   __init__    s    					c         C   sv   |  i  oX |  i o: g  } |  i i | ƒ D] } | o | | q+ q+ ~ Sqr |  i i | ƒ Sn |  i i | ƒ Sd  S(   N(   R   R   R   t   splitt   findall(   R   t   textt   _[1]t   tok(    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyt   tokenize[   s
    

:c         C   s)   d |  i  i |  i |  i |  i |  i f S(   Ns3   %s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)(   t	   __class__t   __name__R   R   R   R	   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyt   __repr__g   s    (   R   t
   __module__t   __doc__t   Falset   TrueR   t   UNICODEt	   MULTILINEt   DOTALLR   R   R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyR      s
   	:	t   BlanklineTokenizerc           B   s   e  Z d  Z d „  Z RS(   sù   
    A tokenizer that divides a string into substrings by treating any
    sequence of blank lines as a separator.  Blank lines are defined
    as lines containing no characters, or containing only space
    (C{' '}) or tab (C{'	'}) characters.
    c         C   s   t  i |  d d t ƒd  S(   Ns   \s*\n\s*\n\s*R   (   R   R   R#   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyR   s   s    (   R   R    R!   R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyR'   l   s   t   WordPunctTokenizerc           B   s   e  Z d  Z d „  Z RS(   sÜ   
    A tokenizer that divides a text into sequences of alphabetic and
    non-alphabetic characters.  E.g.:

        >>> WordPunctTokenizer().tokenize("She said 'hello'.")
        ['She', 'said', "'", 'hello', "'."]
    c         C   s   t  i |  d ƒ d  S(   Ns   \w+|[^\w\s]+(   R   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyR   ~   s    (   R   R    R!   R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyR(   v   s   t   WordTokenizerc           B   s   e  Z d  Z d „  Z RS(   s`  
    B{If you want to tokenize words, you should probably use
    TreebankWordTokenizer or word_tokenize() instead.}
    
    A tokenizer that divides a text into sequences of alphabetic
    characters.  Any non-alphabetic characters are discarded.  E.g.:

        >>> WordTokenizer().tokenize("She said 'hello'.")
        ['She', 'said', 'hello']
    c         C   s   t  i |  d ƒ d  S(   Ns   \w+(   R   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyR   Œ   s    (   R   R    R!   R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyR)      s   
c         C   s"   t  | | | | ƒ } | i |  ƒ S(   s¹   
    Split the given text string, based on the given regular expression
    pattern.  See the documentation for L{RegexpTokenizer.tokenize()}
    for descriptions of the arguments.
    (   R   R   (   R   R   R   R   R   t	   tokenizer(    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pyt   regexp_tokenize“   s    (   R!   R   t   sre_constantst   nltk.internalsR    R   t   apit
   TokenizerIR   R'   R(   R)   R"   R#   R$   R%   R&   R+   R   t   blankline_tokenizet   wordpunct_tokenize(    (    (    s*   /p/zhu/06/nlp/nltk/nltk/tokenize/regexp.pys   <module>   s   
V
	