
B_Kc        
   @   s  d  Z  d d k Z d d k Z d d k l Z d d k l Z d d k l Z d d >Z	 d d >Z
 d d >Z d d	 >Z d d
 >Z d d >Z e	 e
 e Z e e e Z h  e	 d d f <e
 d d f <e d d f <e d d f <e d d f <e d d f <Z d e f d     YZ e i d e i  Z d e f d     YZ d   Z d e f d     YZ d e f d     YZ d e f d     YZ d e f d     YZ d e e f d      YZ e e d!  Z e d" j o# d d k  Z  e e  i! i"    n d S(#   s   
The Punkt sentence tokenizer.  The algorithm for this tokenizer is
described in Kiss & Strunk (2006)::

  Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
    Boundary Detection.  Computational Linguistics 32: 485-525.
iN(   t   defaultdict(   t   FreqDist(   t
   TokenizerIi   i   i   i   i   i   t   initialt   uppert   internalt   unknownt   lowert   PunktLanguageVarsc           B   s   e  Z d  Z d Z d   Z d   Z d Z d Z d	 Z e	 i
 d
 e	 i  Z d Z d Z d Z d Z d   Z d   Z d Z d   Z RS(   sX  
    Stores variables, mostly regular expressions, which may be
    language-dependent for correct application of the algorithm.
    An extension of this class may modify its properties to suit
    a language other than English; an instance can then be passed
    as an argument to PunktSentenceTokenizer and PunktTrainer
    constructors.
    t   _re_period_contextt   _re_word_tokenizerc         C   s   d S(   Ni   (    (   t   self(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __getstate__^   s    c         C   s   d S(   Ni   (    (   R   t   state(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __setstate__d   s    t   .t   ?t   !s   [.?!]s   ,:;s   ["\')\]}]+?(?:\s+|(?=--)|$)s   [^\(\"\`{\[:;&\#\*@\)}\]\-,]s   (?:[?!)\";}\]\*:@\'\({\[])s    (?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)s  (
        %(MultiChar)s
        |
        (?=%(WordStart)s)\S+?  # Accept word characters until end is found
        (?= # Sequences marking a word's end
            \s|                                 # White-space
            $|                                  # End-of-string
            %(NonWord)s|%(MultiChar)s|          # Punctuation
            ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word
        )
        |
        \S
    )c         C   sv   y |  i  SWnd t j
 oX t i |  i h  |  i d <|  i d <|  i d <t i t i	 B |  _  |  i  Sn Xd S(   s?   Compiles and returns a regular expression for word tokenizationt   NonWordt	   MultiChart	   WordStartN(
   R
   t   AttributeErrort   ret   compilet   _word_tokenize_fmtt   _re_non_word_charst   _re_multi_char_punctt   _re_word_startt   UNICODEt   VERBOSE(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   _word_tokenizer_re   s    c         C   s   |  i    i |  S(   s<   Tokenize a string to split of punctuation other than periods(   R   t   findall(   R   t   s(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   word_tokenize   s    s:  
        \S*                          # some word material
        %(SentEndChars)s             # a potential sentence ending
        (?=(?P<after_tok>
            %(NonWord)s              # either other punctuation
            |
            \s+(?P<next_tok>\S+)     # or whitespace and some other token
        ))c         C   s^   y |  i  SWnL t i |  i h  |  i d <|  i d <t i t i B |  _  |  i  Sn Xd S(   sj   Compiles and returns a regular expression to find contexts
        including possible sentence boundaries.R   t   SentEndCharsN(   R	   R   R   t   _period_context_fmtR   t   _re_sent_end_charsR   R   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   period_context_re   s    (   s   _re_period_contexts   _re_word_tokenizer(   R   R   R   (   t   __name__t
   __module__t   __doc__t	   __slots__R   R   t   sent_end_charsR$   t   internal_punctuationR   R   t	   MULTILINEt   re_boundary_realignmentR   R   R   R   R   R!   R#   R%   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   R   s"   					s   [^\W\d]t   PunktWordTokenizerc           B   s    e  Z e   d   Z d   Z RS(   c         C   s   | |  _  d  S(   N(   t
   _lang_vars(   R   t	   lang_vars(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __init__   s    c         C   s   |  i  i |  S(   N(   R/   R!   (   R   t   text(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   tokenize   s    (   R&   R'   R   R1   R3   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR.      s   c         c   sI   t  |   }  |  i   } x |  D] } | | f V| } q W| d f Vd S(   s   
    Yields pairs of tokens from the given iterator such that each input
    token will appear as the first element in a yielded tuple. The last
    pair will have None as its second element.
    N(   t   itert   nextt   None(   t   itt   prevt   el(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt
   _pair_iter   s     
t   PunktParametersc           B   sD   e  Z d  Z d   Z d   Z d   Z d   Z d   Z d   Z RS(   sC   Stores data used to perform sentence boundary detection with punkt.c         C   s7   t    |  _ t    |  _ t    |  _ t t  |  _ d  S(   N(   t   sett   abbrev_typest   collocationst   sent_startersR    t   intt   ortho_context(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1      s
    c         C   s   t    |  _ d  S(   N(   R<   R=   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   clear_abbrevs  s    c         C   s   t    |  _ d  S(   N(   R<   R>   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   clear_collocations  s    c         C   s   t    |  _ d  S(   N(   R<   R?   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   clear_sent_starters  s    c         C   s   t  t  |  _ d  S(   N(   R    R@   RA   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   clear_ortho_context  s    c         C   s   |  i  | c | O<d  S(   N(   RA   (   R   t   typt   flag(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   add_ortho_context  s    (	   R&   R'   R(   R1   RB   RC   RD   RE   RH   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR;      s   					t
   PunktTokenc           B   s8  e  Z d  Z d d d d d g Z d d d g e Z d	   Z e i d
  Z e i d  Z	 e i d e i
  Z e i d e i
  Z d   Z e d    Z e d    Z e d    Z e d    Z e d    Z e d    Z e d    Z e d    Z e d    Z e d    Z d   Z d   Z RS(   sX   Stores a token of text with annotations produced during
    sentence boundary detection.t	   parastartt	   linestartt	   sentbreakt   abbrt   ellipsist   tokt   typet   period_finalc         K   s   | |  _  |  i |  |  _ | i d  |  _ x! |  i D] } t |  | d   q7 Wx* | i   D] \ } } t |  | |  q^ Wd  S(   NR   (	   RO   t	   _get_typeRP   t   endswithRQ   t   _propertiest   setattrR6   t	   iteritems(   R   RO   t   paramst   pt   kt   v(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1     s    	
  s   \.\.+$s   ^-?[\.,]?\d[\d,\.-]*\.?$s
   [^\W\d]\.$s	   [^\W\d]+$c         C   s   |  i  i d | i    S(   s6   Returns a case-normalized representation of the token.s
   ##number##(   t   _RE_NUMERICt   subR   (   R   RO   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRR   6  s    c         C   s@   t  |  i  d j o# |  i d d j o |  i d  Sn |  i S(   sG   
        The type with its final period removed if it has one.
        i   iR   (   t   lenRP   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   type_no_period:  s    *c         C   s   |  i  o |  i Sn |  i S(   se   
        The type with its final period removed if it is marked as a
        sentence break.
        (   RL   R^   RP   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   type_no_sentperiodC  s    
c         C   s   |  i  d i   S(   s1   True if the token's first character is uppercase.i    (   RO   t   isupper(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   first_upperM  s    c         C   s   |  i  d i   S(   s1   True if the token's first character is lowercase.i    (   RO   t   islower(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   first_lowerR  s    c         C   s(   |  i  o d Sn |  i o d Sn d S(   NR   R   t   none(   Rc   Ra   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt
   first_caseW  s
    

c         C   s   |  i  i |  i  S(   s.   True if the token text is that of an ellipsis.(   t   _RE_ELLIPSISt   matchRO   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   is_ellipsis_  s    c         C   s   |  i  i d  S(   s+   True if the token text is that of a number.s
   ##number##(   RP   t
   startswith(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt	   is_numberd  s    c         C   s   |  i  i |  i  S(   s-   True if the token text is that of an initial.(   t   _RE_INITIALRg   RO   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt
   is_initiali  s    c         C   s   |  i  i |  i  S(   s)   True if the token text is all alphabetic.(   t	   _RE_ALPHARg   RO   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   is_alphan  s    c         C   s   t  i |  i  S(   s6   True if the token is either a number or is alphabetic.(   t   _re_non_punctt   searchRP   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   is_non_puncts  s    c            su     i    i j o d t   i   } n d } d i   f d     i D  } d   i i t   i  | | f S(   s   
        A string representation of the token that can reproduce it
        with eval(), which lists all the token's non-default
        annotations.
        s	    type=%s,t    s   , c         3   sF   x? |  ]8 } t    |  o" d  | t t    |   f Vq q Wd S(   s   %s=%sN(   t   getattrt   repr(   t   .0RX   (   R   (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>  s   	s   %s(%s,%s %s)(   RP   RO   Rt   t   joinRT   t	   __class__R&   (   R   t   typestrt   propvals(    (   R   s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __repr__|  s    c         C   sU   |  i  } |  i o | d 7} n |  i o | d 7} n |  i o | d 7} n | S(   sO   
        A string representation akin to that used by Kiss and Strunk.
        s   <A>s   <E>s   <S>(   RO   RM   RN   RL   (   R   t   res(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __str__  s    	


(   R&   R'   R(   RT   R)   R1   R   R   Rf   R[   R   Rk   Rm   RR   t   propertyR^   R_   Ra   Rc   Re   Rh   Rj   Rl   Rn   Rq   Rz   R|   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRI     s,   			
		t   _PunktBaseClassc           B   sA   e  Z d  Z e   e e   d  Z d   Z d   Z d   Z	 RS(   sP   
    Includes common components of PunktTrainer and PunktSentenceTokenizer.
    c         C   s   | |  _  | |  _ | |  _ d  S(   N(   t   _paramsR/   t   _Token(   R   R0   t	   token_clsRW   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1     s    			c         c   s   t  } x | i d  D]z } | i   oa t |  i i |   } |  i | i   d | d t Vt  } x& | D] } |  i |  Vqn Wq t } q Wd S(   sB  
        Divide the given text into tokens, using the punkt word
        segmentation regular expression, and generate the resulting list
        of tokens augmented as three-tuples with two boolean values for whether
        the given token occurs at the start of a paragraph or a new line,
        respectively.
        s   
RJ   RK   N(	   t   Falset   splitt   stripR4   R/   R!   R   R5   t   True(   R   t	   plaintextRJ   t   linet	   line_tokst   t(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   _tokenize_words  s      c         c   s'   x  | D] } |  i  |  | Vq Wd S(   s  
        Perform the first pass of annotation, which makes decisions
        based purely based on the word type of each word:
        
          - '?', '!', and '.' are marked as sentence breaks.
          - sequences of two or more periods are marked as ellipsis.
          - any word ending in '.' that's a known abbreviation is
            marked as an abbreviation.
          - any other word ending in '.' is marked as a sentence break.

        Return these annotations as a tuple of three sets:
        
          - sentbreak_toks: The indices of all sentence breaks.
          - abbrev_toks: The indices of all abbreviations.
          - ellipsis_toks: The indices of all ellipsis marks.
        N(   t   _first_pass_annotation(   R   t   tokenst   aug_tok(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   _annotate_first_pass  s     c         C   s   | i  } | |  i i j o t | _ n | i o t | _ n} | i or | i d  oa | d  i	   |  i
 i j p* | d  i	   i d  d |  i
 i j o t | _ q t | _ n d S(   sC   
        Performs type-based annotation on a single token.
        s   ..it   -N(   RO   R/   R*   R   RL   Rh   RN   RQ   RS   R   R   R=   R   RM   (   R   R   RO   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s    	
*(
   R&   R'   R(   R   RI   R;   R1   R   R   R   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR~     s   			t   PunktTrainerc           B   s(  e  Z d  Z e e e   e d  Z d   Z d Z	 e Z
 d Z d Z d Z e Z e Z d Z e e d  Z e e d	  Z d
   Z d   Z e d  Z d d d d d  Z d   Z d   Z d   Z d   Z d   Z e d    Z e d    Z d   Z  d   Z! d   Z" d   Z# d   Z$ RS(   s<   Learns parameters used in Punkt sentence boundary detection.c         C   s}   t  i |  d | d | t   |  _ d |  _ t   |  _ t   |  _ d |  _ t |  _	 | o |  i
 | | d t n d  S(   NR0   R   i    t   finalize(   R~   R1   R   t   _type_fdistt   _num_period_tokst   _collocation_fdistt   _sent_starter_fdistt   _sentbreak_countR   t
   _finalizedt   train(   R   t
   train_textt   verboseR0   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1     s    				c         C   s   |  i  p |  i   n |  i S(   sl   
        Calculates and returns parameters for sentence boundary detection as
        derived from training.(   R   t   finalize_trainingR   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt
   get_params)  s    
g333333?i   gQ@i   i   c         C   s5   |  i  |  i |  |  | o |  i |  n d S(   s8  
        Collects training data from a given text. If finalize is True, it
        will determine all the parameters for sentence boundary detection. If
        not, this will be delayed until get_params() or finalize_training() is
        called. If verbose is True, abbreviations found will be listed.
        N(   t   _train_tokensR   R   (   R   R2   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   ^  s    	c            s<     i    f d   | D |  | o   i |  n d S(   sE   
        Collects training data from a given list of tokens.
        c         3   s"   x |  ] }   i  |  Vq Wd  S(   N(   R   (   Ru   R   (   R   (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>o  s    N(   R   R   (   R   R   R   R   (    (   R   s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   train_tokensk  s     c   
      C   s"  t  |  _ t |  } x> | D]6 } |  i i | i  | i o |  i d 7_ q q W|  i |  } x |  i	 |  D] \ } } } | |  i
 j o< | o1 |  i i i |  | o d | | f GHq qqu | p1 |  i i i |  | o d | | f GHqqu qu Wt |  i |   } |  i |  |  i |  i |  7_ x t |  D] \ } }	 | i p |	 o qSn |  i | |	  o1 |  i i i | i  | o d | i GHqn |  i |	 |  o |  i i |	 i  n |  i | |	  o  |  i i | i |	 i f  qSqSWd  S(   Ni   s     Abbreviation: [%6.4f] %ss"     Removed abbreviation: [%6.4f] %ss     Rare Abbrev: %s(   R   R   t   listR   t   incRP   RQ   R   t   _unique_typest   _reclassify_abbrev_typest   ABBREVR   R=   t   addt   removeR   t   _get_orthography_dataR   t   _get_sentbreak_countR:   t   _is_rare_abbrev_typeR^   t   _is_potential_sent_starterR   t   _is_potential_collocationR   R_   (
   R   R   R   R   t   unique_typesRM   t   scoret   is_addt   aug_tok1t   aug_tok2(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   s  sJ    	 
  	c         C   s   t  d   | D  S(   Nc         s   s   x |  ] } | i  Vq Wd  S(   N(   RP   (   Ru   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>  s    (   R<   (   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s    c         C   s   |  i  i   xG |  i   D]9 \ } } |  i  i i |  | o d | | f GHq q W|  i  i   xV |  i   D]H \ \ } } } |  i  i i | | f  | o d | | | f GHqq qq Wt |  _	 d S(   s~   
        Uses data that has been gathered in training to determine likely
        collocations and sentence starters.
        s     Sent Starter: [%6.4f] %rs     Collocation: [%6.4f] %r+%rN(
   R   RD   t   _find_sent_startersR?   R   RC   t   _find_collocationsR>   R   R   (   R   R   RF   t   llt   typ1t   typ2(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s      i   c         C   s   | d j ob |  i  i } |  i  i   xF |  i i   D]1 \ } } | | j o | | |  i  i | <q6 q6 Wn |  i |  i |  |  _ |  i |  i |  |  _ |  i |  i |  |  _ d S(   s  
        Allows memory use to be reduced after much training by removing data
        about rare tokens that are unlikely to have a statistical effect with
        further training. Entries occurring above the given thresholds will be
        retained.
        i   N(   R   RA   RE   R   RV   t   _freq_thresholdR   R   (   R   t   ortho_thresht   type_thresht   colloc_threst   sentstart_thresht   old_ocRO   t   count(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   freq_threshold  s      c         C   sk   t    } d } xE | i   D]7 \ } } | | j  o | d 7} q | i | |  q W| i t |  | S(   s   
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        i    i   (   R   RV   R   R6   (   R   t   fdistt	   thresholdR{   t   num_removedRO   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s    	 c         C   s  d } t  |  } x | D] } | i o | d j o
 d } n | i o | d j o
 d } n | i } t i | | i f d  } | o |  i i | |  n | i	 o( | i
 p | i p
 d } q d } q | i p
 | i o
 d } q d } q Wd S(   s   
        Collect information about whether each token type occurs
        with different case patterns (i) overall, (ii) at
        sentence-initial positions, and (iii) at sentence-internal
        positions.
        R   R   R   i    N(   R   RJ   RK   R_   t
   _ORTHO_MAPt   getRe   R   RH   RL   Rj   Rl   RN   RM   (   R   R   t   contextR   RF   RG   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s&     

	



c         c   s^  xW| D]O} t  i |  p | d j o q n | i d  o. | |  i i j o q n | d  } t } n! | |  i i j o q n t } | i d  d } t |  | d } |  i	 | d } |  i	 | } |  i
 | | |  i | |  i	 i    } t i |  }	 | }
 t |  i  p t i | |  } | |	 |
 | } | | | f Vq Wd S(   s  
        (Re)classifies each given token if
          - it is period-final and not a known abbreviation; or
          - it is not period-final and is otherwise a known abbreviation
        by checking whether its previous classification still holds according
        to the heuristics of section 3.
        Yields triples (abbr, score, is_add) where abbr is the type in question,
        score is its log-likelihood with penalties applied, and is_add specifies
        whether the present type is a candidate for inclusion or exclusion as an
        abbreviation, such that:
          - (is_add and score >= 0.3)    suggests a new abbreviation; and
          - (not is_add and score < 0.3) suggests excluding an abbreviation.
        s
   ##number##R   ii   N(   Ro   Rp   RS   R   R=   R   R   R   R]   R   t   _dunning_log_likelihoodR   t   Nt   matht   expR@   t   IGNORE_ABBREV_PENALTYt   pow(   R   t   typesRF   R   t   num_periodst   num_nonperiodst   count_with_periodt   count_without_periodR   t   f_lengtht	   f_periodst	   f_penaltyR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   &  s4     

	c         C   sn   |  i  i   d   |  i D } xG |  i |  D]6 \ } } } | |  i j o |  i  i i |  q0 q0 Wd S(   s   
        Recalculates abbreviations given type frequencies, despite no prior
        determination of abbreviations.
        This fails to include abbreviations otherwise found as "rare".
        c         s   s4   x- |  ]& } | o | i  d   o	 | Vq q Wd S(   R   N(   RS   (   Ru   RF   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>m  s    N(   R   RB   R   R   R   R=   R   (   R   R   RM   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   find_abbrev_typesf  s     c         C   s   | i  p | i o t Sn | i } |  i | |  i | d  } | |  i i j p | |  i j o t Sn | i d  |  i	 i
 j o t SnG | i o< | i } |  i i | } | t @o | t @o t Sq n d S(   s  
        A word type is counted as a rare abbreviation if...
          - it's not already marked as an abbreviation
          - it occurs fewer than ABBREV_BACKOFF times
          - either it is followed by a sentence-internal punctuation
            mark, *or* it is followed by a lower-case word that
            sometimes appears with upper case, but never occurs with
            lower case at the beginning of sentences.
        ii   N(   RM   RL   R   R_   R   R   R=   t   ABBREV_BACKOFFRO   R/   R+   R   Rc   RA   t   _ORTHO_BEG_UCt   _ORTHO_MID_UC(   R   t   cur_tokt   next_tokRF   R   R   t   typ2ortho_context(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   u  s    
	#
	c   	      C   s   t  |  | } d } t  |  t i |  |  | t i d |  } t  |  t i |  |  | t i d |  } | | } d | S(   s   
        A function that calculates the modified Dunning log-likelihood
        ratio scores for abbreviation candidates.  The details of how
        this works is available in the paper.
        gGz?g      ?g       (   t   floatR   t   log(	   t   count_at   count_bt   count_abR   t   p1t   p2t	   null_hypot   alt_hypot
   likelihood(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s    
c         C   sN  d d k  } d | | } d | |  } d | | | |  } | | i |  |  | | i d |  } | | | i |  | |  | | | i d |  }	 |  | j o
 d }
 n- | | i |  |  | | i d |  }
 | | j o
 d } n9 | | | i |  | |  | | | i d |  } | |	 |
 | } d | S(   s<  
        A function that will just compute log-likelihood estimate, in
        the original paper it's decribed in algorithm 6 and 7.

        This *should* be the original Dunning log-likelihood values,
        unlike the previous log_l function where it used modified
        Dunning log-likelihood values
        iNg      ?i    g       (   R   R   (   R   R   R   R   R   RX   R   R   t   summand1t   summand2t   summand3t   summand4R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   _col_log_likelihood  s$    
$

$c         C   sM   |  i  p2 |  i o
 | i p | i o% | i p
 | i o | i o | i S(   st   
        Returns True if the pair of tokens may form a collocation given
        log-likelihood statistics.
        (   t   INCLUDE_ALL_COLLOCSt   INCLUDE_ABBREV_COLLOCSRM   RL   Rj   Rl   Rq   (   R   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s    


c      	   c   sW  xP|  i  i   D]?\ } } y | \ } } Wn t j
 o
 q n X| |  i i j o q n |  i | |  i | d } |  i | |  i | d } | d j o | d j o |  i | j  o t | |  j n ot |  i | | | |  i i	    } | |  i
 j o? t |  i i	    | t |  | j o | | f | f VqOq q Wd S(   sI   
        Generates likely collocations and their log-likelihood.
        R   i   N(   R   RV   t	   TypeErrorR   R?   R   t   MIN_COLLOC_FREQt   minR   R   t   COLLOCATIONR   (   R   R   t	   col_countR   R   t
   typ1_countt
   typ2_countR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s&     $c         C   s&   | i  o | i p | i o | i S(   s   
        Returns True given a token and the token that preceds it if it
        seems clear that the token is beginning a sentence.
        (   RL   Rj   Rl   Rn   (   R   R   t   prev_tok(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s    
c         c   s   x |  i  i   D] \ } } | p q n |  i | |  i | d } | | j  o q n |  i |  i | | |  i i    } | |  i j o< t |  i i    |  i t |  | j o | | f Vq q Wd S(   s~   
        Uses collocation heuristics for each candidate token to
        determine if it frequently starts sentences.
        R   N(   R   RV   R   R   R   R   t   SENT_STARTERR   (   R   RF   t   typ_at_break_countt	   typ_countR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s     c         C   s   t  d   | D  S(   sj   
        Returns the number of sentence breaks marked in a given set of
        augmented tokens.
        c         s   s'   x  |  ] } | i  o	 d  Vq q Wd S(   i   N(   RL   (   Ru   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>5  s    (   t   sum(   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   0  s    (%   R&   R'   R(   R6   R   R   RI   R1   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   t   staticmethodR   R   R   R   R   R   R   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s>   )			>				0	@		/*				t   PunktSentenceTokenizerc           B   s   e  Z d  Z e e e   e d  Z e d  Z e d  Z	 e d  Z
 d   Z d   Z d   Z d   Z d	   Z d
   Z d   Z d   Z e d  Z d   Z d   Z d   Z RS(   s'  
    A sentence tokenizer which uses an unsupervised algorithm to build
    a model for abbreviation words, collocations, and words that start
    sentences; and then uses that model to find sentence boundaries.
    This approach has been shown to work well for many European
    languages.
    c         C   s=   t  i |  d | d | | o |  i | |  |  _ n d S(   s   
        train_text can either be the sole training text for this sentence
        boundary detector, or can be a PunktParameters object.
        R0   R   N(   R~   R1   R   R   (   R   R   R   R0   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1   E  s    c         C   sO   t  |  t  d  t  d  f j o | Sn t | d |  i d |  i i   S(   s   
        Derives parameters from a given training text, or uses the parameters
        given. Repeated calls to this method destroy previous parameters. For
        incremental training, instantiate a separate PunktTrainer instance.
        Rr   u    R0   R   (   RP   R   R/   R   R   (   R   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   Q  s    %c         C   s   t  |  i | |   S(   sM   
        Given a text, returns a list of the sentences in that text.
        (   R   t   sentences_from_text(   R   R2   t   realign_boundaries(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR3   `  s    c         C   s-   |  i  |  } | o |  i |  } n | S(   s   
        Given a text, generates the sentences in that text by only
        testing candidate sentence breaks. If realign_boundaries is
        True, includes in the sentence closing punctuation that
        follows the period.
        (   t   _sentences_from_textt   _realign_boundaries(   R   R2   R   t   sents(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   f  s    c         c   s   d } x |  i  i   i |  D]t } | i   | i d  } |  i |  oE | | | i   !V| i d  o | i d  } q | i   } q q W| | Vd  S(   Ni    t	   after_tokR   (   R/   R%   t   finditert   groupt   text_contains_sentbreakt   endt   start(   R   R2   t
   last_breakRg   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   r  s     c         c   s   d } x t  |  D] \ } } | | } | p | o	 | Vq q n |  i i i |  } | o( | | i d  i   V| i   } q d } | o	 | Vq q Wd S(   s\  
        Attempts to realign punctuation that falls after the period but
        should otherwise be included in the same sentence.

        For example: "(Sent1.) Sent2." will otherwise be split as::
        
            ["(Sent1.", ") Sent1."].
            
        This method will produce::
        
            ["(Sent1.)", "Sent2."].
        i    N(   R:   R/   R-   Rg   R   R   R   (   R   R   t   realignt   s1t   s2t   m(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s     
	c         C   sP   t  } xC |  i |  i |   D]) } | o t Sn | i o
 t } q q Wt  S(   sK   
        Returns True if the given text includes a sentence break.
        (   R   t   _annotate_tokensR   R   RL   (   R   R2   t   foundR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR     s     
c         C   s(   |  i  |  i |   } |  i | |  S(   s   
        Given a text, generates the sentences in that text. Annotates all
        tokens, rather than just those with possible sentence breaks. Should
        produce the same results as L{sentences_from_text}.
        (   R  R   t   _build_sentence_list(   R   R2   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   sentences_from_text_legacy  s    c         #   sy   t    i   f d   | D   } g  } x7 | D]/ } | i | i  | i o | Vg  } q2 q2 W| o	 | Vn d S(   sw   
        Given a sequence of tokens, generates lists of tokens, each list
        corresponding to a sentence.
        c         3   s"   x |  ] }   i  |  Vq Wd  S(   N(   R   (   Ru   R   (   R   (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>  s    N(   R4   R  t   appendRO   RL   (   R   R   t   sentenceR   (    (   R   s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   sentences_from_tokens  s    % 
c         C   s"   |  i  |  } |  i |  } | S(   s   
        Given a set of tokens augmented with markers for line-start and
        paragraph-start, returns an iterator through those tokens with full
        annotation including predicted sentence breaks.
        (   R   t   _annotate_second_pass(   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR    s    c         c   sO  d } t  i d  } d } x| D]} | i } | i | |  i   } | t |  7} | | | t |  !| j oO d i d   | D  }	 t  i |	  i | |  }
 |
 o |
 i   } q n | | | t |  !| j p t  | t |  7} | o | | | 7} n | | 7} | i o | Vd } q" q" W| o	 | Vn d S(   s   
        Given the original text and the list of augmented word tokens,
        construct and return a tokenized list of sentence strings.
        i    s   \s*Rr   c         s   s"   x |  ] } t  i |  Vq Wd  S(   N(   R   t   escape(   Ru   t   c(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>  s    N(	   R   R   RO   Rg   R   R]   Rv   t   AssertionErrorRL   (   R   R2   R   t   post	   WS_REGEXPR  R   RO   t   wst   patR  (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR	    s,    
 	%

c         C   s   d GHt  d d  } xd | D]\ } | i o | i d  n) | i o | i d  n | i d  | i t |   q W| i   d  S(   Ns   writing to /tmp/punkt.new...s   /tmp/punkt.newt   ws   

s   
t    (   t   openRJ   t   writeRK   t   strt   close(   R   R   t   outR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   dump  s     

s   ;:,.!?c         c   s6   x/ t  |  D]! \ } } |  i | |  | Vq Wd S(   s   
        Performs a token-based classification (section 4) over the given
        tokens, making use of the orthographic heuristic (4.1.1), collocation
        heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
        N(   R:   t   _second_pass_annotation(   R   R   t   t1t   t2(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR  &  s     c   	      C   s  | p d Sn | i  } | i p d Sn | i } | i  } | i } | i } | | f |  i i j o t | _ t	 | _
 d Sn | i
 p
 | i og | o_ |  i |  } | t	 j o t	 | _ d Sn | i o$ | |  i i j o t	 | _ d Sq n | p | d j o |  i |  } | t j o t | _ t	 | _
 d Sn | d j oA | o: | i o0 |  i i | t @o t | _ t	 | _
 d Sqn d S(   s   
        Performs token-based classification over a pair of contiguous tokens
        returning an updated augmented token for the first of them.
        Ns
   ##number##R   (   RO   RQ   R^   R_   Rl   R   R>   R   RL   R   RM   RN   t   _ortho_heuristicRa   R?   RA   t	   _ORTHO_LC(	   R   R   R   RO   RF   R   t   next_typt   tok_is_initialt   is_sent_starter(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR  0  sH    	
							
			
		c         C   s   | i  |  i j o t Sn |  i i | i } | i o | t @o | t @o t	 Sn | i
 o | t @p | t @o t Sn d S(   sR   
        Decide whether the given token is the first token in a sentence.
        R   (   RO   t   PUNCTUATIONR   R   RA   R_   Ra   R"  R   R   Rc   t	   _ORTHO_UCt   _ORTHO_BEG_LC(   R   R   RA   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR!    s    

(   R&   R'   R(   R6   R   R   RI   R1   R   R3   R   R   R   R   R
  R  R  R	  R  t   tupleR&  R  R  R!  (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   =  s$   								7		
	Oc         C   si   d   } |   } t  | _ | i |   | | i    } x( | i |  d t  D] } | |  GHqP Wd S(   s4   Builds a punkt model and applies it to the same textc         S   s+   t  i d  t  i  i d |   i d d  S(   s   (?:\r|^\s+)Rr   s   
R  (   R   R   R,   R\   t   replace(   R    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   <lambda>  s    R   N(   R   R   R   R   R   (   R2   t   tok_clst	   train_clst   cleanupt   trainert   sbdt   l(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   main  s    			 t   __main__(#   R(   R   R   t   nltk.compatR    t   nltk.probabilityR   t   apiR   R   R   t   _ORTHO_UNK_UCR(  t   _ORTHO_MID_LCt   _ORTHO_UNK_LCR'  R"  R   t   objectR   R   R   Ro   R.   R:   R;   RI   R~   R   R   R2  R&   t   syst   stdint   read(    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys   <module>   sH   	





l	*[  C b