│Є
МB_Kc        
   @   s  d  Z  d d k Z d d k Z d d k l Z d d k l Z d d k l Z d d >Z	 d d >Z
 d d >Z d d	 >Z d d
 >Z d d >Z e	 e
 e Z e e e Z h  e	 d d f <e
 d d f <e d d f <e d d f <e d d f <e d d f <Z d e f d Д  Г  YZ e i d e i Г Z d e f d Д  Г  YZ d Д  Z d e f d Д  Г  YZ d e f d Д  Г  YZ d e f d Д  Г  YZ d e f d Д  Г  YZ d e e f d  Д  Г  YZ e e d! Д Z e d" j o# d d k  Z  e e  i! i" Г  Г n d S(#   sЄ   
The Punkt sentence tokenizer.  The algorithm for this tokenizer is
described in Kiss & Strunk (2006)::

  Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
    Boundary Detection.  Computational Linguistics 32: 485-525.
i    N(   t   defaultdict(   t   FreqDist(   t
   TokenizerIi   i   i   i   i   i   t   initialt   uppert   internalt   unknownt   lowert   PunktLanguageVarsc           B   sЖ   e  Z d  Z d Z d Д  Z d Д  Z d Z d Z d	 Z e	 i
 d
 e	 i Г Z d Z d Z d Z d Z d Д  Z d Д  Z d Z d Д  Z RS(   sX  
    Stores variables, mostly regular expressions, which may be
    language-dependent for correct application of the algorithm.
    An extension of this class may modify its properties to suit
    a language other than English; an instance can then be passed
    as an argument to PunktSentenceTokenizer and PunktTrainer
    constructors.
    t   _re_period_contextt   _re_word_tokenizerc         C   s   d S(   Ni   (    (   t   self(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __getstate__^   s    c         C   s   d S(   Ni   (    (   R   t   state(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __setstate__d   s    t   .t   ?t   !s   [.?!]s   ,:;s   ["\')\]}]+?(?:\s+|(?=--)|$)s   [^\(\"\`{\[:;&\#\*@\)}\]\-,]s   (?:[?!)\";}\]\*:@\'\({\[])s    (?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)s─  (
        %(MultiChar)s
        |
        (?=%(WordStart)s)\S+?  # Accept word characters until end is found
        (?= # Sequences marking a word's end
            \s|                                 # White-space
            $|                                  # End-of-string
            %(NonWord)s|%(MultiChar)s|          # Punctuation
            ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word
        )
        |
        \S
    )c         C   sv   y |  i  SWnd t j
 oX t i |  i h  |  i d <|  i d <|  i d <t i t i	 BГ |  _  |  i  Sn Xd S(   s?   Compiles and returns a regular expression for word tokenizationt   NonWordt	   MultiChart	   WordStartN(
   R
   t   AttributeErrort   ret   compilet   _word_tokenize_fmtt   _re_non_word_charst   _re_multi_char_punctt   _re_word_startt   UNICODEt   VERBOSE(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   _word_tokenizer_reО   s    c         C   s   |  i  Г  i | Г S(   s<   Tokenize a string to split of punctuation other than periods(   R   t   findall(   R   t   s(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   word_tokenizeЮ   s    s:  
        \S*                          # some word material
        %(SentEndChars)s             # a potential sentence ending
        (?=(?P<after_tok>
            %(NonWord)s              # either other punctuation
            |
            \s+(?P<next_tok>\S+)     # or whitespace and some other token
        ))c         C   s^   y |  i  SWnL t i |  i h  |  i d <|  i d <t i t i BГ |  _  |  i  Sn Xd S(   sj   Compiles and returns a regular expression to find contexts
        including possible sentence boundaries.R   t   SentEndCharsN(   R	   R   R   t   _period_context_fmtR   t   _re_sent_end_charsR   R   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   period_context_reо   s    (   s   _re_period_contexts   _re_word_tokenizer(   R   R   R   (   t   __name__t
   __module__t   __doc__t	   __slots__R   R   t   sent_end_charsR$   t   internal_punctuationR   R   t	   MULTILINEt   re_boundary_realignmentR   R   R   R   R   R!   R#   R%   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR   R   s"   					s   [^\W\d]t   PunktWordTokenizerc           B   s    e  Z e Г  d  Д Z d Д  Z RS(   c         C   s   | |  _  d  S(   N(   t
   _lang_vars(   R   t	   lang_vars(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __init__╠   s    c         C   s   |  i  i | Г S(   N(   R/   R!   (   R   t   text(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   tokenize╧   s    (   R&   R'   R   R1   R3   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR.   ╩   s   c         c   sI   t  |  Г }  |  i Г  } x |  D] } | | f V| } q W| d f Vd S(   s─   
    Yields pairs of tokens from the given iterator such that each input
    token will appear as the first element in a yielded tuple. The last
    pair will have None as its second element.
    N(   t   itert   nextt   None(   t   itt   prevt   el(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt
   _pair_iter┌   s     
t   PunktParametersc           B   sD   e  Z d  Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z RS(   sC   Stores data used to perform sentence boundary detection with punkt.c         C   s7   t  Г  |  _ t  Г  |  _ t  Г  |  _ t t Г |  _ d  S(   N(   t   sett   abbrev_typest   collocationst   sent_startersR    t   intt   ortho_context(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1   ю   s
    c         C   s   t  Г  |  _ d  S(   N(   R<   R=   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   clear_abbrevs  s    c         C   s   t  Г  |  _ d  S(   N(   R<   R>   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   clear_collocations  s    c         C   s   t  Г  |  _ d  S(   N(   R<   R?   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   clear_sent_starters  s    c         C   s   t  t Г |  _ d  S(   N(   R    R@   RA   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   clear_ortho_context  s    c         C   s   |  i  | c | O<d  S(   N(   RA   (   R   t   typt   flag(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   add_ortho_context  s    (	   R&   R'   R(   R1   RB   RC   RD   RE   RH   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR;   ы   s   					t
   PunktTokenc           B   s8  e  Z d  Z d d d d d g Z d d d g e Z d	 Д  Z e i d
 Г Z e i d Г Z	 e i d e i
 Г Z e i d e i
 Г Z d Д  Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z d Д  Z d Д  Z RS(   sX   Stores a token of text with annotations produced during
    sentence boundary detection.t	   parastartt	   linestartt	   sentbreakt   abbrt   ellipsist   tokt   typet   period_finalc         K   sВ   | |  _  |  i | Г |  _ | i d Г |  _ x! |  i D] } t |  | d  Г q7 Wx* | i Г  D] \ } } t |  | | Г q^ Wd  S(   NR   (	   RO   t	   _get_typeRP   t   endswithRQ   t   _propertiest   setattrR6   t	   iteritems(   R   RO   t   paramst   pt   kt   v(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1     s    	
  s   \.\.+$s   ^-?[\.,]?\d[\d,\.-]*\.?$s
   [^\W\d]\.$s	   [^\W\d]+$c         C   s   |  i  i d | i Г  Г S(   s6   Returns a case-normalized representation of the token.s
   ##number##(   t   _RE_NUMERICt   subR   (   R   RO   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRR   6  s    c         C   s@   t  |  i Г d j o# |  i d d j o |  i d  Sn |  i S(   sG   
        The type with its final period removed if it has one.
        i   i    R   (   t   lenRP   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   type_no_period:  s    *c         C   s   |  i  o |  i Sn |  i S(   se   
        The type with its final period removed if it is marked as a
        sentence break.
        (   RL   R^   RP   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   type_no_sentperiodC  s    
c         C   s   |  i  d i Г  S(   s1   True if the token's first character is uppercase.i    (   RO   t   isupper(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   first_upperM  s    c         C   s   |  i  d i Г  S(   s1   True if the token's first character is lowercase.i    (   RO   t   islower(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   first_lowerR  s    c         C   s(   |  i  o d Sn |  i o d Sn d S(   NR   R   t   none(   Rc   Ra   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt
   first_caseW  s
    

c         C   s   |  i  i |  i Г S(   s.   True if the token text is that of an ellipsis.(   t   _RE_ELLIPSISt   matchRO   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   is_ellipsis_  s    c         C   s   |  i  i d Г S(   s+   True if the token text is that of a number.s
   ##number##(   RP   t
   startswith(   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt	   is_numberd  s    c         C   s   |  i  i |  i Г S(   s-   True if the token text is that of an initial.(   t   _RE_INITIALRg   RO   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt
   is_initiali  s    c         C   s   |  i  i |  i Г S(   s)   True if the token text is all alphabetic.(   t	   _RE_ALPHARg   RO   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   is_alphan  s    c         C   s   t  i |  i Г S(   s6   True if the token is either a number or is alphabetic.(   t   _re_non_punctt   searchRP   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   is_non_puncts  s    c            su   И  i  И  i j o d t И  i  Г } n d } d i З  f d Ж  И  i DГ Г } d И  i i t И  i Г | | f S(   sЮ   
        A string representation of the token that can reproduce it
        with eval(), which lists all the token's non-default
        annotations.
        s	    type=%s,t    s   , c         3   sF   x? |  ]8 } t  И  | Г o" d  | t t  И  | Г Г f Vq q Wd S(   s   %s=%sN(   t   getattrt   repr(   t   .0RX   (   R   (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>И  s   	s   %s(%s,%s %s)(   RP   RO   Rt   t   joinRT   t	   __class__R&   (   R   t   typestrt   propvals(    (   R   s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __repr__|  s    c         C   sU   |  i  } |  i o | d 7} n |  i o | d 7} n |  i o | d 7} n | S(   sO   
        A string representation akin to that used by Kiss and Strunk.
        s   <A>s   <E>s   <S>(   RO   RM   RN   RL   (   R   t   res(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   __str__Р  s    	


(   R&   R'   R(   RT   R)   R1   R   R   Rf   R[   R   Rk   Rm   RR   t   propertyR^   R_   Ra   Rc   Re   Rh   Rj   Rl   Rn   Rq   Rz   R|   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRI     s,   			
		t   _PunktBaseClassc           B   sA   e  Z d  Z e Г  e e Г  d Д Z d Д  Z d Д  Z d Д  Z	 RS(   sP   
    Includes common components of PunktTrainer and PunktSentenceTokenizer.
    c         C   s   | |  _  | |  _ | |  _ d  S(   N(   t   _paramsR/   t   _Token(   R   R0   t	   token_clsRW   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1   ж  s    			c         c   sШ   t  } xЛ | i d Г D]z } | i Г  oa t |  i i | Г Г } |  i | i Г  d | d t ГVt  } x& | D] } |  i | Г Vqn Wq t } q Wd S(   sB  
        Divide the given text into tokens, using the punkt word
        segmentation regular expression, and generate the resulting list
        of tokens augmented as three-tuples with two boolean values for whether
        the given token occurs at the start of a paragraph or a new line,
        respectively.
        s   
RJ   RK   N(	   t   Falset   splitt   stripR4   R/   R!   RА   R5   t   True(   R   t	   plaintextRJ   t   linet	   line_tokst   t(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   _tokenize_words▓  s      c         c   s'   x  | D] } |  i  | Г | Vq Wd S(   s┴  
        Perform the first pass of annotation, which makes decisions
        based purely based on the word type of each word:
        
          - '?', '!', and '.' are marked as sentence breaks.
          - sequences of two or more periods are marked as ellipsis.
          - any word ending in '.' that's a known abbreviation is
            marked as an abbreviation.
          - any other word ending in '.' is marked as a sentence break.

        Return these annotations as a tuple of three sets:
        
          - sentbreak_toks: The indices of all sentence breaks.
          - abbrev_toks: The indices of all abbreviations.
          - ellipsis_toks: The indices of all ellipsis marks.
        N(   t   _first_pass_annotation(   R   t   tokenst   aug_tok(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   _annotate_first_pass═  s     c         C   s└   | i  } | |  i i j o t | _ nФ | i o t | _ n} | i or | i d Г oa | d  i	 Г  |  i
 i j p* | d  i	 Г  i d Г d |  i
 i j o t | _ q╝ t | _ n d S(   sC   
        Performs type-based annotation on a single token.
        s   ..i    t   -N(   RO   R/   R*   RЕ   RL   Rh   RN   RQ   RS   R   R   R=   RГ   RM   (   R   RН   RO   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRЛ   т  s    	
*(
   R&   R'   R(   R   RI   R;   R1   RК   RО   RЛ   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR~   б  s   			t   PunktTrainerc           B   s(  e  Z d  Z e e e Г  e d Д Z d Д  Z d Z	 e Z
 d Z d Z d Z e Z e Z d Z e e d Д Z e e d	 Д Z d
 Д  Z d Д  Z e d Д Z d d d d d Д Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z e d Д  Г Z e d Д  Г Z d Д  Z  d Д  Z! d Д  Z" d Д  Z# d Д  Z$ RS(   s<   Learns parameters used in Punkt sentence boundary detection.c         C   s}   t  i |  d | d | Гt Г  |  _ d |  _ t Г  |  _ t Г  |  _ d |  _ t |  _	 | o |  i
 | | d t Гn d  S(   NR0   RБ   i    t   finalize(   R~   R1   R   t   _type_fdistt   _num_period_tokst   _collocation_fdistt   _sent_starter_fdistt   _sentbreak_countRЕ   t
   _finalizedt   train(   R   t
   train_textt   verboseR0   RБ   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1      s    				c         C   s   |  i  p |  i Г  n |  i S(   sl   
        Calculates and returns parameters for sentence boundary detection as
        derived from training.(   RЧ   t   finalize_trainingR   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt
   get_params)  s    
g333333╙?i   gЕыQ╕Е@i   i   c         C   s5   |  i  |  i | Г | Г | o |  i | Г n d S(   s8  
        Collects training data from a given text. If finalize is True, it
        will determine all the parameters for sentence boundary detection. If
        not, this will be delayed until get_params() or finalize_training() is
        called. If verbose is True, abbreviations found will be listed.
        N(   t   _train_tokensRК   RЫ   (   R   R2   RЪ   RС   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRШ   ^  s    	c            s<   И  i  З  f d Ж  | DГ | Г | o И  i | Г n d S(   sE   
        Collects training data from a given list of tokens.
        c         3   s"   x |  ] } И  i  | Г Vq Wd  S(   N(   RА   (   Ru   RЙ   (   R   (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>o  s    N(   RЭ   RЫ   (   R   RМ   RЪ   RС   (    (   R   s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   train_tokensk  s     c   
      C   s"  t  |  _ t | Г } x> | D]6 } |  i i | i Г | i o |  i d 7_ q q W|  i | Г } xд |  i	 | Г D]У \ } } } | |  i
 j o< | o1 |  i i i | Г | o d | | f GHq╠ qqu | p1 |  i i i | Г | o d | | f GHqqu qu Wt |  i | Г Г } |  i | Г |  i |  i | Г 7_ x╒ t | Г D]╟ \ } }	 | i p |	 o qSn |  i | |	 Г o1 |  i i i | i Г | o d | i GHq╜n |  i |	 | Г o |  i i |	 i Г n |  i | |	 Г o  |  i i | i |	 i f Г qSqSWd  S(   Ni   s     Abbreviation: [%6.4f] %ss"     Removed abbreviation: [%6.4f] %ss     Rare Abbrev: %s(   RВ   RЧ   t   listRТ   t   incRP   RQ   RУ   t   _unique_typest   _reclassify_abbrev_typest   ABBREVR   R=   t   addt   removeRО   t   _get_orthography_dataRЦ   t   _get_sentbreak_countR:   t   _is_rare_abbrev_typeR^   t   _is_potential_sent_starterRХ   t   _is_potential_collocationRФ   R_   (
   R   RМ   RЪ   RН   t   unique_typesRM   t   scoret   is_addt   aug_tok1t   aug_tok2(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRЭ   s  sJ    	 
  	c         C   s   t  d Д  | DГ Г S(   Nc         s   s   x |  ] } | i  Vq Wd  S(   N(   RP   (   Ru   RН   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>▓  s    (   R<   (   R   RМ   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRб   ▒  s    c         C   s╩   |  i  i Г  xG |  i Г  D]9 \ } } |  i  i i | Г | o d | | f GHq q W|  i  i Г  xV |  i Г  D]H \ \ } } } |  i  i i | | f Г | o d | | | f GHqq qq Wt |  _	 d S(   s~   
        Uses data that has been gathered in training to determine likely
        collocations and sentence starters.
        s     Sent Starter: [%6.4f] %rs     Collocation: [%6.4f] %r+%rN(
   R   RD   t   _find_sent_startersR?   Rд   RC   t   _find_collocationsR>   RЕ   RЧ   (   R   RЪ   RF   t   llt   typ1t   typ2(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRЫ   ┤  s      i   c         C   s╗   | d j ob |  i  i } |  i  i Г  xF |  i i Г  D]1 \ } } | | j o | | |  i  i | <q6 q6 Wn |  i |  i | Г |  _ |  i |  i | Г |  _ |  i |  i | Г |  _ d S(   s  
        Allows memory use to be reduced after much training by removing data
        about rare tokens that are unlikely to have a statistical effect with
        further training. Entries occurring above the given thresholds will be
        retained.
        i   N(   R   RA   RE   RТ   RV   t   _freq_thresholdRФ   RХ   (   R   t   ortho_thresht   type_thresht   colloc_threst   sentstart_thresht   old_ocRO   t   count(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   freq_threshold╠  s      c         C   sk   t  Г  } d } xE | i Г  D]7 \ } } | | j  o | d 7} q | i | | Г q W| i t | Г | S(   sФ   
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        i    i   (   R   RV   Rа   R6   (   R   t   fdistt	   thresholdR{   t   num_removedRO   R╗   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR╡   с  s    	 c         C   s  d } t  | Г } xш | D]р } | i o | d j o
 d } n | i o | d j o
 d } n | i } t i | | i f d Г } | o |  i i | | Г n | i	 o( | i
 p | i p
 d } q∙ d } q | i p
 | i o
 d } q d } q Wd S(   s┌   
        Collect information about whether each token type occurs
        with different case patterns (i) overall, (ii) at
        sentence-initial positions, and (iii) at sentence-internal
        positions.
        R   R   R   i    N(   RЯ   RJ   RK   R_   t
   _ORTHO_MAPt   getRe   R   RH   RL   Rj   Rl   RN   RM   (   R   RМ   t   contextRН   RF   RG   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRж   Ў  s&     

	



c         c   s^  xW| D]O} t  i | Г p | d j o q n | i d Г o. | |  i i j o q n | d  } t } n! | |  i i j o q n t } | i d Г d } t | Г | d } |  i	 | d } |  i	 | } |  i
 | | |  i | |  i	 i Г  Г } t i | Г }	 | }
 t |  i Г p t i | | Г } | |	 |
 | } | | | f Vq Wd S(   s▐  
        (Re)classifies each given token if
          - it is period-final and not a known abbreviation; or
          - it is not period-final and is otherwise a known abbreviation
        by checking whether its previous classification still holds according
        to the heuristics of section 3.
        Yields triples (abbr, score, is_add) where abbr is the type in question,
        score is its log-likelihood with penalties applied, and is_add specifies
        whether the present type is a candidate for inclusion or exclusion as an
        abbreviation, such that:
          - (is_add and score >= 0.3)    suggests a new abbreviation; and
          - (not is_add and score < 0.3) suggests excluding an abbreviation.
        s
   ##number##R   i    i   N(   Ro   Rp   RS   R   R=   RЕ   RВ   R╗   R]   RТ   t   _dunning_log_likelihoodRУ   t   Nt   matht   expR@   t   IGNORE_ABBREV_PENALTYt   pow(   R   t   typesRF   Rн   t   num_periodst   num_nonperiodst   count_with_periodt   count_without_periodR▓   t   f_lengtht	   f_periodst	   f_penaltyRм   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRв   &  s4     

	c         C   sn   |  i  i Г  d Д  |  i DГ } xG |  i | Г D]6 \ } } } | |  i j o |  i  i i | Г q0 q0 Wd S(   s─   
        Recalculates abbreviations given type frequencies, despite no prior
        determination of abbreviations.
        This fails to include abbreviations otherwise found as "rare".
        c         s   s4   x- |  ]& } | o | i  d  Г o	 | Vq q Wd S(   R   N(   RS   (   Ru   RF   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>m  s    N(   R   RB   RТ   Rв   Rг   R=   Rд   (   R   RМ   RM   Rм   Rн   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   find_abbrev_typesf  s     c         C   s┘   | i  p | i o t Sn | i } |  i | |  i | d  } | |  i i j p | |  i j o t Sn | i d  |  i	 i
 j o t SnG | i o< | i } |  i i | } | t @o | t @o t Sq╒ n d S(   s▓  
        A word type is counted as a rare abbreviation if...
          - it's not already marked as an abbreviation
          - it occurs fewer than ABBREV_BACKOFF times
          - either it is followed by a sentence-internal punctuation
            mark, *or* it is followed by a lower-case word that
            sometimes appears with upper case, but never occurs with
            lower case at the beginning of sentences.
        i    i   N(   RM   RL   RВ   R_   RТ   R   R=   t   ABBREV_BACKOFFRO   R/   R+   RЕ   Rc   RA   t   _ORTHO_BEG_UCt   _ORTHO_MID_UC(   R   t   cur_tokt   next_tokRF   R╗   R┤   t   typ2ortho_context(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRи   u  s    
	#
	c   	      C   sМ   t  | Г | } d } t  | Г t i | Г |  | t i d | Г } t  | Г t i | Г |  | t i d | Г } | | } d | S(   s─   
        A function that calculates the modified Dunning log-likelihood
        ratio scores for abbreviation candidates.  The details of how
        this works is available in the paper.
        gоGсzоя?g      Ё?g       └(   t   floatR┼   t   log(	   t   count_at   count_bt   count_abR─   t   p1t   p2t	   null_hypot   alt_hypot
   likelihood(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR├   д  s    
c         C   sN  d d k  } d | | } d | |  } d | | | |  } | | i | Г |  | | i d | Г } | | | i | Г | |  | | | i d | Г }	 |  | j o
 d }
 n- | | i | Г |  | | i d | Г }
 | | j o
 d } n9 | | | i | Г | |  | | | i d | Г } | |	 |
 | } d | S(   s<  
        A function that will just compute log-likelihood estimate, in
        the original paper it's decribed in algorithm 6 and 7.

        This *should* be the original Dunning log-likelihood values,
        unlike the previous log_l function where it used modified
        Dunning log-likelihood values
        i    Ng      Ё?i    g       └(   R┼   R┘   (   R┌   R█   R▄   R─   R┼   RX   R▌   R▐   t   summand1t   summand2t   summand3t   summand4Rс   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   _col_log_likelihood╖  s$    
$

$c         C   sM   |  i  p2 |  i o
 | i p | i o% | i p
 | i o | i o | i S(   st   
        Returns True if the pair of tokens may form a collocation given
        log-likelihood statistics.
        (   t   INCLUDE_ALL_COLLOCSt   INCLUDE_ABBREV_COLLOCSRM   RL   Rj   Rl   Rq   (   R   Rо   Rп   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRк   с  s    


c      	   c   sW  xP|  i  i Г  D]?\ } } y | \ } } Wn t j
 o
 q n X| |  i i j o q n |  i | |  i | d } |  i | |  i | d } | d j oл | d j oЮ |  i | j  o t | | Г j n ot |  i | | | |  i i	 Г  Г } | |  i
 j o? t |  i i	 Г  Г | t | Г | j o | | f | f VqOq q Wd S(   sI   
        Generates likely collocations and their log-likelihood.
        R   i   N(   RФ   RV   t	   TypeErrorR   R?   RТ   t   MIN_COLLOC_FREQt   minRц   R─   t   COLLOCATIONR╪   (   R   R╔   t	   col_countR│   R┤   t
   typ1_countt
   typ2_countR▓   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR▒   э  s&     $c         C   s&   | i  o | i p | i o | i S(   sМ   
        Returns True given a token and the token that preceds it if it
        seems clear that the token is beginning a sentence.
        (   RL   Rj   Rl   Rn   (   R   R╒   t   prev_tok(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRй     s    
c         c   s╥   x╦ |  i  i Г  D]║ \ } } | p q n |  i | |  i | d } | | j  o q n |  i |  i | | |  i i Г  Г } | |  i j o< t |  i i Г  Г |  i t | Г | j o | | f Vq q Wd S(   s~   
        Uses collocation heuristics for each candidate token to
        determine if it frequently starts sentences.
        R   N(   RХ   RV   RТ   Rц   RЦ   R─   t   SENT_STARTERR╪   (   R   RF   t   typ_at_break_countt	   typ_countR▓   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR░     s     c         C   s   t  d Д  | DГ Г S(   sj   
        Returns the number of sentence breaks marked in a given set of
        augmented tokens.
        c         s   s'   x  |  ] } | i  o	 d  Vq q Wd S(   i   N(   RL   (   Ru   RН   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>5  s    (   t   sum(   R   RМ   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRз   0  s    (%   R&   R'   R(   R6   RВ   R   RI   R1   RЬ   Rг   R╟   R╥   Rь   Rё   Rч   Rш   Rъ   RЕ   RШ   RЮ   RЭ   Rб   RЫ   R╝   R╡   Rж   Rв   R╤   Rи   t   staticmethodR├   Rц   Rк   R▒   Rй   R░   Rз   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRР   №  s>   )			>				0	@		/*				t   PunktSentenceTokenizerc           B   s╣   e  Z d  Z e e e Г  e d Д Z e d Д Z e d Д Z	 e d Д Z
 d Д  Z d Д  Z d Д  Z d Д  Z d	 Д  Z d
 Д  Z d Д  Z d Д  Z e d Г Z d Д  Z d Д  Z d Д  Z RS(   s'  
    A sentence tokenizer which uses an unsupervised algorithm to build
    a model for abbreviation words, collocations, and words that start
    sentences; and then uses that model to find sentence boundaries.
    This approach has been shown to work well for many European
    languages.
    c         C   s=   t  i |  d | d | Г| o |  i | | Г |  _ n d S(   sТ   
        train_text can either be the sole training text for this sentence
        boundary detector, or can be a PunktParameters object.
        R0   RБ   N(   R~   R1   RШ   R   (   R   RЩ   RЪ   R0   RБ   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR1   E  s    c         C   sO   t  | Г t  d Г t  d Г f j o | Sn t | d |  i d |  i Гi Г  S(   sё   
        Derives parameters from a given training text, or uses the parameters
        given. Repeated calls to this method destroy previous parameters. For
        incremental training, instantiate a separate PunktTrainer instance.
        Rr   u    R0   RБ   (   RP   RР   R/   RА   RЬ   (   R   RЩ   RЪ   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRШ   Q  s    %c         C   s   t  |  i | | Г Г S(   sM   
        Given a text, returns a list of the sentences in that text.
        (   RЯ   t   sentences_from_text(   R   R2   t   realign_boundaries(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR3   `  s    c         C   s-   |  i  | Г } | o |  i | Г } n | S(   sь   
        Given a text, generates the sentences in that text by only
        testing candidate sentence breaks. If realign_boundaries is
        True, includes in the sentence closing punctuation that
        follows the period.
        (   t   _sentences_from_textt   _realign_boundaries(   R   R2   R°   t   sents(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRў   f  s    c         c   sд   d } xО |  i  i Г  i | Г D]t } | i Г  | i d Г } |  i | Г oE | | | i Г  !V| i d Г o | i d Г } qУ | i Г  } q q W| | Vd  S(   Ni    t	   after_tokR╓   (   R/   R%   t   finditert   groupt   text_contains_sentbreakt   endt   start(   R   R2   t
   last_breakRg   R┬   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR∙   r  s     c         c   sй   d } xЬ t  | Г D]О \ } } | | } | p | o	 | Vq q n |  i i i | Г } | o( | | i d Г i Г  V| i Г  } q d } | o	 | Vq q Wd S(   s\  
        Attempts to realign punctuation that falls after the period but
        should otherwise be included in the same sentence.

        For example: "(Sent1.) Sent2." will otherwise be split as::
        
            ["(Sent1.", ") Sent1."].
            
        This method will produce::
        
            ["(Sent1.)", "Sent2."].
        i    N(   R:   R/   R-   Rg   R■   RД   R   (   R   R√   t   realignt   s1t   s2t   m(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR·   А  s     
	c         C   sP   t  } xC |  i |  i | Г Г D]) } | o t Sn | i o
 t } q q Wt  S(   sK   
        Returns True if the given text includes a sentence break.
        (   RВ   t   _annotate_tokensRК   RЕ   RL   (   R   R2   t   foundRЙ   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR    Ю  s     
c         C   s(   |  i  |  i | Г Г } |  i | | Г S(   s▄   
        Given a text, generates the sentences in that text. Annotates all
        tokens, rather than just those with possible sentence breaks. Should
        produce the same results as L{sentences_from_text}.
        (   R  RК   t   _build_sentence_list(   R   R2   RМ   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   sentences_from_text_legacyк  s    c         #   sy   t  И  i З  f d Ж  | DГ Г Г } g  } x7 | D]/ } | i | i Г | i o | Vg  } q2 q2 W| o	 | Vn d S(   sw   
        Given a sequence of tokens, generates lists of tokens, each list
        corresponding to a sentence.
        c         3   s"   x |  ] } И  i  | Г Vq Wd  S(   N(   RА   (   Ru   RЙ   (   R   (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>╕  s    N(   R4   R  t   appendRO   RL   (   R   RМ   t   sentenceRН   (    (   R   s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   sentences_from_tokens│  s    % 
c         C   s"   |  i  | Г } |  i | Г } | S(   s╒   
        Given a set of tokens augmented with markers for line-start and
        paragraph-start, returns an iterator through those tokens with full
        annotation including predicted sentence breaks.
        (   RО   t   _annotate_second_pass(   R   RМ   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR  ┬  s    c         c   sO  d } t  i d Г } d } x| D]} | i } | i | | Г i Г  } | t | Г 7} | | | t | Г !| j oO d i d Д  | DГ Г }	 t  i |	 Г i | | Г }
 |
 o |
 i Г  } q╞ n | | | t | Г !| j p t В | t | Г 7} | o | | | 7} n | | 7} | i o | Vd } q" q" W| o	 | Vn d S(   sУ   
        Given the original text and the list of augmented word tokens,
        construct and return a tokenized list of sentence strings.
        i    s   \s*Rr   c         s   s"   x |  ] } t  i | Г Vq Wd  S(   N(   R   t   escape(   Ru   t   c(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys	   <genexpr>Ї  s    N(	   R   R   RO   Rg   R■   R]   Rv   t   AssertionErrorRL   (   R   R2   RМ   t   post	   WS_REGEXPR  RН   RO   t   wst   patR  (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR	  ╫  s,    
 	%

c         C   sЙ   d GHt  d d Г } xd | D]\ } | i o | i d Г n) | i o | i d Г n | i d Г | i t | Г Г q W| i Г  d  S(   Ns   writing to /tmp/punkt.new...s   /tmp/punkt.newt   ws   

s   
t    (   t   openRJ   t   writeRK   t   strt   close(   R   RМ   t   outRН   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   dump  s     

s   ;:,.!?c         c   s6   x/ t  | Г D]! \ } } |  i | | Г | Vq Wd S(   sы   
        Performs a token-based classification (section 4) over the given
        tokens, making use of the orthographic heuristic (4.1.1), collocation
        heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
        N(   R:   t   _second_pass_annotation(   R   RМ   t   t1t   t2(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR  &  s     c   	      C   sЬ  | p d Sn | i  } | i p d Sn | i } | i  } | i } | i } | | f |  i i j o t | _ t	 | _
 d Sn | i
 p
 | i og | o_ |  i | Г } | t	 j o t	 | _ d Sn | i o$ | |  i i j o t	 | _ d Sq№ n | p | d j oИ |  i | Г } | t j o t | _ t	 | _
 d Sn | d j oA | o: | i o0 |  i i | t @o t | _ t	 | _
 d SqШn d S(   sЪ   
        Performs token-based classification over a pair of contiguous tokens
        returning an updated augmented token for the first of them.
        Ns
   ##number##R   (   RO   RQ   R^   R_   Rl   R   R>   RВ   RL   RЕ   RM   RN   t   _ortho_heuristicRa   R?   RA   t	   _ORTHO_LC(	   R   Rо   Rп   RO   RF   R╓   t   next_typt   tok_is_initialt   is_sent_starter(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR  0  sH    	
							
			
		c         C   sД   | i  |  i j o t Sn |  i i | i } | i o | t @o | t @o t	 Sn | i
 o | t @p | t @o t Sn d S(   sR   
        Decide whether the given token is the first token in a sentence.
        R   (   RO   t   PUNCTUATIONRВ   R   RA   R_   Ra   R"  R╘   RЕ   Rc   t	   _ORTHO_UCt   _ORTHO_BEG_LC(   R   RН   RA   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyR!    s    

(   R&   R'   R(   R6   RВ   R   RI   R1   RШ   R3   Rў   R∙   R·   R    R
  R  R  R	  R  t   tupleR&  R  R  R!  (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyRЎ   =  s$   								7		
	Oc         C   si   d Д  } | Г  } t  | _ | i |  Г | | i Г  Г } x( | i |  d t  ГD] } | | Г GHqP Wd S(   s4   Builds a punkt model and applies it to the same textc         S   s+   t  i d  t  i Г i d |  Г i d d Г S(   s   (?:\r|^\s+)Rr   s   
R  (   R   R   R,   R\   t   replace(   R    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   <lambda>а  s    R°   N(   RЕ   Rч   RШ   RЬ   Rў   (   R2   t   tok_clst	   train_clst   cleanupt   trainert   sbdt   l(    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pyt   mainЮ  s    			 t   __main__(#   R(   R   R┼   t   nltk.compatR    t   nltk.probabilityR   t   apiR   R╙   R╘   t   _ORTHO_UNK_UCR(  t   _ORTHO_MID_LCt   _ORTHO_UNK_LCR'  R"  R└   t   objectR   R   R   Ro   R.   R:   R;   RI   R~   RР   RЎ   R2  R&   t   syst   stdint   read(    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tokenize/punkt.pys   <module>   sH   	


l	*М[    C  b