łň
3ŇÇIc           @   s4  d  d k  Z  d  d k Z d  d k l Z d  d k Z d  d k Td  d k l Z d   Z d e	 f d     YZ
 d   Z d	 d
 d d  Z e  i d  Z d" d
 d  Z d   Z d   Z e  i d e  i  Z e  i d  Z d   Z d d d d d d d d d g	 d
 d  Z d    Z e d! j o e   n d S(#   i˙˙˙˙N(   t   Tree(   t   *(   t   accuracyc         C   s_   g  } g  } xC | D]; } |  i  | i    } | t |  7} | t |  7} q Wt | |  S(   s  
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    @type chunker: C{ChunkParserI}
    @param chunker: The chunker being evaluated.
    @type gold: C{tree}
    @param gold: The chunk structures to score the chunker on.
    @rtype: C{float}
    (   t   parset   flattent   tree2conlltagst	   _accuracy(   t   chunkert   goldt	   gold_tagst	   test_tagst	   gold_treet	   test_tree(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyR      s     t
   ChunkScorec           B   s   e  Z d  Z d   Z d   Z d   Z d   Z d   Z d d  Z d   Z	 d	   Z
 d
   Z d   Z d   Z d   Z d   Z RS(   sč
  
    A utility class for scoring chunk parsers.  C{ChunkScore} can
    evaluate a chunk parser's output, based on a number of statistics
    (precision, recall, f-measure, misssed chunks, incorrect chunks).
    It can also combine the scores from the parsing of multiple texts;
    this makes it signifigantly easier to evaluate a chunk parser that
    operates one sentence at a time.

    Texts are evaluated with the C{score} method.  The results of
    evaluation can be accessed via a number of accessor methods, such
    as C{precision} and C{f_measure}.  A typical use of the
    C{ChunkScore} class is::

        >>> chunkscore = ChunkScore()
        >>> for correct in correct_sentences:
        ...     guess = chunkparser.parse(correct.leaves())
        ...     chunkscore.score(correct, guess)
        >>> print 'F Measure:', chunkscore.f_measure()
        F Measure: 0.823

    @ivar kwargs: Keyword arguments:

        - max_tp_examples: The maximum number actual examples of true
          positives to record.  This affects the C{correct} member
          function: C{correct} will not return more than this number
          of true positive examples.  This does *not* affect any of
          the numerical metrics (precision, recall, or f-measure)

        - max_fp_examples: The maximum number actual examples of false
          positives to record.  This affects the C{incorrect} member
          function and the C{guessed} member function: C{incorrect}
          will not return more than this number of examples, and
          C{guessed} will not return more than this number of true
          positive examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)
        
        - max_fn_examples: The maximum number actual examples of false
          negatives to record.  This affects the C{missed} member
          function and the C{correct} member function: C{missed}
          will not return more than this number of examples, and
          C{correct} will not return more than this number of true
          negative examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - chunk_node: A regular expression indicating which chunks
          should be compared.  Defaults to C{'.*'} (i.e., all chunks).
        
    @type _tp: C{list} of C{Token}
    @ivar _tp: List of true positives
    @type _fp: C{list} of C{Token}
    @ivar _fp: List of false positives
    @type _fn: C{list} of C{Token}
    @ivar _fn: List of false negatives
    
    @type _tp_num: C{int}
    @ivar _tp_num: Number of true positives
    @type _fp_num: C{int}
    @ivar _fp_num: Number of false positives
    @type _fn_num: C{int}
    @ivar _fn_num: Number of false negatives.
    c         K   sÁ   t    |  _ t    |  _ t    |  _ t    |  _ t    |  _ | i d d  |  _ | i d d  |  _ | i d d  |  _	 | i d d  |  _
 d |  _ d |  _ d |  _ d |  _ t |  _ d  S(   Nt   max_tp_examplesid   t   max_fp_examplest   max_fn_examplest
   chunk_nodes   .*i    (   t   sett   _correctt   _guessedt   _tpt   _fpt   _fnt   gett   _max_tpt   _max_fpt   _max_fnt   _chunk_nodet   _tp_numt   _fp_numt   _fn_numt   _countt   Falset   _measuresNeedUpdate(   t   selft   kwargs(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   __init__r   s    				c         C   s   |  i  o| |  i |  i @|  _ |  i |  i |  _ |  i |  i |  _ t |  i  |  _ t |  i  |  _ t |  i  |  _	 t
 |  _  n d  S(   N(   R"   R   R   R   R   R   t   lenR   R   R   R!   (   R#   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   _updateMeasures   s    
c         C   s^   |  i  t | |  i |  i  O_  |  i t | |  i |  i  O_ |  i d 7_ t |  _ d S(   s]  
        Given a correctly chunked sentence, score another chunked
        version of the same sentence.
        
        @type correct: chunk structure
        @param correct: The known-correct ("gold standard") chunked
            sentence.
        @type guessed: chunk structure
        @param guessed: The chunked sentence to be scored.
        i   N(   R   t
   _chunksetsR    R   R   t   TrueR"   (   R#   t   correctt   guessed(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   score   s    !!c         C   sD   |  i    |  i |  i } | d j o d Sn t |  i  | Sd S(   s   
        @return: the overall precision for all texts that have been
            scored by this C{ChunkScore}.
        @rtype: C{float}
        i    N(   R'   R   R   t   float(   R#   t   div(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt	   precision   s    
c         C   sD   |  i    |  i |  i } | d j o d Sn t |  i  | Sd S(   s   
        @return: the overall recall for all texts that have been
            scored by this C{ChunkScore}.
        @rtype: C{float}
        i    N(   R'   R   R   R-   (   R#   R.   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   recall¨   s    
g      ŕ?c         C   s\   |  i    |  i   } |  i   } | d j p | d j o d Sn d | | d | | S(   sÍ  
        @return: the overall F measure for all texts that have been
            scored by this C{ChunkScore}.
        @rtype: C{float}
        
        @param alpha: the relative weighting of precision and recall.
            Larger alpha biases the score towards the precision value,
            while smaller alpha biases the score towards the recall
            value.  C{alpha} should have a value in the range [0,1].
        @type alpha: C{float}
        i    i   (   R'   R/   R0   (   R#   t   alphat   pt   r(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt	   f_measureł   s    
c         C   s<   |  i    t |  i  } g  } | D] } | | d q$ ~ S(   sÔ   
        @rtype: C{list} of chunks
        @return: the chunks which were included in the
            correct chunk structures, but not in the guessed chunk
            structures, listed in input order.
        i   (   R'   t   listR   (   R#   t   chunkst   _[1]t   c(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   missedĆ   s    
c         C   s<   |  i    t |  i  } g  } | D] } | | d q$ ~ S(   sÔ   
        @rtype: C{list} of chunks
        @return: the chunks which were included in the
            guessed chunk structures, but not in the correct chunk
            structures, listed in input order.
        i   (   R'   R5   R   (   R#   R6   R7   R8   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt	   incorrectŃ   s    
c         C   s2   t  |  i  } g  } | D] } | | d q ~ S(   s   
        @rtype: C{list} of chunks
        @return: the chunks which were included in the correct
            chunk structures, listed in input order.
        i   (   R5   R   (   R#   R6   R7   R8   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyR*   Ü   s    c         C   s2   t  |  i  } g  } | D] } | | d q ~ S(   s   
        @rtype: C{list} of chunks
        @return: the chunks which were included in the guessed
            chunk structures, listed in input order.
        i   (   R5   R   (   R#   R6   R7   R8   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyR+   ĺ   s    c         C   s   |  i    |  i |  i S(   N(   R'   R   R   (   R#   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   __len__î   s    
c         C   s   d t  |   d S(   sf   
        @rtype: C{String}
        @return: a concise representation of this C{ChunkScoring}.
        s   <ChunkScoring of s    chunks>(   R&   (   R#   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   __repr__ň   s    c         C   s:   d d |  i    d d |  i   d d |  i   d S(   sJ  
        @rtype: C{String}
        @return: a verbose representation of this C{ChunkScoring}.
            This representation includes the precision, recall, and
            f-measure scores.  For other information about the score,
            use the accessor methods (e.g., C{missed()} and
            C{incorrect()}). 
        s   ChunkParse score:
s       Precision: %5.1f%%
id   s       Recall:    %5.1f%%
s       F-Measure: %5.1f%%(   R/   R0   R4   (   R#   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   __str__ů   s    '(   t   __name__t
   __module__t   __doc__R%   R'   R,   R/   R0   R4   R9   R:   R*   R+   R;   R<   R=   (    (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyR   4   s   =		
											c         C   s   d } g  } x |  D]y } t  | t  oY t i | | i  o) | i | | f t | i    f  n | t | i	    7} q | d 7} q Wt
 |  S(   Ni    i   (   t
   isinstanceR    t   ret   matcht   nodet   appendt   tuplet   freezeR&   t   leavesR   (   t   tt   countR   t   posR6   t   child(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyR(   	  s     )t   NPt   St   /c   	      C   s|  t  i d  } t | g   g } x#| i |   D]} | i   } | d d j o^ t |  d j o t d | i     n t | g   } | d i |  | i |  q1 | d d j o; t |  d j o t d	 | i     n | i	   q1 | t
 j o | d i |  q1 | d i t i i i | |   q1 Wt |  d j o t d
 t |     n | d S(   s  
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a C{Tree}.
    Chunks are marked by square brackets (C{[...]}).  Words are
    delimited by whitespace, and each word should have the form
    C{I{text}/I{tag}}.  Words that do not contain a slash are
    assigned a C{tag} of C{None}.

    @return: A tree corresponding to the string representation.
    @rtype: C{tree}
    @param s: The string to be converted
    @type s: C{string}
    @param chunk_node: The label to use for chunk nodes
    @type chunk_node: C{string}
    @param top_node: The label to use for the root of the tree
    @type top_node: C{string}
    s   \[|\]|[^\[\]\s]+i    t   [i   s   Unexpected [ at char %di˙˙˙˙t   ]i   s   Unexpected ] at char %ds   Expected ] at char %d(   RB   t   compileR    t   finditert   groupR&   t
   ValueErrort   startRE   t   popt   Nonet   nltkt   tagt   utilt	   str2tuple(	   t   sR   t   top_nodet   sept   WORD_OR_BRACKETt   stackRC   t   textt   chunk(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   tagstr2tree  s*     's   (\S+)\s+(\S+)\s+([IOB])-?(\S+)?t   PPt   VPc         C   sr  t  | g   g } xUt |  i d   D]>\ } } | i   p q( n t i |  } | t j o t d |  n | i   \ } } }	 }
 | t j	 o |
 | j o
 d }	 n |	 d j o |
 | d i	 j } |	 d j p | o% t
 |  d j o | i   q
n |	 d j p | o1 t  |
 g   } | d i |  | i |  n | d i | | f  q( W| d	 S(
   s)  
    Convert a CoNLL IOB string into a tree.  Uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    @param s: The CoNLL string to be converted.
    @type s: C{string}
    @param chunk_types: The chunk types to be converted.
    @type chunk_types: C{tuple}
    @param top_node: The node label to use for the root.
    @type top_node: C{string}
    @return: A chunk structure for a single sentence
        encoded in the given CONLL 2000 style string.
    @rtype: L{Tree}
    s   
s   Error on line %dt   Ot   Ii˙˙˙˙t   BOi   t   Bi    (   R    t	   enumeratet   splitt   stript   _LINE_RERC   RX   RU   t   groupsRD   R&   RW   RE   (   R]   t   chunk_typesR^   Ra   t   linenot   lineRC   t   wordRZ   t   statet
   chunk_typet
   mismatch_IRc   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   conllstr2treeE  s(     
 %c      
   C   s¸   g  } xŤ |  D]Ł } yi | i  } d } xS | D]K } t | t  o t d  n | i | d | d | | f  d } q, WWq t j
 o% | i | d | d d f  q Xq W| S(   sÔ   
    Convert a tree to the CoNLL IOB tag format

    @param t: The tree to be converted.
    @type t: C{Tree}
    @return: A list of 3-tuples containing word, tag and IOB tag.
    @rtype: C{list} of C{tuple}
    s   B-s7   Tree is too deeply nested to be printed in CoNLL formati    i   s   I-Rg   (   RD   RA   R    RU   RE   t   AttributeError(   RI   t   tagsRL   t   categoryt   prefixt   contents(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyR   w  s    
 	 "'c         C   s=   g  } t  |   D] } | t i |  q ~ } d i |  S(   sÝ   
    Convert a tree to the CoNLL IOB string format

    @param t: The tree to be converted.
    @type t: C{Tree}
    @return: A multiline string where each line contains a word, tag and IOB tag.
    @rtype: C{string}
    s   
(   R   t   stringt   join(   RI   R7   t   tokent   lines(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   tree2conllstr  s    	0s   <DOC>\s*(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?<BODY>\s*(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?<TEXT>(?P<text>.*?)</TEXT>\s*</BODY>\s*</DOC>\s*s#   <b_\w+\s+[^>]*?type="(?P<type>\w+)"c         C   sS  t  | g   g } |  d  j o g  Sn xţ t i d |   D]ę } | i   } yŚ | i d  oc t i |  } | d  j o d G| GHn t  | i d  g   } | d i |  | i |  n0 | i d  o | i	   n | d i |  Wq: t
 t f j
 o t d | i     q: Xq: Wt |  d j o t d	   n | d
 S(   Ns   <[^>]+>|[^\s<]+s   <b_t   XXXXt   typei˙˙˙˙s   <e_s'   Bad IEER string (error at character %d)i   s   Bad IEER stringi    (   R    RX   RB   RS   RT   t
   startswitht   _IEER_TYPE_RERC   RE   RW   t
   IndexErrorRU   RV   R&   (   R]   R^   Ra   t   piece_mt   piecet   mRc   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   _ieer_read_text¨  s,     t   LOCATIONt   ORGANIZATIONt   PERSONt   DURATIONt   DATEt   CARDINALt   PERCENTt   MONEYt   MEASUREc         C   s   t  i |   } | ot h  t | i d  |  d <| i d  d <| i d  d <| i d  d <t | i d  |  d <Sn t |  |  Sd S(   sy  
    Convert a string of chunked tagged text in the IEER named
    entity format into a chunk structure.  Chunks are of several
    types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
    PERCENT, MONEY, and MEASURE.

    @return: A chunk structure containing the chunked tagged text that is
        encoded in the given IEER style string.
    @rtype: L{Tree}
    Rb   t   docnot   doctypet	   date_timet   headlineN(   t   _IEER_DOC_RERC   R   RT   (   R]   Rp   R^   R   (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   ieerstr2treeĹ  s    c          C   sr   d }  d d k  l } | i |  d d } | i   GHHd }  t |  d d
 } | i   GHd	 GH| i |  GHHd  S(   Nsd   [ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./.i˙˙˙˙(   Rc   R   RM   sv  
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
Rp   Re   s   CoNLL output:(   s   NPs   PP(   RY   Rc   Rd   t   pprintRw   R   (   R]   Rc   RI   t
   conll_tree(    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pyt   demoă  s    t   __main__(   s   NPs   PPs   VP(   RB   R}   RY   R    t   nltk.tag.utilt   apit   nltk.metricsR   R   t   objectR   R(   Rd   RR   Rn   Rw   R   R   t   DOTALLR   R   R   R   R   R>   (    (    (    s%   /p/zhu/06/nlp/nltk/nltk/chunk/util.pys   <module>	   s,   
	Ő	.2					/