
4Ic           @   s  d  Z  d d k Z d d k Z d d k Z d d k Z d d k l Z d d k l Z d d k	 Td e
 e i f d     YZ d e i f d	     YZ d
 e f d     YZ d e f d     YZ d e f d     YZ d e f d     YZ d e f d     YZ d e f d     YZ d e f d     YZ d e f d     YZ d d  Z d d d d d  e d! d d"  Z e d# j o e   n d S($   s-   
Brill's transformational rule-based tagger.
iN(   t   defaultdict(   t   untag(   t   *t   BrillTaggerc           B   s/   e  Z d  Z d Z d   Z d   Z d   Z RS(   sq  
    Brill's transformational rule-based tagger.  Brill taggers use an
    X{initial tagger} (such as L{tag.DefaultTagger}) to assign an intial
    tag sequence to a text; and then apply an ordered list of
    transformational rules to correct the tags of individual tokens.
    These transformation rules are specified by the L{BrillRule}
    interface.

    Brill taggers can be created directly, from an initial tagger and
    a list of transformational rules; but more often, Brill taggers
    are created by learning rules from a training corpus, using either
    L{BrillTaggerTrainer} or L{FastBrillTaggerTrainer}.
    s   !nltk.BrillTaggerc         C   s   | |  _  t |  |  _ d S(   s  
        @param initial_tagger: The initial tagger
        @type initial_tagger: L{TaggerI}
        @param rules: An ordered list of transformation rules that
            should be used to correct the initial tagging.
        @type rules: C{list} of L{BrillRule}
        N(   t   _initial_taggert   tuplet   _rules(   t   selft   initial_taggert   rules(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   __init__,   s    	c         C   s   |  i  S(   N(   R   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR	   7   s    c   
      C   s   |  i  i |  } t t  } x1 t |  D]# \ } \ } } | | i |  q+ Wxq |  i D]f } | i | i g   } | i	 | |  }	 x6 |	 D]. } | | i i
 |  | | i i |  q Wq\ W| S(   N(   R   t   tagR    t   sett	   enumeratet   addR   t   gett   original_tagt   applyt   removet   replacement_tag(
   R   t   tokenst   tagged_tokenst   tag_to_positionst   it   tokenR   t   rulet	   positionst   changed(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR   :   s     
  (   t   __name__t
   __module__t   __doc__t   yaml_tagR
   R	   R   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR      s
   		t	   BrillRulec           B   sG   e  Z d  Z d   Z e d  Z d   Z d   Z d   Z d   Z	 RS(   s*  
    An interface for tag transformations on a tagged corpus, as
    performed by brill taggers.  Each transformation finds all tokens
    in the corpus that are tagged with a specific X{original tag} and
    satisfy a specific X{condition}, and replaces their tags with a
    X{replacement tag}.  For any given transformation, the original
    tag, replacement tag, and condition are fixed.  Conditions may
    depend on the token under consideration, as well as any other
    tokens in the corpus.

    Brill rules must be comparable and hashable.
    c         C   s0   |  i  t j p
 t d  | |  _ | |  _ d  S(   Ns#   BrillRule is an abstract base class(   t	   __class__R    t   AssertionErrorR   R   (   R   R   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR
   f   s
    		c         C   s   | t  j o t t |   } n g  } | D]$ } |  i | |  o | | q. q. ~ } x) | D]! } | | d |  i f | | <qb W| S(   s  
        Apply this rule at every position in C{positions} where it
        applies to the given sentence.  I.e., for each position M{p}
        in C{positions}, if C{tokens[M{p}]} is tagged with this rule's
        original tag, and satisfies this rule's condition, then set
        its tag to be this rule's replacement tag.

        @param tokens: The tagged sentence
        @type tokens: list of Token
        @type positions: C{list} of C{int}
        @param positions: The positions where the transformation is to
            be tried.  If not specified, try it at all positions.
        @return: The indices of tokens whose tags were changed by this
            rule.
        @rtype: C{int}
        i    (   t   Nonet   ranget   lent   appliesR   (   R   R   R   t   _[1]R   t   change(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR   p   s    8 c         C   s   t  p
 t d  d S(   s  
        @return: True if the rule would change the tag of 
            C{tokens[index]}, False otherwise
        @rtype: Boolean

        @param tokens: A tagged sentence
        @type tokens: list of Token
        @param index: The index to check
        @type index: int
        s!   Brill rules must define applies()N(   t   FalseR"   (   R   R   t   index(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR&      s    c         C   s   t  p
 t d  d  S(   Ns   Brill rules must be comparable(   R)   R"   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   __eq__   s    c         C   s   t  p
 t d  d  S(   Ns   Brill rules must be comparable(   R)   R"   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   __ne__   s    c         C   s   t  p
 t d  d  S(   Ns   Brill rules must be hashable(   R)   R"   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   __hash__   s    (
   R   R   R   R
   R#   R   R&   R+   R,   R-   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR    Y   s   	
			t   ProximateTokensRulec           B   s   e  Z d  Z d   Z e d    Z e d    Z e d    Z d   Z	 d   Z
 d   Z d   Z d	   Z d
   Z d   Z d   Z RS(   sO  
    An abstract base class for brill rules whose condition checks for
    the presence of tokens with given properties at given ranges of
    positions, relative to the token.

    Each subclass of proximate tokens brill rule defines a method
    M{extract_property}, which extracts a specific property from the
    the token, such as its text or tag.  Each instance is
    parameterized by a set of tuples, specifying ranges of positions
    and property values to check for in those ranges:
    
      - (M{start}, M{end}, M{value})

    The brill rule is then applicable to the M{n}th token iff:
    
      - The M{n}th token is tagged with the rule's original tag; and
      - For each (M{start}, M{end}, M{value}) triple:
        - The property value of at least one token between
          M{n+start} and M{n+end} (inclusive) is M{value}.

    For example, a proximate token brill template with M{start=end=-1}
    generates rules that check just the property of the preceding
    token.  Note that multiple properties may be included in a single
    rule; the rule applies if they all hold.
    c         G   s   |  i  t j p
 t d  t i |  | |  | |  _ xD | D]< \ } } } | | j o  t d | | | f f   q= q= Wd S(   s  
        Construct a new brill rule that changes a token's tag from
        C{original_tag} to C{replacement_tag} if all of the properties
        specified in C{conditions} hold.

        @type conditions: C{tuple} of C{(int, int, *)}
        @param conditions: A list of 3-tuples C{(start, end, value)},
            each of which specifies that the property of at least one
            token between M{n}+C{start} and M{n}+C{end} (inclusive) is
            C{value}.
        @raise ValueError: If C{start}>C{end} for any condition.
        s-   ProximateTokensRule is an abstract base classs!   Condition %s has an invalid rangeN(   R!   R.   R"   R    R
   t   _conditionst
   ValueError(   R   R   R   t
   conditionst   st   et   v(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR
      s    	 c         C   sS   | i  |  i t d t |  d t d   | i D  d | i d | i   } | S(   Nt   descriptionR1   c         s   s   x |  ] } t  |  Vq Wd  S(   N(   t   list(   t   .0t   x(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pys	   <genexpr>   s    t   originalt   replacement(   t   represent_mappingR   t   dictt   strR6   R/   R   R   (   t   clst   dumpert   datat   node(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   to_yaml   s    	c         C   s;   | i  | d t } |  | d | d d   | d D  S(   Nt   deepR9   R:   c         s   s   x |  ] } t  |  Vq Wd  S(   N(   R   (   R7   R8   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pys	   <genexpr>   s    R1   (   t   construct_mappingt   True(   R>   t   loaderRA   t   map(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt	   from_yaml   s    c         C   s   t  p
 t d  d S(   s  
        Returns some property characterizing this token, such as its
        base lexical item or its tag.

        Each implentation of this method should correspond to an
        implementation of the method with the same name in a subclass
        of L{ProximateTokensTemplate}.

        @param token: The token
        @type token: Token
        @return: The property
        @rtype: any
        s2   ProximateTokenRules must define extract_property()N(   R)   R"   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   extract_property   s    c   	      C   s   | | d |  i  j o t Sn x |  i D]| \ } } } t d | |  } t | | d t |   } x: t | |  D]% } |  i | |  | j o Pqy qy Wt Sq* Wt S(   Ni   i    (	   R   R)   R/   t   maxt   minR%   R$   RI   RE   (	   R   R   R*   t   startt   endt   valR2   R3   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR&      s    
  	c         C   sc   |  | j pV | d  j	 oI | i |  i j o6 |  i | i j o# |  i | i j o |  i | i j S(   N(   R#   R!   R   R   R/   (   R   t   other(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR+     s    c         C   s   |  | j S(   N(    (   R   RO   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR,     s    c         C   sJ   y |  i  SWn8 t |  i |  i |  i |  i i f  |  _  |  i  Sn Xd  S(   N(   t   _ProximateTokensRule__hasht   hashR   R   R/   R!   R   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR-     s    c         C   s   y |  i  SWnv d i g  } |  i D]# \ } } } | d | | | f q% ~  } d |  i i |  i |  i | f |  _  |  i  Sn Xd  S(   Ns    and s   %s in %d...%ds   <%s: %s->%s if %s>(   t   _ProximateTokensRule__reprt   joinR/   R!   R   R   R   (   R   R'   R2   R3   R4   R1   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   __repr__"  s    6c         C   sx   d |  i  |  i f } t |  i  d j o
 d } n; d d i g  } |  i D] } | |  i |  qM ~  } | | S(   Ns   %s -> %si    t    s    if s   , and (   R   R   R%   R/   RS   t   _condition_to_str(   R   R:   R1   R'   t   c(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   __str__0  s    	
*c         C   s/   | \ } } } d |  i  |  i | |  | f S(   sz   
        Return a string representation of the given condition.
        This helper method is used by L{__str__}.
        s   the %s of %s is %r(   t   PROPERTY_NAMEt   _range_to_str(   R   t	   conditionRL   RM   t   value(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRV   :  s    c         C   s  | | j o
 d j n o d Sn | | j o
 d j n o d Sn | | j o
 d j n o d Sn | | j o | d j  o d | Snk | | j o | d j o d | SnE | d j o d	 | } n | d j o d	 | } n d
 | | f Sd S(   sx   
        Return a string representation for the given range.  This
        helper method is used by L{__str__}.
        i    s	   this wordis   the preceding wordi   s   the following words	   word i-%ds	   word i+%ds   +%ds   words i%s...i%sN(    (   R   RL   RM   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRZ   C  s    (   R   R   R   R
   t   classmethodRB   RH   t   staticmethodRI   R&   R+   R,   R-   RT   RX   RV   RZ   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR.      s   								
		t   ProximateTagsRulec           B   s)   e  Z d  Z d Z d Z e d    Z RS(   s   
    A rule which examines the tags of nearby tokens.
    @see: superclass L{ProximateTokensRule} for details.
    @see: L{SymmetricProximateTokensTemplate}, which generates these rules.
    R   s   !ProximateTagsRulec         C   s   |  d S(   s   @return: The given token's tag.i   (    (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRI   _  s    (   R   R   R   RY   R   R^   RI   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR_   W  s   t   ProximateWordsRulec           B   s)   e  Z d  Z d Z d Z e d    Z RS(   s   
    A rule which examines the base types of nearby tokens.
    @see: L{ProximateTokensRule} for details.
    @see: L{SymmetricProximateTokensTemplate}, which generates these rules.
    t   texts   !ProximateWordsRulec         C   s   |  d S(   s    @return: The given token's text.i    (    (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRI   l  s    (   R   R   R   RY   R   R^   RI   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR`   d  s   t   BrillTemplateIc           B   s)   e  Z d  Z d   Z d   Z d   Z RS(   s   
    An interface for generating lists of transformational rules that
    apply at given sentence positions.  C{BrillTemplateI} is used by
    C{Brill} training algorithms to generate candidate rules.
    c         C   s   t  d  d  S(   Ns'   BrillTemplateI is an abstract interface(   R"   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR
   {  s    c         C   s   t  d  d S(   s/  
        Return a list of the transformational rules that would correct
        the C{i}th subtoken's tag in the given token.  In particular,
        return a list of zero or more rules that would change
        C{tagged_tokens[i][1]} to C{correctTag}, if applied
        to C{token}.

        If the C{i}th subtoken already has the correct tag (i.e., if
        C{tagged_tokens[i][1]} == C{correctTag}), then
        C{applicable_rules} should return the empty list.
        
        @param tokens: The tagged tokens being tagged.
        @type tokens: C{list} of C{tuple}
        @param i: The index of the token whose tag should be corrected.
        @type i: C{int}
        @param correctTag: The correct tag for the C{i}th token.
        @type correctTag: (any)
        @rtype: C{list} of L{BrillRule}
        s'   BrillTemplateI is an abstract interfaceN(   R"   (   R   R   R   t
   correctTag(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   applicable_rules~  s    c         C   s   t  d  d S(   s  
        Returns the set of indices C{i} such that
        C{applicable_rules(token, i, ...)} depends on the value of
        the C{index}th subtoken of C{token}.

        This method is used by the "fast" Brill tagger trainer.

        @param token: The tokens being tagged.
        @type token: C{list} of C{tuple}
        @param index: The index whose neighborhood should be returned.
        @type index: C{int}
        @rtype: C{Set}
        s'   BrillTemplateI is an abstract interfaceN(   R"   (   R   R   R*   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   get_neighborhood  s    (   R   R   R   R
   Rd   Re   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRb   u  s   		t   ProximateTokensTemplatec           B   s2   e  Z d  Z d   Z d   Z d   Z d   Z RS(   s  
    An brill templates that generates a list of
    L{ProximateTokensRule}s that apply at a given sentence
    position.  In particular, each C{ProximateTokensTemplate} is
    parameterized by a proximate token brill rule class and a list of
    boundaries, and generates all rules that:
    
      - use the given brill rule class
      - use the given list of boundaries as the C{start} and C{end}
        points for their conditions
      - are applicable to the given token.
    c         G   sW   | |  _  | |  _ x> | D]6 \ } } | | j o t d | | f f   q q Wd S(   sR  
        Construct a template for generating proximate token brill
        rules.

        @type rule_class: C{class}
        @param rule_class: The proximate token brill rule class that
        should be used to generate new rules.  This class must be a
        subclass of L{ProximateTokensRule}.
        @type boundaries: C{tuple} of C{(int, int)}
        @param boundaries: A list of tuples C{(start, end)}, each of
            which specifies a range for which a condition should be
            created by each rule.
        @raise ValueError: If C{start}>C{end} for any boundary.
        s    Boundary %s has an invalid rangeN(   t   _rule_classt   _boundariesR0   (   R   t
   rule_classt
   boundariesR2   R3   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR
     s    		 c         C   s   | | d | j o g  Sn g  } |  i  D]% \ } } | |  i | | | |  q+ ~ } g  g } xC | D]; }	 g  }
 | D]! } |	 D] } |
 | | g q qz ~
 } qi Wg  } | D]$ } | |  i | | d | |  q ~ S(   Ni   (   Rh   t   _applicable_conditionsRg   (   R   R   R*   t   correct_tagR'   RL   RM   t   applicable_conditionst   condition_combosR1   t   _[2]t   old_conditionst   new_conditiont   _[3]t   conds(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRd     s    5	 
(c   
      C   s   g  } t  d | |  } t | | d t |   } xC t | |  D]2 } |  i i | |  }	 | i | | |	 f  qF W| S(   so  
        @return: A set of all conditions for proximate token rules
        that are applicable to C{tokens[index]}, given boundaries of
        C{(start, end)}.  I.e., return a list of all tuples C{(start,
        end, M{value})}, such the property value of at least one token
        between M{index+start} and M{index+end} (inclusive) is
        M{value}.
        i    i   (   RJ   RK   R%   R$   Rg   RI   t   append(
   R   R   R*   RL   RM   R1   R2   R3   R   R\   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRk     s    	 c   	      C   s   t  | g  } xp |  i D]e \ } } t d | |  } t | | d t |   } x$ t | |  D] } | i |  qg Wq W| S(   Ni    i   (   R   Rh   RJ   RK   R%   R$   R   (	   R   R   R*   t   neighborhoodRL   RM   R2   R3   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRe     s    
  (   R   R   R   R
   Rd   Rk   Re   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRf     s
   			t    SymmetricProximateTokensTemplatec           B   s)   e  Z d  Z d   Z d   Z d   Z RS(   s]  
    Simulates two L{ProximateTokensTemplate}s which are symmetric
    across the location of the token.  For rules of the form "If the
    M{n}th token is tagged C{A}, and any tag preceding B{or} following
    the M{n}th token by a distance between M{x} and M{y} is C{B}, and
    ... , then change the tag of the nth token from C{A} to C{C}."

    One C{ProximateTokensTemplate} is formed by passing in the
    same arguments given to this class's constructor: tuples
    representing intervals in which a tag may be found.  The other
    C{ProximateTokensTemplate} is constructed with the negative
    of all the arguments in reversed order.  For example, a
    C{SymmetricProximateTokensTemplate} using the pair (-2,-1) and the
    constructor C{SymmetricProximateTokensTemplate} generates the same rules as a
    C{SymmetricProximateTokensTemplate} using (-2,-1) plus a second
    C{SymmetricProximateTokensTemplate} using (1,2).

    This is useful because we typically don't want templates to
    specify only "following" or only "preceding"; we'd like our
    rules to be able to look in either direction.
    c         G   sW   t  | |  |  _ g  } | D] \ } } | | | f q ~ } t  | |  |  _ d S(   sZ  
        Construct a template for generating proximate token brill
        rules.
        
        @type rule_class: C{class}
        @param rule_class: The proximate token brill rule class that
        should be used to generate new rules.  This class must be a
        subclass of L{ProximateTokensRule}.
        @type boundaries: C{tuple} of C{(int, int)}
        @param boundaries: A list of tuples C{(start, end)}, each of
            which specifies a range for which a condition should be
            created by each rule.
        @raise ValueError: If C{start}>C{end} for any boundary.
        N(   Rf   t   _ptt1t   _ptt2(   R   Ri   Rj   R'   R2   R3   t   reversed(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR
     s    /c         C   s,   |  i  i | | |  |  i i | | |  S(   sm   
        See L{BrillTemplateI} for full specifications.

        @rtype: list of ProximateTokensRule
        (   Rw   Rd   Rx   (   R   R   R*   Rc   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRd   )  s    c         C   s7   |  i  i | |  } |  i i | |  } | i |  S(   N(   Rw   Re   Rx   t   union(   R   R   R*   t   n1t   n2(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRe   2  s    (   R   R   R   R
   Rd   Re   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyRv     s   			t   BrillTaggerTrainerc           B   sY   e  Z d  Z d e d  Z d d d  Z d   Z d   Z d   Z d	   Z	 d
   Z
 RS(   s&   
    A trainer for brill taggers.
    i    c         C   sE   | d j o | d j } n | |  _ | |  _ | |  _ | |  _ d S(   s  
        @param deterministic: If true, then choose between rules that
            have the same score by picking the one whose __repr__
            is lexicographically smaller.  If false, then just pick the
            first rule we find with a given score -- this will depend
            on the order in which keys are returned from dictionaries,
            and so may not be the same from one run to the next.  If
            not specified, treat as true iff trace > 0.
        i    N(   R#   R   t
   _templatest   _tracet   _deterministic(   R   R   t	   templatest   tracet   deterministic(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR
   @  s
    			i   i   c         C   s  |  i  d j o d t |  GHn g  } | D] } | |  i i t |   q. ~ } |  i  d j o |  i   n g  } y x t |  | j  o |  i | |  \ } }	 }
 | t j p |	 | j  o |  i  d j o	 d GHn Pq | i |  d } x' | D] } | t | i	 |   7} q W|  i  d j o |  i
 | |	 |
 |  q q WWn" t j
 o d t |  GHn Xt |  i |  S(   su  
        Trains the Brill tagger on the corpus C{train_token},
        producing at most C{max_rules} transformations, each of which
        reduces the net number of errors in the corpus by at least
        C{min_score}.
        
        @type train_sents: C{list} of C{list} of L{tuple}
        @param train_sents: The corpus of tagged tokens
        @type max_rules: C{int}
        @param max_rules: The maximum number of transformations to be created
        @type min_score: C{int}
        @param min_score: The minimum acceptable net error reduction
            that each transformation must produce in the corpus.
        i    s(   Training Brill tagger on %d sentences...i   i   s"   Insufficient improvement; stoppings+   Training stopped manually -- %d rules found(   R   R%   R   R   R   t   _trace_headert
   _best_ruleR#   Rt   R   t   _trace_rulet   KeyboardInterruptR   (   R   t   train_sentst	   max_rulest	   min_scoreR'   t   sentt
   test_sentsR	   R   t   scoret   fixscoret   k(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   trainU  s0    ,		 #c         C   s  t  t  } xy t |  D]k \ } } x\ t |  D]N \ } } | d | | | d j o% | d } | | i | | f  q2 q2 Wq W|  i | |  }	 d \ }
 } } x&|	 D]\ } } | | j p | | j o |  i o |
 | | f Sn | } | i | j oq xn | | i D][ \ } } | i | | |  o8 | d 8} | | j  p | | j o |  i o PqqqqWn | | j p0 | | j o; |  i o1 t	 |  t	 |
  j  o | | | }
 } } q q W|
 | | f S(   Ni   i    (   Ni    i    (
   R    R6   R   Rt   t   _find_rulesR#   R   R   R&   t   repr(   R   R   R   t   correct_indicest   sentnumR   t   wordnumt   tagged_wordR   R	   t	   best_rulet
   best_scoret   best_fixscoreR   R   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s:      
#  

c         C   s   g  } xk t  |  D]] \ } } xN t  |  D]@ \ } } | d | | | d j o | i | | f  q, q, Wq Wt t  } xX | D]P \ } } | | }	 | | }
 x- |  i |	 |
 |  D] } | | c d 7<q Wq Wt | i   d d   S(   s  
        Find all rules that correct at least one token's tag in
        C{test_sents}.

        @return: A list of tuples C{(rule, fixscore)}, where C{rule}
            is a brill rule and C{fixscore} is the number of tokens
            whose tag the rule corrects.  Note that C{fixscore} does
            I{not} include the number of tokens whose tags are changed
            to incorrect values.        
        i   t   keyc         S   s   |  \ } } | S(    (    (   R7   R   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   <lambda>  s    (   R   Rt   R    t   intt   _find_rules_att   sortedt   items(   R   R   R   t   error_indicesR   R   R   R   t   rule_score_dictt	   test_sentt
   train_sentR   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s"       

 c         C   sr   t    } | | d | | d j oH | | d } x7 |  i D]( } | i | | |  } | i |  q> Wn | S(   s   
        @rtype: C{Set}
        @return: the set of all rules (based on the templates) that
        correct token C{i}'s tag in C{test_sent}.
        i   (   R   R~   Rd   t   update(   R   R   R   R   Rd   Rl   t   templatet	   new_rules(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s    	
 	c         C   s   d i    GHd  S(   Ns  
           B      |     
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
        (   t   rstrip(   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s    	c      	   C   su   |  i  d j o\ d | | | | | | d | f Gd Gt i t |  d d d d d	 d d i   GHn | GHd  S(   Ni   s   %4d%4d%4d%4d t   |t   initial_indentt    i   t   widthiO   t   subsequent_indenti   s   |   s                       s                     (   R   t   textwrapt   fillR=   t   strip(   R   R   R   R   t
   numchanges(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s    (   R   R   R   R#   R
   R   R   R   R   R   R   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR}   <  s   <	6	!		t   FastBrillTaggerTrainerc           B   s   e  Z d  Z d e d  Z d d d  Z d   Z d   Z d   Z d	   Z	 d
   Z
 d   Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z RS(   s-   
    A faster trainer for brill taggers.
    i    c         C   s{   | d  j o | d j } n | |  _ | |  _ | |  _ | |  _ d  |  _ d  |  _ d  |  _ d  |  _ d  |  _	 d  |  _
 d  S(   Ni    (   R#   R   R~   R   R   t   _tag_positionst   _rules_by_positiont   _positions_by_rulet   _rules_by_scoret   _rule_scorest   _first_unknown_position(   R   R   R   R   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR
     s    											i   i   c   	   	   C   s  |  i  d j o d t |  GHn g  } | D] } | |  i i t |   q. ~ } |  i  d j o	 d GHn |  i | |  |  i  d j o d t |  i  GHn |  i  d j o |  i   n |  i  d j o	 d GHn g  } y x t |  | j  o |  i | | |  } | o | i	 |  n P|  i  d j o |  i
 |  n |  i | |  |  i |  |  i | | |  q WWn" t j
 o d t |  GHn X|  i   t |  i |  S(	   Ni    s(   Training Brill tagger on %d sentences...s   Finding initial useful rules...s       Found %d useful rules.i   i   s   Selecting rules...s+   Training stopped manually -- %d rules found(   R   R%   R   R   R   t   _init_mappingsR   R   R   Rt   R   t   _apply_rulet   _update_tag_positionst   _update_rulesR   t   _cleanR   (	   R   R   R   R   R'   R   R   R	   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR   H  s2    ,!
c   
      C   s  t  t  |  _ t  t  |  _ t  t  |  _ t  t  |  _ t  t  |  _	 t  t  |  _
 x t |  D] \ } } x t |  D] \ } \ } } |  i | i | | f  | | | d } | | j o: x7 |  i | | |  D] }	 |  i |	 | | |  q Wq q Wqg Wd S(   s   
        Initialize the tag position mapping & the rule related
        mappings.  For each error in test_sents, find new rules that
        would correct them, and add them to the rule mappings.
        i   N(   R    R6   R   R   R   R<   R   R   R   R   R   R   Rt   R   t   _update_rule_applies(
   R   R   R   R   R   R   t   wordR   Rl   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s"       c         C   s:   d  |  _ d  |  _ d  |  _ d  |  _ d  |  _ d  |  _ d  S(   N(   R#   R   R   R   R   R   R   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s    					c         c   s=   x6 |  i  D]+ } x" | i | | |  D] } | Vq& Wq
 Wd S(   s   
        Use the templates to find rules that apply at index C{wordnum}
        in the sentence C{sent} and generate the tag C{new_tag}.
        N(   R~   Rd   (   R   R   R   t   new_tagR   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s
    
  c         C   s  | | f } | |  i  | j o d Sn | | | d } | i | j o d |  i  | | <n7 | i | j o d |  i  | | <n d |  i  | | <|  i | i |  |  i | } |  i | c |  i  | | 7<|  i | i |  |  i |  i | i |  d S(   s   
        Update the rule data tables to reflect the fact that
        C{rule} applies at the position C{(sentnum, wordnum)}.
        Ni   ii    (   R   R   R   R   R   R   R   t   discard(   R   R   R   R   R   t   posRl   t	   old_score(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s    c         C   s   | | f } |  i  | } |  i  | c |  i | | 8<|  i | i |  |  i |  i  | i |  |  i | | =|  i | i |  d S(   s   
        Update the rule data tables to reflect the fact that C{rule}
        does not apply at the position C{(sentnum, wordnum)}.
        N(   R   R   R   R   R   R   R   (   R   R   R   R   R   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   _update_rule_not_applies  s    c         C   s  |  i  h  j o t Sn t |  i   } x| | j ot |  i  |  } |  i o | i d t  n x| D]} |  i | i } |  i	 i
 | d  } t i | |  }	 x t |	 t |   D]w }
 | |
 \ } } | i | | |  oJ |  i | | | |  |  i | | j  o | | d f |  i	 | <Pq9q q W|  i | | j o% t |  d d f |  i	 | <| Sqo qo W|  i  | p t  |  i  | =t |  i   d j o t Sn t |  i   } q* Wt S(   s  
        Find the next best rule.  This is done by repeatedly taking a
        rule with the highest score and stepping through the corpus to
        see where it applies.  When it makes an error (decreasing its
        score) it's bumped down, and we try a new rule with the
        highest score.  When we find a rule which has the highest
        score AND which has been tested against the entire corpus, we
        can conclude that it's the next best rule.
        R   i    ii   (   i    i(   R   R#   RJ   R6   R   t   sortR   R   R   R   R   t   bisectt   bisect_leftR$   R%   R&   R   R   R"   (   R   R   R   R   t	   max_scoret
   best_rulesR   R   t   unkRL   R   R   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s<    

  
c   	      C   s   t  |  i |  } | i } | i } |  i d j o |  i t |   n x: | D]2 \ } } | | | d } | | f | | | <qS Wd S(   sl   
        Update C{test_sents} by applying C{rule} everywhere where its
        conditions are meet.
        i   i    N(   R   R   R   R   R   t   _trace_applyR%   (	   R   R   R   t   update_positionst   old_tagR   R   R   Ra   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s    		' c         C   se   x^ |  i  | D]O } |  i | i } t i | |  } | | =|  i | i } t i | |  q Wd S(   sh   
        Update _tag_positions to reflect the changes to tags that are
        made by C{rule}.
        N(   R   R   R   R   R   R   t   insort_left(   R   R   R   t   old_tag_positionst	   old_indext   new_tag_positions(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR   &  s     c         C   sa  t    } xs |  i | D]d \ } } xU |  i D]J } | i | | |  } | i g  }	 | D] }
 |	 | |
 f qZ ~	  q- Wq Wd } } } x| D]\ } } | | } | | | d } t  |  i | | f  } xB | D]: } | i | |  p! | d 7} |  i | | |  q q Wt    } x |  i D] } xy | i | | |  D]b } | | j oO | d 7} | |  i	 j o | d 7} n | i
 |  |  i | | | |  qIqIWq-Wx| |  i i   D]k \ } } | | | f j oL | | j o; | d 7} | i | |  o |  i | | | |  q*q.qqWq W|  i d j o |  i | | |  n d S(   s{   
        Check if we should add or remove any rules from consideration,
        given the changes made by C{rule}.
        i    i   i   N(   R   R   R~   Re   R   R   R&   R   Rd   R   R   R   R   R   R   t   _trace_update_rules(   R   R   R   R   t	   neighborsR   R   R   t   nR'   R   t   num_obsoletet   num_newt
   num_unseenR   Rl   t	   old_rulest   old_rulet
   site_rulest   new_ruleR   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR   5  sR    	 
 6 
 
	
  
 
c         C   s   d i    GHd  S(   Ns  
           B      |     
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
        (   R   (   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR   r  s    	c         C   s]  |  i  | t |  i | i    j p t  |  i | i   } t |  } t g  } | D] } | d j o | | q[ q[ ~  } t g  } | D] } | d j o | | q q ~  } t g  }	 | D] } | d j o |	 | q q ~	  }
 |  i  | } |  i d j oB d | | | |
 f Gt i t	 |  d d d	 d d i
   GHn | GHd  S(   Ni   ii    i   s   %4d%4d%4d%4d  |R   R   i   R   i   s   |   s                       s                     (   R   t   sumR   t   valuesR"   R%   R   R   R   R=   R   (   R   R   t   changest   num_changedR'   RW   t	   num_fixedRo   t
   num_brokenRr   t	   num_otherR   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR   }  s    
$888c         C   s   d } | GH| Gd | GHd  S(   NR   i   R   s   Applying rule to %d positions.s                     s                     |(    (   R   t   num_updatest   prefix(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s    c         C   s8   d } | Gd GH| Gd | GH| Gd | | f GH| GHd  S(	   NR   i   R   s   Updated rule tables:s      - %d rule applications removeds)     - %d rule applications added (%d novel)s                     s                     |(    (   R   R   R   R   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s    	(   R   R   R   R#   R
   R   R   R   R   R   R   R   R   R   R   R   R   R   R   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyR     s"   /<					 		/			=			i   c         C   s'  d d d d d d d d d d d i  d  d	 f } | g } x t |  |  D] \ } } x t |  D] \ } \ } }	 | | d
 }
 |	 |
 j o d i d   | |  D  } d i d   | | d
 D  } d | |
 |	 f } | i d | d | i  d  | d  f  qn qn WqU W| S(   s1  
    Returns a list of human-readable strings indicating the errors in the
    given tagging of the corpus.

    @param train_sents: The correct tagging of the corpus
    @type train_sents: C{list} of C{tuple}
    @param test_sents: The tagged corpus
    @type test_sents: C{list} of C{tuple}
    @param radius: How many tokens on either side of a wrongly-tagged token
        to include in the error string.  For example, if C{radius}=2,
        each error string will show the incorrect token plus two
        tokens on either side.
    @type radius: int
    s   %25s | %s | %s
t   -i   t   +i   s   left contexts   word/test->goldi   s   right contexti   R   c         s   s   x |  ] } d  | Vq Wd S(   s   %s/%sN(    (   R7   t   w(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pys	   <genexpr>  s    c         s   s   x |  ] } d  | Vq Wd S(   s   %s/%sN(    (   R7   R   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pys	   <genexpr>  s    s	   %s/%s->%ss   %25s | %s | %sii   (   t   centert   zipR   RS   Rt   (   R   R   t   radiust   hdrt   errorsR   R   R   R   t	   train_post   test_post   leftt   rightt   mid(    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt
   error_list  s    #	  !	.id   i   i   s
   errors.outs
   rules.yamlg?c         C   sC  d d k  l } d d k l }	 d d k l }
 |	 i d d g  } d	 GH| i   } | o$ t i	 t
 t   t i t  n t |  |  } | |  } | | |  !} g  } | D], } | g  } | D] } | | d
 q ~ q ~ } d GHd GH|	 i | d | } | o d |	 i | |  GHn d GH|	 i | d | } | o d |	 i | |  GHn |
 i |
 i d  |
 i |
 i d  |
 i |
 i d  |
 i |
 i d  |
 i |
 i d   |
 i |
 i d!  |
 i |
 i d"  |
 i |
 i d#  |
 i |
 i d$ d%  |
 i |
 i d& d'  g
 } |
 i | | |  } | i | | |  } | o d |	 i | |  GHn | d j o+ d GHx# | i   D] } t |  GHqWn t | d  } t i | |  | i   | i |  } t | d  } | i d |  x( t | |  D] } | i | d  qW| i   d | | f GHd S((   sp  
    Brill Tagger Demonstration

    @param num_sents: how many sentences of training and testing data to use
    @type num_sents: L{int}
    @param max_rules: maximum number of rule instances to create
    @type max_rules: L{int}
    @param min_score: the minimum score for a rule in order for it to
        be considered
    @type min_score: L{int}
    @param error_output: the file where errors will be saved
    @type error_output: C{string}
    @param rule_output: the file where rules will be saved
    @type rule_output: C{string}
    @param randomize: whether the training data should be a random subset
        of the corpus
    @type randomize: boolean
    @param train: the fraction of the the corpus to be used for training
        (1=all)
    @type train: L{float}
    @param trace: the level of diagnostic tracing output to produce (0-4)
    @type trace: L{int}
    i(   t   treebank(   R   (   t   brills   ^-?[0-9]+(.[0-9]+)?$t   CDs   .*t   NNs   Loading tagged data... i    s   Done loading.s   Training unigram tagger:t   backoffs       [accuracy: %f]s   Training bigram tagger:i   i   i   s   
Brill accuracy: %fs   
Rules: R   s   Errors for Brill Tagger %r

s   
s*   Done; rules and errors saved to %s and %s.N(   s   ^-?[0-9]+(.[0-9]+)?$s   CD(   s   .*s   NN(   i   i   (   i   i   (   i   i   (   i   i   (   i   i   (   i   i   (   i   i   (   i   i   (   ii(   i   i   (   ii(   i   i   (    t   nltk.corpusR   t   nltkR   t   nltk.tagR   t   RegexpTaggert   tagged_sentst   randomt   seedR%   t   sentst   shuffleR   t   UnigramTaggert   accuracyt   BigramTaggerRv   R_   R`   Rf   R   R   R	   R=   t   filet   yamlt   dumpt   closet	   batch_tagt   writeR   (   t	   num_sentsR   R   t   error_outputt   rule_outputt	   randomizeR   R   R   R   R   t   nn_cd_taggert   tagged_datat   cutofft   training_datat	   gold_dataR'   R   Ro   t   tt   testing_datat   unigram_taggert   bigram_taggerR   t   trainert   brill_taggerR   t   print_rulest
   error_fileR3   (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pyt   demo  sp    	
@		 
 
t   __main__(   R   R   R   R  R   R   R    t   utilR   t   apit   TaggerIt
   YAMLObjectR   R    R.   R_   R`   t   objectRb   Rf   Rv   R}   R   R   R)   R  R   (    (    (    s$   /p/zhu/06/nlp/nltk/nltk/tag/brill.pys   <module>   s0   
=L/[= "	a