³ò
ŽB_Kc        	   @   si  d  Z  d d k Z d d k Z d d k l Z l Z d d k l Z d d k Td d k	 Td e
 f d „  ƒ  YZ d e f d	 „  ƒ  YZ d
 e e i f d „  ƒ  YZ d e e i f d „  ƒ  YZ d e f d „  ƒ  YZ d e f d „  ƒ  YZ d e f d „  ƒ  YZ d e e i f d „  ƒ  YZ d e e i f d „  ƒ  YZ d e e f d „  ƒ  YZ d e f d „  ƒ  YZ d S(   s5  
Classes for tagging sentences sequentially, left to right.  The
abstract base class L{SequentialBackoffTagger} serves as the base
class for all the taggers in this module.  Tagging of individual words
is performed by the method L{choose_tag()
<SequentialBackoffTagger.choose_tag>}, which is defined by
subclasses of L{SequentialBackoffTagger}.  If a tagger is unable to
determine a tag for the specified token, then its I{backoff tagger} is
consulted instead.  Any C{SequentialBackoffTagger} may serve as a
backoff tagger for any other C{SequentialBackoffTagger}.
iÿÿÿÿN(   t   FreqDistt   ConditionalFreqDist(   t   NaiveBayesClassifier(   t   *t   SequentialBackoffTaggerc           B   sP   e  Z d  Z e d „ Z d „  Z e e d d ƒZ d „  Z d „  Z	 d „  Z
 RS(   sÆ  
    An abstract base class for taggers that tags words sequentially,
    left to right.  Tagging of individual words is performed by the
    method L{choose_tag()}, which should be defined by subclasses.  If
    a tagger is unable to determine a tag for the specified token,
    then its backoff tagger is consulted.

    @ivar _taggers: A list of all the taggers that should be tried to
        tag a token (i.e., C{self} and its backoff taggers).
    c         C   s4   | d  j o |  g |  _ n |  g | i |  _ d  S(   N(   t   Nonet   _taggers(   t   selft   backoff(    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyt   __init__,   s    c         C   s-   t  |  i ƒ d j  o d  Sn |  i d Sd  S(   Ni   i   (   t   lenR   R   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyt   _get_backoff2   s    t   docs,   
        The backoff tagger for this tagger.c         C   sL   g  } x6 t  t | ƒ ƒ D]" } | i |  i | | | ƒ ƒ q Wt | | ƒ S(   N(   t   rangeR
   t   appendt   tag_onet   zip(   R   t   tokenst   tagst   i(    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyt   tag8   s
      c         C   sE   t  } x8 |  i D]- } | i | | | ƒ } | t  j	 o Pq q W| S(   sG  
        Determine an appropriate tag for the specified token, and
        return that tag.  If this tagger is unable to determine a tag
        for the specified token, then its backoff tagger is consulted.
        
        @rtype: C{str}
        @type tokens: C{list}
        @param tokens: The list of words that are being tagged.
        @type index: C{int}
        @param index: The index of the word whose tag should be
            returned.
        @type history: C{list} of C{str}
        @param history: A list of the tags for all words before
            C{index}.
        (   R   R   t
   choose_tag(   R   R   t   indext   historyR   t   tagger(    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR   ?   s    
 c         C   s   t  d ƒ ‚ d S(   s³  
        Decide which tag should be used for the specified token, and
        return that tag.  If this tagger is unable to determine a tag
        for the specified token, return C{None} -- do I{not} consult
        the backoff tagger.  This method should be overridden by
        subclasses of C{SequentialBackoffTagger}.

        @rtype: C{str}
        @type tokens: C{list}
        @param tokens: The list of words that are being tagged.
        @type index: C{int}
        @param index: The index of the word whose tag should be
            returned.
        @type history: C{list} of C{str}
        @param history: A list of the tags for all words before
            C{index}.
        s,   SequentialBackoffTagger is an abstract classN(   t   AssertionError(   R   R   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR   U   s    (   t   __name__t
   __module__t   __doc__R   R	   R   t   propertyR   R   R   R   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR   !   s   
					t   ContextTaggerc           B   sM   e  Z d  Z e d „ Z d „  Z d „  Z d „  Z d „  Z d e	 d „ Z
 RS(   s2  
    An abstract base class for sequential backoff taggers that choose
    a tag for a token based on the value of its "context".  Different
    subclasses are used to define different contexts.

    A C{ContextTagger} chooses the tag for a token by calculating the
    token's context, and looking up the corresponding tag in a table.
    This table can be constructed manually; or it can be automatically
    constructed based on a training corpus, using the L{_train()}
    factory method.

    @ivar _context_to_tag: Dictionary mapping contexts to tags.
    c         C   s1   t  i |  | ƒ | o | |  _ n
 h  |  _ d S(   sŸ   
        @param context_to_tag: A dictionary mapping contexts to tags.
        @param backoff: The backoff tagger that should be used for this tagger.
        N(   R   R	   t   _context_to_tag(   R   t   context_to_tagR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	   x   s    c         C   s   t  d ƒ ‚ d S(   sá   
        @return: the context that should be used to look up the tag
            for the specified token; or C{None} if the specified token
            should not be handled by this tagger.
        @rtype: (hashable)
        s   Abstract base classN(   R   (   R   R   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyt   contextƒ   s    c         C   s%   |  i  | | | ƒ } |  i i | ƒ S(   N(   R!   R   t   get(   R   R   R   R   R!   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR   Œ   s    c         C   s   t  |  i ƒ S(   sw   
        @return: The number of entries in the table used by this
        tagger to map from contexts to tags.
        (   R
   R   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyt   size   s    c         C   s   d |  i  i |  i ƒ  f S(   Ns   <%s: size=%d>(   t	   __class__R   R#   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyt   __repr__—   s    i    c      	   C   s©  d } } t  ƒ  } t ƒ  } xÏ | D]Ç } t | Œ  \ }	 }
 x¬ t | ƒ D]ž \ } \ } } | d 7} |  i |	 | |
 |  ƒ } | d j o qH n | | i | ƒ |  i d j p# | |  i i |	 | |
 |  ƒ j o | i	 | ƒ qH qH Wq# WxT | D]L } | | i
 ƒ  } | | | } | | j o | |  i | <| | 7} qõ qõ W| oY t |  i ƒ } d | d | } d | d t | i ƒ  ƒ } d Gd | | | f GHn d S(   sð  
        Initialize this C{ContextTagger}'s L{_context_to_tag} table
        based on the given training data.  In particular, for each
        context C{I{c}} in the training data, set
        C{_context_to_tag[I{c}]} to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of C{self._context_to_tag} (if any) is discarded.

        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of C{(word, tag)} tuples.
        @param cutoff: If the most likely tag for a context occurs
            fewer than C{cutoff} times, then exclude it from the
            context-to-tag table for the new tagger.
        i    i   id   g      Y@s   [Trained Unigram tagger:s(   size=%d, backoff=%.2f%%, pruning=%.2f%%]N(   t   setR   R   t	   enumerateR!   R   t   incR   R   t   addt   maxR   R
   t
   conditions(   R   t   tagged_corpust   cutofft   verboset   token_countt	   hit_countt   useful_contextst   fdt   sentenceR   R   R   t   tokenR   R!   t   best_tagt   hitsR#   R   t   pruning(    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyt   _trainš   s:    
		  
# (   R   R   R   R   R	   R!   R   R#   R%   t   FalseR8   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR   j   s   					t   DefaultTaggerc           B   s/   e  Z d  Z d Z d „  Z d „  Z d „  Z RS(   s<   
    A tagger that assigns the same tag to every token.
    s   !nltk.DefaultTaggerc         C   s   | |  _  t i |  d ƒ d S(   sK   
        Construct a new tagger that assigns C{tag} to all tokens.
        N(   t   _tagR   R	   R   (   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	   Þ   s    	c         C   s   |  i  S(   N(   R;   (   R   R   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR   å   s    c         C   s   d |  i  S(   Ns   <DefaultTagger: tag=%s>(   R;   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR%   è   s    (   R   R   R   t   yaml_tagR	   R   R%   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR:   Ø   s
   		t   NgramTaggerc           B   s5   e  Z d  Z d Z e e e d e d „ Z d „  Z RS(   s-  
    A tagger that chooses a token's tag based on its word string and
    on the preceeding I{n} word's tags.  In particular, a tuple
    C{(tags[i-n:i-1], words[i])} is looked up in a table, and the
    corresponding tag is returned.  N-gram taggers are typically
    trained on a tagged corpus.
    s   !nltk.NgramTaggeri    c         C   sN   | |  _  |  i | | ƒ t i |  | | ƒ | o |  i | | | ƒ n d S(   s@  
        Train a new C{NgramTagger} using the given training data or
        the supplied model.  In particular, construct a new tagger
        whose table maps from each context C{(tag[i-n:i-1], word[i])}
        to the most frequent tag for that context.  But exclude any
        contexts that are already tagged perfectly by the backoff
        tagger.
        
        @param train: A tagged corpus consisting of a C{list} of tagged
            sentences, where each sentence is a C{list} of C{(word, tag)} tuples.
        @param backoff: A backoff tagger, to be used by the new
            tagger if it encounters an unknown context.
        @param cutoff: If the most likely tag for a context occurs
            fewer than C{cutoff} times, then exclude it from the
            context-to-tag table for the new tagger.
        N(   t   _nt   _check_paramsR   R	   R8   (   R   t   nt   traint   modelR   R-   R.   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	   ö   s
    	c         C   s5   t  | t d | |  i d ƒ | !ƒ } | | | f S(   Ni    i   (   t   tupleR*   R>   (   R   R   R   R   t   tag_context(    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR!     s    '(   R   R   R   R<   R   R9   R	   R!   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR=   ì   s
   t   UnigramTaggerc           B   s5   e  Z d  Z d Z e e e d e d „ Z d „  Z RS(   s‚   
    A tagger that chooses a token's tag based its word string.
    Unigram taggers are typically trained on a tagged corpus.
    s   !nltk.UnigramTaggeri    c         C   s#   t  i |  d | | | | | ƒ d  S(   Ni   (   R=   R	   (   R   RA   RB   R   R-   R.   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	     s    c         C   s   | | S(   N(    (   R   R   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR!   !  s    (   R   R   R   R<   R   R9   R	   R!   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRE     s
   t   BigramTaggerc           B   s)   e  Z d  Z d Z e e d e d „ Z RS(   s3  
    A tagger that chooses a token's tag based its word string and on
    the preceeding words' tag.  In particular, a tuple consisting
    of the previous tag and the word is looked up in a table, and
    the corresponding tag is returned.  Bigram taggers are typically
    trained on a tagged corpus.
    s   !nltk.BigramTaggeri    c         C   s#   t  i |  d | | | | | ƒ d  S(   Ni   (   R=   R	   (   R   RA   RB   R   R-   R.   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	   /  s    (   R   R   R   R<   R   R9   R	   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRF   %  s   t   TrigramTaggerc           B   s,   e  Z d  Z d Z e e e d e d „ Z RS(   sC  
    A tagger that chooses a token's tag based its word string and on
    the preceeding two words' tags.  In particular, a tuple consisting
    of the previous two tags and the word is looked up in a table, and
    the corresponding tag is returned.  Trigram taggers are typically
    trained them on a tagged corpus.
    s   !nltk.TrigramTaggeri    c         C   s#   t  i |  d | | | | | ƒ d  S(   Ni   (   R=   R	   (   R   RA   RB   R   R-   R.   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	   ?  s    (   R   R   R   R<   R   R9   R	   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRG   5  s   t   AffixTaggerc           B   s;   e  Z d  Z d Z e e d d e d e d „ Z d „  Z RS(   sË  
    A tagger that chooses a token's tag based on a leading or trailing
    substring of its word string.  (It is important to note that these
    substrings are not necessarily "true" morphological affixes).  In
    particular, a fixed-length substring of the word is looked up in a
    table, and the corresponding tag is returned.  Affix taggers are
    typically constructed by training them on a tagged corpus; see
    L{the constructor <__init__>}.
    s   !nltk.AffixTaggeriýÿÿÿi   i    c         C   sa   |  i  | | ƒ t i |  | | ƒ | |  _ | t | ƒ |  _ | o |  i | | | ƒ n d S(   s•  
        Construct a new affix tagger.
        
        @param affix_length: The length of the affixes that should be
            considered during training and tagging.  Use negative
            numbers for suffixes.
        @param min_stem_length: Any words whose length is less than
            C{min_stem_length+abs(affix_length)} will be assigned a
            tag of C{None} by this tagger.
        N(   R?   R   R	   t   _affix_lengtht   abst   _min_word_lengthR8   (   R   RA   RB   t   affix_lengtht   min_stem_lengthR   R-   R.   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	   Q  s    	c         C   sV   | | } t  | ƒ |  i j  o d  Sn+ |  i d j o | |  i  Sn | |  i Sd  S(   Ni    (   R
   RK   R   RI   (   R   R   R   R   R4   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR!   g  s    
(   R   R   R   R<   R   R9   R	   R!   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRH   E  s
   		t   RegexpTaggerc           B   s2   e  Z d  Z d Z e d „ Z d „  Z d „  Z RS(   sa   
    A tagger that assigns tags to words based on regular expressions
    over word strings.
    s   !nltk.RegexpTaggerc         C   s   | |  _  t i |  | ƒ d S(   sÌ  
        Construct a new regexp tagger.

        @type regexps: C{list} of C{(str, str)}
        @param regexps: A list of C{(regexp, tag)} pairs, each of
            which indicates that a word matching C{regexp} should
            be tagged with C{tag}.  The pairs will be evalutated in
            order.  If none of the regexps match a word, then the
            optional backoff tagger is invoked, else it is
            assigned the tag C{None}.
        N(   t   _regexpsR   R	   (   R   t   regexpsR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	   w  s    	c         C   s=   x6 |  i  D]+ \ } } t i | | | ƒ o | Sq
 q
 Wd  S(   N(   RO   t   ret   matchR   (   R   R   R   R   t   regexpR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR   †  s
    
 c         C   s   d t  |  i ƒ S(   Ns   <Regexp Tagger: size=%d>(   R
   RO   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR%   Œ  s    (   R   R   R   R<   R   R	   R   R%   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRN   q  s
   	t   ClassifierBasedTaggerc           B   s\   e  Z d  Z e e e i e e e e d „ Z d „  Z d „  Z	 d „  Z
 d „  Z d „  Z RS(   sì  
    A sequential tagger that uses a classifier to choose the tag for
    each token in a sentence.  The featureset input for the classifier
    is generated by a feature detector function::

        feature_detector(tokens, index, history) -> featureset

    Where C{tokens} is the list of unlabeled tokens in the sentence;
    C{index} is the index of the token for which feature detection
    should be performed; and C{history} is list of the tags for all
    tokens before C{index}.
    c         C   sœ   |  i  | | ƒ t i |  | ƒ | o | p | o | o t d ƒ ‚ n | d j	 o | |  _ n | |  _ | |  _ | o |  i | | | ƒ n d S(   sb  
        Construct a new classifier-based sequential tagger.

        @param feature_detector: A function used to generate the
            featureset input for the classifier::
                feature_detector(tokens, index, history) -> featureset

        @param train: A tagged corpus consisting of a C{list} of tagged
            sentences, where each sentence is a C{list} of C{(word, tag)} tuples.
            
        @param backoff: A backoff tagger, to be used by the new tagger
            if it encounters an unknown context.
            
        @param classifier_builder: A function used to train a new
            classifier based on the data in C{train}.  It should take
            one argument, a list of labeled featuresets (i.e.,
            C{(featureset, label)} tuples).
            
        @param classifier: The classifier that should be used by the
            tagger.  This is only useful if you want to manually
            construct the classifier; normally, you would use C{train}
            instead.
            
        @param backoff: A backoff tagger, used if this tagger is
            unable to determine a tag for a given token.
            
        @param cutoff_prob: If specified, then this tagger will fall
            back on its backoff tagger if the probability of the most
            likely tag is less than C{cutoff_prob}.
        s8   Must specify either training data or trained classifier.N(	   R?   R   R	   t
   ValueErrorR   t   _feature_detectort   _cutoff_probt   _classifierR8   (   R   t   feature_detectorRA   t   classifier_buildert
   classifierR   t   cutoff_probR.   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR	   œ  s    "		c         C   s€   |  i  | | | ƒ } |  i d  j o |  i i | ƒ SnD |  i i | ƒ } | i ƒ  } | i | ƒ |  i j o | Sn d  Sd  S(   N(   RY   RW   R   RX   t   classifyt   prob_classifyR*   t   prob(   R   R   R   R   t
   featuresett   pdistR   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR   Õ  s    c         C   sÆ   g  } | o	 d GHn x€ | D]x } g  } t  | Œ  \ } } xW t t | ƒ ƒ D]C }	 |  i | |	 | ƒ }
 | i |
 | |	 f ƒ | i | |	 ƒ qN Wq W| o d t | ƒ GHn | | ƒ |  _ d S(   sf   
        Build a new classifier, based on the given training data
        (C{tagged_corpus}).
        s,   Constructing training corpus for classifier.s"   Training classifier (%d instances)N(   R   R   R
   RY   R   RX   (   R   R,   RZ   R.   t   classifier_corpusR3   R   t   untagged_sentenceR   R   R`   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR8   æ  s     	  	c         C   s   d |  i  S(   Ns   <ClassifierBasedTagger: %r>(   RX   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR%   ý  s    c         C   s   |  i  | | | ƒ S(   s  
        Return the feature detector that this tagger uses to generate
        featuresets for its classifier.  The feature detector is a
        function with the signature::

          feature_detector(tokens, index, history) -> featureset

        @see: L{classifier()}
        (   RV   (   R   R   R   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRY      s    
c         C   s   |  i  S(   sñ   
        Return the classifier that this tagger uses to choose a tag
        for each word in a sentence.  The input for this classifier is
        generated using this tagger's feature detector.

        @see: L{feature_detector()}
        (   RX   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyR[     s    (   R   R   R   R   R   RA   R9   R	   R   R8   R%   RY   R[   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRT     s   6				t   ClassifierBasedPOSTaggerc           B   s   e  Z d  Z d „  Z RS(   s3   
    A classifier based part of speech tagger.
    c         C   s  | | } | d j o d  } } d  } } n„ | d j o2 | | d i ƒ  } d  } | | d } d  } nE | | d i ƒ  } | | d i ƒ  } | | d } | | d } t i d | ƒ o
 d }	 n{ t i d | ƒ o
 d }	 n^ t i d | ƒ o
 d	 }	 nA t i d
 | ƒ o
 d }	 n$ t i d | ƒ o
 d }	 n d }	 h  | d <| d <| d <| i ƒ  d <| i ƒ  d d <| i ƒ  d d <| i ƒ  d d <| d <| d <d | | i ƒ  f d <d | | i ƒ  f d <d | | i ƒ  f d <|	 d <}
 |
 S(    Ni    i   i   s!   [0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$t   numbers   \W+$t   puncts   [A-Z][a-z]+$t   upcases   [a-z]+$t   downcases   \w+$t	   mixedcaset   othert   prevtagt   prevprevtagt   words
   word.loweriýÿÿÿt   suffix3iþÿÿÿt   suffix2iÿÿÿÿt   suffix1t   prevprevwordt   prevwords   %s+%ss   prevtag+words   prevprevtag+words   prevword+wordt   shape(   R   t   lowerRQ   RR   (   R   R   R   R   Rm   Rr   Rq   Rk   Rl   Rs   t   features(    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRY     sN    







					(   R   R   R   RY   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pyRd     s   (   R   RQ   t   yamlt   nltk.probabilityR    R   t   nltk.classify.naivebayesR   t   apit   utilt   TaggerIR   R   t
   YAMLObjectR:   R=   RE   RF   RG   RH   RN   t   FeaturesetTaggerIRT   Rd   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/tag/sequential.pys   <module>   s    

In),‡