³ò
4ÒÇIc           @   sŸ   d  Z  d d k Z d d k Z d d k l Z d d „ Z d „  Z d „  Z	 d „  Z
 d e f d	 „  ƒ  YZ d
 „  Z d „  Z e d „ Z h  a d d „ Z d S(   s0   
Utility functions and classes for classifiers.
iÿÿÿÿN(   t   LazyMappedListc            si   | d j o$ | o t | d t t f ƒ } n | o  ‡  f d †  } t | | ƒ Sn t | ˆ  ƒ Sd S(   sÓ  
    Use the L{LazyMappedList} class to construct a lazy list-like
    object that is analogous to C{map(feature_func, toks)}.  In
    particular, if C{labeled=False}, then the returned list-like
    object's values are equal to::

        [feature_func(tok) for tok in toks]

    If C{labeled=True}, then the returned list-like object's values
    are equal to::

        [(feature_func(tok), label) for (tok, label) in toks]

    The primary purpose of this function is to avoid the memory
    overhead involved in storing all the featuresets for every token
    in a corpus.  Instead, these featuresets are constructed lazily,
    as-needed.  The reduction in memory overhead can be especially
    significant when the underlying list of tokens is itself lazy (as
    is the case with many corpus readers).

    @param feature_func: The function that will be applied to each
        token.  It should return a featureset -- i.e., a C{dict}
        mapping feature names to feature values.
    @param toks: The list of tokens to which C{feature_func} should be
        applied.  If C{labeled=True}, then the list elements will be
        passed directly to C{feature_func()}.  If C{labeled=False},
        then the list elements should be tuples C{(tok,label)}, and
        C{tok} will be passed to C{feature_func()}.
    @param labeled: If true, then C{toks} contains labeled tokens --
        i.e., tuples of the form C{(tok, label)}.  (Default:
        auto-detect based on types.)
    i    c            s   ˆ  |  d ƒ |  d f S(   Ni    i   (    (   t   labeled_token(   t   feature_func(    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt	   lazy_func>   s    N(   t   Nonet
   isinstancet   tuplet   listR    (   R   t   tokst   labeledR   (    (   R   s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   apply_features   s    !$c         C   s1   t  t g  } |  D] \ } } | | q ~ ƒ ƒ S(   s&  
    @return: A list of all labels that are attested in the given list
        of tokens.
    @rtype: C{list} of (immutable)
    @param tokens: The list of classified tokens from which to extract
        labels.  A classified token has the form C{(token, label)}.
    @type tokens: C{list}
    (   R   t   set(   t   tokenst   _[1]t   tokt   label(    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   attested_labelsD   s    	c   	      C   s’   |  i  g  } | D] \ } } | | q ~ ƒ } g  } t | | ƒ D]" \ \ } } } | | i | ƒ qD ~ } t i t t | ƒ ƒ t | ƒ ƒ S(   N(   t   batch_prob_classifyt   zipt   probt   matht   logt   floatt   sumt   len(	   t
   classifiert   goldR   t   fst   lt   resultst   _[2]t   pdistt   ll(    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   log_likelihoodO   s    0?c   	      C   s†   |  i  g  } | D] \ } } | | q ~ ƒ } g  } t | | ƒ D] \ \ } } } | | | j qD ~ } t t | ƒ ƒ t | ƒ S(   N(   t   batch_classifyR   R   R   R   (	   R   R   R   R   R   R   R   t   rt   correct(    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   accuracyT   s    0<t   CutoffCheckerc           B   s    e  Z d  Z d „  Z d „  Z RS(   sÉ   
    A helper class that implements cutoff checks based on number of
    iterations and log likelihood.

    Accuracy cutoffs are also implemented, but they're almost never
    a good idea to use.
    c         C   sy   | i  ƒ  |  _ d | j o t | d ƒ | d <n d | j o t | d ƒ | d <n d  |  _ d  |  _ d |  _ d  S(   Nt   min_llt   min_lldeltai   (   t   copyt   cutoffst   absR   R    t   acct   iter(   t   selfR*   (    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   __init__a   s    		c         C   s  |  i  } |  i d 7_ d | j o |  i | d j o t Sn d | j p d | j oˆ t i i i | | ƒ } d | j o | | d j o t Sn d | j o0 |  i o& | |  i t | d ƒ j o t Sn | |  _ n d | j p d | j oŒ t i i i | | ƒ } d | j o | | d j o t Sn d | j o0 |  i	 o& | |  i	 t | d ƒ j o t Sn | |  _	 t
 Sn d  S(   Ni   t   max_iterR'   R(   t   max_acct   min_accdelta(   R*   R-   t   Truet   nltkt   classifyt   utilR!   R    R+   R,   t   False(   R.   R   t
   train_toksR*   t   new_llt   new_acc(    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   checkk   s,    	!	(   t   __name__t
   __module__t   __doc__R/   R;   (    (    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyR&   Y   s   	
c         C   s„   h  } t  | d <|  d i ƒ  | d <|  d i ƒ  | d <xE d D]= } |  i ƒ  i | ƒ | d | <| |  i ƒ  j | d | <q? W| S(	   Nt   alwaysoni    t
   startswithiÿÿÿÿt   endswitht   abcdefghijklmnopqrstuvwxyzs	   count(%s)s   has(%s)(   R3   t   lowert   count(   t   namet   featurest   letter(    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   names_demo_featuresŠ   s    
 c         C   sÌ   h  } t  | d <|  d i ƒ  d j | d <|  d i ƒ  d j | d <x d D]y } |  i ƒ  i | ƒ | d | <| |  i ƒ  j | d	 | <| |  d i ƒ  j | d
 | <| |  d i ƒ  j | d | <qK W| S(   NR?   i    t   aeiouys   startswith(vowel)iÿÿÿÿs   endswith(vowel)RB   s	   count(%s)s   has(%s)s   startswith(%s)s   endswith(%s)(   R3   RC   RD   (   RE   RF   RG   (    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   binary_names_demo_features”   s    
 "c         C   s^  d d k  l } d d  k } g  } | i d ƒ D] } | | d f q0 ~ g  } | i d ƒ D] } | | d f q] ~ } | i d ƒ | i | ƒ | d  } | d d	 !}	 d
 GH|  g  }
 | D] \ } } |
 | | ƒ | f q¾ ~
 ƒ } d GHt | g  } |	 D] \ } } | | | ƒ | f qÿ ~ ƒ } d | GHyg  } |	 D] \ } } | | | ƒ qA~ } | i | ƒ } g  } t |	 | ƒ D]" \ \ } } } | | i	 | ƒ q†~ } d t
 | ƒ t |	 ƒ GHHd d d GHxh t |	 | ƒ d  D]S \ \ } } } | d j o
 d } n d } | | | i d ƒ | i d ƒ f GHqìWWn t j
 o n X| S(   Niÿÿÿÿ(   t   namess   male.txtt   males
   female.txtt   femalei@â iˆ  i|  s   Training classifier...s   Testing classifier...s   Accuracy: %6.4fs   Avg. log likelihood: %6.4fs%   Unseen Names      P(Male)  P(Female)
t   -i(   i   s     %-15s *%6.4f   %6.4fs     %-15s  %6.4f  *%6.4f(   t   nltk.corpusRK   t   randomt   wordst   seedt   shuffleR%   R   R   t   logprobR   R   R   t   NotImplementedError(   t   trainerRF   RK   RP   R   RE   R   t   namelistt   traint   testt   _[3]t   nt   gR   t   _[4]R,   t   _[5]t   test_featuresetst   pdistst   _[6]R   R   R    t   gendert   fmt(    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt
   names_demo    s<    -1
9<	-8 
,iè  c         C   sM  d d k  l } d d  k } d GH| t j o? g  } | i | ƒ D] } | | | i d f qB ~ t | <n t | } | t | ƒ j o t | ƒ } n t t d „  | Dƒ ƒ ƒ }	 d d i	 |	 ƒ GHd GH| i
 d	 ƒ | i | ƒ | t d
 | ƒ  }
 | t d
 | ƒ | !} d GH|  g  } |
 D] \ } } | | | ƒ | f q&~ ƒ } d GHt | g  } | D] \ } } | | | ƒ | f qg~ ƒ } d | GHy˜ g  } | D] \ } } | | | ƒ q©~ } | i | ƒ } g  } t | | ƒ D]" \ \ } } } | | i | ƒ qî~ } d t | ƒ t | ƒ GHWn t j
 o n X| S(   Niÿÿÿÿ(   t   sensevals   Reading data...i    c         s   s   x |  ] \ } } | Vq Wd  S(   N(    (   t   .0t   iR   (    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pys	   <genexpr>Ù   s    s
     Senses: t    s   Splitting into test & train...i@â gš™™™™™é?s   Training classifier...s   Testing classifier...s   Accuracy: %6.4fs   Avg. log likelihood: %6.4f(   RO   Re   RP   t   _inst_cachet	   instancest   sensesR   R   R   t   joinRR   RS   t   intR%   R   R   RT   R   RU   (   RV   t   wordRF   R[   Re   RP   R   Rg   Rj   Rk   RX   RY   R   R   R   RZ   R,   R]   R_   R`   R^   RE   R   R   R    (    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pyt   wsd_demoÎ   s8    ?#9<	-8(   R>   R   t   nltk.classify.utilR4   t	   nltk.utilR    R   R
   R   R!   R%   t   objectR&   RH   RJ   Rd   Ri   Ro   (    (    (    s(   /p/zhu/06/nlp/nltk/nltk/classify/util.pys   <module>   s   
*			1	
	-