³ò
’B_Kc           @   se   d  Z  d d k l Z d d k Td d k Td e f d „  ƒ  YZ d „  Z e d j o e ƒ  n d S(	   s  
A classifier based on the Naive Bayes algorithm.  In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label)::

                      P(label) * P(features|label)
 P(label|features) = ------------------------------
                             P(features)

The algorithm then makes the 'naive' assumption that all features are
independent, given the label::
                             
                      P(label) * P(f1|label) * ... * P(fn|label)
 P(label|features) = --------------------------------------------
                                        P(features)

Rather than computing P(featues) explicitly, the algorithm just
calculates the denominator for each label, and normalizes them so they
sum to one::
                             
                      P(label) * P(f1|label) * ... * P(fn|label)
 P(label|features) = --------------------------------------------
                       SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
iÿÿÿÿ(   t   defaultdict(   t   *t   NaiveBayesClassifierc           B   s\   e  Z d  Z d „  Z d „  Z d „  Z d „  Z d d „ Z d d „ Z e	 e
 d	 „ ƒ Z RS(
   s  
    A Naive Bayes classifier.  Naive Bayes classifiers are
    paramaterized by two probability distributions:

      - P(label) gives the probability that an input will receive each
        label, given no information about the input's features.
        
      - P(fname=fval|label) gives the probability that a given feature
        (fname) will receive a given value (fval), given that the
        label (label).

    If the classifier encounters an input with a feature that has
    never been seen with any label, then rather than assigning a
    probability of 0 to all labels, it will ignore that feature.

    The feature value 'None' is reserved for unseen feature values;
    you generally should not use 'None' as a feature value for one of
    your own features.
    c         C   s%   | |  _  | |  _ | i ƒ  |  _ d S(   s5  
        @param label_probdist: P(label), the probability distribution
            over labels.  It is expressed as a L{ProbDistI} whose
            samples are labels.  I.e., P(label) =
            C{label_probdist.prob(label)}.
        
        @param feature_probdist: P(fname=fval|label), the probability
            distribution for feature values, given labels.  It is
            expressed as a dictionary whose keys are C{(label,fname)}
            pairs and whose values are L{ProbDistI}s over feature
            values.  I.e., P(fname=fval|label) =
            C{feature_probdist[label,fname].prob(fval)}.  If a given
            C{(label,fname)} is not a key in C{feature_probdist}, then
            it is assumed that the corresponding P(fname=fval|label)
            is 0 for all values of C{fval}.
        N(   t   _label_probdistt   _feature_probdistt   samplest   _labels(   t   selft   label_probdistt   feature_probdist(    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyt   __init__@   s    		c         C   s   |  i  S(   N(   R   (   R   (    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyt   labelsU   s    c         C   s   |  i  | ƒ i ƒ  S(   N(   t   prob_classifyt   max(   R   t
   featureset(    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyt   classifyX   s    c         C   s,  | i  ƒ  } xJ | i ƒ  D]< } x3 |  i D]! } | | f |  i j o Pq) q) W| | =q Wh  } x' |  i D] } |  i i | ƒ | | <qi WxŠ |  i D] } xv | i ƒ  D]h \ } } | | f |  i j o0 |  i | | f } | | c | i | ƒ 7<q¦ | | c t g  ƒ 7<q¦ Wq“ Wt | d t	 d t	 ƒS(   Nt	   normalizet   log(
   t   copyt   keysR   R   R   t   logprobt   itemst   sum_logst   DictionaryProbDistt   True(   R   R   t   fnamet   labelR   t   fvalt   feature_probs(    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyR   [   s*     
 	
 
  i
   c   	         sD  |  i  ‰  d GHx/|  i | ƒ D]\ ‰ ‰ ‡ ‡ ‡  f d †  } t g  } |  i D]. } ˆ ˆ  | ˆ f i ƒ  j o | | qP qP ~ d | ƒ} t | ƒ d j o q n | d } | d } ˆ  | ˆ f i ˆ ƒ d j o
 d } n5 d ˆ  | ˆ f i ˆ ƒ ˆ  | ˆ f i ˆ ƒ } d	 ˆ ˆ | d
  | d
  | f GHq Wd  S(   Ns   Most Informative Featuresc            s   ˆ |  ˆ f i  ˆ  ƒ S(   N(   t   prob(   t   l(   R   R   t   cpdist(    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyt	   labelprob‚   s    t   keyi   i    iÿÿÿÿt   INFs   %8.1fs"   %24s = %-14r %6s : %-6s = %s : 1.0i   (   R   t   most_informative_featurest   sortedR   R   t   lenR   (	   R   t   nR    t   _[1]R   R   t   l0t   l1t   ratio(    (   R   R   R   s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyt   show_most_informative_features|   s"    	 1	

 
id   c   	         s  t  ƒ  } t d „  ƒ ‰  t d „  ƒ ‰ x² |  i i ƒ  D]¡ \ \ } } } xŒ | i ƒ  D]~ } | | f } | i | ƒ | i | ƒ } t | ˆ  | ƒ ˆ  | <t | ˆ | ƒ ˆ | <ˆ | d j o | i	 | ƒ qV qV Wq7 Wt
 | d ‡  ‡ f d †  ƒ} | |  S(   s–  
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature C{(fname,fval)} is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label::

          max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        c           S   s   d  S(   g        (    (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyt   <lambda>    s    c           S   s   d  S(   g      ð?(    (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyR,   ¡   s    i    R!   c            s   ˆ |  ˆ  |  S(    (    (   t   feature(   t   maxprobt   minprob(    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyR,   °   s    (   t   setR    R   R   R   t   addR   R   t   mint   discardR$   (	   R   R&   t   featuresR   R   t   probdistR   R-   t   p(    (   R.   R/   s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyR#   ’   s"    	  	c         C   s  t  ƒ  } t t  ƒ } t t ƒ } t ƒ  } xs |  D]k \ } } | i | ƒ xO | i ƒ  D]A \ } }	 | | | f i |	 ƒ | | i |	 ƒ | i | ƒ qW Wq1 Wxn | D]f } | | }
 xS | D]K } | | | f i ƒ  } | | | f i t |
 | ƒ | | i t ƒ q¾ Wq§ W| | ƒ } h  } xL | i ƒ  D]> \ \ } } } | | d t | | ƒ ƒ} | | | | f <q0Wt	 | | ƒ S(   sŠ   
        @param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples C{(featureset, label)}.
        t   bins(
   t   FreqDistR    R0   t   incR   R1   t   Nt   NoneR%   R   (   t   labeled_featuresetst	   estimatort   label_freqdistt   feature_freqdistt   feature_valuest   fnamesR   R   R   R   t   num_samplest   countR   R	   t   freqdistR5   (    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyt   train³   s6    		   
  (   t   __name__t
   __module__t   __doc__R
   R   R   R   R+   R#   t   staticmethodt   ELEProbDistRE   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyR   ,   s   				!!c          C   s-   d d k  l }  |  t i ƒ } | i ƒ  d  S(   Niÿÿÿÿ(   t
   names_demo(   t   nltk.classify.utilRK   R   RE   R+   (   RK   t
   classifier(    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pyt   demoå   s    t   __main__N(	   RH   t   nltk.compatR    t   nltk.probabilityt   apit   ClassifierIR   RN   RF   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/classify/naivebayes.pys   <module>!   s   

¹	