³ò
4ÒÇIc           @   s¡   d  Z  d d k Z d „  Z e i Z d „  Z d Z d Z d Z	 d Z
 d e f d	 „  ƒ  YZ d
 e f d „  ƒ  YZ d e f d „  ƒ  YZ d e f d „  ƒ  YZ d S(   sÉ   
Provides scoring functions for a number of association measures through a
generic, abstract implementation in L{NgramAssocMeasures}, and n-specific
L{BigramAssocMeasures} and L{TrigramAssocMeasures}.
iÿÿÿÿNc         C   s   t  i |  d  ƒ S(   g       @(   t   _matht   log(   t   x(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   <lambda>   s    c         C   s   t  d  „  |  ƒ S(   c         S   s   |  | S(    (    (   R   t   y(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR      s    (   t   reduce(   t   s(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR      s    g#B’¡œÇ;i    iþÿÿÿt   NgramAssocMeasuresc           B   s³   e  Z d  Z e d „  ƒ Z e d „  ƒ Z e d „  ƒ Z e d „  ƒ Z e d „  ƒ Z	 e d „  ƒ Z
 e d „  ƒ Z e d „  ƒ Z e d	 „  ƒ Z e d
 „  ƒ Z e d „  ƒ Z RS(   sº  
    An abstract class defining a collection of generic association measures.
    Each public method returns a score, taking the following arguments:
        score_fn(count_of_ngram,
                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
                 ...,
                 (count_of_1gram_1, ..., count_of_1gram_n),
                 count_of_total_words)
    See L{BigramAssocMeasures} and L{TrigramAssocMeasures}

    Inheriting classes should define a property _n, and a method _contingency
    which calculates contingency values from marginals in order for all
    association measures defined here to be usable.
    c          G   s   t  d ‚ d S(   s>   Calculates values of a contingency table from marginal values.s?   The contingency table is not availablein the general ngram caseN(   t   NotImplementedError(   t	   marginals(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   _contingency4   s    c          G   s   t  d ‚ d S(   sA   Calculates values of contingency table marginals from its values.s?   The contingency table is not availablein the general ngram caseN(   R   (   t   contingency(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt
   _marginals:   s    c         #   s‡   t  ˆ  ƒ } g  } t |  i ƒ D] ‰ | d ˆ >q  ~ } xF t t ˆ  ƒ ƒ D]2 ‰ t ‡ ‡  f d †  | Dƒ ƒ t | d ƒ VqM Wd S(   s3   Calculates expected values for a contingency table.i   c         3   s)   x" |  ] } ˆ ˆ  ˆ ˆ  | AVq Wd  S(   N(    (   t   .0t   j(   t   it   cont(    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pys	   <genexpr>I   s    i   N(   t   sumt   ranget   _nt   lent   _productt   float(   t   clsR   t   n_allt   _[1]t   bits(    (   R   R   s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   _expected_values@   s    . c          G   s   t  |  t ƒ |  t S(   s    Scores ngrams by their frequency(   R   t   NGRAMt   TOTAL(   R	   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   raw_freqL   s    c         G   sA   | t  | t t | t ƒ | t |  i d | t  t d S(   s   Scores ngrams using Student's t test with independence hypothesis
        for unigrams, as in Manning and Schutze 5.3.2.
        i   g      à?(   R   R   R   t   UNIGRAMSR   t   _SMALL(   R   R	   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt	   student_tQ   s    c         G   s;   |  i  | Œ  } |  i | ƒ } t d „  t | | ƒ Dƒ ƒ S(   sZ   Scores ngrams using Pearson's chi-square as in Manning and Schutze
        5.3.3.
        c         s   s/   x( |  ]! \ } } | | d  | t  Vq Wd S(   i   N(   R    (   R   t   obst   exp(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pys	   <genexpr>b   s   	(   R
   R   R   t   zip(   R   R	   R   t   exps(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   chi_sq[   s    	c          O   s,   |  t  | i d d ƒ t t |  t ƒ ƒ S(   sÂ   Scores ngrams using a variant of mutual information. The keyword
        argument power sets an exponent (default 3) for the numerator. No
        logarithm of the result is calculated.
        t   poweri   (   R   t   getR   R   R   (   R	   t   kwargs(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   mi_likee   s    c         G   s5   t  | t | t |  i d ƒ t  t | t ƒ ƒ S(   s^   Scores ngrams by pointwise mutual information, as in Manning and
        Schutze 5.4.
        i   (   t   _log2R   R   R   R   R   (   R   R	   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   pmin   s     c         G   sD   |  i  | Œ  } d |  i d t d „  t | |  i | ƒ ƒ Dƒ ƒ S(   sO   Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
        iÿÿÿÿi   c         s   s;   x4 |  ]- \ } } | t  t | ƒ | t t ƒ Vq Wd  S(   N(   t   _lnR   R    (   R   R"   R#   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pys	   <genexpr>}   s   	(   R
   R   R   R$   R   (   R   R	   R   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   likelihood_ratiov   s    	c         G   sG   t  | t ƒ t | t |  i d ƒ } | t t | t | ƒ d S(   s1   Scores ngrams using the Poisson-Stirling measure.i   (   R   R   R   R   R   R   R+   (   R   R	   R#   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   poisson_stirling€   s    c         G   s+   |  i  | Œ  } t | d ƒ t | d  ƒ S(   s&   Scores ngrams using the Jaccard index.i    iÿÿÿÿ(   R
   R   R   (   R   R	   R   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   jaccard‡   s    (   t   __name__t
   __module__t   __doc__t   staticmethodR
   R   t   classmethodR   R   R!   R&   R*   R,   R.   R/   R0   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR   #   s   

	
t   BigramAssocMeasuresc           B   sn   e  Z d  Z d Z e d „  ƒ Z e d „  ƒ Z e d „  ƒ Z e d „  ƒ Z	 e d „  ƒ Z
 e d „  ƒ Z RS(   sk  
    A collection of trigram association measures. Each association measure
    is provided as a function with three arguments:
        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:
    n_ii counts (w1, w2), i.e. the bigram being scored
    n_ix counts (w1, *)
    n_xi counts (*, w2)
    n_xx counts (*, *), i.e. any bigram

    This may be shown with respect to a contingency table:
                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
    i   c         C   s<   | \ } } | |  } | |  } |  | | | |  | | f S(   sE   Calculates values of a bigram contingency table from marginal values.(    (   t   n_iit   .1t   n_xxt   n_ixt   n_xit   n_oit   n_io(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR
   ¨   s   	

c         C   s'   |  | |  | |  f | | | |  f S(   sA   Calculates values of contingency table marginals from its values.(    (   R7   R<   R=   t   n_oo(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR   ¯   s    c         c   sZ   t  |  ƒ } xG t d ƒ D]9 } |  | |  | d A|  | |  | d At | ƒ Vq Wd S(   s3   Calculates expected values for a contingency table.i   i   i   N(   R   R   R   (   R   R9   R   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR   ´   s     c         G   sU   |  i  | Œ  \ } } } } t | | | | d ƒ | | | | | | | | S(   sd   Scores bigrams using phi-square, the square of the Pearson correlation
        coefficient.
        i   (   R
   R   (   R   R	   R7   R=   R<   R>   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   phi_sq¼   s    c         C   s)   | \ } } | |  i  | | | f | ƒ S(   sƒ   Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
        of bigrams, as in Manning and Schutze 5.3.3.
        (   R?   (   R   R7   t   .2R9   R:   R;   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR&   Æ   s   	c         C   s"   | \ } } d t  |  ƒ | | S(   s(   Scores bigrams using Dice's coefficient.i   (   R   (   R7   R8   R9   R:   R;   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   diceÍ   s   	(   R1   R2   R3   R   R4   R
   R   R   R5   R?   R&   RA   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR6   Ž   s   
t   TrigramAssocMeasuresc           B   s2   e  Z d  Z d Z e d „  ƒ Z e d „  ƒ Z RS(   sÁ  
    A collection of trigram association measures. Each association measure
    is provided as a function with four arguments:
        trigram_score_fn(n_iii,
                         (n_iix, n_ixi, n_xii),
                         (n_ixx, n_xix, n_xxi),
                         n_xxx)
    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:
    n_iii counts (w1, w2, w3), i.e. the trigram being scored
    n_ixx counts (w1, *, *)
    n_xxx counts (*, *, *), i.e. any trigram
    i   c         C   s°   | \ } } } | \ } } }	 | |  }
 | |  } | |  } |	 |  |
 | } | |  |
 | } | |  | | } | |  |
 | | | | | } |  |
 | | | | | | f S(   sa   Calculates values of a trigram contingency table (or cube) from marginal
        values.
        (    (   t   n_iiiR8   R@   t   n_xxxt   n_iixt   n_ixit   n_xiit   n_ixxt   n_xixt   n_xxit   n_oiit   n_ioit   n_iiot   n_ooit   n_oiot   n_ioot   n_ooo(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR
   æ   s   


"c    	      G   sv   |  \ } } } } } } } } | | | | | | | f | | | | | | | | | | | | f t  |  ƒ f S(   sA   Calculates values of contingency table marginals from its values.(   R   (	   R   RC   RK   RL   RN   RM   RO   RP   RQ   (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyR   ù   s    (   R1   R2   R3   R   R4   R
   R   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyRB   Ó   s   t   ContingencyMeasuresc           B   s&   e  Z d  Z d „  Z e d „  ƒ Z RS(   s   Wraps NgramAssocMeasures classes such that the arguments of association
    measures are contingency table values rather than marginals.
    c         C   s   d | i  i |  i  _ xp t | ƒ D]b } | i d ƒ o q# n t | | ƒ } | i d ƒ p |  i | | ƒ } n t |  | | ƒ q# Wd S(   sA   Constructs a ContingencyMeasures given a NgramAssocMeasures classt   Contingencyt   __t   _N(   t	   __class__R1   t   dirt
   startswitht   getattrt   _make_contingency_fnt   setattr(   t   selft   measurest   kt   v(    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   __init__
  s     c            s.   ‡  ‡ f d †  } ˆ  i  | _  ˆ  i | _ | S(   s‡   From an association measure function, produces a new function which
        accepts contingency table values as its arguments.
        c             s   ˆ  ˆ i  |  Œ  Œ  S(   N(   R   (   R   (   t   old_fnR]   (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyt   res  s    (   R3   R1   (   R]   Ra   Rb   (    (   Ra   R]   s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyRZ     s    (   R1   R2   R3   R`   R4   RZ   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pyRR     s   	(   R3   t   mathR    R+   R   R-   R   R    R   R   R   t   objectR   R6   RB   RR   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/metrics/association.pys   <module>   s   			kE2