³ò
3ÒÇIc           @   s‰  d  Z  d d k Z d d k l Z d d k l Z d d k l	 Z	 d d k
 l Z l Z l Z d d k Td e f d	 „  ƒ  YZ d
 e f d „  ƒ  YZ d e f d „  ƒ  YZ d d d „ Z e ƒ  Z e ƒ  Z e d j o› d d k Z d d k
 l Z y e d e i d ƒ Z Wn e j
 o d Z n Xy e d e i d ƒ Z Wn e j
 o d Z n Xe e e ƒ n d
 d d d g Z d S(   sì  
Tools to identify X{collocation}s --- words that often appear consecutively
--- within corpora. They may also be used to find other X{association}s between
word occurrences.
See Manning and Schutze ch. 5 at http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
and the Text::NSP Perl package at http://ngram.sourceforge.net

Finding collocations requires first calculating the frequencies of words and
their appearance in the context of other words. Often the collection of words
will then requiring filtering to only retain useful content terms. Each ngram
of words may then be scored according to some X{association measure}, in order
to determine the relative likelihood of each ngram being a collocation.

The L{BigramCollocationFinder} and L{TrigramCollocationFinder} classes provide
these functionalities, dependent on being provided a function which scores a
ngram given appropriate frequency counts. A number of standard association
measures are provided in L{bigram_measures} and L{trigram_measures}.
iÿÿÿÿN(   t
   itemgetter(   t   FreqDist(   t   ingrams(   t   ContingencyMeasurest   BigramAssocMeasurest   TrigramAssocMeasures(   t   *t   AbstractCollocationFinderc           B   sƒ   e  Z d  Z d „  Z e d „  ƒ Z e d „  ƒ Z d „  d „ Z d „  Z	 d „  Z
 d „  Z d	 „  Z d
 „  Z d „  Z d „  Z RS(   s”   
    An abstract base class for X{collocation finder}s whose purpose is to
    collect collocation candidate frequencies, filter and rank them.
    c         C   s   | |  _  | |  _ d S(   s  As a minimum, collocation finders require the frequencies of each
        word in a corpus, and the joint frequency of word tuples. This data
        should be provided through L{nltk.probability.FreqDist} objects or an
        identical interface.
        N(   t   word_fdt   ngram_fd(   t   selfR   R	   (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   __init__0   s    	c         C   s   |  i  t i | Œ  ƒ S(   s‚   Constructs a collocation finder given a collection of documents,
        each of which is a list (or iterable) of tokens.
        (   t
   from_wordst
   _itertoolst   chain(   t   clst	   documents(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   from_documents9   s    c            s-   t  ‡ ‡  f d †  t t ˆ  ƒ d ƒ Dƒ ƒ S(   Nc         3   s*   x# |  ] } t  ˆ | | ˆ  !ƒ Vq Wd  S(   N(   t   tuple(   t   .0t   i(   t   nt   words(    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pys	   <genexpr>B   s    i   (   R   t   ranget   len(   R   R   (    (   R   R   s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   _ngram_freqdist@   s    c         C   s   t  S(    (   t   False(   t   ngramt   freq(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   <lambda>D   s    c         C   s\   xU |  i  i ƒ  D]D \ } } | | | ƒ o( y |  i  | =WqT t j
 o qT Xq q Wd S(   s‡   Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        N(   R	   t   itemst   KeyError(   R
   t   fnR   R   (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   _apply_filterD   s     c            s   |  i  ‡  f d †  ƒ d S(   sA   Removes candidate ngrams which have frequency less than min_freq.c            s
   | ˆ  j  S(    (    (   t   ngR   (   t   min_freq(    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR   Q   s    N(   R!   (   R
   R#   (    (   R#   s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   apply_freq_filterO   s    c            s   |  i  ‡  f d †  ƒ d S(   s`   Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
        evaluates to True.
        c            s
   ˆ  |  Œ  S(    (    (   R"   t   f(   R    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR   W   s    N(   R!   (   R
   R    (    (   R    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   apply_ngram_filterS   s    c            s   |  i  ‡  f d †  ƒ d S(   sm   Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
        ...) evaluates to True.
        c            s   t  ‡  f d  †  |  Dƒ ƒ S(   c         3   s   x |  ] } ˆ  | ƒ Vq Wd  S(   N(    (   R   t   w(   R    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pys	   <genexpr>]   s    (   t   any(   R"   R%   (   R    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR   ]   s    N(   R!   (   R
   R    (    (   R    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   apply_word_filterY   s    c         c   sF   x? |  i  D]4 } |  i | | Œ } | d j	 o | | f Vq
 q
 Wd S(   sb   Generates of (ngram, score) pairs as determined by the scoring
        function provided.
        N(   R	   t   score_ngramt   None(   R
   t   score_fnt   tupt   score(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   _score_ngrams_   s
    
 c         C   s%   t  |  i | ƒ d t d ƒ d t ƒS(   s‘   Returns a sequence of (ngram, score) pairs ordered from highest to
        lowest score, as determined by the scoring function provided.
        t   keyi   t   reverse(   t   sortedR/   t   _itemgettert   True(   R
   R,   (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   score_ngramsh   s    c         C   s2   g  } |  i  | ƒ |  D] \ } } | | q ~ S(   s;   Returns the top n ngrams when scored by the given function.(   R5   (   R
   R,   R   t   _[1]t   pt   s(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   nbesto   s    c         c   s;   x4 |  i  | ƒ D]# \ } } | | j o	 | Vq Pq Wd S(   s}   Returns a sequence of ngrams, ordered by decreasing score, whose
        scores each exceed the given minimum score.
        N(   R5   (   R
   R,   t	   min_scoreR   R.   (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   above_scores   s
     	(   t   __name__t
   __module__t   __doc__R   t   classmethodR   t   staticmethodR   R!   R$   R&   R)   R/   R5   R9   R;   (    (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR   *   s   									t   BigramCollocationFinderc           B   s&   e  Z d  Z e d „  ƒ Z d „  Z RS(   s»   A tool for the finding and ranking of bigram collocations or other
    association measures. It is often useful to use from_words() rather than
    constructing an instance directly.
    c         C   sv   t  ƒ  } t  ƒ  } xT t | d d t ƒD]= \ } } | i | ƒ | t j	 o | i | | f ƒ q( q( W|  | | ƒ S(   s[   Construct a BigramCollocationFinder for all bigrams in the given
        sequence.
        i   t	   pad_right(   R   R   R4   t   incR+   (   R   R   t   wfdt   bfdt   w1t   w2(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR   „   s    		 c         C   sa   |  i  i ƒ  } |  i | | f } | p d Sn |  i  | } |  i  | } | | | | f | ƒ S(   sW   Returns the score for a given bigram using the given scoring
        function.
        N(   R   t   NR	   (   R
   R,   RF   RG   t   n_allt   n_iit   n_ixt   n_xi(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR*   ’   s    (   R<   R=   R>   R?   R   R*   (    (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyRA   ~   s   t   TrigramCollocationFinderc           B   s8   e  Z d  Z d „  Z e d „  ƒ Z d „  Z d „  Z RS(   s»   A tool for the finding and ranking of bigram collocations or other
    association measures. It is often useful to use from_words() rather than
    constructing an instance directly.
    c         C   s)   t  i |  | | ƒ | |  _ | |  _ d S(   s¥   Construct a TrigramCollocationFinder, given FreqDists for
        appearances of words, bigrams, two words with any word between them,
        and trigrams.
        N(   R   R   t   wildcard_fdt	   bigram_fd(   R
   R   RO   RN   t
   trigram_fd(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR   ¥   s    	c   	      C   sÑ   t  ƒ  } t  ƒ  } t  ƒ  } t  ƒ  } x— t | d d t ƒD]€ \ } } } | i | ƒ | t j o q: n | i | | f ƒ | t j o q: n | i | | f ƒ | i | | | f ƒ q: W|  | | | | ƒ S(   s]   Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        i   RB   (   R   R   R4   RC   R+   (	   R   R   RD   t   wildfdRE   t   tfdRF   RG   t   w3(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR   ®   s    				 c         C   s   t  |  i |  i ƒ S(   s¸   Constructs a bigram collocation finder with the bigram and unigram
        data from this finder. Note that this does not include any filtering
        applied to this finder.
        (   RA   R   RO   (   R
   (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   bigram_finderÃ   s    c         C   s¹   |  i  i ƒ  } |  i | | | f } | p d Sn |  i | | f } |  i | | f } |  i | | f }	 |  i  | }
 |  i  | } |  i  | } | | | | |	 f |
 | | f | ƒ S(   sX   Returns the score for a given trigram using the given scoring
        function.
        N(   R   RH   R	   RO   RN   (   R
   R,   RF   RG   RS   RI   t   n_iiit   n_iixt   n_ixit   n_xiit   n_ixxt   n_xixt   n_xxi(    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR*   Ê   s    (   R<   R=   R>   R   R?   R   RT   R*   (    (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyRM   Ÿ   s
   			c            sr  d d k  l } l } l } |  d j o | i }  n | d j o | i } n d d k l } | i	 i
 d ƒ ‰  ‡  f d †  } xê | i i ƒ  D]Ù } g  } | i i
 | ƒ D] }	 | |	 i ƒ  q® ~ }
 t i |
 ƒ } | i d ƒ | i | ƒ | GHd Gg  } | i |  d ƒ D] } | d	 i | ƒ q~ GHd
 | i | | | i |  ƒ ƒ | | i | ƒ ƒ ƒ f GHq‘ Wd S(   s>   Finds trigram collocations in the files of the WebText corpus.iÿÿÿÿ(   R   t   spearman_correlationt   ranks_from_scores(   t   corpust   englishc            s#   t  |  ƒ d  j  p |  i ƒ  ˆ  j S(   i   (   R   t   lower(   R'   (   t   ignored_words(    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyR   ê   s    i   s   	i   t    s   	 Correlation to %s: %0.4fN(   t   nltk.metricsR   R\   R]   R+   t   likelihood_ratiot   raw_freqt   nltkR^   t	   stopwordsR   t   webtextt   filesR`   RA   R   R$   R)   R9   t   joinR<   R5   (   t   scorert   compare_scorerR   R\   R]   R^   t   word_filtert   fileR6   t   wordR   t   cft   _[2]R-   (    (   Ra   s'   /p/zhu/06/nlp/nltk/nltk/collocations.pyt   demoÞ   s*     ,9	t   __main__(   R   s   BigramAssocMeasures.i   i   t   bigram_measurest   trigram_measures(   R>   t	   itertoolsR   t   operatorR    R3   t   nltk.probabilityR   t	   nltk.utilR   Rc   R   R   R   t   nltk.metrics.spearmant   objectR   RA   RM   R+   Rr   Rt   Ru   R<   t   syst   evalt   argvRk   t
   IndexErrorRl   t   __all__(    (    (    s'   /p/zhu/06/nlp/nltk/nltk/collocations.pys   <module>   s4   
T!?		