³ò
ŽB_Kc        	   @   s   d  d k  Z  d  d k l Z d  d k l Z d  d k l Z l Z l Z d  d k	 l
 Z
 d  d k Td e f d „  ƒ  YZ d	 „  Z e d
 j o e ƒ  n d S(   iÿÿÿÿN(   t   chain(   t   log(   t   ConditionalProbDistt   ConditionalFreqDistt   MLEProbDist(   t   ingrams(   t   *t
   NgramModelc           B   sw   e  Z d  Z e d „ Z d „  Z d „  Z d „  Z d „  Z d d „ Z	 d „  Z
 d „  Z d	 „  Z d
 „  Z d „  Z RS(   sN   
    A processing interface for assigning a probability to the next word.
    c         C   só   | |  _  | d j o d „  } n t ƒ  } t ƒ  |  _ d | d |  _ x^ t t |  i | ƒ | ƒ D]A } |  i i | ƒ t	 | d  ƒ } | d } | | i
 | ƒ qe Wt | | t | ƒ ƒ |  _ | d j o t | d | | ƒ |  _ n d S(   su  
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        c         S   s
   t  |  ƒ S(    (   R   (   t   fdistt   bins(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   <lambda>+   s    t    i   iÿÿÿÿN(   R   (   t   _nt   NoneR   t   sett   _ngramst   _prefixR   R    t   addt   tuplet   incR   t   lent   _modelR   t   _backoff(   t   selft   nt   traint	   estimatort   cfdt   ngramt   contextt   token(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   __init__   s    		 
c         C   s—   t  | ƒ } | | f |  i j o |  | i | ƒ Sn\ |  i d j o( |  i | ƒ |  i i | | d ƒ Sn$ t d d | d i | ƒ f ƒ ‚ d S(   sH   
        Evaluate the probability of this word in this context.
        i   s+   No probability mass assigned to word %s in s
   context %st    N(   R   R   t   probR   t   _alphaR   t   RuntimeErrort   join(   R   t   wordR   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR!   >   s    (c         C   s!   |  i  | ƒ |  i i  | d ƒ S(   Ni   (   t   _betaR   (   R   t   tokens(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR"   L   s    c         C   s'   | |  j o |  | i  ƒ  Sn d Sd  S(   Ni   (   t   discount(   R   R'   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR&   O   s    c         C   s   t  |  i | | ƒ d ƒ S(   sW   
        Evaluate the (negative) log probability of this word in this context.
        i   (   R   R!   (   R   R%   R   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   logprobU   s    c         C   s=   t  | ƒ } x* t | ƒ D] } | i |  i | ƒ ƒ q W| S(   s1   Generate random text based on the language model.(   t   listt   ranget   appendt   _generate_one(   R   t	   num_wordsR   t   textt   i(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   generate^   s
     c         C   sn   |  i  t | ƒ |  i d } | |  j o |  | i ƒ  Sn- |  i d j o |  i i | d ƒ Sn d Sd  S(   Ni   t   .(   R   R   R   R1   R   R-   (   R   R   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR-   e   s    c         C   ss   d } xf t  |  i d t | ƒ ƒ D]H } t | | |  i d | d !ƒ } | | } | |  i | | ƒ 7} q# W| S(   s   
        Evaluate the total entropy of a text with respect to the model.
        This is the sum of the log probability of each word in the message.
        g        i   (   R+   R   R   R   R)   (   R   R/   t   eR0   R   R   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   entropyo   s     "
c         C   s   t  | ƒ |  i j S(   N(   R   R   (   R   t   item(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   __contains__|   s    c         C   s   |  i  t | ƒ S(   N(   R   R   (   R   R5   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   __getitem__   s    c         C   s   d t  |  i ƒ |  i f S(   Ns   <NgramModel with %d %d-grams>(   R   R   R   (   R   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   __repr__‚   s    (    (   t   __name__t
   __module__t   __doc__R   R   R!   R"   R&   R)   R1   R-   R4   R6   R7   R8   (    (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR      s   &						
			c             s—   d d k  l }  d d k l ‰  l } ‡  f d †  } t d |  i d d ƒ | ƒ } | GH| i d ƒ } d d  k } d	 i	 | i
 d
 i	 | ƒ ƒ ƒ GHd  S(   Niÿÿÿÿ(   t   brown(   t   LidstoneProbDistt   WittenBellProbDistc            s   ˆ  |  d  ƒ S(   gš™™™™™É?(    (   R   R	   (   R=   (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR
   ˆ   s    i   t
   categoriest   newsid   s   
R    (   t   nltk.corpusR<   t   nltk.probabilityR=   R>   R   t   wordsR1   t   textwrapR$   t   wrap(   R<   R>   R   t   lmR/   RD   (    (   R=   s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   demo…   s    t   __main__(   t   randomt	   itertoolsR    t   mathR   RB   R   R   R   t	   nltk.utilR   t   apit   ModelIR   RG   R9   (    (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pys   <module>   s   
s	