³ò
4ÒÇIc        	   @   s   d  d k  Z  d  d k l Z d  d k l Z d  d k l Z l Z l Z d  d k	 l
 Z
 d  d k Td e f d „  ƒ  YZ d	 „  Z e d
 j o e ƒ  n d S(   iÿÿÿÿN(   t   chain(   t   log(   t   ConditionalProbDistt   ConditionalFreqDistt   MLEProbDist(   t   ingrams(   t   *t
   NgramModelc           B   sw   e  Z d  Z e d „ Z d „  Z d „  Z d „  Z d „  Z d d „ Z	 d „  Z
 d „  Z d	 „  Z d
 „  Z d „  Z RS(   sN   
    A processing interface for assigning a probability to the next word.
    c         C   sö   | |  _  | d j o d „  } n t ƒ  } t ƒ  |  _ d | d |  _ x^ t t |  i | ƒ | ƒ D]A } |  i i | ƒ t	 | d  ƒ } | d } | | i
 | ƒ qe Wt | | t t | ƒ ƒ |  _ | d j o t | d | | ƒ |  _ n d S(   sx  
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and returns
              a C{ConditionalProbDist}
        c         S   s
   t  |  ƒ S(    (   R   (   t   fdistt   bins(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   <lambda>)   s    t    i   iÿÿÿÿN(   R   (   t   _nt   NoneR   t   sett   _ngramst   _prefixR   R    t   addt   tuplet   incR   t   Falset   lent   _modelR   t   _backoff(   t   selft   nt   traint	   estimatort   cfdt   ngramt   contextt   token(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   __init__   s    		 
c         C   s“   t  | ƒ } | | f |  i j o |  | i | ƒ SnX |  i d j o( |  i | ƒ |  i i | | d  ƒ Sn  t d | d i | ƒ f ƒ ‚ d S(   s6   Evaluate the probability of this word in this context.i   iÿÿÿÿs5   No probability mass assigned to word %s in context %st    N(   R   R   t   probR   t   _alphaR   t   RuntimeErrort   join(   R   t   wordR   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR"   <   s    (c         C   s!   |  i  | ƒ |  i i  | d  ƒ S(   Niÿÿÿÿ(   t   _betaR   (   R   t   tokens(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR#   G   s    c         C   s'   | |  j o |  | i  ƒ  Sn d Sd  S(   Ni   (   t   discount(   R   R(   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR'   J   s    c         C   s   t  |  i | | ƒ d ƒ S(   sE   Evaluate the (negative) log probability of this word in this context.i   (   R   R"   (   R   R&   R   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   logprobP   s    c         C   s=   t  | ƒ } x* t | ƒ D] } | i |  i | ƒ ƒ q W| S(   s1   Generate random text based on the language model.(   t   listt   ranget   appendt   _generate_one(   R   t	   num_wordsR   t   textt   i(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   generateW   s
     c         C   sn   |  i  t | ƒ |  i d } | |  j o |  | i ƒ  Sn- |  i d j o |  i i | d ƒ Sn d Sd  S(   Ni   t   .(   R   R   R   R2   R   R.   (   R   R   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR.   ^   s    c         C   ss   d } xf t  |  i d t | ƒ ƒ D]H } t | | |  i d | d !ƒ } | | } | |  i | | ƒ 7} q# W| S(   s‹   Evaluate the total entropy of a text with respect to the model.
        This is the sum of the log probability of each word in the message.g        i   (   R,   R   R   R   R*   (   R   R0   t   eR1   R   R   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   entropyh   s     "
c         C   s   t  | ƒ |  i j S(   N(   R   R   (   R   t   item(    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   __contains__s   s    c         C   s   |  i  t | ƒ S(   N(   R   R   (   R   R6   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   __getitem__v   s    c         C   s   d t  |  i ƒ |  i f S(   Ns   <NgramModel with %d %d-grams>(   R   R   R   (   R   (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   __repr__y   s    (    (   t   __name__t
   __module__t   __doc__R   R    R"   R#   R'   R*   R2   R.   R5   R7   R8   R9   (    (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR      s   %					
			c             s—   d d k  l }  d d k l ‰  l } ‡  f d †  } t d |  i d d ƒ | ƒ } | GH| i d ƒ } d d  k } d	 i	 | i
 d
 i	 | ƒ ƒ ƒ GHd  S(   Niÿÿÿÿ(   t   brown(   t   LidstoneProbDistt   WittenBellProbDistc            s   ˆ  |  d  ƒ S(   gš™™™™™É?(    (   R   R	   (   R>   (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyR
      s    i   t
   categoriest   newsid   s   
R!   (   t   nltk.corpusR=   t   nltk.probabilityR>   R?   R   t   wordsR2   t   textwrapR%   t   wrap(   R=   R?   R   t   lmR0   RE   (    (   R>   s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pyt   demo|   s    t   __main__(   t   randomt	   itertoolsR    t   mathR   RC   R   R   R   t	   nltk.utilR   t   apit   ModelIR   RH   R:   (    (    (    s&   /p/zhu/06/nlp/nltk/nltk/model/ngram.pys   <module>   s   
k	