
4Ic           @   s(  d  Z  d Z d d k Z d d k Z d d k Z d d k Z d d k Z d d k l Z d d k	 l
 Z
 d d k Td d k Z d d k Td d k l Z l Z d d k l Z l Z l Z d d	 k l Z l Z l Z d
 e f d     YZ e Z e d  d    Z d e f d     YZ d e f d     YZ  d e f d     YZ! d e! f d     YZ" d e! f d     YZ# d e$ e$ d  Z% d   Z& d   Z' d e$ e$ d  Z( d   Z) d   Z* d e$ e$ d e+ d  d!  Z, d e$ e$ d  d"  Z- d# e f d$     YZ. d%   Z/ e0 d& j o e/   n d S('   s  
A classifier model based on maximum entropy modeling framework.  This
framework considers all of the probability distributions that are
empirically consistant with the training data; and chooses the
distribution with the highest entropy.  A probability distribution is
X{empirically consistant} with a set of training data if its estimated
frequency with which a class and a feature vector value co-occur is
equal to the actual frequency in the data.

Terminology: 'feature'
======================
The term I{feature} is usually used to refer to some property of an
unlabeled token.  For example, when performing word sense
disambiguation, we might define a C{'prevword'} feature whose value is
the word preceeding the target word.  However, in the context of
maxent modeling, the term I{feature} is typically used to refer to a
property of a X{labeled} token.  In order to prevent confusion, we
will introduce two distinct terms to disambiguate these two different
concepts:

  - An X{input-feature} is a property of an unlabeled token.
  - A X{joint-feature} is a property of a labeled token.

In the rest of the C{nltk.classify} module, the term X{features} is
used to refer to what we will call X{input-features} in this module.

In literature that describes and discusses maximum entropy models,
input-features are typically called X{contexts}, and joint-features
are simply referred to as X{features}.

Converting Input-Features to Joint-Features
-------------------------------------------
In maximum entropy models, joint-features are required to have numeric
values.  Typically, each input-feature C{input_feat} is mapped to a
set of joint-features of the form::

    joint_feat(token, label) = { 1 if input_feat(token) == feat_val
                               {      and label == some_label
                               {
                               { 0 otherwise

For all values of C{feat_val} and C{some_label}.  This mapping is
performed by classes that implement the L{MaxentFeatureEncodingI}
interface.
s
   epytext eniN(   t   defaultdict(   t   OrderedDict(   t   *(   t   attested_labelst   CutoffChecker(   t
   call_megamt   write_megam_filet   parse_megam_weights(   t	   call_tadmt   write_tadm_filet   parse_tadm_weightst   MaxentClassifierc        	   B   s   e  Z d  Z e d  Z d   Z d   Z d   Z d   Z d   Z	 d d  Z
 d	 d
 d  Z d   Z d d d d d d d d d g	 Z e e d e e e d d   Z h  d d <d d <d d <d d <d d <Z RS(   s  
    A maximum entropy classifier (also known as a X{conditional
    exponential classifier}).  This classifier is parameterized by a
    set of X{weights}, which are used to combine the joint-features
    that are generated from a featureset by an X{encoding}.  In
    particular, the encoding maps each C{(featureset, label)} pair to
    a vector.  The probability of each label is then computed using
    the following equation::

                                dotprod(weights, encode(fs,label))
      prob(fs|label) = ---------------------------------------------------
                       sum(dotprod(weights, encode(fs,l)) for l in labels)
    
    Where C{dotprod} is the dot product::

      dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
    c         C   s?   | |  _  | |  _ | |  _ | i   t |  j p t  d S(   s  
        Construct a new maxent classifier model.  Typically, new
        classifier models are created using the L{train()} method.

        @type encoding: L{MaxentFeatureEncodingI}
        @param encoding: An encoding that is used to convert the
            featuresets that are given to the C{classify} method into
            joint-feature vectors, which are used by the maxent
            classifier model.

        @type weights: C{list} of C{float}
        @param weights:  The feature weight vector for this classifier.

        @type logarithmic: C{bool}
        @param logarithmic: If false, then use non-logarithmic weights.
        N(   t	   _encodingt   _weightst   _logarithmict   lengtht   lent   AssertionError(   t   selft   encodingt   weightst   logarithmic(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   __init___   s    			c         C   s   |  i  i   S(   N(   R   t   labels(   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR   v   s    c         C   s0   | |  _  |  i i   t |  j p t  d S(   s   
        Set the feature weight vector for this classifier.  
        @param new_weights: The new feature weight vector.
        @type new_weights: C{list} of C{float}
        N(   R   R   R   R   R   (   R   t   new_weights(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   set_weightsy   s    	c         C   s   |  i  S(   sm   
        @return: The feature weight vector for this classifier.
        @rtype: C{list} of C{float}
        (   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR      s    c         C   s   |  i  |  i   S(   N(   t   prob_classifyt   max(   R   t
   featureset(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   classify   s    c   	      C   s   h  } x |  i  i   D] } |  i  i | |  } |  i o@ d } x) | D]! \ } } | |  i | | 7} qH W| | | <q d } x) | D]! \ } } | |  i | | 9} q W| | | <q Wt | d |  i d t S(   Ng        g      ?t   logt	   normalize(   R   R   t   encodeR   R   t   DictionaryProbDistt   True(	   R   R   t	   prob_dictt   labelt   feature_vectort   totalt   f_idt   f_valt   prod(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR      s"     
  i   c      	      s-  d } d t  | d  d }   i |   t  i   d  i d t } | |  } d i |  d i d	   | D  GHd
 d | d d t |  GHt	 t
   x	t |  D] \ } }   i i | |  } | i d   f d   d t  x | D] \ }	 }
   i o   i |	 |
 } n   i t |
 }   i i |	  } | i d  d } t |  d j o | d  d } n | | | d d | f GH | c | 7<q Wq Wd
 d | d d t |  GHd i |  d i  f d   | D  GHd i |  d i  f d   | D  GHd S(   s   
        Print a table showing the effect of each of the features in
        the given feature set, and how they combine to determine the
        probabilities of each label for that featureset.
        i2   s     %-i   s   s%s%8.3ft   keyt   reverses	     Featuret    c         s   s!   x |  ] } d  | d  Vq Wd S(   s   %8si   N(    (   t   .0t   l(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pys	   <genexpr>   s    s     t   -i   c            s   |  \ } } t    i |  S(    (   t   absR   (   R-   t   fidt   _(   R   (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   <lambda>   s    s    and label is i    i/   i,   s   ...t    i   s     TOTAL:c         3   s!   x |  ] } d    | Vq Wd S(   s   %8.3fN(    (   R-   R.   (   t   sums(    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pys	   <genexpr>   s    s     PROBS:c         3   s&   x |  ] } d    i  |  Vq Wd S(   s   %8.3fN(   t   prob(   R-   R.   (   t   pdist(    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pys	   <genexpr>   s    N(   t   strR   t   sortedt   samplesR6   R"   t   ljustt   joinR   R    t   intt	   enumerateR   R    t   sortR   R   R1   t   describet   split(   R   R   t   columnst   descr_widtht   TEMPLATER   t   iR$   R%   R'   R(   t   scoret   descr(    (   R   R7   R5   s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   explain   s8    !
  %i
   t   allc      	      s   t  t t   i   d   f d   d t } | d j o= g  } | D]% }   i | d j o | | qH qH ~ } nK | d j o= g  } | D]% }   i | d j  o | | q q ~ } n x4 | |  D]( } d   i |   i i |  f GHq Wd S(	   sT   
        @param show: all, neg, or pos (for negative-only or positive-only)
        R*   c            s   t    i |   S(    (   R0   R   (   R1   (   R   (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR3      s    R+   t   posi    t   negs   %8.3f %sN(   R9   t   rangeR   R   R"   R   R@   (   R   t   nt   showt   fidst   _[1]R1   t   _[2](    (   R   s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   show_most_informative_features   s    	== c         C   s&   d t  |  i i    |  i i   f S(   Ns:   <ConditionalExponentialClassifier: %d labels, %d features>(   R   R   R   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   __repr__   s    t   GISt   IISt   CGt   BFGSt   Powellt   LBFGSBs   Nelder-Meadt   MEGAMt   TADMi   i    c         K   s  | d j o6 y d d k }	 d } WqC t j
 o d } qC Xn x/ | D]' }
 |
 d j o t d |
   qJ qJ W| i   } | d j o t | | | | |  Sn | d j o t | | | | |  Sn | |  i j o* t | | | | |  i | | | |  Sn | d j o t	 | | | | | |  Sn` | d j oB | } | | d <| | d <| | d <| | d <t
 i | |  Sn t d |   d S(   s  
        Train a new maxent classifier based on the given corpus of
        training samples.  This classifier will have its weights
        chosen to maximize entropy while remaining empirically
        consistent with the training corpus.

        @rtype: L{MaxentClassifier}
        @return: The new maxent classifier

        @type train_toks: C{list}
        @param train_toks: Training data, represented as a list of
            pairs, the first member of which is a featureset,
            and the second of which is a classification label.

        @type algorithm: C{str}
        @param algorithm: A case-insensitive string, specifying which
            algorithm should be used to train the classifier.  The
            following algorithms are currently available.
            
              - Iterative Scaling Methods
                - C{'GIS'}: Generalized Iterative Scaling
                - C{'IIS'}: Improved Iterative Scaling
                
              - Optimization Methods (require C{scipy})
                - C{'CG'}: Conjugate gradient
                - C{'BFGS'}: Broyden-Fletcher-Goldfarb-Shanno algorithm
                - C{'Powell'}: Powell agorithm
                - C{'LBFGSB'}: A limited-memory variant of the BFGS algorithm
                - C{'Nelder-Mead'}: The Nelder-Mead algorithm

              - External Libraries
                - C{'megam'}: LM-BFGS algorithm, with training performed
                  by an U{megam <http://www.cs.utah.edu/~hal/megam/>}.
                  (requires that C{megam} be installed.)

            The default algorithm is C{'CG'} if C{'scipy'} is
            installed; and C{'iis'} otherwise.

        @type trace: C{int}
        @param trace: The level of diagnostic tracing output to produce.
            Higher values produce more verbose output.

        @type encoding: L{MaxentFeatureEncodingI}
        @param encoding: A feature encoding, used to convert featuresets
            into feature vectors.  If none is specified, then a
            L{BinaryMaxentFeatureEncoding} will be built based on the
            features that are attested in the training corpus.

        @type labels: C{list} of C{str}
        @param labels: The set of possible labels.  If none is given, then
            the set of all labels attested in the training data will be
            used instead.

        @param sparse: If true, then use sparse matrices instead of
            dense matrices.  Currently, this is only supported by
            the scipy (optimization method) algorithms.  For other
            algorithms, its value is ignored.
        
        @param gaussian_prior_sigma: The sigma value for a gaussian
            prior on model weights.  Currently, this is supported by
            the scipy (optimization method) algorithms and C{megam}.
            For other algorithms, its value is ignored.
            
        @param cutoffs: Arguments specifying various conditions under
            which the training should be halted.  (Some of the cutoff
            conditions are not supported by some algorithms.)
            
              - C{max_iter=v}: Terminate after C{v} iterations.
              - C{min_ll=v}: Terminate after the negative average
                log-likelihood drops under C{v}.
              - C{min_lldelta=v}: Terminate if a single iteration improves
                log likelihood by less than C{v}.
              - C{tolerance=v}: Terminate a scipy optimization method when
                improvement drops below a tolerance level C{v}.  The
                exact meaning of this tolerance depends on the scipy
                algorithm used.  See C{scipy} documentation for more
                info.  Default values: 1e-3 for CG, 1e-5 for LBFGSB,
                and 1e-4 for other algorithms.  I{(C{scipy} only)}
        iNt   cgt   iist   max_itert   min_llt   min_lldeltat	   tolerancet   max_acct   min_accdeltat   count_cutofft   norms   Unexpected keyword arg %rt   gist   megamt   tadmt   traceR   R   t   gaussian_prior_sigmas   Unknown algorithm %s(   s   max_iters   min_lls   min_lldeltas	   tolerances   max_accs   min_accdeltas   count_cutoffs   norm(   t   Nonet   scipyt   ImportErrort	   TypeErrort   lowert    train_maxent_classifier_with_iist    train_maxent_classifier_with_gist   _SCIPY_ALGSt"   train_maxent_classifier_with_scipyt"   train_maxent_classifier_with_megamt   TadmMaxentClassifiert   traint
   ValueError(   t   clst
   train_tokst	   algorithmRi   R   R   t   sparseRj   t   cutoffsRl   R*   t   kwargs(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRv      sH    R
  




R\   t   bfgst   powellt   lbfgsbs   nelder-mead(   t   __name__t
   __module__t   __doc__R"   R   R   R   R   R   R   RH   RR   RS   t
   ALGORITHMSt   classmethodRk   Rv   Rr   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR   M   s"   						"		us$   Use MaxentClassifier.train() insteadc          O   s   t  i |  |   S(   N(   R   Rv   (   t   argsR}   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   train_maxent_classifierY  s    t   MaxentFeatureEncodingIc           B   s;   e  Z d  Z d   Z d   Z d   Z d   Z d   Z RS(   s  
    A mapping that converts a set of input-feature values to a vector
    of joint-feature values, given a label.  This conversion is
    necessary to translate featuresets into a format that can be used
    by maximum entropy models.

    The set of joint-features used by a given encoding is fixed, and
    each index in the generated joint-feature vectors corresponds to a
    single joint-feature.  The length of the generated joint-feature
    vectors is therefore constant (for a given encoding).

    Because the joint-feature vectors generated by
    C{MaxentFeatureEncodingI} are typically very sparse, they are
    represented as a list of C{(index, value)} tuples, specifying the
    value of each non-zero joint-feature.

    Feature encodings are generally created using the L{train()}
    method, which generates an appropriate encoding based on the
    input-feature values and labels that are present in a given
    corpus.
    c         C   s   t  d   d S(   sS  
        Given a (featureset, label) pair, return the corresponding
        vector of joint-feature values.  This vector is represented as
        a list of C{(index, value)} tuples, specifying the value of
        each non-zero joint-feature.
        
        @type featureset: C{dict}
        @rtype: C{list} of C{(int, number)}
        s   Not implementedN(   R   (   R   R   R$   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR    w  s    
c         C   s   t  d   d S(   s   
        @return: The size of the fixed-length joint-feature vectors
            that are generated by this encoding.
        @rtype: C{int}
        s   Not implementedN(   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    c         C   s   t  d   d S(   s   
        @return: A list of the "known labels" -- i.e., all labels
            C{l} such that C{self.encode(fs,l)} can be a nonzero
            joint-feature vector for some value of C{fs}.
        @rtype: C{list}
        s   Not implementedN(   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    c         C   s   t  d   d S(   s   
        @return: A string describing the value of the joint-feature
            whose index in the generated feature vectors is C{fid}.
        @rtype: C{str}
        s   Not implementedN(   R   (   R   R1   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR@     s    c         C   s   t  d   d S(   s  
        Construct and return new feature encoding, based on a given
        training corpus C{train_toks}.

        @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
        @param train_toks: Training data, represented as a list of
            pairs, the first member of which is a feature dictionary,
            and the second of which is a classification label.
        s   Not implementedN(   R   (   Rx   Ry   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRv     s    
(   R   R   R   R    R   R   R@   Rv   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR   a  s   					t#   FunctionBackedMaxentFeatureEncodingc           B   s;   e  Z d  Z d   Z d   Z d   Z d   Z d   Z RS(   s   
    A feature encoding that calls a user-supplied function to map a
    given featureset/label pair to a sparse joint-feature vector.
    c         C   s   | |  _  | |  _ | |  _ d S(   so  
        Construct a new feature encoding based on the given function.

        @type func: (callable)
        @param func: A function that takes two arguments, a featureset
             and a label, and returns the sparse joint feature vector
             that encodes them:

             >>> func(featureset, label) -> feature_vector
        
             This sparse joint feature vector (C{feature_vector}) is a
             list of C{(index,value)} tuples.

        @type length: C{int}
        @param length: The size of the fixed-length joint-feature
            vectors that are generated by this encoding.

        @type labels: C{list}
        @param labels: A list of the "known labels" for this
            encoding -- i.e., all labels C{l} such that
            C{self.encode(fs,l)} can be a nonzero joint-feature vector
            for some value of C{fs}.
        N(   t   _lengtht   _funct   _labels(   R   t   funcR   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    		c         C   s   |  i  | |  S(   N(   R   (   R   R   R$   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR      s    c         C   s   |  i  S(   N(   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    c         C   s   |  i  S(   N(   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    c         C   s   d S(   Ns   no description available(    (   R   R1   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR@     s    (   R   R   R   R   R    R   R   R@   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s   				t   BinaryMaxentFeatureEncodingc           B   sV   e  Z d  Z e e d  Z d   Z d   Z d   Z d   Z e	 d e
 d   Z RS(   s  
    A feature encoding that generates vectors containing a binary
    joint-features of the form::

      joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
                          {
                          { 0 otherwise

    Where C{fname} is the name of an input-feature, C{fval} is a value
    for that input-feature, and C{label} is a label.

    Typically, these features are constructed based on a training
    corpus, using the L{train()} method.  This method will create one
    feature for each combination of C{fname}, C{fval}, and C{label}
    that occurs at least once in the training corpus.  

    The C{unseen_features} parameter can be used to add X{unseen-value
    features}, which are used whenever an input feature has a value
    that was not encountered in the training corpus.  These features
    have the form::

      joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
                          {      and l == label
                          {
                          { 0 otherwise

    Where C{is_unseen(fname, fval)} is true if the encoding does not
    contain any joint features that are true when C{fs[fname]==fval}.

    The C{alwayson_features} parameter can be used to add X{always-on
    features}, which have the form::

      joint_feat(fs, l) = { 1 if (l == label)
                          {
                          { 0 otherwise

    These always-on features allow the maxent model to directly model
    the prior probabilities of each label.
    c      	   C   sW  t  | i    t  t t |    j o t d   n t |  |  _ | |  _ t |  |  _ d |  _
 d |  _ | o_ t g  } t |  D]  \ } } | | | |  i f q ~  |  _
 |  i t |  i
  7_ n | or t  d   | D  } t g  }	 t |  D]  \ } }
 |	 |
 | |  i f q~	  |  _ |  i t |  7_ n d S(   s  
        @param labels: A list of the "known labels" for this encoding.
        
        @param mapping: A dictionary mapping from C{(fname,fval,label)}
            tuples to corresponding joint-feature indexes.  These
            indexes must be the set of integers from 0...len(mapping).
            If C{mapping[fname,fval,label]=id}, then
            C{self.encode({..., fname:fval, ...}, label)[id]} is 1;
            otherwise, it is 0.
            
        @param unseen_features: If true, then include unseen value
           features in the generated joint-feature vectors.
           
        @param alwayson_features: If true, then include always-on
           features in the generated joint-feature vectors.
        sH   Mapping values must be exactly the set of integers from 0...len(mapping)c         s   s"   x |  ] \ } } } | Vq Wd  S(   N(    (   R-   t   fnamet   fvalR$   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pys	   <genexpr>(  s    N(   t   sett   valuesRL   R   Rw   t   listR   t   _mappingR   Rk   t	   _alwaysont   _unseent   dictR>   (   R   R   t   mappingt   unseen_featurest   alwayson_featuresRP   RE   R$   t   fnamesRQ   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s     +			
9
9c         C   s  g  } x | i    D] \ } } | | | f |  i j o' | i |  i | | | f d f  q |  i od xa |  i D]$ } | | | f |  i j o Pqs qs W| |  i j o | i |  i | d f  q q q W|  i o. | |  i j o | i |  i | d f  n | S(   Ni   (   t   itemsR   t   appendR   R   R   (   R   R   R$   R   R   R   t   label2(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR    -  s     '

 	&c         C   s  t  | t t f  p t d   n y |  i WnY t j
 oM d g t |  i  |  _ x/ |  i i   D] \ } } | |  i | <qk Wn X| t |  i  j  o+ |  i | \ } } } d | | | f Sn |  i	 oS | |  i	 i
   j o= x |  i	 i   D]% \ } } | | j o d | Sq q Wnj |  i oS | |  i i
   j o= xF |  i i   D]% \ } } | | j o d | Sq[q[Wn t d   d  S(   Ns   describe() expected an intis   %s==%r and label is %rs   label is %rs   %s is unseens   Bad feature id(   t
   isinstanceR=   t   longRn   t   _inv_mappingt   AttributeErrorR   R   R   R   R   R   Rw   (   R   R'   t   infoRE   R   R   R$   t   f_id2(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR@   H  s*       !  !c         C   s   |  i  S(   N(   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR   _  s    c         C   s   |  i  S(   N(   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR   c  s    i    c         K   s  h  } t    } t t  } x | D] \ } }	 | o! |	 | j o t d |	   n | i |	  x~ | i   D]p \ }
 } | |
 | f c d 7<| |
 | f | j o7 |
 | |	 f | j o t |  | |
 | |	 f <q qp qp Wq" W| t j o
 | } n |  | | |  S(   s  
        Construct and return new feature encoding, based on a given
        training corpus C{train_toks}.  See the L{class description
        <BinaryMaxentFeatureEncoding>} for a description of the
        joint-features that will be included in this encoding.

        @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
        @param train_toks: Training data, represented as a list of
            pairs, the first member of which is a feature dictionary,
            and the second of which is a classification label.

        @type count_cutoff: C{int}
        @param count_cutoff: A cutoff value that is used to discard
            rare joint-features.  If a joint-feature's value is 1
            fewer than C{count_cutoff} times in the training corpus,
            then that joint-feature is not included in the generated
            encoding.

        @type labels: C{list}
        @param labels: A list of labels that should be used by the
            classifier.  If not specified, then the set of labels
            attested in C{train_toks} will be used.

        @param options: Extra parameters for the constructor, such as
            C{unseen_features} and C{alwayson_features}.
        s   Unexpected label %si   (   R   R    R=   Rw   t   addR   R   Rk   (   Rx   Ry   Rd   R   t   optionsR   t   seen_labelst   countt   tokR$   R   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRv   g  s     	  )(   R   R   R   t   FalseR   R    R@   R   R   R   Rk   Rv   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s   '/				t   GISEncodingc           B   sP   e  Z d  Z e e e d  Z e d   d d Z d   Z d   Z	 d   Z
 RS(   s  
    A binary feature encoding which adds one new joint-feature to the
    joint-features defined by L{BinaryMaxentFeatureEncoding}: a
    correction feature, whose value is chosen to ensure that the
    sparse vector always sums to a constant non-negative number.  This
    new feature is used to ensure two preconditions for the GIS
    training algorithm:
      - At least one feature vector index must be nonzero for every
        token.
      - The feature vector must sum to a constant non-negative number
        for every token.
    c   
      C   sq   t  i |  | | | |  | d j o> t t g  } | D] \ } } }	 | | q7 ~   d } n | |  _ d S(   s  
        @param C: The correction constant.  The value of the correction
            feature is based on this value.  In particular, its value is
            C{C - sum([v for (f,v) in encoding])}.
        @seealso: L{BinaryMaxentFeatureEncoding.__init__}
        i   N(   R   R   Rk   R   R   t   _C(
   R   R   R   R   R   t   CRP   R   R   R$   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    >	c         C   s   |  i  S(    (   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR3     s    t   docsX   
        The non-negative constant that all encoded feature vectors
        will sum to.c   	      C   s   t  i |  | |  } t  i |   } t g  } | D] \ } } | | q2 ~  } | |  i j o t d   n | i | |  i | f  | S(   Ns&   Correction feature is not high enough!(   R   R    R   t   sumR   Rw   R   (	   R   R   R$   R   t   base_lengthRP   t   ft   vR&   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR      s    -c         C   s   t  i |   d S(   Ni   (   R   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    c         C   s9   | t  i |   j o d |  i Sn t  i |  |  Sd  S(   Ns   Correction feature (%s)(   R   R   R   R@   (   R   R'   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR@     s    (   R   R   R   R   Rk   R   t   propertyR   R    R   R@   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s   			t   TadmEventMaxentFeatureEncodingc           B   sP   e  Z e e d   Z d   Z d   Z d   Z d   Z e d e	 d   Z
 RS(   c         C   s;   t  |  |  _ t    |  _ t i |  | |  i | |  d  S(   N(   R   R   t   _label_mappingR   R   (   R   R   R   R   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s
    c         C   s   g  } x | i    D] \ } } | | f |  i j o  t |  i  |  i | | f <n | |  i j o; t | t  p t |  i  |  i | <q | |  i | <n | i |  i | | f |  i | f  q W| S(   N(   R   R   R   R   R   R=   R   (   R   R   R$   R   t   featuret   value(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR      s      c         C   s   |  i  S(   N(   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    c         C   sF   x? |  i  D]4 \ } } |  i  | | f | j o | | f Sq
 q
 Wd  S(   N(   R   (   R   R1   R   R$   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR@     s    
 c         C   s   t  |  i  S(   N(   R   R   (   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    i    c   	   	   K   s   t    } | p
 g  } n t |  } x2 | D]* \ } } | | j o | i |  q- q- Wxc | D][ \ } } xL | D]D } x; | D]3 } | | f | j o t |  | | | f <q q Wqu Wqb W|  | | |  S(   N(   R   R   R   R   (	   Rx   Ry   Rd   R   R   R   R   R$   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRv     s"    	
    &(   R   R   R   R   R    R   R@   R   R   Rk   Rv   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s   				i   c      
   K   s  | i  d d  t |  } | t j o t i |  d | } n t | d  p t d   n d | i } t |  |  } t	 t
 i | d j  d  } t
 i t |  d  }	 x | D] }
 t
 i |	 |
 <q Wt | |	  } t
 i |  } ~ t } t } | d j o d	 | d GHn | d
 j o Hd GHd GHn yxt o | d
 j oc | i p t i i i | |   } | i p t i i i | |   } | i } d | | | f GHn t | |  |  } x | D] }
 | |
 c d 7<qWt
 i |  } ~ | i   }	 |	 | | | 7}	 | i |	  | i | |   o Pq@q@WWn! t j
 o d GHn   n X| d
 j oC t i i i | |   } t i i i | |   } d | | f GHn | S(   s  
    Train a new C{ConditionalExponentialClassifier}, using the given
    training samples, using the Generalized Iterative Scaling
    algorithm.  This C{ConditionalExponentialClassifier} will encode
    the model that maximizes entropy from all the models that are
    empirically consistent with C{train_toks}.

    @see: L{train_maxent_classifier()} for parameter descriptions.
    R^   id   R   R   sJ   The GIS algorithm requires an encoding that defines C (e.g., GISEncoding).g      ?i    t   ds     ==> Training (%d iterations)i   s-         Iteration    Log Likelihood    Accuracys-         ---------------------------------------s        %9d    %14.5f    %9.3fi   s*         Training stopped: keyboard interrupts!            Final    %14.5f    %9.3f(   t
   setdefaultR   Rk   R   Rv   t   hasattrRn   R   t   calculate_empirical_fcountR   t   numpyt   nonzerot   zerosR   t   NINFt    ConditionalExponentialClassifiert   log2R"   t   llt   nltkR   t   utilt   log_likelihoodt   acct   accuracyt   itert   calculate_estimated_fcountR   R   t   checkt   KeyboardInterrupt(   Ry   Ri   R   R   R|   t   cutoffcheckert   Cinvt   empirical_fcountt
   unattestedR   R1   t
   classifiert   log_empirical_fcountt   ll_oldt   acc_oldR   R   t   iternumt   estimated_fcountt   log_estimated_fcount(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRq     sf     	
	 	c         C   sf   t  i | i   d  } xG |  D]? \ } } x0 | i | |  D] \ } } | | c | 7<q> Wq W| S(   NR   (   R   R   R   R    (   Ry   R   t   fcountR   R$   t   indext   val(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR   p  s      c   
      C   s   t  i | i   d  } x | D]x \ } } |  i |  } xZ | i   D]L } | i |  } x4 | i | |  D]  \ } }	 | | c | |	 7<qo WqG Wq W| S(   NR   (   R   R   R   R   R:   R6   R    (
   R   Ry   R   R   R   R$   R7   R6   R1   R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR   y  s        c         K   s  | i  d d  t |  } | t j o t i |  d | } n t |  |  t |   } t |  |  } t i	 t
 | d | i d  } t i | t |  d f  }	 t t i | d j  d  }
 t i t |  d  } x |
 D] } t i | | <q Wt | |  } | d j o d | d GHn | d	 j o Hd
 GHd GHn t } t } y x t o | d	 j oc | i p t i i i | |   } | i p t i i i | |   } | i } d | | | f GHn t |  | |
 | | | |	 |  } | i   } | | 7} | i |  | i | |   o PqYqYWWn! t j
 o d GHn   n X| d	 j oC t i i i | |   } t i i i | |   } d | | f GHn | S(   s~  
    Train a new C{ConditionalExponentialClassifier}, using the given
    training samples, using the Improved Iterative Scaling algorithm.
    This C{ConditionalExponentialClassifier} will encode the model
    that maximizes entropy from all the models that are empirically
    consistent with C{train_toks}.

    @see: L{train_maxent_classifier()} for parameter descriptions.
    R^   id   R   R*   R   i   i    s     ==> Training (%d iterations)i   s-         Iteration    Log Likelihood    Accuracys-         ---------------------------------------s        %9d    %14.5f    %9.3fs*         Training stopped: keyboard interrupts!            Final    %14.5f    %9.3f(    R   R   Rk   R   Rv   R   R   t   calculate_nfmapR   t   arrayR9   t   __getitem__t   reshapeR   R   R   R   R   R"   R   R   R   R   R   R   R   R   t   calculate_deltasR   R   R   R   (   Ry   Ri   R   R   R|   R   t   empirical_ffreqt   nfmapt   nfarrayt   nftransposeR   R   R1   R   R   R   R   R   R   t   deltas(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRp     s^    ! 	
	
	c         C   s   t    } xk |  D]c \ } } xT | i   D]F } | i t g  } | i | |  D] \ } } | | qO ~   q) Wq Wt g  }	 t |  D] \ }
 } |	 | |
 f q ~	  S(   s  
    Construct a map that can be used to compress C{nf} (which is
    typically sparse).

    M{nf(feature_vector)} is the sum of the feature values for
    M{feature_vector}.

    This represents the number of features that are active for a
    given labeled text.  This method finds all values of M{nf(t)}
    that are attested for at least one token in the given list of
    training tokens; and constructs a dictionary mapping these
    attested values to a continuous range M{0...N}.  For example,
    if the only values of M{nf()} that were attested were 3, 5,
    and 7, then C{_nfmap} might return the dictionary {3:0, 5:1,
    7:2}.

    @return: A map that can be used to compress C{nf} to a dense
        vector.
    @rtype: C{dictionary} from C{int} to C{int}
    (   R   R   R   R   R    R   R>   (   Ry   R   t   nfsetR   R2   R$   RP   t   idR   RQ   RE   t   nf(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR     s    	  Hc      
   C   s  d } d }	 t  i | i   d  }
 t  i t |  | i   f d  } x |  D] \ } } | i |  } x | i   D] } | i | |  } t g  } | D] \ } } | | q ~  } x; | D]3 \ } } | | | | f c | i	 |  | 7<q Wqw WqO W| t |   :} x t
 |	  D] } t  i | |
  } d | } | | } t  i | | d d } t  i | | d d } x | D] } | | c d 7<qW|
 | | | 8}
 t  i t | |   t  i t |
   } | | j  o |
 SqqW|
 S(   s
  
    Calculate the update values for the classifier weights for
    this iteration of IIS.  These update weights are the value of
    C{delta} that solves the equation::
    
      ffreq_empirical[i]
             =
      SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
                 feature_vector(fs,l)[i] *
                 exp(delta[i] * nf(feature_vector(fs,l))))

    Where:
        - M{(fs,l)} is a (featureset, label) tuple from C{train_toks}
        - M{feature_vector(fs,l)} = C{encoding.encode(fs,l)}
        - M{nf(vector)} = C{sum([val for (id,val) in vector])}

    This method uses Newton's method to solve this equation for
    M{delta[i]}.  In particular, it starts with a guess of
    C{delta[i]}=1; and iteratively updates C{delta} with::

        delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])

    until convergence, where M{sum1} and M{sum2} are defined as::

        sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
        
        sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta) *
                                    nf(feature_vector(fs,l)))
        
      f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) *
                          feature_vector(fs,l)[i] *
                          exp(delta[i] * nf(feature_vector(fs,l))))

    Note that M{sum1} and M{sum2} depend on C{delta}; so they need
    to be re-computed each iteration.
    
    The variables C{nfmap}, C{nfarray}, and C{nftranspose} are
    used to generate a dense encoding for M{nf(ltext)}.  This
    allows C{_deltas} to calculate M{sum1} and M{sum2} using
    matrices, which yields a signifigant performance improvement. 

    @param train_toks: The set of training tokens.
    @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
    @param classifier: The current classifier.
    @type classifier: C{ClassifierI}
    @param ffreq_empirical: An array containing the empirical
        frequency for each feature.  The M{i}th element of this
        array is the empirical frequency for feature M{i}.
    @type ffreq_empirical: C{sequence} of C{float}
    @param unattested: An array that is 1 for features that are
        not attested in the training data; and 0 for features that
        are attested.  In other words, C{unattested[i]==0} iff
        C{ffreq_empirical[i]==0}. 
    @type unattested: C{sequence} of C{int}
    @param nfmap: A map that can be used to compress C{nf} to a dense
        vector.
    @type nfmap: C{dictionary} from C{int} to C{int}
    @param nfarray: An array that can be used to uncompress C{nf}
        from a dense vector.
    @type nfarray: C{array} of C{float}
    @param nftranspose: C{array} of C{float}
    @type nftranspose: The transpose of C{nfarray}
    g-q=i,  R   i   t   axisi    i   (   R   t   onesR   R   R   R   R   R    R   R6   RL   t   outerR0   (   Ry   R   R   t   ffreq_empiricalR   R   R   R   t   NEWTON_CONVERGEt
   MAX_NEWTONR   t   AR   R$   t   distR%   RP   R   R   R   t   rangenumt   nf_deltat   exp_nf_deltat   nf_exp_nf_deltat   sum1t   sum2R1   t   n_error(    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyR      s<    D$  - 3
 

 RV   i    c         K   s  y d d k  } d d k } Wn% t j
 o }	 t d |   n X| d j o t i |  d | } n | d j	 o t d   n | i   } t g  }
 t	 |  D] \ } } |
 | | f q ~
  } | i
   } t |   } t |  } | o | i i } n
 t i } | | | | f  } | d | | f  } x t	 |   D] \ } \ } } | d | t |  | | f c d 7<xS | D]K } xB | i | |  D]. \ } } | | | | t |  | | f <qWqWqIW| i i | | |  } | o | d | _ n | d	 j o d | _ n | d
 j o t | _ n d | j o | d | _ n d | j oM | d j o | d | _ q| d	 j o | d | _ q| d | _ n | i d |  | i t i t i  } t | |  S(   s  
    Train a new C{ConditionalExponentialClassifier}, using the given
    training samples, using the specified C{scipy} optimization
    algorithm.  This C{ConditionalExponentialClassifier} will encode
    the model that maximizes entropy from all the models that are
    empirically consistent with C{train_toks}.

    @see: L{train_maxent_classifier()} for parameter descriptions.
    @require: The C{scipy} package must be installed.
    iNsi   The maxent training algorithm %r requires that the scipy package be installed.  See http://www.scipy.org/R   s$   Specify encoding or labels, not bothi   i    i   RY   i   R^   Ra   RV   Rz   (    t   scipy.sparset   scipy.maxentropyRm   Rw   Rk   R   Rv   R   R   R>   R   R   R{   t
   lil_matrixR   R   R    t
   maxentropyt   conditionalmodelt   sigma2R   R"   t   verboset   maxitert   avegtolt   maxgtolt   tolt   fitt   paramsR   t   eR   (   Ry   Ri   R   R   Rz   R{   Rj   R|   Rl   R  RP   RE   R$   t   labelnumt   num_featurest   num_tokst
   num_labelsR   t   Ft   Nt   toknumR   R   R1   R   t   modelR   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRs   ~  sT    9	 (  .c      	   K   sW  t  } | t j o4 | i d d  } t i |  | d | d t  } n | t j	 o t d   n yT t i d d d d	  \ } }	 t i	 |	 d
  }
 t
 |  | |
 d | |
 i   Wn. t t t f j
 o } t d |   n Xg  } | d d d g 7} | o | d g 7} n | o d | d } n d } | d d | d g 7} | d j  o | d g 7} n d | j o | d d | d g 7} n d | j o" | d d t | d  g 7} n | d |	 g 7} | GHt |  } y t i |	  Wn* t t f j
 o } d |	 | f GHn Xt | |  } | t i t i  9} t | |  S(   s  
    Train a new C{ConditionalExponentialClassifier}, using the given
    training samples, using the external C{megam} library.  This
    C{ConditionalExponentialClassifier} will encode the model that
    maximizes entropy from all the models that are empirically
    consistent with C{train_toks}.

    @see: L{train_maxent_classifier()} for parameter descriptions.
    @see: L{nltk.classify.megam}
    Rd   i    R   R   s$   Specify encoding or labels, not botht   prefixs   nltk-t   suffixs   .gzt   wbt   explicits,   Error while creating megam training file: %ss   -nobiass   -repeatt   10s	   -explicitg      ?i   s   -lambdas   %.2fs   -tunei   s   -quietR^   s   -maxis   %st   ll_deltas   -dppt
   multiclasss    Warning: unable to delete %s: %s(   R"   Rk   t   getR   Rv   Rw   t   tempfilet   mkstempt   gzipt   openR   t   closet   OSErrort   IOErrorR0   R   t   ost   removeR   R   R   R  R   (   Ry   Ri   R   R   Rj   R}   R  Rd   t   fdt   trainfile_namet	   trainfileR  R   t   inv_variancet   stdoutR   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRt     sL    "Ru   c           B   s   e  Z e d     Z RS(   c         K   sk  | i  d d  } | i  d d  } | i  d d   } | i  d d   } | i  d d  } | i  d	 d  } | i  d
  }	 | i  d  }
 | p t i | | d | } n t i d d d d  \ } } t i d d  \ } } t i | d  } t | | |  | i	   g  } | i
 d g  | i
 d | g  | o | i
 d d | d g  n |	 o | i
 d d |	 g  n |
 o! | i
 d d t |
  g  n | i
 d | g  | i
 d | g  | d j  o | i
 d g  n | i
 d g  t |  t | d  } t |  } | i	   t i |  t i |  | t i t i  9} |  | |  S(   NRz   t   tao_lmvmRi   i   R   R   Rj   i    Rd   R^   R`   R  s   nltk-tadm-events-R  s   .gzs   nltk-tadm-weights-R  s   -monitors   -methods   -l2s   %.6fi   s   -max_its   %ds   -fatols
   -events_ins   -params_outs   2>&1s   -summaryt   rb(   R  Rk   R   Rv   R  R  R  R  R	   R  t   extendR0   R   R
   R  R  R   R   R  (   Rx   Ry   R}   Rz   Ri   R   R   t   sigmaRd   R^   R  t   trainfile_fdR  t   weightfile_fdt   weightfile_nameR  R   t
   weightfileR   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRv   +  sN    	
!

(   R   R   R   Rv   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyRu   *  s   c          C   s#   d d k  l }  |  t i  } d  S(   Ni(   t
   names_demo(   t   nltk.classify.utilR*  R   Rv   (   R*  R   (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pyt   demog  s    t   __main__(1   R   t   __docformat__R   t   timeR  R  R  R   R    t	   nltk.utilR   t   nltk.probabilityR+  t   apiR   R   R   Rg   R   R   R   Rh   R   R	   R
   t   ClassifierIR   R   t
   deprecatedR   t   objectR   R   R   R   R   Rk   Rq   R   R   Rp   R   R   R"   Rs   Rt   Ru   R,  R   (    (    (    s*   /p/zhu/06/nlp/nltk/nltk/classify/maxent.pys   <module>6   sP   

 G-:;`			Y		~\M=	