³ò
’B_Kc           @   s¯   d  Z  d Z d d k Z d d k Z d d k Z d d k l Z y d d k Z Wn e j
 o d Z n Xd a
 d d „ Z e e d „ Z e d „ Z d „  Z d	 „  Z d S(
   s™  
A set of functions used to interface with the external U{megam
<http://www.cs.utah.edu/~hal/megam/>} maxent optimization package.
Before C{megam} can be used, you should tell NLTK where it can find
the C{megam} binary, using the L{config_megam()} function.  Typical
usage:

    >>> import nltk
    >>> nltk.config_megam('.../path/to/megam')
    >>> classifier = nltk.MaxentClassifier.train(corpus, 'megam')

s
   epytext eniÿÿÿÿN(   t   find_binaryc      
   C   s7   t  d |  d d d g d d d d d g d	 d
 ƒa d S(   sC  
    Configure NLTK's interface to the C{megam} maxent optimization
    package.

    @param bin: The full path to the C{megam} binary.  If not specified,
        then nltk will search the system for a C{megam} binary; and if
        one is not found, it will raise a C{LookupError} exception.
    @type bin: C{string}
    t   megamt   env_varst   MEGAMt	   MEGAMHOMEt   binary_namess	   megam.optt
   megam_686os   megam_i686.optot   urls"   http://www.cs.utah.edu/~hal/megam/N(   R    t
   _megam_bin(   t   bin(    (    s)   /p/zhu/06/nlp/nltk/nltk/classify/megam.pyt   config_megam'   s
    	c         C   sã   | i  ƒ  } t g  } t | ƒ D] \ } } | | | f q  ~ ƒ }	 x— |  D] \ }
 } | i d |	 | ƒ | p  t | i |
 | ƒ | | ƒ n; x7 | D]/ } | i d ƒ t | i |
 | ƒ | | ƒ q› W| i d ƒ qL Wd S(   s  
    Generate an input file for C{megam} based on the given corpus of
    classified tokens.

    @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
    @param train_toks: Training data, represented as a list of
        pairs, the first member of which is a feature dictionary,
        and the second of which is a classification label.

    @type encoding: L{MaxentFeatureEncodingI}
    @param encoding: A feature encoding, used to convert featuresets
        into feature vectors.

    @type stream: C{stream}
    @param stream: The stream to which the megam input file should be
        written.

    @param bernoulli: If true, then use the 'bernoulli' format.  I.e.,
        all joint features have binary values, and are listed iff they
        are true.  Otherwise, list feature values explicitly.  If
        C{bernoulli=False}, then you must call C{megam} with the
        C{-fvals} option.

    @param explicit: If true, then use the 'explicit' format.  I.e.,
        list the features that would fire for any of the possible
        labels, for each token.  If C{explicit=True}, then you must
        call C{megam} with the C{-explicit} option.
    s   %ds    #s   
N(   t   labelst   dictt	   enumeratet   writet   _write_megam_featurest   encode(   t
   train_tokst   encodingt   streamt	   bernoullit   explicitR   t   _[1]t   it   labelt   labelnumt
   featuresett   l(    (    s)   /p/zhu/06/nlp/nltk/nltk/classify/megam.pyt   write_megam_file<   s    9  c         C   s©   t  t j o t d ƒ ‚ n | p
 t d ‚ |  i ƒ  i d ƒ } t  i t | ƒ d ƒ } xG | D]? } | i ƒ  o, | i ƒ  \ } } t | ƒ | t	 | ƒ <qb qb W| S(   sÒ   
    Given the stdout output generated by C{megam} when training a
    model, return a C{numpy} array containing the corresponding weight
    vector.  This function does not currently handle bias features.
    s.   This function requires that numpy be installeds   non-explicit not supported yets   
t   d(
   t   numpyt   Nonet
   ValueErrort   AssertionErrort   stript   splitt   zerost   lent   floatt   int(   t   sR   t   linest   weightst   linet   fidt   weight(    (    s)   /p/zhu/06/nlp/nltk/nltk/classify/megam.pyt   parse_megam_weightst   s     c         C   s“   |  p t  d ƒ ‚ n xu |  D]m \ } } | oC | d j o | i d | ƒ q‹ | d j o t  d ƒ ‚ q‹ q | i d | | f ƒ q Wd  S(   Ns:   MEGAM classifier requires the use of an always-on feature.i   s    %si    s3   If bernoulli=True, then allfeatures must be binary.s    %s %s(   R    R   (   t   vectorR   R   R,   t   fval(    (    s)   /p/zhu/06/nlp/nltk/nltk/classify/megam.pyR   …   s     c         C   s™   t  |  t ƒ o t d ƒ ‚ n t t j o t ƒ  n t g |  } t i | d t i ƒ} | i	 ƒ  \ } } | i
 d j o H| GHt d ƒ ‚ n | S(   s<   
    Call the C{megam} binary with the given arguments.
    s    args should be a list of stringst   stdouti    s   megam command failed!(   t
   isinstancet
   basestringt	   TypeErrorR   R   R
   t
   subprocesst   Popent   PIPEt   communicatet
   returncodet   OSError(   t   argst   cmdt   pR1   t   stderr(    (    s)   /p/zhu/06/nlp/nltk/nltk/classify/megam.pyt
   call_megam“   s    (   t   __doc__t   __docformat__t   ost   os.pathR5   t   nltk.internalsR    R   t   ImportErrorR   R   R
   t   TrueR   R.   R   R?   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/classify/megam.pys   <module>   s   7	