³ò
’B_Kc           @   s¤   d  Z  d d k Z d d k l Z d „  Z d „  Z d e f d „  ƒ  YZ d „  Z e d	 „ Z	 d
 „  Z
 d „  Z d „  Z e d j o e
 ƒ  e ƒ  e ƒ  n d S(   sˆ  
Simple classifier for RTE corpus.

It calculates the overlap in words and named entities between text and
hypothesis, and also whether there are words / named entities in the
hypothesis which fail to occur in the text, since this is an indicator that
the hypothesis is more informative than (i.e not entailed by) the text.

TO DO: better Named Entity classification
TO DO: add lemmatization
iÿÿÿÿN(   t   accuracyc         C   s&   |  i  ƒ  p |  i ƒ  o t Sn t S(   sr   
    This just assumes that words in all caps or titles are 
    named entities.
    
    @type token: C{str}
    (   t   istitlet   isuppert   Truet   False(   t   token(    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   ne   s    c         C   s4   t  i i i |  d d ƒ} | t j	 o | Sn |  S(   sA   
    Use morphy from WordNet to find the base form of verbs.
    t   post   verb(   t   nltkt   corpust   wordnett   morphyt   None(   t   wordt   lemma(    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt	   lemmatize#   s    t   RTEFeatureExtractorc           B   s5   e  Z d  Z e e d „ Z e d „ Z e d „ Z RS(   s™   
    This builds a bag of words for both the text and the hypothesis after
    throwing away some stopwords, then calculates overlap and difference.
    c   	      C   s¯  | |  _  t d d d d d d d d d	 d
 d d d d d g ƒ |  _ t d d d d d g ƒ |  _ d d k l } | d ƒ } | i | i ƒ |  _ | i | i	 ƒ |  _
 t |  i ƒ |  _ t |  i
 ƒ |  _ | oj t g  } |  i D] } | | | ƒ qå ~ ƒ |  _ t g  } |  i
 D] } | | | ƒ q~ ƒ |  _ n |  i  o* |  i |  i |  _ |  i |  i |  _ n |  i |  i @|  _ |  i |  i |  _ |  i |  i |  _ d S(   s®   
        @param rtepair: a L{RTEPair} from which features should be extracted
        @param stop: if C{True}, stopwords are thrown away.
        @type stop: C{bool}
        t   at   thet   itt   theyt   oft   int   tot   havet   ist   aret   weret   andt   veryt   .t   ,t   not   nott   nevert   failedrejectedt   deniediÿÿÿÿ(   t   RegexpTokenizers   ([A-Z]\.)+|\w+|\$[\d\.]+N(   t   stopt   sett	   stopwordst   negwordst   nltk.tokenizeR&   t   tokenizet   textt   text_tokenst   hypt
   hyp_tokenst
   text_wordst	   hyp_wordst   _overlapt
   _hyp_extrat
   _txt_extra(	   t   selft   rtepairR'   R   R&   t	   tokenizert   _[1]R   t   _[2](    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   __init__1   s&    	$37
c         C   s¯   t  g  } |  i D] } t | ƒ o | | q q ~ ƒ } | d j o | o d G| GHn | SnH | d j o* | o d G|  i | GHn |  i | Sn t d | ƒ ‚ d S(   s¸   
        Compute the overlap between text and hypothesis.
        
        @param toktype: distinguish Named Entities from ordinary words
        @type toktype: 'ne' or 'word'
        R   s
   ne overlapR   s   word overlaps   Type not recognized:'%s'N(   R(   R3   R   t
   ValueError(   R6   t   toktypet   debugR9   R   t
   ne_overlap(    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   overlapT   s    ;c         C   s€   t  g  } |  i D] } t | ƒ o | | q q ~ ƒ } | d j o | Sn- | d j o |  i | Sn t d | ƒ ‚ d S(   sº   
        Compute the extraneous material in the hypothesis.
        
        @param toktype: distinguish Named Entities from ordinary words
        @type toktype: 'ne' or 'word'
        R   R   s   Type not recognized: '%s'N(   R(   R4   R   R<   (   R6   R=   R>   R9   R   t   ne_extra(    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt	   hyp_extrae   s    ;(   t   __name__t
   __module__t   __doc__R   R   R;   R@   RB   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyR   ,   s   #c         C   s¸   t  |  ƒ } h  } t | d <t | i d ƒ ƒ | d <t | i d ƒ ƒ | d <t | i d ƒ ƒ | d <t | i d ƒ ƒ | d <t | i | i @ƒ | d <t | i | i @ƒ | d	 <| S(
   Nt   alwaysonR   t   word_overlapt   word_hyp_extraR   R?   t   ne_hyp_extrat   neg_txtt   neg_hyp(   R   R   t   lenR@   RB   R*   R1   R2   (   R7   t	   extractort   features(    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   rte_featuresu   s    
c         C   s  g  } t  i i i d d d g ƒ D] } | | | i f q# ~ } g  } t  i i i d d d g ƒ D] } | | | i f qe ~ } d GH|  g  } | D] \ } } | | | ƒ | f q— ~ ƒ }	 d GHt |	 g  }
 | D] \ } } |
 | | ƒ | f qØ ~
 ƒ } d	 | GH|	 S(
   s   
    Classify RTEPairs
    s   rte1_dev.xmls   rte2_dev.xmls   rte3_dev.xmls   rte1_test.xmls   rte2_test.xmls   rte3_test.xmls   Training classifier...s   Testing classifier...s   Accuracy: %6.4f(   R	   R
   t   rtet   pairst   valueR    (   t   trainerRN   R9   t   pairt   trainR:   t   testt   _[3]t   labelt
   classifiert   _[4]t   acc(    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   rte_classifier‚   s    BB9<	c          C   sh   t  i i i d g ƒ d  }  xE |  D]= } Hx3 t t | ƒ ƒ D] } d | t | ƒ | f GHq= Wq# Wd  S(   Ns   rte1_dev.xmli   s   %-15s => %s(   R	   R
   RP   RQ   t   sortedRO   (   RQ   RT   t   key(    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   demo_features–   s      c          C   s^   t  i i i d g ƒ d }  t |  ƒ } | i GH| i d ƒ GH| i d ƒ GH| i d ƒ GHd  S(   Ns   rte3_dev.xmli!   R   R   (   R	   R
   RP   RQ   R   R2   R@   RB   (   R7   RM   (    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   demo_feature_extractorž   s    c             s‹   d d  k  ‰  y  ˆ  i d ƒ ‡  f d †  }  WnH t j
 o< y ‡  f d †  }  Wqw t j
 o ˆ  i i }  qw Xn Xˆ  i i |  ƒ d  S(   Niÿÿÿÿs   /usr/local/bin/megamc            s   ˆ  i  i |  d  ƒ S(   t   megam(   t   MaxentClassifierRU   (   t   x(   R	   (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   <lambda>«   s    c            s   ˆ  i  i |  d  ƒ S(   t   BFGS(   Rb   RU   (   Rc   (   R	   (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyRd   ®   s    (   R	   t   config_megamR<   Rb   RU   t   classifyR\   (   RS   (    (   R	   s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pyt   demo§   s    t   __main__(   RE   R	   t   utilR    R   R   t   objectR   RO   R\   R_   R`   Rh   RC   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/classify/rte_classify.pys   <module>   s   			I					