łň
4ŇÇIc           @   s¸  d  Z  d d k l Z d d k l Z d d k Z d d k Z d d k l Z h  d d d d	 d
 d d d d g	 d <d d d g d <Z	 e
 d d d d d d  Z e
 d d d d d d  Z d   Z d   Z d e d  Z e i d  Z d   Z d   Z d d d  Z d d d d  Z e e d   Z d!   Z d e d"  Z d d#  Z d$   Z d% d&  Z d'   Z e  d( j o3 e d) d  e d) d  e   e   e   n d S(*   sÂ  
Code for extracting relational triples from the ieer and conll2002 corpora.

Relations are stored internally as dictionaries ('reldicts'). 

The two serialization outputs are I{rtuple} and I{clause}. 
   - An I{rtuple} is a tuple of the form C{(subj, filler, obj)}, 
     where C{subj} and C{obj} are pairs of Named Entity mentions, and C{filler} is the string of words   
     occurring between C{sub} and C{obj} (with no intervening NEs). Strings are printed via C{repr()} to
     circumvent locale variations in rendering utf-8 encoded strings.
   - A I{clause} is an atom of the form C{relsym(subjsym, objsym)}, 
     where the relation, subject and object have been canonicalized to single strings.

i˙˙˙˙(   t   defaultdict(   t   joinN(   t   ifiltert   LOCATIONt   ORGANIZATIONt   PERSONt   DURATIONt   DATEt   CARDINALt   PERCENTt   MONEYt   MEASUREt   ieert   LOCt   PERt   ORGt	   conll2002c         C   s*   y t  |  SWn t j
 o |  Sn Xd S(   sL   
    Expand an NE class name.
    @type type: C{str}
    @rtype: C{str}
    N(   t
   short2longt   KeyError(   t   type(    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   _expand,   s    c         C   s*   y t  |  SWn t j
 o |  Sn Xd S(   sP   
    Abbreviate an NE class name.
    @type type: C{str}
    @rtype: C{str}
    N(   t
   long2shortR   (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   class_abbrev7   s    t    c      	   C   sŠ   y t  |  d | SWn t j
 o | o3 t  g  } |  D] } | | d q: ~ d | Sn d d k l } t  g  } |  D] } | | |  q} ~ d | Sn Xd S(   sČ   
    Join a list into a string, turning tags tuples into tag strings or just words.
    @param untag: if C{True}, omit the tag from tagged input strings.
    @type lst: C{list}
    @rtype: C{str}
    t   sepi    i˙˙˙˙(   t	   tuple2strN(   R   t	   TypeErrort   nltk.tagR   (   t   lstR   t   untagt   _[1]t   tupR   t   _[2](    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   _joinC   s    3c         C   s<   y | |  i  d  SWn  t j
 o |  i  d  Sn Xd S(   sd   
    Translate one entity to its ISO Latin value.
    Inspired by example from effbot.org
    

    i   i    N(   t   groupR   (   t   mt   defs(    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   descape_entityR   s    c         C   sX   t  |  d d t } | i   } t i d  } | i t |  } | i d d  } | S(   s   
    Convert a list of strings into a canonical symbol.
    @type lst: C{list}
    @return: a Unicode string without whitespace
    @rtype: C{unicode}
    t   _R   s   &(\w+?);t   .t    (   R!   t   Truet   lowert   ret   compilet   subR%   t   replace(   R   t   symt   ENT(    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   list2syme   s    c         C   s   d d k  l } g  } g  t g } xV |  D]N } t | |  p | d i |  q) | | d <| i |  g  t g } q) W| S(   sĐ  
    Group a chunk structure into a list of pairs of the form (list(str), L{Tree})
    
    In order to facilitate the construction of (L{Tree}, string, L{Tree}) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a L{Tree} of the form (NE_label, terminals).
    
    @param tree: a chunk tree
    @return: a list of pairs (list(C{str}), L{Tree})
    @rtype: C{list} of C{tuple}
    i˙˙˙˙(   t   Treei    i   (   t   nltkR2   t   Nonet
   isinstancet   append(   t   treeR2   t   pairst   pairt   dtr(    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   mk_pairss   s     
i   i    c         C   s]  g  } xPt  |   d j o<t t  } t |  d d |  | d <|  d d i | d <t |  d d i    | d <t |  d d i    | d <t |  d d  | d <|  d d i | d	 <t |  d d i    | d
 <t |  d d i    | d <t |  d d |   | d <| o d | d | d	 f GHn | i |  |  d }  q	 W| S(   s  
    Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which
    stores information about the subject and object NEs plus the filler between them.
    Additionally, a left and right context of length =< window are captured (within 
    a given input sentence).
    
    @param pairs: a pair of list(str) and L{Tree}, as generated by 
    @param window: a threshold for the number of items to include in the left and right context
    @type window: C{int}
    @return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
    @rtype: C{list} of C{defaultdict}
    i   i    t   lconi   t	   subjclasst   subjtextt   subjsymt   fillert   objclasst   objtextt   objsymt   rcons   (rel(%s, %s)(   t   lenR    t   strR!   t   nodet   leavesR1   R6   (   R8   t   windowt   tracet   resultt   reldict(    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   mk_reldicts   s"    i
   c   	         s$   oI  t  | j o8 t   t  | j o t    qP t d   n  oI  t  | j o8 t   t  | j o t    q  t d   n | d j o  t | i  t | i  } n' | d j o t |  } n
 t d  t |  }      f d   } t | |  S(   s  
    Filter the output of L{mk_reldicts} according to specified NE classes and a filler pattern.
    
    The parameters C{subjclass} and C{objclass} can be used to restrict the 
    Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', 
    'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').

    @param subjclass: the class of the subject Named Entity.
    @type subjclass: C{string}
    @param objclass: the class of the object Named Entity.
    @type objclass: C{string}
    @param doc: input document
    @type doc: C{ieer} document or a list of chunk trees
    @param corpus: name of the corpus to take as input; possible values are         
    'ieer' and 'conll2002'
    @type corpus: C{string}
    @param pattern: a regular expression for filtering the fillers of
    retrieved triples.
    @type pattern: C{SRE_Pattern}
    @param window: filters out fillers which exceed this threshold
    @type window: C{int}
    @return: see L{mk_reldicts}
    @rtype: C{list} of C{defaultdict}
    s;   your value for the subject type has not been recognized: %ss:   your value for the object type has not been recognized: %sR   R   s   corpus type not recognizedc            sP   |  d   j o? t  |  d i     j o"   i |  d  o |  d  j S(   R=   R@   RA   (   RE   t   splitt   match(   t   x(   t   patternRA   RI   R=   (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   <lambda>Ý   s   (   t
   NE_CLASSESR   t
   ValueErrorR;   t   textt   headlineRM   t   filter(	   R=   RA   t   doct   corpusRQ   RI   R8   t   reldictst	   relfilter(    (   RQ   RA   RI   R=   s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   extract_relsŻ   s      	c         C   s   t  |  d  |  d |  d t  |  d  |  d g } d } | o |  d g | } d | } n | o | i |  d	  | d
 } n t |  } | | S(   s|   
    Pretty print the reldict as an rtuple.
    @param reldict: a relation dictionary
    @type reldict: C{defaultdict}
    R=   R>   R@   RA   RB   s   [%s: %r] %r [%s: %r]R<   s   ...%r)RD   s   (%r...(   R   R6   t   tuple(   RL   R<   RD   t   itemst   formatt	   printargs(    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   show_raw_rtupleĺ   s    5c         C   s   | |  d |  d f } d | S(   sž   
    Print the relation in clausal form.
    @param reldict: a relation dictionary
    @type reldict: C{defaultdict}
    @param relsym: a label for the relation
    @type relsym: C{str}
    R?   RC   s
   %s(%r, %r)(    (   RL   t   relsymR^   (    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   show_clauseö   s    c      
   C   sr  d d k  l } | oD d d  k } | i d  } | i | _ | i   } | i d  n t i	 d  } Hd GHd d GHxš | i
   D]Ť } x˘ | i |  D] } |  o | i GHd GHn xp t d
 d | d | D]V }	 t |	 d d GH| o8 |	 d |	 d | i f }
 | i d |
  | i   qÓ qÓ Wq Wq W| o2 | i d  Hd GHd GHx | D] } | GHq[Wn d  S(   Ni˙˙˙˙(   R   s   :memory:sL   create table Locations
        (OrgName text, LocationName text, DocID text)s   .*\bin\b(?!\b.+ing\b)s'   IEER: in(ORG, LOC) -- just the clauses:t   =i-   i   R   R   RQ   Rb   t   INR>   RB   sG   insert into Locations 
                                values (?, ?, ?)sP   select OrgName from Locations
                    where LocationName = 'Atlanta's,   Extract data from SQL table: ORGs in Atlantat   -s   ===============s   ---------------(   t   nltk.corpusR   t   sqlite3t   connectt   OptimizedUnicodet   text_factoryt   cursort   executeR+   R,   t   filest   parsed_docst   docnoR\   Rc   t   commit(   RJ   t   sqlR   Rh   t
   connectiont   curRe   t   fileRX   t   relt   rtuplet   row(    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   in_demo	  sH    	  	 
 c   	   
   C   sŐ   d d k  l } d } t i | t i  } Hd GHd d GHx | i   D] } x} | i |  D]l } t } } |  o | i GHd GHt	 } } n x7 t
 d d	 | d
 | D] } t | d | d | GHq¨ Wq] WqG Wd  S(   Ni˙˙˙˙(   R   s  
    (.*(                   # assorted roles
    analyst|
    chair(wo)?man|
    commissioner|
    counsel|
    director|
    economist|
    editor|
    executive|         
    foreman|
    governor|
    head|
    lawyer|
    leader|
    librarian).*)|
    manager|
    partner|
    president|
    producer|
    professor|
    researcher|
    spokes(wo)?man|
    writer|
    ,\sof\sthe?\s*  # "X, of (the) Y"
    s(   IEER: has_role(PER, ORG) -- raw rtuples:Rd   i-   i   R   R   RQ   R<   RD   s   ===============(   Rg   R   R+   R,   t   VERBOSERn   Ro   t   FalseRp   R)   R\   Ra   (	   RJ   R   t   rolest   ROLESRu   RX   R<   RD   Rv   (    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt
   roles_demo5  s$    	  
 c       	   C   s   d d k  l }  d d k l } d GHd d GHg  } |  i   D]& } |  i |  D] } | | i qR q? ~ } x% | d  D] } Hd | i | f GHqy Wd  S(	   Ni˙˙˙˙(   R   (   R2   s   IEER: First 20 HeadlinesRd   i-   i   s   %s:
%s(   Rg   R   R3   R2   Rn   Ro   RV   Rp   (   R   R2   R   Ru   RX   t   treesR7   (    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   ieer_headlinesg  s    	@ i   c      
   C   sˇ   d d k  l } d } t i | t i  } Hd GHd d GHxv | i d  D]e } t } } |  o t } } n x= t d d	 | d
 d d | D] } t	 | d | d | GHq WqJ Wd S(   sh   
    Find the copula+'van' relation ('of') in the Dutch tagged training corpus
    from CoNLL 2002.
    i˙˙˙˙(   R   sR   
    (
    is/V|
    was/V|
    werd/V|
    wordt/V
    )
    .*
    van/Prep
    s;   Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:Rd   i-   s	   ned.trainR   R   RY   R   RQ   R<   RD   N(
   Rg   R   R+   R,   Rz   t   chunked_sentsR{   R)   R\   Ra   (   RJ   R   t   vnvt   VANRX   R<   RD   Rv   (    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   conllnedz  s    	 
 c          C   s´   d d k  l }  d } t i | t i  } Hd GHd d GHg  } |  i d  D]2 } t d d	 | d
 d d | D] } | | qp qN ~ } x# | d  D] } t | d d GHq WHd  S(   Ni˙˙˙˙(   R   s.   
    .*
    (
    de/SP|
    del/SP
    )
    s=   Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:Rd   i-   s	   esp.trainR   R   RY   R   RQ   i
   Rb   t   DE(   Rg   R   R+   R,   Rz   R   R\   Rc   (   R   t   deR   R   RX   Rv   t   relst   r(    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pyt   conllesp  s    	5 t   __main__RJ   (!   t   __doc__t   nltk.compatR    t   stringR   R+   t   htmlentitydefst	   itertoolsR   RS   t   dictR   R   R   R   R{   R!   t
   entitydefsR%   R1   R;   RM   R4   R\   Ra   Rc   R)   Ry   R~   R   R   R   t   __name__(    (    (    s)   /p/zhu/06/nlp/nltk/nltk/sem/relextract.pys   <module>   s@   				6	,2	"	