³ò
B_Kc           @   sÐ   d  Z  d d k Z d d k Z d d k Z d d k l Z d d k l Z d d k l	 Z	 l
 Z
 d d k Td d k Td e f d „  ƒ  YZ d	 e f d
 „  ƒ  YZ d e f d „  ƒ  YZ d e f d „  ƒ  YZ d S(   s!   
Read CoNLL-style chunk fileids.
iÿÿÿÿN(   t
   deprecated(   t   Tree(   t   LazyMapt   LazyConcatenation(   t   *t   ConllCorpusReaderc           B   sÖ  e  Z d  Z d Z d Z d Z d Z d Z d Z d Z	 e e e e e e e	 f Z
 e d e e e e d	 „ Z e d
 „ Z e d „ Z e d „ Z e d „ Z e d „ Z e e d „ Z e e d „ Z e e d „ Z e d „ Z e e e d „ Z e d „ Z e d „ Z e d „ Z d „  Z d „  Z d „  Z d „  Z  d „  Z! d „  Z" d „  Z# d „  Z$ d „  Z% e& d  „  ƒ Z' e( d! ƒ d" e d# „ ƒ Z) e( d$ ƒ e d% „ ƒ Z* e( d& ƒ d' „  ƒ Z+ e( d( ƒ d) „  ƒ Z, RS(*   s@  
    A corpus reader for CoNLL-style files.  These files consist of a
    series of sentences, separated by blank lines.  Each sentence is
    encoded using a table (or I{grid}) of values, where each line
    corresponds to a single word, and each column corresponds to an
    annotation type.  The set of columns used by CoNLL-style files can
    vary from corpus to corpus; the C{ConllCorpusReader} constructor
    therefore takes an argument, C{columntypes}, which is used to
    specify the columns that are used by a given corpus.

    @todo: Add support for reading from corpora where different
        parallel files contain different columns.
    @todo: Possibly add caching of the grid corpus view?  This would
        allow the same grid view to be used by different data access
        methods (eg words() and parsed_sents() could both share the
        same grid corpus view object).
    @todo: Better support for -DOCSTART-.  Currently, we just ignore
        it, but it could be used to define methods that retrieve a
        document at a time (eg parsed_documents()).
    t   wordst   post   treet   chunkt   net   srlt   ignoret   Sc
         C   s¸   x2 | D]* }
 |
 |  i  j o t d |
 ƒ ‚ q q Wt | t ƒ o | g } n | |  _ t d „  t | ƒ Dƒ ƒ |  _ | |  _ | |  _	 | |  _
 |	 |  _ t i |  | | | ƒ d  S(   Ns   Bad column type %rc         s   s%   x |  ] \ } } | | f Vq Wd  S(   N(    (   t   .0t   it   c(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pys	   <genexpr>K   s    (   t   COLUMN_TYPESt
   ValueErrort
   isinstancet
   basestringt   _chunk_typest   dictt	   enumeratet   _colmapt   _pos_in_treet	   _top_nodet   _srl_includes_rolesett   _tree_classt   CorpusReadert   __init__(   t   selft   roott   fileidst   columntypest   chunk_typest   top_nodet   pos_in_treet   srl_includes_rolesett   encodingt
   tree_classt
   columntype(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR   A   s     					c         C   sk   | d  j o |  i } n t | t ƒ o | g } n t g  } | D] } | |  i | ƒ i ƒ  qE ~ ƒ S(   N(   t   Nonet   _fileidsR   R   t   concatt   opent   read(   R   R!   t   _[1]t   f(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   rawV   s    c         C   s/   |  i  |  i ƒ t t |  i |  i | ƒ ƒ ƒ S(   N(   t   _requiret   WORDSR   R   t
   _get_wordst   _grids(   R   R!   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR   [   s    c         C   s)   |  i  |  i ƒ t |  i |  i | ƒ ƒ S(   N(   R2   R3   R   R4   R5   (   R   R!   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   sents_   s    c         C   s5   |  i  |  i |  i ƒ t t |  i |  i | ƒ ƒ ƒ S(   N(   R2   R3   t   POSR   R   t   _get_tagged_wordsR5   (   R   R!   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   tagged_wordsc   s    c         C   s/   |  i  |  i |  i ƒ t |  i |  i | ƒ ƒ S(   N(   R2   R3   R7   R   R8   R5   (   R   R!   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   tagged_sentsh   s    c            sd   ˆ  i  ˆ  i ˆ  i ˆ  i ƒ ˆ d  j o ˆ  i ‰ n ‡  ‡ f d †  } t t | ˆ  i | ƒ ƒ ƒ S(   Nc            s   ˆ  i  |  ˆ ƒ S(   N(   t   _get_chunked_words(   t   grid(   R   R#   (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   get_chunked_wordso   s    (	   R2   R3   R7   t   CHUNKR*   R   R   R   R5   (   R   R!   R#   R=   (    (   R   R#   s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   chunked_wordsl   s
    	c            s^   ˆ  i  ˆ  i ˆ  i ˆ  i ƒ ˆ d  j o ˆ  i ‰ n ‡  ‡ f d †  } t | ˆ  i | ƒ ƒ S(   Nc            s   ˆ  i  |  ˆ ƒ S(   N(   R;   (   R<   (   R   R#   (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR=   w   s    (   R2   R3   R7   R>   R*   R   R   R5   (   R   R!   R#   R=   (    (   R   R#   s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   chunked_sentst   s    c            s^   ˆ  i  ˆ  i ˆ  i ˆ  i ƒ ˆ d  j o ˆ  i ‰ n ‡ ‡  f d †  } t | ˆ  i | ƒ ƒ S(   Nc            s   ˆ i  |  ˆ  ƒ S(   N(   t   _get_parsed_sent(   R<   (   R%   R   (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   get_parsed_sent~   s    (   R2   R3   R7   t   TREER*   R   R   R5   (   R   R!   R%   RB   (    (   R   R%   s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   parsed_sents{   s    c         C   s)   |  i  |  i ƒ t |  i |  i | ƒ ƒ S(   N(   R2   t   SRLR   t   _get_srl_spansR5   (   R   R!   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt	   srl_spans‚   s    c            s   ˆ  i  ˆ  i ˆ  i ˆ  i ˆ  i ƒ ˆ d  j o ˆ  i ‰ n ‡ ‡  f d †  } t | ˆ  i | ƒ ƒ } | o t	 | ƒ } n | S(   Nc            s   ˆ i  |  ˆ  ƒ S(   N(   t   _get_srl_instances(   R<   (   R%   R   (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   get_srl_instances‰   s    (
   R2   R3   R7   RC   RE   R*   R   R   R5   R   (   R   R!   R%   t   flattenRI   t   result(    (   R   R%   s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   srl_instances†   s    "c         C   s;   |  i  |  i |  i |  i ƒ t t |  i |  i | ƒ ƒ ƒ S(   s×   
        @return: a list of word/tag/IOB tuples 
        @rtype: C{list} of C{tuple}
        @param fileids: the list of fileids that make up this corpus 
        @type fileids: C{None} or C{str} or C{list}
        (   R2   R3   R7   R>   R   R   t   _get_iob_wordsR5   (   R   R!   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt	   iob_words   s    c         C   s5   |  i  |  i |  i |  i ƒ t |  i |  i | ƒ ƒ S(   sß   
        @return: a list of lists of word/tag/IOB tuples 
        @rtype: C{list} of C{list}
        @param fileids: the list of fileids that make up this corpus 
        @type fileids: C{None} or C{str} or C{list}
        (   R2   R3   R7   R>   R   RM   R5   (   R   R!   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt	   iob_sentsš   s    c      	   C   sI   t  g  } |  i | t ƒ D]% \ } } | t | |  i d | ƒq ~ ƒ S(   NR'   (   R,   t   abspathst   Truet   StreamBackedCorpusViewt   _read_grid_block(   R   R!   R/   t   fileidt   enc(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR5   ¨   s    
c         C   sé   g  } xÜ t  | ƒ D]Î } | i ƒ  } | p q n g  } | i d ƒ D] } | | i ƒ  qG ~ } | d |  i i d d ƒ d j o | d =n x? | D]7 } t | ƒ t | d ƒ j o t d | ƒ ‚ q™ q™ W| i | ƒ q W| S(   Ns   
i    R   s
   -DOCSTART-s"   Inconsistent number of columns:
%s(   t   read_blankline_blockt   stript   splitR   t   gett   lenR   t   append(   R   t   streamt   gridst   blockR/   t   lineR<   t   row(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyRS   °   s     0$ c         C   s   |  i  | |  i d ƒ S(   NR   (   t   _get_columnR   (   R   R<   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR4   Ë   s    c         C   s3   t  |  i | |  i d ƒ |  i | |  i d ƒ ƒ S(   NR   R   (   t   zipRa   R   (   R   R<   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR8   Î   s    c         C   sI   t  |  i | |  i d ƒ |  i | |  i d ƒ |  i | |  i d ƒ ƒ S(   NR   R   R	   (   Rb   Ra   R   (   R   R<   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyRM   Ò   s    c         C   s’  |  i  | |  i d ƒ } |  i  | |  i d ƒ } |  i  | |  i d ƒ } t |  i g  ƒ g } x't | | | ƒ D]\ } } }	 |	 d j o d \ }
 } n |	 i d ƒ \ }
 } | d  j	 o | | j o
 d }
 n |
 d j o | | d i j o
 d	 }
 n |
 d
 j o! t | ƒ d j o | i	 ƒ  n |
 d	 j o1 t | g  ƒ } | d i
 | ƒ | i
 | ƒ n | d i
 | | f ƒ qs W| d S(   NR   R   R	   t   Ot    t   -t   Iiÿÿÿÿt   Bt   BOi   i    (   Rc   Rd   (   Ra   R   R   R   Rb   RX   R*   t   nodeRZ   t   popR[   (   R   R<   R#   R   t   pos_tagst
   chunk_tagst   stackt   wordt   pos_tagt	   chunk_tagt   statet
   chunk_typet	   new_chunk(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR;   ×   s*     
!
 c      
   C   sþ  |  i  | |  i d ƒ } |  i  | |  i d ƒ } |  i  | |  i d ƒ } d } xÁ t | | | ƒ D]­ \ } } }	 | d j o
 d } n | d j o
 d } n | d j o
 d } n | d j o
 d } n |	 i d	 ƒ \ }
 } | i d ƒ d } | d
 |
 | | | f 7} qd Wy |  i i | ƒ } Wn8 t t f j
 o& |  i i d |  i	 | f ƒ } n X| p xŠ | i
 ƒ  D]x } xo t | ƒ D]a \ } } t | t i ƒ oB t | ƒ d j o/ t | d t ƒ o | d | i f | | <qqWqzWn | S(   NR   R   R   Rd   t   (s   -LRB-t   )s   -RRB-R   s   %s (%s %s) %ss   (%s %s)i   i    (   Ra   R   Rb   RX   t   countR   t   parseR   t
   IndexErrorR   t   subtreesR   R   t   nltkR   RZ   R   Ri   (   R   R<   R%   R   Rk   t
   parse_tagst   treestrRn   Ro   t	   parse_tagt   leftt   rightR   t   subtreeR   t   child(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyRA   ÷   s6       &'c         C   s§  |  i  o2 |  i | |  i d d ƒ } |  i d d } n+ |  i | |  i d ƒ } |  i d d } t g  } | D] } | d j o | | qt qt ~ ƒ } g  } xü t | ƒ D]î } |  i | | | ƒ }	 g  }
 g  } x¶ t |	 ƒ D]¨ \ } } | i d ƒ \ } } x5 | i d ƒ D]$ } | o | i | | f ƒ qqWxL t | i d ƒ ƒ D]5 } | i	 ƒ  \ } } |
 i | | d f | f ƒ qUWqæ W| i |
 ƒ q± W| S(   s;   
        list of list of (start, end), tag) tuples
        R   i   i   Re   R   Rt   Ru   (
   R   Ra   R   RZ   t   rangeR   RX   R[   Rv   Rj   (   R   R<   t
   predicatest	   start_colR/   t   pt	   num_predst	   spanlistsR   t   colt   spanlistRm   t   wordnumt   srl_tagR~   R   t   tagt   start(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyRF     s2    
8    %c      
   C   s_  |  i  | | ƒ } |  i | ƒ } |  i o: |  i | |  i d d ƒ } |  i | |  i d ƒ } n- |  i | |  i d ƒ } d  g t | ƒ } t | ƒ } x» t | ƒ D]­ \ } }	 |	 d j o qª n xg | D]O }
 xE |
 D]: \ \ } } } | t	 | | ƒ j o | d j o PqÞ qÞ WqÑ PqÑ Wt
 d |	 ƒ ‚ | i t | | |	 | | |
 ƒ ƒ qª W| S(   NR   i   Re   t   Vs   C-Vs   No srl column found for %r(   RŽ   s   C-V(   RA   RF   R   Ra   R   R*   RZ   t   ConllSRLInstanceListR   R‚   R   R[   t   ConllSRLInstance(   R   R<   R%   R   R‡   Rƒ   t   rolesetst	   instancesRŠ   t	   predicateR‰   R   t   endRŒ   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyRH   4  s.    
   #	c         G   s9   x2 | D]* } | |  i  j o t d | ƒ ‚ q q Wd  S(   Ns)   This corpus does not contain a %s column.(   R   R   (   R   R"   R)   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR2   U  s
     c         C   s3   g  } t  t |  ƒ ƒ D] } | |  | | q ~ S(   N(   R‚   RZ   (   R<   t   column_indexR/   R   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyRa   [  s    sF   Use .raw() or .words() or .tagged_words() or .chunked_sents() instead.t   chunkedc         C   s   | d j o |  i  | | ƒ Sn | d j o |  i | ƒ Sn | d j o |  i | ƒ Sn | d j o |  i | ƒ Sn t d | ƒ ‚ d  S(   NR–   R1   t	   tokenizedt   taggeds   bad format %r(   R@   R1   R   R9   R   (   R   t   itemst   formatR#   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR.   c  s
    !s   Use .chunked_sents() instead.c         C   s   |  i  | | ƒ S(   N(   R@   (   R   R™   R#   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR–   k  s    s   Use .words() instead.c         C   s   |  i  | ƒ S(   N(   R   (   R   R™   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR—   n  s    s   Use .tagged_words() instead.c         C   s   |  i  | ƒ S(   N(   R9   (   R   R™   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR˜   q  s    (-   t   __name__t
   __module__t   __doc__R3   R7   RC   R>   t   NERE   t   IGNORER   R*   t   FalseRQ   R   R   R1   R   R6   R9   R:   R?   R@   RD   RG   RL   RN   RO   R5   RS   R4   R8   RM   R;   RA   RF   RH   R2   t   staticmethodRa   R    R.   R–   R—   R˜   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR      sR   							 		 	!			R   c           B   s)   e  Z d  Z d „  Z d „  Z d „  Z RS(   s|   
    An SRL instance from a CoNLL corpus, which identifies and
    providing labels for the arguments of a single verb.
    c   	      C   s´   g  |  _  | |  _ | |  _ | |  _ g  |  _ | |  _ | |  _ | i ƒ  |  _ x_ | D]W \ \ } } } | d j o |  i  t	 | | ƒ 7_  qU |  i i
 | | f | f ƒ qU Wd  S(   NRŽ   s   C-V(   RŽ   s   C-V(   t   verbt	   verb_headt	   verb_stemt   rolesett	   argumentst   tagged_spansR   t   leavesR   R‚   R[   (	   R   R   R£   R¤   R¥   R§   R   R”   RŒ   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR   }  s    							 c         C   s@   t  |  i ƒ d j o d p d } d |  i t  |  i ƒ | f S(   Ni   t   sRd   s,   <ConllSRLInstance for %r with %d argument%s>(   RZ   R¦   R¤   (   R   t   plural(    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   __repr__§  s    #c   	   
      s(  d i  ‡  f d †  ˆ  i Dƒ ƒ } d | ˆ  i f } d } xÁ t ˆ  i ƒ D]° \ } } t | t ƒ o | d } n xW ˆ  i D]L \ \ } } } | | j o | d | 7} n | | j o | d 7} q q W| ˆ  i j o d | } n | | d 7} qK W| t i	 | i
 d	 d
 ƒ d d d d ƒS(   Nt    c         3   s$   x |  ] } ˆ  i  | d  Vq Wd S(   i    N(   R   (   R   R   (   R   (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pys	   <genexpr>­  s    s   SRL for %r (stem=%r):
Rd   i    s   [%s s   ] s   <<%s>>s    ]t   ]t   initial_indents       t   subsequent_indent(   t   joinR¢   R¤   R   R   R   t   tupleR¦   t   textwrapt   fillt   replace(	   R   t   verbstrt   hdrR©   R   Rn   R   R”   t   argid(    (   R   s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   pprint¬  s    " 
 (   R›   Rœ   R   R   R«   R¸   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR   v  s   	*	R   c           B   s8   e  Z d  Z d d „ Z d „  Z e d „ Z d „  Z RS(   s0   
    Set of instances for a single sentence
    c         C   s   | |  _  t i |  | ƒ d  S(   N(   R   t   listR   (   R   R   R’   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR   ¿  s    	c         C   s
   |  i  ƒ  S(   N(   R¸   (   R   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyt   __str__Ã  s    c         C   sÛ  x1 |  D]) } | i  |  i  j o t d ƒ ‚ q q W| oU |  i  i ƒ  } d  g t | ƒ } d g t | ƒ } |  i |  i  d | | | ƒ n d } x>t t | ƒ ƒ D]*} | oI | d | | 7} | d | | 7} | d t | | i d ƒ ƒ 7} n xB |  D], } | | i	 j o | d | i
 7} PqqW| d d 7} x‚ |  D]z } d } x] | i D]R \ \ }	 }
 } | |	 j o d	 | | f } n | |
 d
 j o | d 7} qaqaW| d | 7} qKW| d 7} q© W| S(   Ns   Tree mismatch!R   i    Rd   s   %-20s s   %-8s s
   %15s*%-8s Re   s   (%s%si   Ru   s   %-12s s   
(   R   R   R¨   R*   RZ   t   _tree2conllR‚   R±   RX   R£   R¤   R§   (   R   t   include_treet   instR   R   t   syntR©   R   t   argstrR   R”   R·   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR¸   Æ  s@       % 	 
 !#c         C   s?  t  | t ƒ p t ‚ t | ƒ d j oI t  | d t ƒ o5 | i | | <| | | d j p t ‚ | d SnÉ t | ƒ d j oV t  | d t ƒ oB t | d ƒ d j p t ‚ | d \ | | <| | <| d Sn` d | i | | f | | <x) | D]! } |  i | | | | | ƒ } qþ W| | d c d 7<| Sd  S(   Ni   i    i   s   (%s%sRu   (   R   R   t   AssertionErrorRZ   R   Ri   R±   R»   (   R   R   RŠ   R   R   R¾   R   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR»   ì  s     '' (    (   R›   Rœ   R   R   Rº   R    R¸   R»   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR   »  s
   	&t   ConllChunkCorpusReaderc           B   s   e  Z d  Z e d „ Z RS(   s`   
    A ConllCorpusReader whose data file contains three columns: words,
    pos, and chunk.
    c      	   C   s&   t  i |  | | d d | d | ƒd  S(   NR   R   R	   R#   R'   (   s   wordss   poss   chunk(   R   R   (   R   R    R!   R#   R'   (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyR     s    (   R›   Rœ   R   R*   R   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pyRÁ   þ  s   (   R   t   ost   codecsR²   t   nltk.internalsR    t	   nltk.treeR   t	   nltk.utilR   R   t   utilt   apiR   R   t   objectR   R¹   R   RÁ   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/corpus/reader/conll.pys   <module>   s   

ÿ _EC