³ò
B_Kc           @   s˜   d  Z  d d k Z d d k Z d d k Z d d k l Z d d k l Z d d k	 Td d k
 Td d k Td e f d „  ƒ  YZ d e f d	 „  ƒ  YZ d S(
   sN   
A reader for corpora that contain chunked (and optionally tagged)
documents.
iÿÿÿÿN(   t   BracketParseCorpusReader(   t   Tree(   t   *t   ChunkedCorpusReaderc           B   s¹   e  Z d  Z d e i i e d d e ƒe e	 d „ Z
 e	 d „ Z e	 d „ Z e	 d „ Z e	 d „ Z e	 d	 „ Z e	 d
 „ Z e	 d „ Z e	 d „ Z e	 d „ Z e	 d „ Z d „  Z RS(   s%  
    Reader for chunked (and optionally tagged) corpora.  Paragraphs
    are split using a block reader.  They are then tokenized into
    sentences using a sentence tokenizer.  Finally, these sentences
    are parsed into chunk trees using a string-to-chunktree conversion
    function.  Each of these steps can be performed using a default
    function or a custom function.  By default, paragraphs are split
    on blank lines; sentences are listed one per line; and sentences
    are parsed into chunk trees using L{nltk.chunk.tagstr2tree}.
    t    s   
t   gapsc         C   s,   t  i |  | | | ƒ | | | f |  _ d S(   s’   
        @param root: The root directory for this corpus.
        @param fileids: A list or regexp specifying the fileids in this corpus.
        N(   t   CorpusReadert   __init__t   _cv_args(   t   selft   roott   fileidst	   extensiont   str2chunktreet   sent_tokenizert   para_block_readert   encoding(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR   "   s    	c         C   sk   | t  j o |  i } n t | t ƒ o | g } n t g  } | D] } | |  i | ƒ i ƒ  qE ~ ƒ S(   sW   
        @return: the given file(s) as a single string.
        @rtype: C{str}
        (   t   Nonet   _fileidst
   isinstancet
   basestringt   concatt   opent   read(   R	   R   t   _[1]t   f(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   raw1   s    c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s†   
        @return: the given file(s) as a list of words
            and punctuation symbols.
        @rtype: C{list} of C{str}
        i    (   R   t   abspathst   Truet   ChunkedCorpusViewR   (   R	   R   R   R   t   enc(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   words:   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   sÁ   
        @return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        @rtype: C{list} of (C{list} of C{str})
        i    i   (   R   R   R   R   R   (   R	   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   sentsC   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   sò   
        @return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of word strings.
        @rtype: C{list} of (C{list} of (C{list} of C{str}))
        i    i   (   R   R   R   R   R   (   R	   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   parasM   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   sÀ   
        @return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            C{(word,tag)}.
        @rtype: C{list} of C{(str,str)}
        i   i    (   R   R   R   R   R   (   R	   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   tagged_wordsW   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   sÂ   
        @return: the given file(s) as a list of
            sentences, each encoded as a list of C{(word,tag)} tuples.
            
        @rtype: C{list} of (C{list} of C{(str,str)})
        i   i    (   R   R   R   R   R   (   R	   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   tagged_sentsa   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s   
        @return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of C{(word,tag)} tuples.
        @rtype: C{list} of (C{list} of (C{list} of C{(str,str)}))
        i   i    (   R   R   R   R   R   (   R	   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   tagged_parask   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s|  
        @return: the given file(s) as a list of tagged
            words and chunks.  Words are encoded as C{(word, tag)}
            tuples (if the corpus has tags) or word strings (if the
            corpus has no tags).  Chunks are encoded as depth-one
            trees over C{(word,tag)} tuples or word strings.
        @rtype: C{list} of (C{(str,str)} and L{Tree})
        i   i    (   R   R   R   R   R   (   R	   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   chunked_wordsu   s    	
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s@  
        @return: the given file(s) as a list of
            sentences, each encoded as a shallow C{Tree}.  The leaves
            of these trees are encoded as C{(word, tag)} tuples (if
            the corpus has tags) or word strings (if the corpus has no
            tags).
        @rtype: C{list} of L{Tree}
        i   i    (   R   R   R   R   R   (   R	   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   chunked_sents   s    	
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s€  
        @return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as a shallow C{Tree}.  The leaves of these
            trees are encoded as C{(word, tag)} tuples (if the corpus
            has tags) or word strings (if the corpus has no tags).
        @rtype: C{list} of (C{list} of L{Tree})
        i   (   R   R   R   R   R   (   R	   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   chunked_paras   s    	
c         C   s1   g  } t  | ƒ D] } | t i i | ƒ q ~ S(   N(   t   read_blankline_blockt   nltkt   chunkt   tagstr2tree(   R	   t   streamR   t   t(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   _read_block™   s    (   t   __name__t
   __module__t   __doc__R)   R*   R+   t   RegexpTokenizerR   R(   R   R   R   R   R    R!   R"   R#   R$   R%   R&   R'   R.   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR      s"   
			




R   c           B   s#   e  Z d  „  Z d „  Z d „  Z RS(   c
   
      C   sY   t  i |  | d | ƒ| |  _ | |  _ | |  _ | |  _ | |  _ | |  _ |	 |  _ d  S(   NR   (	   t   StreamBackedCorpusViewR   t   _taggedt   _group_by_sentt   _group_by_parat   _chunkedt   _str2chunktreet   _sent_tokenizert   _para_block_reader(
   R	   t   fileidR   t   taggedt   group_by_sentt   group_by_parat   chunkedR   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR   ž   s    						c         C   sÝ   g  } xÐ |  i  | ƒ D]¿ } g  } xˆ |  i i | ƒ D]t } |  i | ƒ } |  i p |  i | ƒ } n |  i p | i ƒ  } n |  i o | i	 | ƒ q5 | i
 | ƒ q5 W|  i o | i	 | ƒ q | i
 | ƒ q W| S(   N(   R:   R9   t   tokenizeR8   R4   t   _untagR7   t   leavesR5   t   appendt   extendR6   (   R	   R,   t   blockt   para_strt   parat   sent_strt   sent(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt
   read_blockª   s$      



c         C   sp   xi t  | ƒ D][ \ } } t | t ƒ o |  i | ƒ q t | t ƒ o | d | | <q t d ƒ ‚ q W| S(   Ni    s"   expected child to be Tree or tuple(   t	   enumerateR   R   RA   t   tuplet
   ValueError(   R	   t   treet   it   child(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyRA   È   s     (   R/   R0   R   RJ   RA   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR      s   		(   R1   t   os.patht   ost   codecsR)   t    nltk.corpus.reader.bracket_parseR    t	   nltk.treeR   t   nltk.tokenizet   utilt   apiR   R   R3   R   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pys   <module>   s   


†