³ò
4ÒÇIc           @   sœ   d  Z  d d k Z d d k Z d d k l Z d d k l Z d d k Td d k	 l
 Z
 d d k Td d k Td e f d „  ƒ  YZ d	 e f d
 „  ƒ  YZ d S(   sN   
A reader for corpora that contain chunked (and optionally tagged)
documents.
iÿÿÿÿN(   t   BracketParseCorpusReader(   t   Tree(   t   *(   t   chunkt   ChunkedCorpusReaderc           B   s¶   e  Z d  Z d e i e d d e ƒe e d „ Z	 e d „ Z
 e d „ Z e d „ Z e d „ Z e d	 „ Z e d
 „ Z e d „ Z e d „ Z e d „ Z e d „ Z d „  Z RS(   s   
    Reader for chunked (and optionally tagged) corpora.  Paragraphs
    are split using a block reader.  They are then tokenized into
    sentences using a sentence tokenizer.  Finally, these sentences
    are parsed into chunk trees using a string-to-chunktree conversion
    function.  Each of these steps can be performed using a default
    function or a custom function.  By default, paragraphs are split
    on blank lines; sentences are listed one per line; and sentences
    are parsed into chunk trees using L{chunk.tagstr2tree}.
    t    s   
t   gapsc         C   s,   t  i |  | | | ƒ | | | f |  _ d S(   s’   
        @param root: The root directory for this corpus.
        @param fileids: A list or regexp specifying the fileids in this corpus.
        N(   t   CorpusReadert   __init__t   _cv_args(   t   selft   roott   fileidst	   extensiont   str2chunktreet   sent_tokenizert   para_block_readert   encoding(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR   #   s    	c         C   sk   | t  j o |  i } n t | t ƒ o | g } n t g  } | D] } | |  i | ƒ i ƒ  qE ~ ƒ S(   sW   
        @return: the given file(s) as a single string.
        @rtype: C{str}
        (   t   Nonet   _fileidst
   isinstancet
   basestringt   concatt   opent   read(   R
   R   t   _[1]t   f(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   raw2   s    c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s†   
        @return: the given file(s) as a list of words
            and punctuation symbols.
        @rtype: C{list} of C{str}
        i    (   R   t   abspathst   Truet   ChunkedCorpusViewR	   (   R
   R   R   R   t   enc(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   words;   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   sÁ   
        @return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        @rtype: C{list} of (C{list} of C{str})
        i    i   (   R   R   R   R   R	   (   R
   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   sentsD   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   sò   
        @return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of word strings.
        @rtype: C{list} of (C{list} of (C{list} of C{str}))
        i    i   (   R   R   R   R   R	   (   R
   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   parasN   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   sÀ   
        @return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            C{(word,tag)}.
        @rtype: C{list} of C{(str,str)}
        i   i    (   R   R   R   R   R	   (   R
   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   tagged_wordsX   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   sÂ   
        @return: the given file(s) as a list of
            sentences, each encoded as a list of C{(word,tag)} tuples.
            
        @rtype: C{list} of (C{list} of C{(str,str)})
        i   i    (   R   R   R   R   R	   (   R
   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   tagged_sentsb   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s   
        @return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of C{(word,tag)} tuples.
        @rtype: C{list} of (C{list} of (C{list} of C{(str,str)}))
        i   i    (   R   R   R   R   R	   (   R
   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   tagged_parasl   s    
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s|  
        @return: the given file(s) as a list of tagged
            words and chunks.  Words are encoded as C{(word, tag)}
            tuples (if the corpus has tags) or word strings (if the
            corpus has no tags).  Chunks are encoded as depth-one
            trees over C{(word,tag)} tuples or word strings.
        @rtype: C{list} of (C{(str,str)} and L{Tree})
        i   i    (   R   R   R   R   R	   (   R
   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   chunked_wordsv   s    	
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s@  
        @return: the given file(s) as a list of
            sentences, each encoded as a shallow C{Tree}.  The leaves
            of these trees are encoded as C{(word, tag)} tuples (if
            the corpus has tags) or word strings (if the corpus has no
            tags).
        @rtype: C{list} of L{Tree}
        i   i    (   R   R   R   R   R	   (   R
   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   chunked_sents‚   s    	
c         C   sR   t  g  } |  i | t ƒ D]. \ } } | t | | d d d d |  i Œ q ~ ƒ S(   s€  
        @return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as a shallow C{Tree}.  The leaves of these
            trees are encoded as C{(word, tag)} tuples (if the corpus
            has tags) or word strings (if the corpus has no tags).
        @rtype: C{list} of (C{list} of L{Tree})
        i   (   R   R   R   R   R	   (   R
   R   R   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   chunked_parasŽ   s    	
c         C   s.   g  } t  | ƒ D] } | t i | ƒ q ~ S(   N(   t   read_blankline_blockR   t   tagstr2tree(   R
   t   streamR   t   t(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt   _read_blockš   s    (   t   __name__t
   __module__t   __doc__R   R*   t   RegexpTokenizerR   R)   R   R   R   R    R!   R"   R#   R$   R%   R&   R'   R(   R-   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR      s"   
		




R   c           B   s#   e  Z d  „  Z d „  Z d „  Z RS(   c
   
      C   sY   t  i |  | d | ƒ| |  _ | |  _ | |  _ | |  _ | |  _ | |  _ |	 |  _ d  S(   NR   (	   t   StreamBackedCorpusViewR   t   _taggedt   _group_by_sentt   _group_by_parat   _chunkedt   _str2chunktreet   _sent_tokenizert   _para_block_reader(
   R
   t   fileidR   t   taggedt   group_by_sentt   group_by_parat   chunkedR   R   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR   Ÿ   s    						c         C   sÝ   g  } xÐ |  i  | ƒ D]¿ } g  } xˆ |  i i | ƒ D]t } |  i | ƒ } |  i p |  i | ƒ } n |  i p | i ƒ  } n |  i o | i	 | ƒ q5 | i
 | ƒ q5 W|  i o | i	 | ƒ q | i
 | ƒ q W| S(   N(   R9   R8   t   tokenizeR7   R3   t   _untagR6   t   leavesR4   t   appendt   extendR5   (   R
   R+   t   blockt   para_strt   parat   sent_strt   sent(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyt
   read_block«   s$      



c         C   sp   xi t  | ƒ D][ \ } } t | t ƒ o |  i | ƒ q t | t ƒ o | d | | <q t d ƒ ‚ q W| S(   Ni    s"   expected child to be Tree or tuple(   t	   enumerateR   R   R@   t   tuplet
   ValueError(   R
   t   treet   it   child(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR@   É   s     (   R.   R/   R   RI   R@   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pyR   ž   s   		(   R0   t   os.patht   ost   codecst    nltk.corpus.reader.bracket_parseR    t	   nltk.treeR   t   nltk.tokenizet   nltkR   t   utilt   apiR   R   R2   R   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/chunked.pys   <module>   s   


†