³ò
B_Kc           @   sÌ   d  Z  d d k Z y d d k l Z Wn# e j
 o d d k l Z n Xd d k l Z d d k	 l
 Z
 d d k l Z l Z d d k l Z d d	 k Td
 e f d „  ƒ  YZ d e f d „  ƒ  YZ d S(   s‚   
Corpus reader for corpora whose documents are xml files.

(note -- not named 'xml' to avoid conflicting w/ standard xml package)
iÿÿÿÿN(   t   cElementTree(   t   ElementTree(   t   SeekableUnicodeStreamReader(   t   WordPunctTokenizer(   t
   deprecatedt   ElementWrapper(   t   CorpusReader(   t   *t   XMLCorpusReaderc           B   sY   e  Z d  Z e d „ Z e d „ Z e d „ Z e d „ Z e	 d ƒ e d d „ ƒ Z
 RS(   s  
    Corpus reader for corpora whose documents are xml files.

    Note that the C{XMLCorpusReader} constructor does not take an
    C{encoding} argument, because the unicode encoding is specified by
    the XML files themselves.  See the XML specs for more info.
    c         C   s    | |  _  t i |  | | ƒ d  S(   N(   t   _wrap_etreeR   t   __init__(   t   selft   roott   fileidst
   wrap_etree(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyR
   #   s    	c         C   s–   | d  j o' t |  i ƒ d j o |  i d } n t | t ƒ p t d ƒ ‚ n t i |  i | ƒ i	 ƒ  ƒ i
 ƒ  } |  i o t | ƒ } n | S(   Ni   i    s(   Expected a single file identifier string(   t   Nonet   lent   _fileidst
   isinstancet
   basestringt	   TypeErrorR   t   parset   abspatht   opent   getrootR	   R   (   R   t   fileidt   elt(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyt   xml'   s    #$
c   	      C   su   |  i  | ƒ } t ƒ  } | i ƒ  } g  } xD | D]< } | i } | t j	 o  | i | ƒ } | i | ƒ q1 q1 W| S(   sM  
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        @return: the given file's text nodes as a list of words and punctuation symbols
        @rtype: C{list} of C{str}
        (   R   R   t   getiteratort   textR   t   tokenizet   extend(	   R   R   R   t   word_tokenizert   iteratort   outt   nodeR   t   toks(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyt   words5   s    
	 	c         C   sk   | d  j o |  i } n t | t ƒ o | g } n t g  } | D] } | |  i | ƒ i ƒ  qE ~ ƒ S(   N(   R   R   R   R   t   concatR   t   read(   R   R   t   _[1]t   f(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyt   rawK   s    s   Use .raw() or .xml() instead.R   c         C   sP   | d j o |  i  | ƒ Sn | d j o |  i | ƒ Sn t d | ƒ ‚ d  S(   NR*   R   s   bad format %r(   R*   R   t
   ValueError(   R   t   itemst   format(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyR'   Q   s    (   t   __name__t
   __module__t   __doc__t   FalseR
   R   R   R%   R*   R   R'   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyR      s   	t   XMLCorpusViewc           B   s—   e  Z d  Z e Z d Z e d „ Z d „  Z d „  Z	 e
 i d e
 i e
 i Bƒ Z e
 i d ƒ Z e
 i d e
 i e
 i Bƒ Z d „  Z e e d	 „ Z RS(
   sg  
    A corpus view that selects out specified elements from an XML
    file, and provides a flat list-like interface for accessing them.
    (Note: C{XMLCorpusView} is not used by L{XMLCorpusReader} itself,
    but may be used by subclasses of L{XMLCorpusReader}.)
    
    Every XML corpus view has a X{tag specification}, indicating what
    XML elements should be included in the view; and each (non-nested)
    element that matches this specification corresponds to one item in
    the view.  Tag specifications are regular expressions over tag
    paths, where a tag path is a list of element tag names, separated
    by '/', indicating the ancestry of the element.  Some examples:

      - C{'foo'}: A top-level element whose tag is C{foo}.
      - C{'foo/bar'}: An element whose tag is C{bar} and whose parent
        is a top-level element whose tag is C{foo}.
      - C{'.*/foo'}: An element whose tag is C{foo}, appearing anywhere
        in the xml tree.
      - C{'.*/(foo|bar)'}: An wlement whose tag is C{foo} or C{bar},
        appearing anywhere in the xml tree.
    
    The view items are generated from the selected XML elements via
    the method L{handle_elt()}.  By default, this method returns the
    element as-is (i.e., as an ElementTree object); but it can be
    overridden, either via subclassing or via the C{elt_handler}
    constructor parameter.
    i   c         C   se   | o | |  _  n t i | d ƒ |  _ h  d d <|  _ |  i | ƒ } t i |  | d | ƒd S(   s[  
        Create a new corpus view based on a specified XML file.

        Note that the C{XMLCorpusView} constructor does not take an
        C{encoding} argument, because the unicode encoding is
        specified by the XML files themselves.
    
        @type tagspec: C{str}
        @param tagspec: A tag specification, indicating what XML
            elements should be included in the view.  Each non-nested
            element that matches this specification corresponds to one
            item in the view.

        @param elt_handler: A function used to transform each element
            to a value for the view.  If no handler is specified, then
            L{self.handle_elt()} is called, which returns the element
            as an ElementTree object.  The signature of elt_handler is::

                elt_handler(elt, tagspec) -> value
        s   \Zi    t   encodingN(    (   t
   handle_eltt   ret   compilet   _tagspect   _tag_contextt   _detect_encodingt   StreamBackedCorpusViewR
   (   R   R   t   tagspect   elt_handlerR3   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyR
   |   s
    c         C   s  t  | t ƒ o | i ƒ  i ƒ  } n t | d ƒ i ƒ  } | i t i ƒ o d Sn | i t i ƒ o d Sn | i t i ƒ o d Sn | i t i	 ƒ o d Sn | i t i
 ƒ o d Sn t i d | ƒ } | o | i d ƒ Sn t i d	 | ƒ } | o | i d ƒ Sn d S(
   Nt   rbs	   utf-16-bes	   utf-16-les	   utf-32-bes	   utf-32-les   utf-8s    \s*<?xml\b.*\bencoding="([^"]+)"i   s    \s*<?xml\b.*\bencoding='([^']+)'(   R   t   PathPointerR   t   readlinet
   startswitht   codecst   BOM_UTF16_BEt   BOM_UTF16_LEt   BOM_UTF32_BEt   BOM_UTF32_LEt   BOM_UTF8R5   t   matcht   group(   R   R   t   st   m(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyR9   Ÿ   s$    c         C   s   | S(   s  
        Convert an element into an appropriate value for inclusion in
        the view.  Unless overridden by a subclass or by the
        C{elt_handler} constructor argument, this method simply
        returns C{elt}.

        @return: The view value corresponding to C{elt}.

        @type elt: C{ElementTree}
        @param elt: The element that should be converted.

        @type context: C{str}
        @param context: A string composed of element tags separated by
            forward slashes, indicating the XML context of the given
            element.  For example, the string C{'foo/bar/baz'}
            indicates that the element is a C{baz} element whose
            parent is a C{bar} element and whose grandparent is a
            top-level C{foo} element.
        (    (   R   R   t   context(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyR4   µ   s    s;  
        [^<]*
        (
          ((<!--.*?-->)                         |  # comment
           (<![CDATA[.*?]])                     |  # raw character data
           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl
           (<[^>]*>))                              # tag or PI
          [^<]*)*
        \Zs   <\s*/?\s*([^\s>]+)s6  
        # Include these so we can skip them:
        (?P<COMMENT>        <!--.*?-->                          )|
        (?P<CDATA>          <![CDATA[.*?]]>                     )|
        (?P<PI>             <\?.*?\?>                           )|
        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>  )|
        # These are the ones we actually care about:
        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )|
        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )|
        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )c         C   su  d } xht  o`t | t ƒ o | i ƒ  } n | i |  i ƒ } | | 7} |  i i | ƒ o | Sn t i	 d | ƒ i
 d ƒ d j o@ | i ƒ  t | ƒ t i	 d | ƒ i ƒ  } t d | ƒ ‚ n | p t d ƒ ‚ n | i d ƒ } | d j op |  i i | |  ƒ oU t | t ƒ o | i | ƒ | i | ƒ n | i t | ƒ | d ƒ | |  Sqlq	 q	 Wd	 S(
   sz  
        Read a string from the given stream that does not contain any
        un-closed tags.  In particular, this function first reads a
        block from the stream of size L{self._BLOCK_SIZE}.  It then
        checks if that block contains an un-closed tag.  If it does,
        then this function either backtracks to the last '<', or reads
        another block.
        t    s   [<>]i    t   >s   Unexpected ">" near char %ss&   Unexpected end of file: tag not closedt   <i   N(   t   TrueR   R   t   tellR'   t   _BLOCK_SIZEt   _VALID_XML_RERG   R5   t   searchRH   R   t   endR+   t   rfindt   seekt   char_seek_forward(   R   t   streamt   fragmentt   startpost	   xml_blockt   post   last_open_bracket(    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyt   _read_xml_fragmentì   s,    	

"	#c         C   s§  | t  j o |  i } n | t  j o |  i } n t |  i i | i ƒ  ƒ ƒ } | t  j	 p t ‚ g  } t  } t  } d } x| g  j p | t  j	 ott | t	 ƒ o | i ƒ  }	 n |  i
 | ƒ }
 |
 p" | t  j o Pqó t d ƒ ‚ n x]|  i i |
 ƒ D]I} |  i o& d d i | ƒ d | i ƒ  f GHn | i d ƒ o{ |  i i | i ƒ  ƒ i d ƒ } | i | ƒ | t  j o< t i | d i | ƒ ƒ o | i ƒ  } t | ƒ } qÃqOq| i d ƒ oà |  i i | i ƒ  ƒ i d ƒ } | p t d	 | ƒ ‚ n | | d
 j o t d | d
 | f ƒ ‚ n | t  j	 oZ | t | ƒ j oG | |
 | | i ƒ  !7} | i | d i | ƒ f ƒ t  } } d } n | i ƒ  q| i d ƒ oˆ |  i i | i ƒ  ƒ i d ƒ } | t  j oV t i | d i | ƒ d | ƒ o. | i | i ƒ  d i | ƒ d | f ƒ qKqOqqW| t  j	 o« | g  j o | |
 | 7} d } q|  i o d d d GHn t | t	 ƒ o | i |	 ƒ | i | ƒ n | i t |
 ƒ | d ƒ | | d  } t  } } d } q q W| i ƒ  } | |  i j o% t | ƒ |  i | j p t ‚ n t | ƒ |  i | <g  } | D]1 \ } } | | t i | i d d ƒ ƒ | ƒ qo~ S(   s¹   
        Read from C{stream} until we find at least one element that
        matches C{tagspec}, and return the result of applying
        C{elt_handler} to each element found.
        RL   s   Unexpected end of files   %25s %st   /iìÿÿÿt	   START_TAGi   t   END_TAGs   Unmatched tag </%s>iÿÿÿÿs   Unmatched tag <%s>...</%s>t   EMPTY_ELT_TAGi    t    i$   s   (backtrack)t   asciit   xmlcharrefreplace(   R   R7   R4   t   listR8   t   getRP   t   AssertionErrorR   R   R^   R+   t
   _XML_PIECEt   finditert   _DEBUGt   joinRH   t   _XML_TAG_NAMERG   t   appendR5   t   startR   RT   t   popRV   RW   t   tupleR   t
   fromstringt   encode(   R   RX   R;   R<   RK   t   eltst	   elt_startt	   elt_deptht   elt_textRZ   t   xml_fragmentt   piecet   nameR\   R(   R   (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyt
   read_block  s€     
&!! 

!$+


%(   R.   R/   R0   R1   Rk   RQ   R   R
   R9   R4   R5   R6   t   DOTALLt   VERBOSERR   Rm   Ri   R^   R{   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pyR2   X   s   #				0(   R0   RA   t	   xml.etreeR    R   t   ImportErrort
   nltk.etreet	   nltk.dataR   t   nltk.tokenizeR   t   nltk.internalsR   R   t   nltk.corpus.reader.apiR   t   nltk.corpus.reader.utilR   R:   R2   (    (    (    s0   /p/zhu/06/nlp/nltk/nltk/corpus/reader/xmldocs.pys   <module>   s   #
=