³ò
B_Kc           @   sš   d  Z  d Z d d k Z d d k i i Z d d k Td d k Td d k	 Td e
 f d „  ƒ  YZ d d „ Z d e f d	 „  ƒ  YZ d
 e f d „  ƒ  YZ d S(   sC   
Corpus reader for the XML version of the British National Corpus.
s
   epytext eniÿÿÿÿN(   t   *t   BNCCorpusReaderc           B   sq   e  Z d  Z e d „ Z e e e d „ Z e e e e d „ Z e e e d „ Z	 e e e e d „ Z
 d „  Z RS(   s  
    Corpus reader for the XML version of the British National Corpus.
    For access to the complete XML data structure, use the L{xml()}
    method.  For access to simple word lists and tagged word lists, use
    L{words()}, L{sents()}, L{tagged_words()}, and L{tagged_sents()}.
    c         C   s    t  i |  | | ƒ | |  _ d  S(   N(   t   XMLCorpusReadert   __init__t   _lazy(   t   selft   roott   fileidst   lazy(    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyR      s    c      
   C   s•   |  i  oD t g  } |  i | ƒ D] } | t | t d | | ƒ q! ~ ƒ SnD t g  } |  i | ƒ D]" } | |  i | t d | | ƒ qe ~ ƒ Sd S(   sd  
        @return: the given file(s) as a list of words
            and punctuation symbols.
        @rtype: C{list} of C{str}
        
        @param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        @param stem: If true, then use word stems instead of word strings.
        N(   R   t   concatt   abspathst   BNCWordViewt   Falset   Nonet   _words(   R   R   t   strip_spacet   stemt   _[1]t   fileidt   _[2](    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyt   words    s
    


:
c   	   
   C   s¬   | o
 d } n d } |  i  oD t g  } |  i | ƒ D] } | t | t | | | ƒ q8 ~ ƒ SnD t g  } |  i | ƒ D]" } | |  i | t | | | ƒ q| ~ ƒ Sd S(   s*  
        @return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            C{(word,tag)}.
        @rtype: C{list} of C{(str,str)}
        
        @param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        @param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        @param stem: If true, then use word stems instead of word strings.
        t   c5t   posN(   R   R	   R
   R   R   R   (	   R   R   R   R   R   t   tagR   R   R   (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyt   tagged_words3   s    

:
c      
   C   s•   |  i  oD t g  } |  i | ƒ D] } | t | t d | | ƒ q! ~ ƒ SnD t g  } |  i | ƒ D]" } | |  i | t d | | ƒ qe ~ ƒ Sd S(   sŸ  
        @return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        @rtype: C{list} of (C{list} of C{str})
        
        @param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        @param stem: If true, then use word stems instead of word strings.
        N(   R   R	   R
   R   t   TrueR   R   (   R   R   R   R   R   R   R   (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyt   sentsI   s
    

:
c   	   
   C   s¬   | o
 d } n d } |  i  oD t g  } |  i | ƒ D] } | t | t | | | ƒ q8 ~ ƒ SnD t g  } |  i | ƒ D]" } | |  i | t | | | ƒ q| ~ ƒ Sd S(   s#  
        @return: the given file(s) as a list of
            sentences, each encoded as a list of C{(word,tag)} tuples.
        @rtype: C{list} of (C{list} of C{(str,str)})
            
        @param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        @param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        @param stem: If true, then use word stems instead of word strings.
        R   R   N(   R   R	   R
   R   R   R   (	   R   R   R   R   R   R   R   R   R   (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyt   tagged_sents[   s    

:
c         C   sY  g  } t  i | ƒ i ƒ  } x#| i d ƒ D]} g  }	 xÎ t | ƒ D]À }
 |
 i } | p
 d } n | p | o | i ƒ  } n | o |
 i d | ƒ } n | d j o | |
 i d ƒ f } n3 | d j o% | |
 i d |
 i d ƒ ƒ f } n |	 i | ƒ qD W| o! | i t	 | i
 d |	 ƒ ƒ q+ | i |	 ƒ q+ Wt | j p t ‚ | S(   sØ  
        Helper used to implement the view methods -- returns a list of
        words or a list of sentences, optionally tagged.
        
        @param fileid: The name of the underlying file.
        @param bracket_sent: If true, include sentence bracketing.
        @param tag: The name of the tagset to use, or None for no tags.
        @param strip_space: If true, strip spaces from word tokens.
        @param stem: If true, then substitute stems for words.
        s   .//st    t   hwR   R   t   n(   t   ElementTreet   parset   getroott   findallt   _all_xmlwords_int   textt   stript   gett   appendt   BNCSentencet   attribt   extendR   t   AssertionError(   R   R   t   bracket_sentR   R   R   t   resultt   xmldoct   xmlsentt   sentt   xmlwordt   word(    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyR   q   s,      	
%!(   t   __name__t
   __module__t   __doc__R   R   R   R   R   R   R   R   R   (    (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyR      s   	c         C   sZ   | d  j o
 g  } n x< |  D]4 } | i d j o | i | ƒ q t | | ƒ q W| S(   Nt   ct   w(   R6   R7   (   R   R   R'   R#   (   t   eltR-   t   child(    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyR#   ”   s     !R(   c           B   s   e  Z d  Z d „  Z RS(   s‰   
    A list of words, augmented by an attribute C{num} used to record
    the sentence identifier (the C{n} attribute from the XML).
    c         C   s   | |  _  t i |  | ƒ d  S(   N(   t   numt   listR   (   R   R:   t   items(    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyR       s    	(   R3   R4   R5   R   (    (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyR(   ›   s   R   c           B   sS   e  Z d  Z d „  Z e Z e Z e Z e Z d „  Z	 d „  Z
 d „  Z d „  Z RS(   sN   
    A stream backed corpus view specialized for use with the BNC corpus.
    c         C   s‘   | o
 d } n d } | |  _  | |  _ | |  _ | |  _ t i |  | | ƒ |  i ƒ  |  i |  i d |  i	 ƒ |  i
 ƒ  h  d d <|  _ d S(   sG  
        @param fileid: The name of the underlying file.
        @param sent: If true, include sentence bracketing.
        @param tag: The name of the tagset to use, or None for no tags.
        @param strip_space: If true, strip spaces from word tokens.
        @param stem: If true, then substitute stems for words.
        s   .*/ss   .*/s/(.*/)?(c|w)s   .*/teiHeader$i    N(    (   t   _sentt   _tagt   _strip_spacet   _stemt   XMLCorpusViewR   t   _opent
   read_blockt   _streamt   handle_headert   closet   _tag_context(   R   R   R0   R   R   R   t   tagspec(    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyR   ¨   s    				

c         C   sh  | i  d ƒ } | o: d i g  } | D] } | | i i ƒ  q' ~ ƒ |  _ n | i  d ƒ } | o: d i g  } | D] } | | i i ƒ  qw ~ ƒ |  _ n | i  d ƒ }	 |	 o: d i g  }
 |	 D] } |
 | i i ƒ  qÇ ~
 ƒ |  _ n | i  d ƒ } | o^ d i g  } | D]: } | d i g  } | D] } | | i i ƒ  q1~ ƒ q~ ƒ |  _ n d  S(   Ns   titleStmt/titles   
s   titleStmt/authors   titleStmt/editors   titleStmt/respStmts   

(   R"   t   joinR$   R%   t   titlet   authort   editort   resps(   R   R8   t   contextt   titlesR   RJ   t   authorsR   RK   t   editorst   _[3]RL   RM   t   _[4]t   respt   _[5]t   resp_elt(    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyRE   Ç   s    444c         C   s,   |  i  o |  i | ƒ Sn |  i | ƒ Sd  S(   N(   R=   t   handle_sentt   handle_word(   R   R8   RN   (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyt
   handle_eltÚ   s    c         C   sÀ   | i  } | p
 d } n |  i p
 |  i o | i ƒ  } n |  i o | i d | ƒ } n |  i d j o | | i d ƒ f } n6 |  i d j o% | | i d | i d ƒ ƒ f } n | S(   NR   R   R   R   (   R$   R?   R@   R%   R&   R>   (   R   R8   R2   (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyRX   Þ   s    	

%c         C   sª   g  } x | D]… } | i  d j o2 | g  } | D] } | |  i | ƒ q1 ~ 7} q | i  d j o | i |  i | ƒ ƒ q t d | i  ƒ ‚ q Wt | i d | ƒ S(   Nt   mwR7   R6   s   Unexpected element %sR   (   R7   R6   (   R   RX   R'   t
   ValueErrorR(   R)   (   R   R8   R0   R9   R   R7   (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyRW   ì   s     2(   R3   R4   R5   R   R   RJ   RK   RL   RM   RE   RY   RX   RW   (    (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pyR   ¤   s   				(   R5   t   __docformat__t   ret   nltk.etree.ElementTreet   etreeR   t   ETt   apit   utilt   xmldocsR   R   R   R#   R;   R(   RA   R   (    (    (    s,   /p/zhu/06/nlp/nltk/nltk/corpus/reader/bnc.pys   <module>
   s   


	