³ò
B_Kc        
   @   sþ   d  d k  Z  d  d k Z d  d k l Z l Z d  d k l Z d  d k Td  d k Td  d k	 l
 Z
 e i d ƒ Z e i d ƒ Z e i d ƒ Z e i d	 ƒ Z e i d
 ƒ Z e i d ƒ Z e i d ƒ Z d e f d „  ƒ  YZ d e e
 f d „  ƒ  YZ d S(   iÿÿÿÿN(   t   tokenizet   tree(   t
   deprecated(   t   *(   t   XMLCorpusReaders   <p(?: [^>]*){0,1}>(.*?)</p>s   <s(?: [^>]*){0,1}>(.*?)</s>s#   <([wc](?: [^>]*){0,1}>)(.*?)</[wc]>s!   <[wc](?: [^>]*){0,1}>(.*?)</[wc]>s   type="(.*?)"s   ana="(.*?)"s   text id="(.*?)"t   TEICorpusViewc           B   s2   e  Z e d  e d „ Z d Z d „  Z d „  Z RS(   i    c         C   s>   | |  _  | |  _ | |  _ | |  _ t i |  | d | ƒd  S(   Nt   startpos(   t   _taggedt   _textidst   _group_by_sentt   _group_by_parat   StreamBackedCorpusViewt   __init__(   t   selft   corpus_filet   taggedt   group_by_sentt   group_by_parat   tag_mapping_functiont   headLent   textids(    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyR   &   s
    				i   c         C   sï  | i  |  i ƒ } t | ƒ } xh | i d ƒ | i d ƒ j p | i d ƒ d j o2 | i ƒ  } t | ƒ d j o Pn | | 7} q! W| i d d ƒ } t i | ƒ } |  i	 oo xl | D]` } | |  i	 j oJ | i
 | ƒ d } | | i
 d ƒ t d ƒ } | |  | | | } q» q» Wn g  } x¿ t i | ƒ D]® }	 g  }
 xw t i |	 ƒ D]f } |  i p t i | ƒ } n t |  i t i | ƒ ƒ } |  i o |
 i | ƒ qU|
 i | ƒ qUW|  i o | i |
 ƒ q9| i |
 ƒ q9W| S(   Ns   <text ids   </text>i    s   
t    i   (   t	   readlinest	   _pagesizet   concatt   countt   readlinet   lent   replacet   TEXTIDt   findallR   t   findt   PARAt   SENTR   t   WORDt   mapt
   _parse_tagt
   TAGGEDWORDR	   t   appendt   extendR
   (   R   t   streamt   blockt   tmpR   t   tidt   begt   endt   outputt   para_strt   parat   sent_strt   sent(    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt
   read_block4   sB    "
 "  


c         C   sZ   | \ } } | i  d ƒ o t i | ƒ i d ƒ } n t i | ƒ i d ƒ } | | f S(   Nt   wi   (   t
   startswitht   ANAt   searcht   groupt   TYPE(   R   t   .1t   tagt   word(    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyR$   Z   s   (   t   __name__t
   __module__t   NoneR   R   R3   R$   (    (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyR   %   s
   		&t   Pl196xCorpusReaderc           B   s×   e  Z d  Z d „  Z d „  Z d „  Z e d „ Z d „  Z e e d „ Z	 e e e d „ Z
 e e e d „ Z e e e d	 „ Z e e e d
 „ Z e e e d „ Z e e e d „ Z e e d „ Z e e d „ Z RS(   iÒ
  c         O   sU   d | j o | d |  _  n
 d  |  _  t i |  | Œ t i |  | ƒ |  i ƒ  d  S(   Nt   textid_file(   R   R?   R   R   t   CategorizedCorpusReadert   _init_textids(   R   t   argst   kwargs(    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyR   f   s
    	c         C   s×   t  t ƒ |  _ t  t ƒ |  _ |  i d  j	 o¥ x¢ |  i |  i ƒ i ƒ  D]„ } | i ƒ  } | i	 d d ƒ \ } } | |  i
 ƒ  j o t d t | f ƒ ‚ n x* | i	 |  i ƒ D] } |  i | | ƒ q± WqG Wn d  S(   Nt    i   s(   In text_id mapping file %s: %s not found(   t   defaultdictt   listt   _f2tt   _t2fR   R?   t   openR   t   stript   splitt   fileidst
   ValueErrort   catfilet
   _delimitert   _add_textids(   R   t   linet   file_idt   text_idst   text_id(    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyRC   o   s      c         C   s,   |  i  | i | ƒ |  i | i | ƒ d  S(   N(   RI   R&   RJ   (   R   RT   RV   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyRR   |   s    c            s&  d  } | d  j	 o' | p | d  f } q: t d ƒ ‚ n | d  j	 o0 | p ˆ  i | ƒ d  f } qw t d ƒ ‚ n | d  j	 ož | p‡ t | t ƒ o | g } n t ‡  f d †  | Dƒ g  ƒ } t ƒ  } x/ | D]' } t ˆ  i | ƒ t | ƒ @| | <q× W| | f } q"t d ƒ ‚ n d S(   Ns+   Specify only fileids, categories or textidsc         3   s    x |  ] } ˆ  i  | Vq Wd  S(   N(   RJ   (   t   .0t   t(   R   (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pys	   <genexpr>   s    (   NN(	   R?   RO   RN   t
   isinstancet
   basestringt   sumt   dictt   setRI   (   R   RN   t
   categoriesR   R*   t   filest   tdictt   f(    (   R   s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt   _resolve€   s(    	 %c         C   s   | S(   N(    (   R   R;   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt
   decode_tag˜   s    c            sv   ˆ  i  | | ƒ \ } } | t j o t ˆ  i ƒ Sn t | t ƒ o | g } n t t ‡  f d †  | Dƒ g  ƒ ƒ S(   sM  
		In the pl196x corpus each category is stored in single
		file and thus both methods provide identical functionality. In order 
		to accommodate finer granularity, a non-standard textids() method was 
		implemented. All the main functions can be supplied with a list 
		of required chunks---giving much more control to the user.
		c         3   s    x |  ] } ˆ  i  | Vq Wd  S(   N(   RI   (   RW   t   d(   R   (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pys	   <genexpr>©   s    (   Rb   R?   t   sortedRJ   RY   RZ   R[   (   R   RN   R^   t   _(    (   R   s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyR   œ   s
    c         C   s÷   |  i  | | | ƒ \ } } | d  j o |  i } n t | t ƒ o | g } n | oT t g  } | D]8 } | t |  i | ƒ t t t d |  i	 d | | ƒqg ~ ƒ SnG t g  } | D]. } | t |  i | ƒ t t t d |  i	 ƒq» ~ ƒ Sd  S(   NR   R   (
   Rb   R?   t   _fileidsRY   RZ   R   R   t   abspatht   FalseR   (   R   RN   R^   R   t   _[1]t   fileidt   _[2](    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt   words«   s    
J
c         C   s÷   |  i  | | | ƒ \ } } | d  j o |  i } n t | t ƒ o | g } n | oT t g  } | D]8 } | t |  i | ƒ t t	 t d |  i
 d | | ƒqg ~ ƒ SnG t g  } | D]. } | t |  i | ƒ t t	 t d |  i
 ƒq» ~ ƒ Sd  S(   NR   R   (   Rb   R?   Rg   RY   RZ   R   R   Rh   Ri   t   TrueR   (   R   RN   R^   R   Rj   Rk   Rl   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt   sents¼   s    
J
c         C   s÷   |  i  | | | ƒ \ } } | d  j o |  i } n t | t ƒ o | g } n | oT t g  } | D]8 } | t |  i | ƒ t t	 t	 d |  i
 d | | ƒqg ~ ƒ SnG t g  } | D]. } | t |  i | ƒ t t	 t	 d |  i
 ƒq» ~ ƒ Sd  S(   NR   R   (   Rb   R?   Rg   RY   RZ   R   R   Rh   Ri   Rn   R   (   R   RN   R^   R   Rj   Rk   Rl   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt   parasÍ   s    
J
c         C   s÷   |  i  | | | ƒ \ } } | d  j o |  i } n t | t ƒ o | g } n | oT t g  } | D]8 } | t |  i | ƒ t t	 t	 d |  i
 d | | ƒqg ~ ƒ SnG t g  } | D]. } | t |  i | ƒ t t	 t	 d |  i
 ƒq» ~ ƒ Sd  S(   NR   R   (   Rb   R?   Rg   RY   RZ   R   R   Rh   Rn   Ri   R   (   R   RN   R^   R   Rj   Rk   Rl   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt   tagged_wordsÞ   s    
J
c         C   s÷   |  i  | | | ƒ \ } } | d  j o |  i } n t | t ƒ o | g } n | oT t g  } | D]8 } | t |  i | ƒ t t t	 d |  i
 d | | ƒqg ~ ƒ SnG t g  } | D]. } | t |  i | ƒ t t t	 d |  i
 ƒq» ~ ƒ Sd  S(   NR   R   (   Rb   R?   Rg   RY   RZ   R   R   Rh   Rn   Ri   R   (   R   RN   R^   R   Rj   Rk   Rl   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt   tagged_sentsï   s    
J
c         C   s÷   |  i  | | | ƒ \ } } | d  j o |  i } n t | t ƒ o | g } n | oT t g  } | D]8 } | t |  i | ƒ t t t d |  i	 d | | ƒqg ~ ƒ SnG t g  } | D]. } | t |  i | ƒ t t t d |  i	 ƒq» ~ ƒ Sd  S(   NR   R   (
   Rb   R?   Rg   RY   RZ   R   R   Rh   Rn   R   (   R   RN   R^   R   Rj   Rk   Rl   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt   tagged_paras   s    
J
c         C   sS   |  i  | | ƒ \ } } t | ƒ d j o t i |  | d ƒ Sn t d ƒ ‚ d  S(   Ni   i    s   Expected a single file(   Rb   R   R   t   xmlt	   TypeError(   R   RN   R^   Rf   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyRt     s    +c         C   sƒ   |  i  | | ƒ \ } } | d  j o |  i } n t | t ƒ o | g } n t g  } | D] } | |  i | ƒ i ƒ  q] ~ ƒ S(   N(   Rb   R?   Rg   RY   RZ   R   RK   t   read(   R   RN   R^   Rf   Rj   Ra   (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyt   raw  s    (   R=   R>   R   R   RC   RR   R?   Rb   Rc   R   Rm   Ro   Rp   Rq   Rr   Rs   Rt   Rw   (    (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pyR@   b   s   					(   t   ost   ret   nltkR    R   t   nltk.internalsR   t   utilt   apit   xmldocsR   t   compileR    R!   R%   R"   R9   R6   R   R   R   RB   R@   (    (    (    s/   /p/zhu/06/nlp/nltk/nltk/corpus/reader/pl196x.pys   <module>   s   

=