³ò
B_Kc           @   sÇ   d  Z  d d k Z d d k Z d d k Z d d k l Z d d k Td d k l	 Z	 d d k
 l Z d d k Td d k Td e f d „  ƒ  YZ d	 e f d
 „  ƒ  YZ d e f d „  ƒ  YZ d „  Z d S(   s  
Read from the Senseval 2 Corpus.

SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [http://www.siglex.org/]

Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
http://www.d.umn.edu/~tpederse/data.html
Distributed with permission.

The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
iÿÿÿÿN(   t   XMLCorpusReader(   t   *(   t   ElementTree(   t
   deprecatedt   SensevalInstancec           B   s   e  Z d  „  Z d „  Z RS(   c         C   s.   | |  _  t | ƒ |  _ | |  _ | |  _ d  S(   N(   t   wordt   tuplet   sensest   positiont   context(   t   selfR   R   R	   R   (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyt   __init__&   s    		c         C   s    d |  i  |  i |  i |  i f S(   Ns=   SensevalInstance(word=%r, position=%r, context=%r, senses=%r)(   R   R   R	   R   (   R
   (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyt   __repr__+   s    (   t   __name__t
   __module__R   R   (    (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyR   %   s   	t   SensevalCorpusReaderc           B   sV   e  Z e d  „ Z e d „ Z d „  Z e d ƒ d d „ ƒ Z e d ƒ d „  ƒ Z RS(   c         C   s@   t  g  } |  i | t ƒ D] \ } } | t | | ƒ q ~ ƒ S(   N(   t   concatt   abspathst   Truet   SensevalCorpusView(   R
   t   fileidst   _[1]t   fileidt   enc(    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyt	   instances1   s    
c         C   sk   | t  j o |  i } n t | t ƒ o | g } n t g  } | D] } | |  i | ƒ i ƒ  qE ~ ƒ S(   sV   
        @return: the text contents of the given fileids, as a single string.
        (   t   Nonet   _fileidst
   isinstancet
   basestringR   t   opent   read(   R
   R   R   t   f(    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyt   raw5   s    c   	      C   s—   g  } xŠ | i  d ƒ D]y } xp | i  d ƒ D]_ } | d i d } g  } | d D] } | | i | i d f qR ~ } | i | | f ƒ q, Wq W| S(   Nt   lexeltt   instancei    t   senseidi   t   pos(   t   findallt   attribt   textt   append(	   R
   t   treet   eltsR!   t   instt   senseR   t   wR	   (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyt   _entry=   s      .s#   Use .instances() or .raw() instead.t   listedc         C   sP   | d j o |  i  | ƒ Sn | d j o |  i | ƒ Sn t d | ƒ ‚ d  S(   NR/   R    s   bad format %r(   R   R    t
   ValueError(   R
   t   itemst   format(    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyR   H   s    s   Use .instances() instead.c         C   s   |  i  | ƒ S(   N(   R   (   R
   R1   (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyR/   M   s    (	   R   R   R   R   R    R.   R   R   R/   (    (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyR   0   s   		R   c           B   s#   e  Z d  „  Z d „  Z d „  Z RS(   c         C   s>   t  i |  | d | ƒt ƒ  |  _ d g |  _ d  g |  _ d  S(   Nt   encodingi    (   t   StreamBackedCorpusViewR   t   WhitespaceTokenizert   _word_tokenizert   _lexelt_startsR   t   _lexelts(   R
   R   R3   (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyR   S   s    c   
      C   sŞ  t  i |  i | i ƒ  ƒ d } |  i | } g  } t } xŸt o—| i ƒ  } | d j o | g  j p t ‚ g  Sn | i	 ƒ  i
 d ƒ o¥ | d 7} t i d | ƒ } | d  j	 p t ‚ | i d ƒ d d !} | t |  i ƒ j  o | |  i | j p t ‚ q2|  i i | ƒ |  i i | i ƒ  ƒ n | i	 ƒ  i
 d ƒ o | g  j p t ‚ t } n | o | i | ƒ n | i	 ƒ  i
 d ƒ oA d i | ƒ } t | ƒ } t i | ƒ }	 |  i |	 | ƒ g Sq; q; Wd  S(	   Ni   t    s   <lexelts   item=("[^"]+"|'[^']+')iÿÿÿÿs	   <instances
   </instances   
(   t   bisectt   bisect_rightR7   t   tellR8   t   FalseR   t   readlinet   AssertionErrort   lstript
   startswitht   ret   searchR   t   groupt   lenR(   t   joint   _fixXMLR   t
   fromstringt   _parse_instance(
   R
   t   streamt
   lexelt_numR!   t   instance_linest   in_instancet   linet   mt	   xml_blockR+   (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyt
   read_blockZ   s8    


c         C   s  g  } g  } d  } xf| D]^} | i d j o | i | i d ƒ q | i d j o| |  i i | i ƒ 7} x| D]à} | i d j o | d } n | i d j o#| d  j p
 t d ‚ | i i ƒ  p t	 | ƒ d j p t ‚ | i i ƒ  o t	 | ƒ d j p t ‚ t	 | ƒ } | i i ƒ  o | i | i i ƒ  ƒ q0| d i d	 j oX | i | d i | d i d
 f ƒ | d i
 o! | |  i i | d i
 ƒ 7} qÊq0t p
 t d ‚ nc | i d	 j o! | i | i | i d
 f ƒ n2 | i d j o n d G| i GHt p
 t d ‚ | i
 o | |  i i | i
 ƒ 7} qw qw Wq t p t d | i ‚ q Wt | | | | ƒ S(   Nt   answerR#   R	   t   compoundi    t   heads   head specified twicei   t   wfR$   s   expected CDATA or wf in <head>t   st   ACKs    expected CDATA or <wf> or <head>s   unexpected tag %s(   R   t   tagR(   R&   R6   t   tokenizeR'   R?   t   stripRE   t   tailR=   R   (   R
   R"   R!   R   R	   R   t   childt   cword(    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyRI   ƒ   sJ      *+!
%(   R   R   R   RQ   RI   (    (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyR   R   s   		)c         C   s?  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d	 d
 |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  t  i d d |  ƒ }  |  S(   s:   
    Fix the various issues with Senseval pseudo-XML.
    s	   <([~\^])>s   \1s   (\s+)\&(\s+)s	   \1&amp;\2s   """s   '"'s   (<[^<]*snum=)([^">]+)>s   \1"\2"/>s   <\&frasl>\s*<p[^>]*>t   FRASLs
   <\&I[^>]*>R9   s   <{([^}]+)}>s	   <(@|/?p)>s	   <&\w+ \.>s   <!DOCTYPE[^>]*>s   <\[\/?[^>]+\]*>s
   <(\&\w+;)>s   &(?!amp|gt|lt|apos|quot)s'   [ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>s    <wf pos="\2">\1</wf>s   \s*"\s*<p=\'"\'/>s    <wf pos='"'>"</wf>(   RB   t   sub(   R'   (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pyRG   °   s"    	(   t   __doc__t   osRB   t   xml.saxt   xmlt   xmldocsR    t   nltk.tokenizet
   nltk.etreeR   t   nltk.internalsR   t   utilt   apit   objectR   t   CorpusReaderR   R4   R   RG   (    (    (    s1   /p/zhu/06/nlp/nltk/nltk/corpus/reader/senseval.pys   <module>   s   


"^