³ò
4ÒÇIc        	   @   s®   d  Z  d d k Z d d k Z d d k l Z l Z d d k l Z d d k Td d k	 Te i
 d ƒ Z e i
 d ƒ Z e i
 d ƒ Z e i
 d	 ƒ Z d
 e f d „  ƒ  YZ d S(   sÌ  
Sinica Treebank Corpus Sample

http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm

10,000 parsed sentences, drawn from the Academia Sinica Balanced
Corpus of Modern Chinese.  Parse tree notation is based on
Information-based Case Grammar.  Tagset documentation is available
at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html

Language and Knowledge Processing Group, Institute of Information
Science, Academia Sinica

It is distributed with the Natural Language Toolkit under the terms of
the Creative Commons Attribution-NonCommercial-ShareAlike License
[http://creativecommons.org/licenses/by-nc-sa/2.5/].

References:

Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
The Construction of Sinica Treebank. Computational Linguistics and
Chinese Language Processing, 4, pp 87-104.

Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
Chinese Language Processing Workshop, Association for Computational
Linguistics.

Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
Extraction, Proceedings of IJCNLP-04, pp560-565.
iÿÿÿÿN(   t   tokenizet   tree(   t
   deprecated(   t   *s   ^#\S+\ss   (?<=\))#.*$s   :([^:()|]+):([^:()|]+)s   :[^:()|]+:([^:()|]+)t   SinicaTreebankCorpusReaderc           B   s5   e  Z d  Z d „  Z d „  Z e d „ Z d „  Z RS(   s)   
    Reader for the sinica treebank.
    c         C   s7   | i  ƒ  } t i d | ƒ } t i d | ƒ } | g S(   Nt    (   t   readlinet
   IDENTIFIERt   subt   APPENDIX(   t   selft   streamt   sent(    (    s8   /p/zhu/06/nlp/nltk/nltk/corpus/reader/sinica_treebank.pyt   _read_block;   s    c         C   s   t  i | ƒ S(   N(   R   t   sinica_parse(   R
   R   (    (    s8   /p/zhu/06/nlp/nltk/nltk/corpus/reader/sinica_treebank.pyt   _parseA   s    c         C   s{   g  } t  i | ƒ D] \ } } | | | f q ~ } | o: g  } | D]" \ } } | | |  i | ƒ f qH ~ } n | S(   N(   t   TAGWORDt   findallt   _tag_mapping_function(   R
   R   t   simplify_tagst   _[1]t   tt   wt   tagged_sentt   _[2](    (    s8   /p/zhu/06/nlp/nltk/nltk/corpus/reader/sinica_treebank.pyt   _tagD   s
    63c         C   s   t  i | ƒ S(   N(   t   WORDR   (   R
   R   (    (    s8   /p/zhu/06/nlp/nltk/nltk/corpus/reader/sinica_treebank.pyt   _wordK   s    (   t   __name__t
   __module__t   __doc__R   R   t   NoneR   R   (    (    (    s8   /p/zhu/06/nlp/nltk/nltk/corpus/reader/sinica_treebank.pyR   7   s
   		(   R   t   ost   ret   nltkR    R   t   nltk.internalsR   t   utilt   apit   compileR   R	   R   R   t   SyntaxCorpusReaderR   (    (    (    s8   /p/zhu/06/nlp/nltk/nltk/corpus/reader/sinica_treebank.pys   <module>'   s   

