³ò
¶ÒÇIc        !   @   sÙ  d  Z  d d k Z d d k Z d d k Z d d k Z d d k Z d d k Z d d k Z d d k Z d d k	 Z	 d d k
 Z
 d d k Z y d d k Z Wn d d k Z n Xy d d k l Z Wn d d k l Z n Xd d k l Z l Z g  Z e g  Z e i i d d ƒ i e i ƒ D] Z e o e e q#q#[ 7Z e g  Z e i i d d ƒ i e i ƒ D] Z e o e e qnqn[ 7Z e i i d ƒ d j o e e i i d	 ƒ g 7Z n e i i d
 ƒ oD e d d d e i i e i  d ƒ e i i e i  d d ƒ g 7Z n e d d d d d g 7Z d e! f d „  ƒ  YZ" d e" e# f d „  ƒ  YZ$ d e$ f d „  ƒ  YZ% d e" f d „  ƒ  YZ& e i' ƒ  Z( d „  Z) d e+ d „ Z, h  d d  <d! d" <d# d$ <d% d& <d' d( <d) d* <d+ d, <d- d. <d/ d0 <Z- h  d  d  <d" d" <d$ d$ <d& d& <d( d( <d* d* <d, d, <d. d. <Z. d1 e+ e/ d d d2 „ Z0 d3 d4 „ Z1 d5 „  Z2 d6 „  Z3 d7 e! f d8 „  ƒ  YZ4 d9 e	 i5 f d: „  ƒ  YZ6 d; e! f d< „  ƒ  YZ7 d S(=   sç  
Functions to find and load NLTK X{resource files}, such as corpora,
grammars, and saved processing objects.  Resource files are identified
using URLs, such as"C{nltk:corpora/abc/rural.txt}" or
"C{http://nltk.org/sample/toy.cfg}".  The following URL protocols are
supported:

  - "C{file:I{path}}": Specifies the file whose path is C{I{path}}.
    Both relative and absolute paths may be used.
    
  - "C{http://I{host}/{path}}": Specifies the file stored on the web
    server C{I{host}} at path C{I{path}}.
    
  - "C{nltk:I{path}}": Specifies the file stored in the NLTK data
    package at C{I{path}}.  NLTK will search for these files in the
    directories specified by L{nltk.data.path}.

If no protocol is specified, then the default protocol "C{nltk:}" will
be used.
 
This module provides to functions that can be used to access a
resource file, given its URL: L{load()} loads a given resource, and
adds it to a resource cache; and L{retrieve()} copies a given resource
to a local file.
iÿÿÿÿN(   t   StringIO(   t   grammart   semt   NLTK_CORPORAt    t	   NLTK_DATAs   ~/s   ~/nltk_datat   wins   C:\nltk_datas   D:\nltk_datas   E:\nltk_datat	   nltk_datat   libs   /usr/share/nltk_datas   /usr/local/share/nltk_datas   /usr/lib/nltk_datas   /usr/local/lib/nltk_datas   /p/zhu/06/nlp/nltk/datat   PathPointerc           B   s,   e  Z d  Z e d „ Z d „  Z d „  Z RS(   so  
    An abstract base class for 'path pointers,' used by NLTK's data
    package to identify specific paths.  Two subclasses exist:
    L{FileSystemPathPointer} identifies a file that can be accessed
    directly via a given absolute path.  L{ZipFilePathPointer}
    identifies a file contained within a zipfile, that can be accessed
    by reading that zipfile.
    c         C   s   t  d ƒ ‚ d S(   sü   
        Return a seekable read-only stream that can be used to read
        the contents of the file identified by this path pointer.

        @raise IOError: If the path specified by this pointer does
            not contain a readable file.
        s   abstract base classN(   t   NotImplementedError(   t   selft   encoding(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   opene   s    c         C   s   t  d ƒ ‚ d S(   sÍ   
        Return the size of the file pointed to by this path pointer,
        in bytes.

        @raise IOError: If the path specified by this pointer does
            not contain a readable file.
        s   abstract base classN(   R
   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt	   file_sizeo   s    c         C   s   t  d ƒ ‚ d S(   sU  
        Return a new path pointer formed by starting at the path
        identified by this pointer, and then following the relative
        path given by C{fileid}.  The path components of C{fileid}
        should be seperated by forward slashes (C{/}), regardless of
        the underlying file system's path seperator character.
        s   abstract base classN(   R
   (   R   t   fileid(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   joiny   s    (   t   __name__t
   __module__t   __doc__t   NoneR   R   R   (    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR	   \   s   
	
t   FileSystemPathPointerc           B   s\   e  Z d  Z d „  Z e d „  d d ƒZ e d „ Z d „  Z d „  Z	 d „  Z
 d	 „  Z RS(
   sd  
    A path pointer that identifies a file which can be accessed
    directly via a given absolute path.  C{FileSystemPathPointer} is a
    subclass of C{str} for backwards compatibility purposes --
    this allows old code that expected C{nltk.data.find()} to expect a
    string to usually work (assuming the resource is not found in a
    zipfile).
    c         C   sV   t  i i | ƒ } t  i i | ƒ p t d | ƒ ‚ n | |  _ t i |  | ƒ d S(   sƒ   
        Create a new path pointer for the given absolute path.

        @raise IOError: If the given path does not exist.
        s   No such file or directory: %rN(   t   ost   patht   abspatht   existst   IOErrort   _patht   strt   __init__(   R   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR      s
    	c         C   s   |  i  S(    (   R   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   <lambda>™   s    t   docs;   
        The absolute path identified by this path pointer.c         C   s6   t  |  i d ƒ } | d  j	 o t | | ƒ } n | S(   Nt   rb(   R   R   R   t   SeekableUnicodeStreamReader(   R   R   t   stream(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   œ   s    c         C   s   t  i |  i ƒ i S(   N(   R   t   statR   t   st_size(   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ¢   s    c         C   s+   t  i i |  i | i d ƒ Œ } t | ƒ S(   Nt   /(   R   R   R   R   t   splitR   (   R   R   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ¥   s    !c         C   s   d |  i  S(   Ns   FileSystemPathPointer(%r)(   R   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   __repr__©   s    c         C   s   |  i  S(   N(   R   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   __str__¬   s    (   R   R   R   R   t   propertyR   R   R   R   R   R'   R(   (    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   „   s   					t   GzipFileSystemPathPointerc           B   s$   e  Z d  Z d d Z e d „ Z RS(   sì   
    A subclass of C{FileSystemPathPointer} that identifies a gzip-compressed
    file located at a given absolute path.  C{GzipFileSystemPathPointer} is
    appropriate for loading large gzip-compressed pickle objects efficiently.
    i   i   c            s   t  ƒ  } t i ˆ  i d ƒ ‰ x0 t ‡  ‡ f d †  d ƒ D] } | i | ƒ q: Wt  | i ƒ  ƒ } | o t | | ƒ } n | S(   NR    c              s   ˆ i  ˆ  i ƒ S(    (   t   readt
   BLOCK_SIZE(    (   R   t   file(    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ¿   s    R   (   R    t   gzipR   R   t   itert   writet   getvalueR!   (   R   R   R"   t   line(    (   R   R-   s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ¹   s    	 i   (   R   R   R   R,   R   R   (    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR*   °   s   
t   ZipFilePathPointerc           B   sk   e  Z d  Z d d „ Z e d „  d d ƒZ e d „  d d ƒZ e d „ Z d	 „  Z	 d
 „  Z
 d „  Z RS(   s~   
    A path pointer that identifies a file contained within a zipfile,
    which can be accessed by reading that zipfile.
    R   c         C   sæ   t  | t ƒ o t t i i | ƒ ƒ } n t i d d | ƒ } | oˆ y | i | ƒ WqÐ | i	 d ƒ o@ g  } | i
 ƒ  D]! } | i | ƒ o | | q€ q€ ~ o qÌ t d | i | f ƒ ‚ qÐ Xn | |  _ | |  _ d S(   sÞ   
        Create a new path pointer pointing at the specified entry
        in the given zipfile.

        @raise IOError: If the given zipfile does not exist, or if it
        does not contain the specified entry.
        s   (^|/)/+s   \1R%   s   Zipfile %r does not contain %rN(   t
   isinstancet
   basestringt   OpenOnDemandZipFileR   R   R   t   ret   subt   getinfot   endswitht   namelistt
   startswithR   t   filenamet   _zipfilet   _entry(   R   t   zipfilet   entryt   _[1]t   n(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   Ì   s    <	c         C   s   |  i  S(    (   R>   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ì   s    R   s€   
        The C{zipfile.ZipFile} object used to access the zip file
        containing the entry identified by this path pointer.c         C   s   |  i  S(    (   R?   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ï   s    sY   
        The name of the file within C{zipfile} that this path
        pointer points to.c         C   sE   |  i  i |  i ƒ } t | ƒ } | d  j	 o t | | ƒ } n | S(   N(   R>   R+   R?   R    R   R!   (   R   R   t   dataR"   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ó   s
    c         C   s   |  i  i |  i ƒ i S(   N(   R>   R9   R?   R   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ú   s    c         C   s#   d |  i  | f } t |  i | ƒ S(   Ns   %s/%s(   R?   R3   R>   (   R   R   RA   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   ý   s    c         C   s   d |  i  i |  i f S(   Ns   ZipFilePathPointer(%r, %r)(   R>   R=   R?   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR'     s    (   R   R   R   R   R)   R@   RA   R   R   R   R   R'   (    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR3   Ç   s    				c         C   sk  t  i d |  ƒ } | i ƒ  \ } } xCt D];} t i i | ƒ o> | i d ƒ o. y t | |  ƒ SWqft j
 o
 q+ qfXq+ t i i	 | ƒ oÑ | d j oa t i i | |  i d ƒ Œ } t i i | ƒ o, | i d ƒ o t | ƒ Sqÿ t | ƒ Sqbqft i i | | i d ƒ Œ } t i i | ƒ o. y t | | ƒ SWqbt j
 o
 q+ qbXqfq+ q+ W| d j o~ |  i d ƒ } xl t t | ƒ ƒ D]T } d i | |  | | d g | | ƒ } y t | ƒ SWq™t j
 o q™Xq™Wn t i d |  f d d d d d	 d
 ƒ}	 |	 d d i d „  t Dƒ ƒ 7}	 d d }
 d |
 |	 |
 f } t | ƒ ‚ d S(   s¦  
    Find the given resource by searching through the directories and
    zip files in L{nltk.data.path}, and return a corresponding path
    name.  If the given resource is not found, raise a C{LookupError},
    whose message gives a pointer to the installation instructions for
    the NLTK downloader.

    Zip File Handling:

      - If C{resource_name} contains a component with a C{.zip}
        extension, then it is assumed to be a zipfile; and the
        remaining path components are used to look inside the zipfile.
        
      - If any element of C{nltk.data.path} has a C{.zip} extension,
        then it is assumed to be a zipfile.

      - If a given resource name that does not contain any zipfile
        component is not found initially, then C{find()} will make a
        second attempt to find that resource, by replacing each
        component I{p} in the path with I{p.zip/p}.  For example, this
        allows C{find()} to map the resource name
        C{corpora/chat80/cities.pl} to a zip file path pointer to
        C{corpora/chat80.zip/chat80/cities.pl}.

      - When using C{find()} to locate a directory contained in a
        zipfile, the resource name I{must} end with the C{'/'}
        character.  Otherwise, C{find()} will not locate the
        directory.

    @type resource_name: C{str}
    @param resource_name: The name of the resource to search for.
        Resource names are posix-style relative path names, such as
        C{'corpora/brown'}.  In particular, directory names should
        always be separated by the C{'/'} character, which will be
        automatically converted to a platform-appropriate path
        separator.
    @rtype: C{str}
    s   (.*\.zip)/?(.*)$|s   .zipR%   s   .gzsc   Resource %r not found.  Please use the NLTK Downloader to obtain the resource: >>> nltk.download().t   initial_indents     t   subsequent_indentt   widthiB   s   
  Searched in:R   c         s   s   x |  ] } d  | Vq Wd S(   s	   
    - %rN(    (   t   .0t   d(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pys	   <genexpr>_  s    t   *iF   s	   
%s
%s
%sN(   R7   t   matcht   groupsR   R   t   isfileR:   R3   R   t   isdirR   R   R&   R   R*   R   t   ranget   lent   findt   LookupErrort   textwrapt   fill(   t   resource_namet   mR@   t   zipentryt	   path_itemt   pt   piecest   it   modified_namet   msgt   sept   resource_not_found(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyRQ     sB    ( #" *	!
c         C   s
  | d
 j oC |  i d ƒ o t i i | ƒ d } qP t i d d |  ƒ } n t i i | ƒ o# t i i | ƒ } t	 d | ‚ n | o d |  | f GHn t
 |  ƒ } t | d ƒ } x4 t o, | i d ƒ } | i | ƒ | p Pq¾ q¾ W| i ƒ  | i ƒ  d
 S(   s§  
    Copy the given resource to a local file.  If no filename is
    specified, then use the URL's filename.  If there is already a
    file named C{filename}, then raise a C{ValueError}.
    
    @type resource_url: C{str}
    @param resource_url: A URL specifying where the resource should be
        loaded from.  The default protocol is C{"nltk:"}, which searches
        for the file in the the NLTK data package.
    s   file:iÿÿÿÿs   (^\w+:)?.*/R   s   File %r already exists!s   Retrieving %r, saving to %rt   wbi   i@   Ni   (   R   R<   R   R   R&   R7   R8   R   R   t
   ValueErrort   _openR   t   TrueR+   R0   t   close(   t   resource_urlR=   t   verboset   infilet   outfilet   s(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   retrieved  s"    

s;   A serialized python object, stored using the pickle module.t   pickles9   A serialized python object, stored using the yaml module.t   yamls3   A context free grammar, parsed by nltk.parse_cfg().t   cfgs1   A probabilistic CFG, parsed by nltk.parse_pcfg().t   pcfgs+   A feature CFG, parsed by nltk.parse_fcfg().t   fcfgsi   A list of first order logic expressions, parsed by nltk.sem.parse_fol() using nltk.sem.logic.LogicParser.t   folsy   A list of first order logic expressions, parsed by nltk.sem.parse_logic().  Requires an additional logic_parser parametert   logics;   A semantic valuation, parsed by nltk.sem.parse_valuation().t   vals)   The raw (byte string) contents of a file.t   rawt   autoc   	      C   s½  | o? t  i |  ƒ } | t j	 o | o d |  f GHn | SqF n | o d |  f GHn | d j oh |  i d ƒ } | d } | d j o | d } n t i | ƒ } | t j o t d |  ƒ ‚ qÒ n | d	 j o t i t |  ƒ ƒ } n“| d
 j o t	 i t |  ƒ ƒ } nm| d j o t
 i t |  ƒ i ƒ  ƒ } nA| d j o t
 i t |  ƒ i ƒ  ƒ } n| d j o+ t
 i t |  ƒ i ƒ  d | d | ƒ} nÝ | d j o. t i t |  ƒ i ƒ  d t i i ƒ  ƒ} n¢ | d j o% t i t |  ƒ i ƒ  d | ƒ} np | d j o t i t |  ƒ i ƒ  ƒ } nD | d j o t |  ƒ i ƒ  } n! | t j p t ‚ t d ƒ ‚ | o( y | t  |  <Wq¹t j
 o q¹Xn | S(   s·  
    Load a given resource from the NLTK data package.  The following
    resource formats are currently supported:
      - C{'pickle'}
      - C{'yaml'}
      - C{'cfg'} (context free grammars)
      - C{'pcfg'} (probabilistic CFGs)
      - C{'fcfg'} (feature-based CFGs)
      - C{'fol'} (formulas of First Order Logic)
      - C{'logic'} (Logical formulas to be parsed by the given logic_parser)
      - C{'val'} (valuation of First Order Logic model)
      - C{'raw'}

    If no format is specified, C{load()} will attempt to determine a
    format based on the resource name's file extension.  If that
    fails, C{load()} will raise a C{ValueError} exception.

    @type resource_url: C{str}
    @param resource_url: A URL specifying where the resource should be
        loaded from.  The default protocol is C{"nltk:"}, which searches
        for the file in the the NLTK data package.
    @type cache: C{bool}
    @param cache: If true, add this resource to a cache.  If C{load}
        finds a resource in its cache, then it will return it from the
        cache rather than loading it.  The cache uses weak references,
        so a resource wil automatically be expunged from the cache
        when no more objects are using it.
        
    @type verbose: C{bool}
    @param verbose: If true, print a message when loading a resource.
        Messages are not displayed when a resource is retrieved from
        the cache.
    
    @type logic_parser: C{LogicParser}
    @param logic_parser: The parser that will be used to parse logical 
    expressions.
    @type fstruct_parser: C{FeatStructParser}
    @param fstruct_parser: The parser that will be used to parse the
    feature structure of an fcfg.
    s   <<Using cached copy of %s>>s   <<Loading %s>>Rt   t   .iÿÿÿÿt   gziþÿÿÿsz   Could not determine format for %s based on its file
extension; use the "format" argument to specify the format explicitly.Rk   Rl   Rm   Rn   Ro   t   logic_parsert   fstruct_parserRp   Rq   Rr   Rs   s   Unknown format type!(   t   _resource_cachet   getR   R&   t   AUTO_FORMATSRa   Rk   t   loadRb   Rl   Rm   t	   parse_cfgR+   t
   parse_pcfgt
   parse_fcfgR   t   parse_logicRq   t   LogicParsert   parse_valuationt   FORMATSt   AssertionErrort	   TypeError(	   Re   t   formatt   cacheRf   Rw   Rx   t   resource_valt   resource_url_partst   ext(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR|   ¨  s^    +
	s   ##c         C   so   t  |  d d d t ƒ} | i ƒ  } xD | D]< } | i | ƒ o q+ n t i d | ƒ o q+ n | GHq+ Wd S(   s„  
    Write out a grammar file, ignoring escaped and empty lines
    @type resource_url: C{str}
    @param resource_url: A URL specifying where the resource should be
        loaded from.  The default protocol is C{"nltk:"}, which searches
        for the file in the the NLTK data package.
    @type escape: C{str}
    @param escape: Prepended string that signals lines to be ignored
    R†   Rs   R‡   s   ^$N(   R|   t   Falset
   splitlinesR<   R7   RK   (   Re   t   escapeRˆ   t   linest   l(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   show_cfg  s    
 c           C   s   t  i ƒ  d S(   sI   
    Remove all objects from the resource cache.
    @see: L{load()}
    N(   Ry   t   clear(    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   clear_cache#  s    c         C   s‡   t  i d |  ƒ i ƒ  \ } } | d j p | i ƒ  d j o t | ƒ i ƒ  Sn2 | i ƒ  d j o t | d ƒ Sn t i |  ƒ Sd S(   sv  
    Helper function that returns an open file object for a resource,
    given its resource URL.  If the given resource URL uses the 'ntlk'
    protocol, or uses no protocol, then use L{nltk.data.find} to find
    its path, and open it with the given mode; if the resource URL
    uses the 'file' protocol, then open the file with the given mode;
    otherwise, delegate to C{urllib2.urlopen}.
    
    @type resource_url: C{str}
    @param resource_url: A URL specifying where the resource should be
        loaded from.  The default protocol is C{"nltk:"}, which searches
        for the file in the the NLTK data package.
    s   (?:(\w+):)?(.*)t   nltkR-   R    N(	   R7   RK   RL   R   t   lowerRQ   R   t   urllib2t   urlopen(   Re   t   protocolR   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyRb   *  s     t
   LazyLoaderc           B   s,   e  Z d  „  Z d „  Z d „  Z d „  Z RS(   c         C   s   | |  _  d  S(   N(   t   _LazyLoader__path(   R   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   H  s    c         C   s+   t  |  i ƒ } | i |  _ | i |  _ d  S(   N(   R|   R™   t   __dict__t	   __class__(   R   t   resource(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   __loadK  s    c         C   s   |  i  ƒ  t |  | ƒ S(   N(   t   _LazyLoader__loadt   getattr(   R   t   attr(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   __getattr__S  s    
c         C   s   |  i  ƒ  d |  S(   Ns   %r(   Rž   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR'   Y  s    
(   R   R   R   Rž   R¡   R'   (    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR˜   G  s   			R6   c           B   s;   e  Z d  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   sã  
    A subclass of C{zipfile.ZipFile} that closes its file pointer
    whenever it is not using it; and re-opens it when it needs to read
    data from the zipfile.  This is useful for reducing the number of
    open file handles when many zip files are being accessed at once.
    C{OpenOnDemandZipFile} must be constructed from a filename, not a
    file-like object (to allow re-opening).  C{OpenOnDemandZipFile} is
    read-only (i.e., C{write} and C{writestr} are disabled.
    c         C   sX   t  | t ƒ p t d ƒ ‚ n t i i |  | ƒ |  i | j p t ‚ |  i ƒ  d  S(   Ns+   ReopenableZipFile filename must be a string(	   R4   R5   R…   R@   t   ZipFileR   R=   R„   Rd   (   R   R=   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   m  s
    c         C   sO   |  i  d  j p t ‚ t |  i d ƒ |  _  t i i |  | ƒ } |  i ƒ  | S(   NR    (	   t   fpR   R„   R   R=   R@   R¢   R+   Rd   (   R   t   namet   value(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR+   t  s
    
c         O   s   t  d ƒ ‚ d S(   s<   @raise NotImplementedError: OpenOnDemandZipfile is read-onlys    OpenOnDemandZipfile is read-onlyN(   R
   (   R   t   argst   kwargs(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR0   {  s    c         O   s   t  d ƒ ‚ d S(   s<   @raise NotImplementedError: OpenOnDemandZipfile is read-onlys    OpenOnDemandZipfile is read-onlyN(   R
   (   R   R¦   R§   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   writestr  s    c         C   s   d |  i  S(   Ns   OpenOnDemandZipFile(%r)(   R=   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR'   ƒ  s    (   R   R   R   R   R+   R0   R¨   R'   (    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR6   c  s   					R!   c           B   s£  e  Z d  Z e Z d d „ Z e d „ Z e d „ Z e e d „ Z	 d „  Z
 d „  Z d „  Z e d	 „  d
 d ƒZ e d „  d
 d ƒZ e d „  d
 d ƒZ d „  Z d d „ Z d „  Z e d „ Z d „  Z e d „ Z d „  Z h  e i e f g d <e i d f e i d f g d <e i e f g d <e i e f g d <e i d f e i d f g d  <e i e f g d! <e i e f g d" <Z d# „  Z RS($   s„  
    A stream reader that automatically encodes the source byte stream
    into unicode (like C{codecs.StreamReader}); but still supports the
    C{seek()} and C{tell()} operations correctly.  This is in contrast
    to C{codecs.StreamReader}, which provide *broken* C{seek()} and
    C{tell()} methods.

    This class was motivated by L{StreamBackedCorpusView}, which
    makes extensive use of C{seek()} and C{tell()}, and needs to be
    able to handle unicode-encoded files.
    
    Note: this class requires stateless decoders.  To my knowledge,
    this shouldn't cause a problem with any of python's builtin
    unicode encodings.
    t   strictc         C   sq   | i  d ƒ | |  _ | |  _ | |  _ t i | ƒ |  _ d |  _ d  |  _	 d |  _
 d  |  _ |  i ƒ  |  _ d  S(   Ni    R   (   t   seekR"   R   t   errorst   codecst
   getdecodert   decodet
   bytebufferR   t
   linebuffert   _rewind_checkpointt   _rewind_numcharst
   _check_bomt   _bom(   R   R"   R   R«   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   œ  s    							c         C   sI   |  i  | ƒ } |  i o, d i |  i ƒ | } t |  _ t |  _ n | S(   s!  
        Read up to C{size} bytes, decode them using this reader's
        encoding, and return the resulting unicode string.

        @param size: The maximum number of bytes to read.  If not
            specified, then read as many bytes as possible.

        @rtype: C{unicode}
        R   (   t   _readR°   R   R   R²   (   R   t   sizet   chars(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR+   Õ  s    

	c   
      C   s÷  |  i  oE t |  i  ƒ d j o/ |  i  i d ƒ } |  i t | ƒ 7_ | Sn | p d } d } |  i  o  | |  i  i ƒ  7} t |  _  n xdt o\|  i i ƒ  t |  i ƒ } |  i	 | ƒ } | o' | i
 d ƒ o | |  i	 d ƒ 7} n | | 7} | i t ƒ } t | ƒ d j oH | d } | d |  _  t | ƒ t | ƒ t | ƒ |  _ | |  _ PnQ t | ƒ d j o= | d } | d i t ƒ d }	 | |	 j o | } Pq³n | p | t j	 o | } Pn | d j  o | d 9} q q W| S(   s]  
        Read a line of text, decode it using this reader's encoding,
        and return the resulting unicode string.

        @param size: The maximum number of bytes to read.  If no
            newline is encountered before C{size} bytes have been
            read, then the returned value may not be a complete line
            of text.
        i   i    iH   R   s   i@  i   (   R°   RP   t   popR²   R   Rc   R"   t   tellR¯   Rµ   R:   RŒ   R±   R‹   (
   R   R¶   R2   t   readsizeR·   t   startpost	   new_charsRŽ   t   line0withendt   line0withoutend(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   readlineé  sD     



#	
	c         C   s   |  i  ƒ  i | ƒ S(   s	  
        Read this file's contents, decode them using this reader's
        encoding, and return it as a list of unicode lines.

        @rtype: C{list} of C{unicode}
        @param sizehint: Ignored.
        @param keepends: If false, then strip newlines.
        (   R+   RŒ   (   R   t   sizehintt   keepends(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt	   readlines%  s    	c         C   s%   |  i  ƒ  } | o | Sn t ‚ d S(   s8   Return the next decoded line from the underlying stream.N(   R¿   t   StopIteration(   R   R2   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   next0  s    c         C   s   |  S(   s   Return self(    (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   __iter__6  s    c         C   s   |  S(   s   Return self(    (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt
   xreadlines:  s    c         C   s
   |  i  i S(    (   R"   t   closed(   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   B  s    R   s1   
        True if the underlying stream is closed.c         C   s
   |  i  i S(    (   R"   R¤   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   E  s    s+   
        The name of the underlying stream.c         C   s
   |  i  i S(    (   R"   t   mode(   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR   H  s    s+   
        The mode of the underlying stream.c         C   s   |  i  i ƒ  d S(   s.   
        Close the underlying stream.
        N(   R"   Rd   (   R   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyRd   K  s    i    c         C   sa   | d j o t  d ƒ ‚ n |  i i | | ƒ d |  _ d |  _ d |  _ |  i i ƒ  |  _ d S(   s)  
        Move the stream to a new file position.  If the reader is
        maintaining any buffers, tehn they will be cleared.

        @param offset: A byte count offset.
        @param whence: If C{whence} is 0, then the offset is from the
            start of the file (offset should be positive).  If
            C{whence} is 1, then the offset is from the current
            position (offset may be positive or negative); and if 2,
            then the offset is from the end of the file (offset should
            typically be negative).
        i   sm   Relative seek is not supported for SeekableUnicodeStreamReader -- consider using char_seek_forward() instead.R   N(	   Ra   R"   Rª   R   R°   R¯   R²   R¹   R±   (   R   t   offsett   whence(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyRª   U  s    			c         C   sA   | d j  o t  d ƒ ‚ n |  i |  i ƒ  ƒ |  i | ƒ d S(   sH   
        Move the read pointer forward by C{offset} characters.
        i    s"   Negative offsets are not supportedN(   Ra   Rª   R¹   t   _char_seek_forward(   R   RÉ   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyt   char_seek_forwardl  s    c         C   s9  | d j o
 | } n d } xt o|  i i | t | ƒ ƒ } | | 7} |  i | ƒ \ } } t | ƒ | j o& |  i i t | ƒ | d ƒ d Sn t | ƒ | j on xE t | ƒ | j o1 | | t | ƒ 7} |  i | |  ƒ \ } } q± W|  i i t | ƒ | d ƒ d Sn | | t | ƒ 7} q  Wd S(   s  
        Move the file position forward by C{offset} characters,
        ignoring all buffers.

        @param est_bytes: A hint, giving an estimate of the number of
            bytes that will be neded to move foward by C{offset} chars.
            Defaults to C{offset}.
        R   i   N(   R   Rc   R"   R+   RP   t   _incr_decodeRª   (   R   RÉ   t	   est_bytest   bytest   newbytesR·   t   bytes_decoded(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyRË   w  s     	

c   
      C   s^  |  i  t j o |  i i ƒ  t |  i ƒ Sn |  i i ƒ  } | t |  i ƒ |  i } t g  } |  i  D] } | t | ƒ qh ~ ƒ } | |  i |  i | } |  i i	 |  i ƒ |  i
 |  i | ƒ |  i i ƒ  } |  i ol |  i i	 | ƒ |  i |  i i d ƒ ƒ d } d i |  i  ƒ }	 | i |	 ƒ p |	 i | ƒ p t ‚ n |  i i	 | ƒ | S(   sí   
        Return the current file position on the underlying byte
        stream.  If this reader is maintaining any buffers, then the
        returned file position will be the position of the beginning
        of those buffers.
        i2   i    R   (   R°   R   R"   R¹   RP   R¯   R±   t   sumR²   Rª   RË   t   DEBUGRÍ   R+   R   R<   R„   (
   R   t   orig_filepost
   bytes_readRB   R2   t   buf_sizeRÎ   t   filepost   check1t   check2(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR¹   ž  s$    
0

+c         C   s)  | d j o d Sn |  i  o- |  i i ƒ  d j o |  i i |  i  ƒ n | t j o |  i i ƒ  } n |  i i | ƒ } |  i | } |  i | ƒ \ } } | t j	 ok | oc t | ƒ d j oP xM | pA |  i i d ƒ } | p Pn | | 7} |  i | ƒ \ } } qË Wn | | |  _ | S(   sá   
        Read up to C{size} bytes from the underlying stream, decode
        them using this reader's encoding, and return the resulting
        unicode string.  C{linebuffer} is *not* included in the
        result.
        i    u    i   (   R´   R"   R¹   R+   R   R¯   RÍ   RP   (   R   R¶   t	   new_bytesRÏ   R·   RÑ   (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyRµ   Ì  s      (

c         C   s   x– t  oŽ y |  i | d ƒ SWq t j
 og } | i t | ƒ j o |  i | | i  |  i ƒ Sq” |  i d j o ‚  q” |  i | |  i ƒ Sq Xq Wd S(   sá  
        Decode the given byte string into a unicode string, using this
        reader's encoding.  If an exception is encountered that
        appears to be caused by a truncation error, then just decode
        the byte string without the bytes that cause the trunctaion
        error.

        @return: A tuple C{(chars, num_consumed)}, where C{chars} is
            the decoded unicode string, and C{num_consumed} is the
            number of bytes that were consumed.
        R©   N(   Rc   R®   t   UnicodeDecodeErrort   endRP   t   startR«   (   R   RÏ   t   exc(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyRÍ   ñ  s    
t   utf8s   utf16-les   utf16-bet   utf16t   utf16let   utf16bes   utf32-les   utf32-bet   utf32t   utf32let   utf32bec         C   sª   t  i d d |  i i ƒ  ƒ } |  i i | ƒ } | oo |  i i d ƒ } |  i i d ƒ xJ | D]> \ } } | i	 | ƒ o" | o | |  _ n t
 | ƒ Sq` q` Wn d  S(   Ns   [ -]R   i   i    (   R7   R8   R   R”   t
   _BOM_TABLERz   R"   R+   Rª   R<   RP   R   (   R   t   enct   bom_infoRÏ   t   bomt   new_encoding(    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR³     s     (    R   R   R   Rc   RÓ   R   R   R+   R¿   RÂ   RÄ   RÅ   RÆ   R)   RÇ   R¤   RÈ   Rd   Rª   RÌ   RË   R¹   Rµ   RÍ   R¬   t   BOM_UTF8t   BOM_UTF16_LEt   BOM_UTF16_BEt   BOM_UTF32_LEt   BOM_UTF32_BERæ   R³   (    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pyR!   Š  sB   9<							
	'	.%	(8   R   t   sysR   t   os.pathRS   t   weakrefRl   R7   R•   R@   R¬   R.   t   cPickleRk   t	   cStringIOR    R“   R   Rm   R   R   RB   t   environRz   R&   t   pathsepRI   t   _[2]t
   expandusert   platformR<   R   t   prefixt   objectR	   R   R   R*   R3   t   WeakValueDictionaryRy   RQ   R   Rc   Rj   Rƒ   R{   R‹   R|   R   R’   Rb   R˜   R¢   R6   R!   (    (    (    s   /p/zhu/06/nlp/nltk/nltk/data.pys   <module>    sˆ   KK	#
(,B	W(																h		'