³ò
èAc           @   s;   d  Z  d Z d d k Z d d k Z d d d „  ƒ  YZ d S(   s   Hugo Liu <hugo@media.mit.edu>s   2.0iÿÿÿÿNt   MontyTokenizerc        ‡   B   s†  e  Z d  „  Z d „  Z d „  Z d d „ Z h  d d <d d <d	 d
 <d d <d d <d d <d d <d d <d d <d d <d d <d d <d d <d d  <d! d" <d# d$ <d% d& <d' d( <d) d* <d+ d, <d- d. <d/ d0 <d1 d2 <d3 d4 <d5 d6 <d7 d8 <d9 d: <d; d< <d= d> <d? d@ <dA dB <dC dD <dE dF <dG dH <dI dJ <dK dL <dM dN <dO dP <dQ dR <dS dT <dU dV <dW dX <dY dZ <d[ d\ <d] d^ <d_ d` <da db <Z h  dc d <Z dd de df dg dh di dj dk dl dm dn do dp dq dr ds dt du dv dw dx dy dz d{ d| d} d~ d d€ d d‚ dƒ d„ d… d† d‡ dˆ d‰ dŠ d‹ dŒ d dŽ d d d‘ d’ d“ d” d• d– d— d˜ d™ dš d› dœ d dž dŸ d  d¡ d¢ d£ d¤ d¥ d¦ d§ d¨ d© dª d« d¬ d­ d® d¯ d° d± d² d³ d´ dµ d¶ d· d¸ d¹ dº d» d¼ d½ d¾ d¿ dÀ dÁ dÂ dÃ dÄ dÅ dÆ dÇ dÈ dÉ dÊ dË dÌ dÍ dÎ dÏ dÐ dÑ dÒ dÓ dÔ dÕ dÖ d× dØ dÙ dÚ dÛ dÜ dÝ dÞ dß dà dá dâ dã dä då dæ dç dè dé dê g‡ Z RS(ë   c         C   s   d  S(   N(    (   t   self(    (    sF   /afs/cs.wisc.edu/p/zhu/06/nlp/montylingua-2.1/python/MontyTokenizer.pyt   __init__   s    c         C   s   t  i d | ƒ } | S(   Ns   [
]+(   t   ret   split(   R   t   textt   info1(    (    sF   /afs/cs.wisc.edu/p/zhu/06/nlp/montylingua-2.1/python/MontyTokenizer.pyt   split_paragraphs
   s    c         C   s=  |  i  } | i ƒ  } d } t i d ƒ } t i d ƒ } x» | t | ƒ j  o§ | i | | ƒ } | o% | i | d d ƒ | d 7} q< n | i | | ƒ } | o@ | | i ƒ  | j o | i | d d ƒ n | d 7} q< n | d 7} q< Wd i | ƒ } t i	 d d | ƒ } t i	 d	 d
 | ƒ } | i d ƒ S(   Ni    s   ([?!]+|[.][.]+)$s   ([.])$i   s   <sentence_break/>t    s   (\<sentence_break\/\> ?){2,20}s   <sentence_break/> s&   (\<(sentence|paragraph)_break\/\> *)+$t    (
   t   common_abbrev_and_acroR   R   t   compilet   lent   searcht   insertt   lowert   joint   sub(   R   R   t   argsst   inputt   dirnamet   contents_arrt   more1t   info_cleaned(    (    sF   /afs/cs.wisc.edu/p/zhu/06/nlp/montylingua-2.1/python/MontyTokenizer.pyt   split_sentences   s,    	

i    c         C   s¯  |  i  } t i } t i } d | d } d d d d d d d d	 d
 d d d d d d d d d d d d d d g } d d d d g } | i d d ƒ } x( | D]  } | i | d | d ƒ } qŸ W| i ƒ  }	 x4t t |	 ƒ ƒ D] }
 |	 |
 } d | j o4 | | i	 d ƒ i
 ƒ  | | i	 d ƒ j o qâ n | i ƒ  | j o qâ n | d | ƒ } | o qâ n | d | ƒ } | o; | i d ƒ d j o |	 |
 d   d d |	 |
 <qâ qâ n x0 | D]( } |	 |
 i | d | d ƒ |	 |
 <qÂW|	 |
 i ƒ  |	 |
 <qâ Wd i |	 ƒ } | o |  i } n
 |  i } d! } x" | i ƒ  D] } | | d 7} qEW| d   } | d" 7} d# } xÁ | o¹ d$ } | | | i ƒ  ƒ } | o“ | | i d% ƒ } | | i d& ƒ | j o | d$ i
 ƒ  | d# } n | d$ i ƒ  | d# } | | i d& ƒ  | | | i d' ƒ } d# } qzqzW| i d( d) ƒ } | i d* d+ ƒ } | o | i d, d- ƒ } n | i d, d. ƒ } | o | i d/ d0 ƒ } n | S(1   NR   t   `t   ^t   *t   =t   +t   |s   \t   [t   ]t   }t   {t   ,t   !t   ?t   #t   &t   (t   )t   "t   >t   <t   ~t   ;t   .t   @t   /t   :s   ^([A-Z][.])+$s,   ^[$][0-9]{1,3}[.][0-9][0-9](?P<period>[.]?)$t   periodiÿÿÿÿs    (?P<begin>)(?P<word>s   )(?P<end>) i   i    t   wordt   begint   ends   's s    's s   'd s    'd s   'll s    will s    'll s    i s    I (   R
   R   R   t   stringt	   uppercaset   replaceR   t   rangeR   t   indext   upperR   t   groupt   stripR   t   contractions_unwoundt   contractions_separatedt   keyst   startR6   (   R   t   sentencet   expand_contractions_pt   built_in_strt   cdt   the_tokenizer_pt   j_arrt   b_dictt
   hostnamesst
   buffer_strR   t   bsR   t   filename_strt   domain_cleanedt   _montylinguat   _montylingua_arrt   id_arr(    (    sF   /afs/cs.wisc.edu/p/zhu/06/nlp/montylingua-2.1/python/MontyTokenizer.pyt   tokenize*   sv    			K  
: &	 


(s   ai n'ts   ain'ts   are n'ts   aren'ts   is n'ts   isn'ts   was n'ts   wasn'ts   were n'ts   weren'ts   did n'ts   didn'ts   does n'ts   doesn'ts   do n'ts   don'ts   had n'ts   hadn'ts   has n'ts   hasn'ts   have n'ts   haven'ts   ca n'ts   can'ts	   could n'ts   couldn'ts   need n'ts   needn'ts
   should n'ts	   shouldn'ts   sha n'ts   shan'ts   wo n'ts   won'ts	   would n'ts   wouldn'ts   i 'ms   i'ms   you 'res   you'res   he 'ss   he'ss   she 'ss   she'ss   it 'ss   it'ss   we 'res   we'res   they 'res   they'res   i 'ves   i'ves   you 'ves   you'ves   we 'ves   we'ves   they 'ves   they'ves   who 'ves   who'ves   what 'ves   what'ves   when 'ves   when'ves	   where 'ves   where'ves   why 'ves   why'ves   how 'ves   how'ves   i 'ds   i'ds   you 'ds   you'ds   he 'ds   he'ds   she 'ds   she'ds   we 'ds   we'ds   they 'ds   they'ds   i 'lls   i'lls   you 'lls   you'lls   he 'lls   he'lls   she 'lls   she'lls   we 'lls   we'lls   they 'lls   they'lls   ai nots   mr.s   mrs.s   ms.s   sr.s   esq.s   jr.s   dr.s   s.b.s   ph.d.s   m.d.s   m.eng.s   m.f.a.s   d.d.s.s   sc.d.s   b.s.s   b.sc.s   b.a.s   a.b.s   m.a.s   c.p.a.s   prof.s   capt.s   col.s   gen.s   sgt.s   lt.s   priv.s   ft.s   nav.s   a.f.s   u.s.a.f.s
   a.f.b.i.e.s   etc.s   e.g.s   c.f.s   p.s.s   q.e.d.s   i.s   ii.s   iii.s   iv.s   v.s   vi.s   vii.s   viii.s   ix.s   x.s   a.m.s   p.m.s   morn.s   eve.s   corp.s   inc.s   co.s   ltd.s   reg.s   u.p.s.s   u.s.p.s.s   fedex.s   i.b.m.s   a.o.l.s   jan.s   feb.s   febr.s   mar.s   apr.s   may.s   jun.s   jul.s   aug.s   sep.s   sept.s   oct.s   nov.s   dec.s   ala.s   ariz.s   ark.s   calif.s   colo.s   conn.s   del.s   d.c.s   fla.s   ga.s   ill.s   ind.s   kans.s   ky.s   la.s   md.s   mass.s   mich.s   minn.s   miss.s   mo.s   nebr.s   nev.s   n.h.s   n.j.s   n.m.s   n.y.s   n.c.s   n.d.t   oklas   ore.s   pa.s   p.r.s   r.i.s   s.c.s   s.d.s   tenn.s   tex.s   vt.s   va.s   v.i.s   wash.s   w.va.s   wis.s   wyo.s   v.c.r.s   v.h.s.s   d.v.d.s   v.c.d.s   c.d.s   tele.s   tv.s   t.v.s   p.c.s   d.s.l.s   a.s.a.p.s   r.s.v.p.s   n.y.c.s   c.o.d.s   s.u.v.(	   t   __name__t
   __module__R   R   R   RR   R@   R?   R
   (    (    (    sF   /afs/cs.wisc.edu/p/zhu/06/nlp/montylingua-2.1/python/MontyTokenizer.pyR       sz  			J																																														(    (   t
   __author__t   __version__R7   R   R    (    (    (    sF   /afs/cs.wisc.edu/p/zhu/06/nlp/montylingua-2.1/python/MontyTokenizer.pys   <module>   s   