
4Ic           @   sW   d  Z  d d k Z d d k Td e f d     YZ d   Z e d j o e   n d S(   s   
A word stemmer based on the Lancaster stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
iN(   t   *t   LancasterStemmerc        s   B   sV   e  Z d{ Z ds   Z dt   Z du   Z dv   Z dw   Z dx   Z dy   Z	 dz   Z
 RS(|   s   ai*2.s   a*1.s   bb1.s   city3s.s   ci2>s   cn1t>s   dd1.s   dei3y>s   deec2ss.s   dee1.s   de2>s   dooh4>s   e1>s   feil1v.s   fi2>s   gni3>s   gai3y.s   ga2>s   gg1.s   ht*2.s	   hsiug5ct.s   hsi3>s   i*1.s   i1y>s   ji1d.s   juf1s.s   ju1d.s   jo1d.s   jeh1r.s   jrev1t.s   jsim2t.s   jn1d.s   j1s.s   lbaifi6.s   lbai4y.s   lba3>s   lbi3.s   lib2l>s   lc1.s   lufi4y.s   luf3>s   lu2.s   lai3>s   lau3>s   la2>s   ll1.s   mui3.s   mu*2.s   msi3>s   mm1.s   nois4j>s   noix4ct.s   noi3>s   nai3>s   na2>s   nee0.s   ne2>s   nn1.s   pihs4>s   pp1.s   re2>s   rae0.s   ra2.s   ro2>s   ru2>s   rr1.s   rt1>s   rei3y>s   sei3y>s   sis2.s   si2>s   ssen4>s   ss0.s   suo3>s   su*2.s   s*1>s   s0.s	   tacilp4y.s   ta2>s   tnem4>s   tne3>s   tna3>s   tpir2b.s   tpro2b.s   tcud1.s   tpmus2.s   tpec2iv.s   tulo2v.s   tsis0.s   tsi3>s   tt1.s   uqi3.s   ugo1.s   vis3j>s   vie0.s   vi2>s   ylb1>s   yli3y>s   ylp0.s   yl2>s   ygo1.s   yhp1.s   ymo1.s   ypo1.s   yti3>s   yte3>s   ytl2.s   yrtsi5.s   yra3>s   yro3>s   yfi3.s   ycn2t>s   yca3>s   zi2>s   zy1s.c         C   s   h  |  _  d S(   s5   Create an instance of the Lancaster stemmer.
        N(   t   rule_dictionary(   t   self(    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt   __init__   s    c         C   s   t  i d  } h  |  _ xt | D]l } | i |  p t d |  n | d d !} | |  i j o |  i | i |  q | g |  i | <q Wd S(   s8   Validate the set of rules used in this stemmer.
        s   ^[a-z]+\*?\d[a-z]*[>\.]?$s   The rule %s is invalidi    i   N(   t   ret   compileR   t   matcht
   ValueErrort   append(   R   t
   rule_tuplet
   valid_rulet   rulet   first_letter(    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt
   parseRules   s    	 c         C   sL   | i    } | } t |  i  d j o |  i t i  n |  i | |  S(   s1   Stem a word using the Lancaster stemmer.
        i    (   t   lowert   lenR   R   R   R
   t   _LancasterStemmer__doStemming(   R   t   wordt   intact_word(    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt   stem   s
    c         C   s  t  i d  } t } x| o|  i |  } | d j  p | | |  i j o
 t } q t } x'|  i | | D]} | i |  } | o | i   \ }	 }
 } } } t |  } | i	 |	 d d d   o |
 o[ | | j oJ |  i
 | |  o7 |  i | | |  } t } | d j o
 t } n Pq}q|  i
 | |  o7 |  i | | |  } t } | d j o
 t } n Pqqqq qq W| t j o
 t } q q W| S(   s)   Perform the actual word stemming
        s#   ^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$i    Nit   .(   R   R   t   Truet    _LancasterStemmer__getLastLetterR   t   FalseR   t   groupst   intt   endswitht   _LancasterStemmer__isAcceptablet   _LancasterStemmer__applyRule(   R   R   R   R   t   proceedt   last_letter_positiont   rule_was_appliedR   t
   rule_matcht   ending_stringt   intact_flagt   remove_totalt   append_stringt	   cont_flag(    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt   __doStemming   sF    
!
 		
			
c         C   sC   d } x6 t  t |   D]" } | | i   o
 | } q Pq W| S(   sQ   Get the zero-based index of the last alphabetic character in this string
        i(   t   rangeR   t   isalpha(   R   R   t   last_lettert   position(    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt   __getLastLetter   s     
c         C   s   t  } | d d j o% t |  | d j o
 t } q nR t |  | d j o: | d d j o
 t } q | d d j o
 t } q n | S(   s:   Determine if the word is acceptable for stemming.
        i    t   aeiouyi   i   i   (   R   R   R   (   R   R   R$   t   word_is_acceptable(    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt   __isAcceptable   s    
c         C   s6   t  |  | } | d | !} | o | | 7} n | S(   s,   Apply the stemming rule to the word
        i    (   R   (   R   R   R$   R%   t   new_word_length(    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt   __applyRule  s
    c         C   s   d S(   Ns   <LancasterStemmer>(    (   R   (    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt   __repr__  s    (s   s   ai*2.s   a*1.s   bb1.s   city3s.s   ci2>s   cn1t>s   dd1.s   dei3y>s   deec2ss.s   dee1.s   de2>s   dooh4>s   e1>s   feil1v.s   fi2>s   gni3>s   gai3y.s   ga2>s   gg1.s   ht*2.s	   hsiug5ct.s   hsi3>s   i*1.s   i1y>s   ji1d.s   juf1s.s   ju1d.s   jo1d.s   jeh1r.s   jrev1t.s   jsim2t.s   jn1d.s   j1s.s   lbaifi6.s   lbai4y.s   lba3>s   lbi3.s   lib2l>s   lc1.s   lufi4y.s   luf3>s   lu2.s   lai3>s   lau3>s   la2>s   ll1.s   mui3.s   mu*2.s   msi3>s   mm1.s   nois4j>s   noix4ct.s   noi3>s   nai3>s   na2>s   nee0.s   ne2>s   nn1.s   pihs4>s   pp1.s   re2>s   rae0.s   ra2.s   ro2>s   ru2>s   rr1.s   rt1>s   rei3y>s   sei3y>s   sis2.s   si2>s   ssen4>s   ss0.s   suo3>s   su*2.s   s*1>s   s0.s	   tacilp4y.s   ta2>s   tnem4>s   tne3>s   tna3>s   tpir2b.s   tpro2b.s   tcud1.s   tpmus2.s   tpec2iv.s   tulo2v.s   tsis0.s   tsi3>s   tt1.s   uqi3.s   ugo1.s   vis3j>s   vie0.s   vi2>s   ylb1>s   yli3y>s   ylp0.s   yl2>s   ygo1.s   yhp1.s   ymo1.s   ypo1.s   yti3>s   yte3>s   ytl2.s   yrtsi5.s   yra3>s   yro3>s   yfi3.s   ycn2t>s   yca3>s   zi2>s   zy1s.(   t   __name__t
   __module__R
   R   R   R   R   R   R   R   R2   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyR      s                                                                                                                     				;			c          C   sa   d d k  l }  |  i   } d d GHd d GHx, d D]$ } | i |  } d | | f GHq5 Wd S(   s   A demonstration of the lancaster stemmer on a samples described in
    Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
    i(   R   s
   %-20s%-20ss   Original Words   Stemmed WordR    i(   t   maximumt
   presumablyt   multiplyt	   provisiont   owedt   eart   sayingt   cryingt   stringt   meantt   cementN(   s   Original Words   Stemmed Word(   s   maximumR6   s   multiplyR8   R9   R:   R;   R<   s   stringR>   R?   (   t   nltkR   R   (   R   t   stemmerR   t   stemmed_word(    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pyt   demo  s&    		           t   __main__(   t   __doc__R   t   apit   StemmerIR   RC   R3   (    (    (    s)   /p/zhu/06/nlp/nltk/nltk/stem/lancaster.pys   <module>   s   
 	