³ò
RÔjKc           @   sÝ   d  d k  Z  d  d k Z d  d k Z d  d k l Z e d ƒ \ Z Z e d ƒ \ Z Z	 e d ƒ Z
 d e f d „  ƒ  YZ d e f d „  ƒ  YZ d	 e f d
 „  ƒ  YZ d d d „ Z e d „ Z e d j o e ƒ  n d S(   iÿÿÿÿN(   t
   TokenizerIi   i   t   TextTilingTokenizerc        
   B   sƒ   e  Z d  Z d d e e e d d e e d „	 Z d „  Z	 d „  Z
 d „  Z d	 „  Z d
 „  Z d „  Z d „  Z d „  Z d „  Z RS(   sj  A section tokenizer based on the TextTiling algorithm. The
    algorithm detects subtopic shifts based on the analysis of lexical
    co-occurence patterns.

    The process starts by tokenizing the text into pseudosentences of
    a fixed size w. Then, depending on the method used, similarity
    scores are assigned at sentence gaps. The algorithm proceeds by
    detecting the peak differences between these scores and marking
    them as boundaries. The boundaries are normalized to the closest
    paragraph break and the segmented text is returned.

    @type w: number
    @param w: Pseudosentence size
    @type k: number
    @param k: Size(in sentences) of the block used in the block comparison
              method
    @type similarity_method: constant
    @param similarity_method: The method used for determining similarity scores
    @type stopwords: list
    @param stopwords: A list of stopwords that are filtered out
    @type smoothing_method: constant
    @param smoothing_method: The method used for smoothing the score plot
    @type smoothing_width: number
    @param smoothing_width: The width of the window used by the smoothing
           method
    @type smoothing_rounds: number
    @param smoothing_rounds: The number of smoothing passes
    @type cutoff_policy: constant
    @param cutoff_policy: The policy used to determine the number of boundaries 
    i   i
   i   i   c
   
      C   sQ   | d  j o# d d k l } | i d ƒ } n |  i i t ƒ  ƒ |  i d =d  S(   Niÿÿÿÿ(   t	   stopwordst   englisht   self(   t   Nonet   nltk.corpusR   t   wordst   __dict__t   updatet   locals(
   R   t   wt   kt   similarity_methodR   t   smoothing_methodt   smoothing_widtht   smoothing_roundst   cutoff_policyt	   demo_mode(    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyt   __init__4   s
    c      
      sÐ  | i  ƒ  } ˆ  i | ƒ } t | ƒ } d i g  } | D]$ } t i d | ƒ o | | q8 q8 ~ ƒ } ˆ  i | ƒ } ˆ  i | ƒ }	 x, |	 D]$ }
 t ‡  f d †  |
 i ƒ |
 _ q Wˆ  i	 |	 | ƒ } ˆ  i
 t j o ˆ  i |	 | ƒ } n! ˆ  i
 t j o t d ƒ ‚ n ˆ  i t j o ˆ  i | ƒ } n ˆ  i | ƒ } ˆ  i | ƒ } ˆ  i | | | ƒ } g  } d } x< | D]4 } | d j o qvn | i | | | !ƒ | } qvWˆ  i o | | | | f Sn | S(   s0   The main function. Follows a pipeline structure.t    s   [a-z\-' 
	]c            s   |  d  ˆ  i  j S(   i    (   R   (   t   wi(   R   (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyt   <lambda>_   s    s'   Vocabulary introduction not implementedi    (   t   lowert   _mark_paragraph_breakst   lent   joint   ret   matcht   _divide_to_tokensequencest   filtert   wrdindex_listt   _create_token_tableR   t   BLOCK_COMPARISONt   _block_comparisont   VOCABULARY_INTRODUCTIONt   NotImplementedErrorR   t   DEFAULT_SMOOTHINGt   _smooth_scorest   _depth_scorest   _identify_boundariest   _normalize_boundariest   appendR   (   R   t   textt   lowercase_textt   paragraph_breakst   text_lengtht   _[1]t   ct   nopunct_textt   nopunct_par_breakst   tokseqst   tst   token_tablet
   gap_scorest   smooth_scorest   depth_scorest   segment_boundariest   normalized_boundariest   segmented_textt   prevbt   b(    (   R   s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyt   tokenizeF   sD    *
 		 

c            s³  ‡  f d †  } g  } t  | ƒ d } x‡t | ƒ D]y} d \ } } }	 d }
 | |  i d j  o | d } n, | | |  i j o | | } n
 |  i } g  } | | | d | d !D] } | | i q¸ ~ } g  } | | d | | d !D] } | | i qï ~ } xb ˆ  i ƒ  D]T } | | | | ƒ | | | ƒ 7} | | | | ƒ d 7} |	 | | | ƒ d 7}	 qWy | t i | |	 ƒ }
 Wn t j
 o n X| i |
 ƒ q2 W| S(   s&   Implements the block comparison methodc            sN   t  ‡  f d †  ˆ |  i ƒ } t g  } | D] } | | d q- ~ ƒ } | S(   Nc            s   |  d  ˆ  j S(   i    (    (   t   o(   t   block(    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR   ‡   s    i   (   R   t   ts_occurencest   sum(   t   tokR@   t   ts_occsR/   t   tsocct   freq(   R5   (   R@   s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyt   blk_frq†   s    +i   g        i   (   g        g        g        (	   R   t   rangeR   t   indext   keyst   matht   sqrtt   ZeroDivisionErrorR*   (   R   R3   R5   RG   R6   t   numgapst   curr_gapt   score_dividendt   score_divisor_b1t   score_divisor_b2t   scoret   window_sizeR/   R4   t   b1t   _[2]t   b2t   t(    (   R5   s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR"   „   s8     	00  c         C   s'   t  t t i | ƒ d |  i d ƒƒ S(   s1   Wraps the smooth function from the SciPy Cookbookt
   window_leni   (   t   listt   smootht   numpyt   arrayR   (   R   R6   (    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR&   ­   s    c         C   s…   d } t  i d ƒ } | i | ƒ } d } d g } xK | D]C } | i ƒ  | | j  o q: q: | i | i ƒ  ƒ | i ƒ  } q: W| S(   sN   Identifies indented text or line breaks as the beginning of
        paragraphsid   s   [ 	]*
[ 	]*
[ 	]*i    (   R   t   compilet   finditert   startR*   (   R   R+   t   MIN_PARAGRAPHt   patternt   matchest
   last_breakt   pbreakst   pb(    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR   ²   s    	 c         C   sš   |  i  } g  } t i d | ƒ } x- | D]% } | i | i ƒ  | i ƒ  f ƒ q( Wg  } t d t | ƒ | ƒ D]% } | t | | | | | | !ƒ qn ~ S(   s3   Divides the text into pseudosentences of fixed sizes   \w+i    (	   R   R   R_   R*   t   groupR`   RH   R   t   TokenSequence(   R   R+   R   R   Rc   R   R/   t   i(    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR   Å   s    	 #c         C   sá  h  } d } d } | i  ƒ  } | i ƒ  } | d j o6 y | i ƒ  } Wqm t j
 o t d ƒ ‚ qm Xn xm| D]e} xR| i D]G\ }	 }
 y/ x( |
 | j o | i ƒ  } | d 7} q– WWn t j
 o n X|	 | i ƒ  j o© | |	 i d 7_ | |	 i | j o$ | | |	 _ | |	 i d 7_ n | |	 i	 | j o+ | | |	 _	 | |	 i
 i | d g ƒ qË| |	 i
 d d c d 7<q„ t d |
 d | d g g d d d d d	 | d
 | ƒ | |	 <q„ W| d 7} qt W| S(   s#   Creates a table of TokenTableFieldsi    s7   No paragraph breaks were found(text too short perhaps?)i   iÿÿÿÿt	   first_posRA   t   total_countt	   par_countt   last_part   last_tok_seq(   t   __iter__t   nextt   StopIterationt
   ValueErrorR   RJ   Rk   Rm   Rl   Rn   RA   R*   t   TokenTableField(   R   t   token_sequencest
   par_breaksR5   t   current_part   current_tok_seqt   pb_itert   current_par_breakR4   t   wordRI   (    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR    Ï   sN     
 c   
   	      sU  g  } | D] } | d q ~ } t  | ƒ t | ƒ } t i | ƒ t _ |  i t j o | t i d ‰  n | t i d ‰  t | t t | ƒ ƒ ƒ } | i	 ƒ  | i
 ƒ  t ‡  f d †  | ƒ } xˆ | D]€ } d | | d <xi | D]a }	 | d |	 d j oF t |	 d | d ƒ d j  o' | |	 d d j o d | | d <qè qè WqÍ W| S(   sJ   Identifies boundaries at the peaks of similarity score
        differencesi    g       @c            s   |  d  ˆ  j S(   i    (    (   t   x(   t   cutoff(    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR     s    i   i   (   RB   R   R\   t   stdt   stdevR   t   LCt   zipRH   t   sortt   reverseR   t   abs(
   R   R8   R/   R{   t
   boundariest   avgt   depth_tuplest   hpt   dtt   dt2(    (   R|   s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR(     s&    !

  4c         C   s.  g  } | D] } | d q ~ } t  t t | ƒ d d ƒ d ƒ } | } x- t | ƒ D] } d | | <d | | d <qV Wx® | | | !D]ž } | }	 x3 | | d d … D] }
 |
 |	 j o
 |
 }	 q¨ Pq¨ W| } x3 | d | d … D] }
 |
 | j o
 |
 } qä Pqä W|	 | d | | | <| d 7} qˆ W| S(   sz   Calculates the depth of each gap, i.e. the average difference
        between the left and right peaks and the gap's scorei    i
   i   i   i   Niÿÿÿÿ(   t   mint   maxR   RH   R   (   R   t   scoresR/   R{   R8   t   clipRI   Ri   t   gapscoret   lpeakRS   t   rpeak(    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR'     s0    !" 
  
 
c         C   sD  g  } d \ } } } t  } x"| D]}	 | d 7} |	 d j o | o t  } | d 7} n |	 d j o | o
 t } n | t | ƒ j  o° | t | |  i |  i ƒ j o | | d j oq t | ƒ }
 x@ | D]8 } |
 t | | ƒ j o t | | ƒ }
 | } qÐ PqÐ W| | j o | i | ƒ q.n | d 7} q" q" W| S(   sS   Normalize the boundaries identified to the original text's
        paragraph breaksi    i   s    	
(   i    i    i    (   t   Falset   TrueR   R‹   R   Rƒ   R*   (   R   R+   R„   R-   t   norm_boundariest
   char_countt
   word_countt	   gaps_seent	   seen_wordt   chart   best_fitt   brt   bestbr(    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR)   =  s2     

 
(   t   __name__t
   __module__t   __doc__R!   R   R%   t   HCR‘   R   R>   R"   R&   R   R   R    R(   R'   R)   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR      s&   		>	)			
	3		!Rs   c           B   s#   e  Z d  Z d d d e d „ Z RS(   s[   A field in the token table holding parameters for each token,
    used later in the processi   i    c         C   s!   |  i  i t ƒ  ƒ |  i  d =d  S(   NR   (   R   R	   R
   (   R   Rj   RA   Rk   Rl   Rm   Rn   (    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR   `  s    (   Rœ   R   Rž   R   R   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyRs   ]  s
   Rh   c           B   s   e  Z d  Z e d „ Z RS(   s3   A token list with its original length and its indexc         C   s4   | p
 t  | ƒ } |  i i t ƒ  ƒ |  i d =d  S(   NR   (   R   R   R	   R
   (   R   RI   R   t   original_length(    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR   l  s    (   Rœ   R   Rž   R   R   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyRh   j  s   i   t   flatc         C   s  |  i  d j o t d ‚ n |  i | j  o t d ‚ n | d j  o |  Sn | d j o t d
 ‚ n t i d |  d |  | d d … |  d |  d |  d | d … f } | d j o t i | d ƒ } n t d | d ƒ } t i | | i ƒ  | d d ƒ} | | d | d !S(   sÌ  smooth the data using a window with requested size.

    This method is based on the convolution of a scaled window with the signal.
    The signal is prepared by introducing reflected copies of the signal 
    (with the window size) in both ends so that transient parts are minimized
    in the begining and end part of the output signal.

    input:
        x: the input signal 
        window_len: the dimension of the smoothing window; should be an odd integer
        window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
            flat window will produce a moving average smoothing.

    output:
        the smoothed signal

    example:

    t=linspace(-2,2,0.1)
    x=sin(t)+randn(len(t))*0.1
    y=smooth(x)

    see also: 

    numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve
    scipy.signal.lfilter

    TODO: the window parameter could be the window itself if an array instead of a string   
    i   s'   smooth only accepts 1 dimension arrays.s1   Input vector needs to be bigger than window size.i   R¡   t   hanningt   hammingt   bartlettt   blackmansD   Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'i   i    iÿÿÿÿt   ds   numpy.s   (window_len)t   modet   same(   s   flats   hannings   hammings   bartletts   blackman(	   t   ndimRr   t   sizeR\   t   r_t   onest   evalt   convolveRB   (   R{   RY   t   windowt   sR   t   y(    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyR[   w  s    I"c         C   s  d d k  l } d d  k } t d t ƒ } |  d  j o | i ƒ  d  }  n | i |  ƒ \ } } } } | i d ƒ | i	 d ƒ | i
 t t | ƒ ƒ | d d ƒ| i
 t t | ƒ ƒ | d d ƒ| i
 t t | ƒ ƒ | d d	 ƒ| i t t | ƒ ƒ | ƒ | i ƒ  | i ƒ  d  S(
   Niÿÿÿÿ(   t   brownR   i'  s   Sentence Gap indexs
   Gap Scorest   labels   Smoothed Gap scoress   Depth scores(   R   R²   t   pylabR   R’   R   t   rawR>   t   xlabelt   ylabelt   plotRH   R   t   stemt   legendt   show(   R+   R²   R´   t   ttR°   t   ssR¦   R=   (    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pyt   demo²  s    !"""
t   __main__(   R   RK   R\   t   apiR    RH   R!   R#   R   RŸ   R%   R   t   objectRs   Rh   R[   R   R¾   Rœ   (    (    (    s.   /p/zhu/06/nlp/nltk/nltk/tokenize/texttiling.pys   <module>	   s   ÿ J;