³ò
ŽB_Kc           @   su   d  Z  d d k Z d d k Td e f d „  ƒ  YZ e d „ Z d „  Z d „  Z d	 „  Z	 e
 d
 j o e ƒ  n d S(   s~   
Implementation of 'TnT - A Statisical Part of Speech Tagger'
by Thorsten Brants

http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf
iÿÿÿÿN(   t   *t   TnTc           B   sb   e  Z d  Z e e d e d „ Z d „  Z d „  Z d „  Z d „  Z	 d „  Z
 d „  Z d	 „  Z RS(
   sŸ  
    TnT - Statistical POS tagger

    IMPORTANT NOTES:

    * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS
      It is possible to provide an untrained POS tagger to
      create tags for unknown words, see __init__ function

    * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT
      - Due to the nature of this tagger, it works best when
       trained over sentence delimited input.
     - However it still produces good results if the training
       data and testing data are separated on all punctuation eg: [,.?!]
     - Input for training is expected to be a list of sentences
       where each sentence is a list of (word, tag) tuples
     - Input for tag function is a single sentence
       Input for tagdata function is a list of sentences
       Output is of a similar form

    * Function provided to process text that is unsegmented
      - Please see basic_sent_chop()


    TnT uses a second order Markov model to produce tags for
    a sequence of input, specifically:

      argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)

    IE: the maximum projection of a set of probabilities

    The set of possible tags for a given word is derived
    from the training data. It is the set of all tags
    that exact word has been assigned.

    The probability of a tag for a given word is the linear
    interpolation of 3 markov models; a zero-order, first-order,
    and a second order model.

      P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
                             l3*P(t_i| t_i-1, t_i-2)

    A beam search is used to limit the memory usage of the algorithm.
    The degree of the beam can be changed using N in the initialization.
    N represents the maximum number of possible solutions to maintain
    while tagging.
   
    It is possible to differentiate the tags which are assigned to
    capitalized words. However this does not result in a significant
    gain in the accuracy of the results. 
    iè  c         C   s¯   t  i i ƒ  |  _ t  i i ƒ  |  _ t  i i ƒ  |  _ t  i i ƒ  |  _ t  i i ƒ  |  _ d |  _	 d |  _
 d |  _ | |  _ | |  _ | |  _ | |  _ d |  _ d |  _ d S(   sø  
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        @param unk: instance of a POS tagger, conforms to TaggerI
        @type  unk:(TaggerI)
        @param Trained: Indication that the POS tagger is trained or not
        @type  Trained: boolean
        @param N: Beam search degree (see above)
        @type  N:(int)
        @param C: Capitalization flag 
        @type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability
      
        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        g        i    N(   t   nltkt   probabilityt   FreqDistt   _unit   ConditionalFreqDistt   _bit   _trit   _wdt   _eost   _l1t   _l2t   _l3t   _Nt   _Ct   _Tt   _unkt   unknownt   known(   t   selft   unkt   Trainedt   Nt   C(    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyt   __init__I   s    								c         C   sC  t  } |  i d j o$ |  i t  j o |  i i | ƒ n xø | D]ð } d d g } xÇ | D]¿ \ } } |  i o | d i ƒ  o
 t } n |  i | i	 | ƒ |  i
 i	 | | f ƒ |  i | d i	 | | f ƒ |  i t | ƒ i	 | | f ƒ | i | | f ƒ | i d ƒ t  } qZ W|  i | i	 d ƒ qA W|  i ƒ  d S(   sü   
        Uses a set of tagged data to train the tagger.
        If an unknown word tagger is specified,
        it is trained on the same data.   

        @param data: List of lists of (word, tag) tuples
        @type data: L{tuple} of L{str}
        t   BOSi    i   t   EOSN(   t   FalseR   t   NoneR   t   trainR   t   isuppert   TrueR	   t   incR   R   R   t   tuplet   appendt   popR
   t   _compute_lambda(   R   t   dataR   t   sentt   historyt   wt   t(    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyR   y   s$       % 
c         C   s“  d } d } d } x;|  i  i ƒ  D]*} | \ } } x|  i  | i ƒ  D] } |  i | d j o qH n |  i |  i  | | d |  i  | i ƒ  d ƒ } |  i |  i | | d |  i | i ƒ  d ƒ }	 |  i |  i | d |  i i ƒ  d ƒ }
 |
 | j o& |
 |	 j o | |  i  | | 7} qH |	 | j o& |	 |
 j o | |  i  | | 7} qH | |	 j o& | |
 j o | |  i  | | 7} qH | |	 j oO | |
 j oB | t |  i  | | ƒ d 7} | t |  i  | | ƒ d 7} qH |	 |
 j oO |
 | j oB | t |  i  | | ƒ d 7} | t |  i  | | ƒ d 7} qH qH Wq" W| | | | |  _ | | | | |  _	 | | | | |  _
 d S(   s  
        creates lambda values based upon training data

        NOTE: no need to explicitly reference C,
        it is contained within the tag variable :: tag == (tag,C)

        for each tag trigram (t1, t2, t3)
        depending on the maximum value of
        - f(t1,t2,t3)-1 / f(t1,t2)-1
        - f(t2,t3)-1 / f(t2)-1
        - f(t3)-1 / N-1

        increment l3,l2, or l1 by f(t1,t2,t3)

        ISSUES -- Resolutions:
        if 2 values are equal, increment both lambda values
        by (f(t1,t2,t3) / 2)
        g        i   g       @N(   R   t
   conditionst   samplesR   t	   _safe_divR   R   t   floatR   R   R   (   R   t   tl1t   tl2t   tl3R(   t   h1t   h2t   tagt   c3t   c2t   c1(    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyR%   ¨   s:      22*##c         C   s-   | d j o d Sn t  | ƒ t  | ƒ Sd S(   s€   
        Safe floating point division function, does not allow division by 0
        returns -1 if the denominator is 0
        i    iÿÿÿÿN(   R.   (   R   t   v1t   v2(    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyR-   û   s    c         C   s7   g  } x* | D]" } |  i  | ƒ } | i | ƒ q W| S(   sk  
        Tags each sentence in a list of sentences

        @param data:list of list of words
        @type data: [[string,],]
        @return: list of list of (word, tag) tuples

        Invokes tag(sent) function for each sentence
        compiles the results into a list of tagged sentences
        each tagged sentence is a list of (word, tag) tuples
        (   R4   R#   (   R   R&   t   resR'   t   res1(    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyt   tagdata  s     c   	      C   s…   d d g d f g } t  | ƒ } |  i | | ƒ } g  } xE t t | ƒ ƒ D]1 } | | d \ } } | i | | | f ƒ qL W| S(   so  
        Tags a single sentence

        @param data: list of words
        @type data: [string,]

        @return: [(word, tag),]

        Calls recursive function '_tagword'
        to produce a list of tags

        Associates the sequence of returned tags
        with the correct words in the input sequence

        returns a list of (word, tag) tuples
        R   g      ð?i   (   t   listt   _tagwordt   ranget   lenR#   (	   R   R&   t   current_stateR'   t   tagsR:   t   iR*   R   (    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyR4     s     c      	   C   s«  | g  j o | d \ } } | Sn | d } | d } g  } t  } |  i o | d i ƒ  o
 t } n | |  i i ƒ  j oU|  i d 7_ xÒ| D]7\ } }	 g  }
 xí |  i | i ƒ  D]Ø } |  i i	 | | f ƒ } |  i
 | d i	 | | f ƒ } |  i t | d ƒ i	 | | f ƒ } t |  i | | ƒ t |  i | | f ƒ } |  i | |  i | |  i | } | | } |
 i | | f | f ƒ q¼ Wx2 |
 D]* \ } } | i | | g |	 | f ƒ qŸWq– Wn |  i d 7_ d } |  i t j o d | f } n1 t |  i i | g ƒ ƒ \ \ } } | | f } x! | D] \ } } | i | ƒ qAW| } | i |  i ƒ t | ƒ |  i j o | |  i  } n |  i | | ƒ S(   s.  
        @param sent : List of words remaining in the sentence
        @type sent  : [word,]
        @param current_states : List of possible tag combinations for
                                the sentence so far, and the probability
                                associated with each tag combination
        @type current_states  : [([tag, ],prob), ]

        Tags the first word in the sentence and
        recursively tags the reminder of sentence

        Uses formula specified above to calculate the probability
        of a particular tag
        i    i   iÿÿÿÿiþÿÿÿt   Unk(   R   R   R   R    R	   R+   R   R,   R   t   freqR   R   R"   R.   R   R   R   R#   R   R   R   R=   R4   t   sortt   _cmp_tupR@   R   R>   (   R   R'   t   current_statest   ht   pt   wordt
   new_statesR   R(   t   curr_sent_probt   probsR*   t   p_unit   p_bit   p_trit   p_wdt   p2R4   t   probt   _w(    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyR>   9  sN    

%   &.#
 *$ 	c         C   s9   | \ } } | \ } } | | d j o d Sn d Sd S(   s´   
        comparison function

        @params : (_, prob)
        @types  : (_, int) tuple
      
        used to sort a list of these tuples
        into descending order
        i    i   iÿÿÿÿN(    (   R   t   .1t   .2t   _hqt   p1t   _h2RS   (    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyRG   ¢  s   
(   t   __name__t
   __module__t   __doc__R   R   R   R   R%   R-   R<   R4   R>   RG   (    (    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyR      s   30	/	S	
		!	ic         C   sÝ   g  } g  } d d d d g } | oS x± |  D]D } | | j o$ | i  | ƒ | i  | ƒ g  } q, | i  | ƒ q, Wnb x^ |  D]V \ } } | | j o* | i  | | f ƒ | i  | ƒ g  } q | i  | | f ƒ q W| S(   s’  
    Basic method for tokenizing input into sentences
    for this tagger:

    @param data: list of tokens
                 tokens can be either
                 words or (word, tag) tuples
    @type data: [string,]
                or [(string, string),]

    @param raw: boolean flag marking the input data
                as a list of words or a list of tagged words
    @type raw: Boolean

    @ret : list of sentences
           sentences are a list of tokens
           tokens are the same as the input

    Function takes a list of tokens and separates the tokens into lists
    where each list represents a sentence fragment
    This function can separate both tagged and raw sequences into
    basic sentences.

    Sentence markers are the set of [,.!?]

    This is a simple method which enhances the performance of the TnT
    tagger. Better sentence tokenization will further enhance the results.
    t   ,t   .t   ?t   !(   R#   (   R&   t   rawt   new_datat	   curr_sentt	   sent_markRK   R4   (    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyt   basic_sent_chop¶  s&     
 
c    
      C   sæ   d d k  l }  d d k l } t | i ƒ  ƒ } t | i ƒ  ƒ } |  i ƒ  } | i | d d !ƒ | i	 | d d !ƒ } xe t
 t | ƒ ƒ D]Q } | | } | | d } x/ t
 t | ƒ ƒ D] }	 | |	 Gd G| |	 GHq¾ WHq Wd  S(	   Niÿÿÿÿ(   t   tnt(   t   browniÈ   iè  id   ix   s   --(   t   nltk.tagRg   t   nltk.corpusRh   R=   t   tagged_sentst   sentsR   R   R<   R?   R@   (
   Rg   Rh   Rl   t   testt   taggert   tagged_datat   jt   sR*   RC   (    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyt   demoî  s     
 c          C   s  d d k  l }  d d k l } d d k l } t | i ƒ  ƒ } | i d d d t	 ƒ } | i d d d t
 ƒ } | i | d ƒ | i | d ƒ xht d
 ƒ D]Z} |  i | | | d	 | d d	 !ƒ } t | i ƒ t | i | i ƒ } t | i ƒ t | i | i ƒ }	 d | _ d | _ d GHd G| GHd G|	 GHd G| GHd G| |	 GH|  i | | | d	 | d d	 !ƒ }
 t | i ƒ t | i | i ƒ } t | i ƒ t | i | i ƒ } d | _ d | _ d GHd G|
 GHd G| GHd G| GHd G|
 | GHq¡ Wd  S(   Niÿÿÿÿ(   R4   (   Rg   (   t   treebankR   iè  R   i   id   i
   i   i    s   Capitalization off:s	   Accuracy:s   Percentage known:s   Percentage unknown:s   Accuracy over known words:s   Capitalization on:iL  iL  (   R   R4   Ri   Rg   Rj   Rs   R=   Rk   R   R   R    R   R?   t   accuracyR.   R   R   (   R4   Rg   Rs   t   dR*   Rq   RC   t   tacct   tp_unt   tp_knt   sacct   sp_unt   sp_kn(    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyt   demo2  s<     %##					%##					c          C   s÷  d d k  l }  d d k l } l } d d k l } t | i ƒ  ƒ } t | i ƒ  ƒ } | d  } | d  } t	 t
 | ƒ d ƒ } t	 t
 | ƒ d ƒ } d } d }	 d }
 d } d } d } xät d ƒ D]Ö} | i d	 d d
 t ƒ } | i d	 d d
 t ƒ } | | | | d | !} | | | | d | !} | | |  | | d | } | | |  | | d | } | i | ƒ | i | ƒ |  i | | ƒ } t | i ƒ t | i | i ƒ } t | i ƒ t | i | i ƒ } | | 7} d | _ d | _ |  i | | ƒ } t | i ƒ t | i | i ƒ } t | i ƒ t | i | i ƒ } | | 7} d | _ d | _ | | | 7} |	 | | 7}	 |
 | 7}
 | | 7} qË Wd Gd | GHd Gd |
 GHd Gd | GHd Gd |	 GHd Gd | GHd Gd | GHd  S(   Niÿÿÿÿ(   R4   (   Rs   Rh   (   Rg   iè  gš™™™™™¹?i    i
   R   R   i   s   brown: acc over words known:s        : overall accuracy:s        : words known:s   treebank: acc over words known:s           : overall accuracy:s           : words known:(   R   R4   Rj   Rs   Rh   Ri   Rg   R=   Rk   t   intR@   R?   R   R   R   Rt   R.   R   R   (   R4   Rs   Rh   Rg   Ru   t   et   d10t   e10t   tknacct   sknacct   tallacct   sallacct   tknownt   sknownRC   R*   Rq   t   dtestt   etestt   dtraint   etrainRv   Rw   Rx   Ry   Rz   R{   (    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pyt   demo3)  s^    

 ##
		##
		
t   __main__(   R]   R   t   apit   TaggerIR   R    Rf   Rr   R|   R‹   R[   (    (    (    s"   /p/zhu/06/nlp/nltk/nltk/tag/tnt.pys   <module>   s   
ÿ £8		%	A