Ticket #3726: sage-3726-part3.patch

File sage-3726-part3.patch, 10.7 KB (added by was, 11 years ago)
  • new file sage/stats/hmm/__init__.py

    # HG changeset patch
    # User William Stein <wstein@gmail.com>
    # Date 1217065659 -7200
    # Node ID 93c0bb9f75e840f79d45f2f3acf3e218fda02829
    # Parent  95f9a6bbe30a24f765d116835e2857a6269f7682
    trac #3726 -- HMM -- part 3
    
    diff -r 95f9a6bbe30a -r 93c0bb9f75e8 sage/stats/hmm/__init__.py
    - +  
     1# Hidden Markov Models
  • sage/stats/hmm/hmm.pyx

    diff -r 95f9a6bbe30a -r 93c0bb9f75e8 sage/stats/hmm/hmm.pyx
    a b cdef extern from "ghmm/model.h": 
    133133        ghmm_alphabet* alphabet
    134134
    135135
    136     int ghmm_dmodel_normalize(ghmm_dmodel *m)
    137     int ghmm_dmodel_free(ghmm_dmodel **m)
    138     int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p)
    139 
    140 
    141136    # Discrete sequences
    142137    ctypedef struct ghmm_dseq:
    143138        # sequence array. sequence[i] [j] = j-th symbol of i-th seq
    cdef extern from "ghmm/model.h": 
    171166    ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len,
    172167                                          long seq_number, int Tmax)
    173168
     169
     170    int ghmm_dmodel_normalize(ghmm_dmodel *m)
     171    int ghmm_dmodel_free(ghmm_dmodel **m)
     172    int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p)
     173    int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p)
     174    int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq)
     175    int ghmm_dmodel_baum_welch_nstep (ghmm_dmodel * m, ghmm_dseq *sq, int max_step,
     176                                      double likelihood_delta)
     177   
     178
    174179cdef class HiddenMarkovModel:
    175180    pass
    176181
    cdef class DiscreteHiddenMarkovModel: 
    178183cdef class DiscreteHiddenMarkovModel:
    179184    cdef ghmm_dmodel* m
    180185    cdef bint initialized
    181     cdef object _emission_symbols
    182     cdef object _emission_symbols_special   
     186    cdef object _emission_symbols, _emission_symbols_dict
    183187    cdef Matrix_real_double_dense A, B
    184188   
    185189    def __init__(self, A, B, pi, emission_symbols=None, name=None):
    cdef class DiscreteHiddenMarkovModel: 
    328332            ghmm_dmodel_free(&self.m)
    329333
    330334    def __repr__(self):
     335        """
     336        Return string representation of this HMM.
     337
     338        OUTPUT:
     339            string
     340
     341        EXAMPLES:
     342            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc'])
     343            sage: a.__repr__()
     344            "Discrete Hidden Markov Model (2 states, 2 outputs)\n\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']"
     345        """
    331346        s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%(
    332347            ' ' + self.m.name if self.m.name else '',
    333348            self.m.N, self.m.M)
    334         s += 'Initial probabilities: %s\n'%self.initial_probabilities()
    335         s += 'Transition matrix:\n%s\n'%self.transition_matrix()
    336         s += 'Emission matrix:\n%s\n'%self.emission_matrix()
     349        s += '\nInitial probabilities: %s'%self.initial_probabilities()
     350        s += '\nTransition matrix:\n%s'%self.transition_matrix()
     351        s += '\nEmission matrix:\n%s'%self.emission_matrix()
     352        if self._emission_symbols_dict:
     353            s += '\nEmission symbols: %s'%self._emission_symbols
    337354        return s
    338355
    339356    def initial_probabilities(self):
    cdef class DiscreteHiddenMarkovModel: 
    422439        cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, 1, length)
    423440        cdef Py_ssize_t i
    424441        v = [d.seq[0][i] for i in range(length)]
    425         if self._emission_symbols_special:
    426             return v
    427         else:
     442        if self._emission_symbols_dict:
    428443            w = self._emission_symbols
    429444            return [w[i] for i in v]
     445        else:
     446            return v
    430447
    431448    def sample(self, long length, long number):
    432449        """
    cdef class DiscreteHiddenMarkovModel: 
    442459        cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, number, length)
    443460        cdef Py_ssize_t i, j
    444461        v = [[d.seq[j][i] for i in range(length)] for j in range(number)]
    445         if self._emission_symbols_special:
    446             return v
    447         else:
     462        if self._emission_symbols_dict:
    448463            w = self._emission_symbols
    449464            return [[w[i] for i in z] for z in v]
     465        else:
     466            return v
    450467
    451468    def emission_symbols(self):
    452469        """
    cdef class DiscreteHiddenMarkovModel: 
    479496        """
    480497        if emission_symbols is None:
    481498            self._emission_symbols = range(self.B.ncols())
     499            self._emission_symbols_dict = None
    482500        else:
    483501            self._emission_symbols = list(emission_symbols)
    484         self._emission_symbols_special = (self._emission_symbols == range(self.B.ncols()))
     502            if self._emission_symbols != range(self.B.ncols()):
     503                self._emission_symbols_dict = dict([(x,i) for i, x in enumerate(emission_symbols)])
    485504
    486     cpdef double log_likelihood(self, seq):
     505
     506    ####################################################################
     507    # HMM Problem 1 -- Computing likelihood: Given the parameter set
     508    # lambda of an HMM model and an observation sequence O, determine
     509    # the likelihood P(O | lambda).
     510    ####################################################################
     511    def log_likelihood(self, seq):
    487512        r"""
    488         Return $\log ( P[seq | model] )$, the log of the probability of seeing
    489         the given sequence given this model, using the forward algorithm and
    490         assuming independance of the sequence seq.
     513        HMM Problem 1: Return $\log ( P[seq | model] )$, the log of
     514        the probability of seeing the given sequence given this model,
     515        using the forward algorithm and assuming independance of the
     516        sequence seq.
    491517
    492         WARNING: By convention we return 1.0 for 0 probability events.
     518        INPUT:
     519            seq -- a list; sequence of observed emissions of the HMM
     520
     521        OUTPUT:
     522            float -- the log of the probability of seeing this sequence
     523                     given the model
     524
     525        WARNING: By convention we return -inf for 0 probability events.
    493526       
    494527        EXAMPLES:
    495528            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1])
    cdef class DiscreteHiddenMarkovModel: 
    500533
    501534        Notice that any sequence starting with 0 can't occur, since
    502535        the above model always starts in a state that produces 1 with
    503         probability 1.  By convention log(probability 0) is 1.
     536        probability 1.  By convention log(probability 0) is -inf.
    504537            sage: a.log_likelihood([0,0])
    505             1.0
     538            -inf
    506539
    507540        Here's a special case where each sequence is equally probable.
    508541            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[1,0],[0,1]], [0.5,0.5])
    cdef class DiscreteHiddenMarkovModel: 
    526559            return -float('Inf')
    527560        return log_p
    528561
     562    ####################################################################
     563    # HMM Problem 2 -- Decoding: Given the complete parameter set that
     564    # defines the model and an observation sequence seq, determine the
     565    # best hidden sequence Q.  Use the Viterbi algorithm.
     566    ####################################################################   
     567    def viterbi(self, seq):
     568        """
     569        HMM Problem 2: Determine a hidden sequence of states that is
     570        most likely to produce the given sequence seq of obserations.
     571
     572        INPUT:
     573            seq -- sequence of emitted symbols
     574           
     575        OUTPUT:
     576            list -- a most probable sequence of hidden states, i.e., the
     577                    Viterbi path.
     578            float -- log of the probability that the sequence of hidden
     579                     states actually produced the given sequence seq.
     580                     [[TODO: I do not understand precisely what this means.]]
     581
     582        EXAMPLES:
     583            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5])
     584            sage: a.viterbi([1,0,0,1,0,0,1,1])
     585            ([1, 0, 0, 1, 1, 0, 1, 1], -11.062453224772216)
     586           
     587        We predict the state sequence when the emissions are 3/4 and 'abc'.
     588            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc'])
     589
     590        Note that state 0 is common below, despite the model trying hard to
     591        switch to state 1:
     592            sage: a.viterbi([3/4, 'abc', 'abc'] + [3/4]*10)
     593            ([0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], -25.299405845367794)
     594        """
     595        if self._emission_symbols_dict:
     596            seq = [self._emission_symbols_dict[z] for z in seq]
     597        cdef int* path
     598        cdef int* O = to_int_array(seq)
     599        cdef int pathlen
     600        cdef double log_p
     601
     602        path = ghmm_dmodel_viterbi(self.m, O, len(seq), &pathlen, &log_p)
     603        sage_free(O)
     604        p = [path[i] for i in range(pathlen)]
     605        sage_free(path)
     606
     607        return p, log_p
     608
     609    ####################################################################
     610    # HMM Problem 3 -- Learning: Given an observation sequence O and
     611    # the set of states in the HMM, improve the HMM to increase the
     612    # probability of observing O.
     613    ####################################################################
     614    def baum_welch(self, training_seqs, nsteps=None, log_likelihood_cutoff=None):
     615        pass
     616## /** Baum-Welch-Algorithm for parameter reestimation (training) in
     617##     a discrete (discrete output functions) HMM. Scaled version
     618##     for multiple sequences, alpha and beta matrices are allocated with
     619##      ighmm_cmatrix_stat_alloc
     620##     New parameters set directly in hmm (no storage of previous values!).
     621##     For reference see: 
     622##     Rabiner, L.R.: "`A Tutorial on Hidden {Markov} Models and Selected
     623##                 Applications in Speech Recognition"', Proceedings of the IEEE,
     624##      77, no 2, 1989, pp 257--285   
     625##   @return            0/-1 success/error
     626##   @param mo          initial model
     627##   @param sq          training sequences
     628##   */
     629## /** Just like reestimate_baum_welch, but you can limit
     630##     the maximum number of steps
     631##   @return            0/-1 success/error
     632##   @param mo          initial model
     633##   @param sq          training sequences
     634##   @param max_step    maximal number of Baum-Welch steps
     635##   @param likelihood_delta minimal improvement in likelihood required for carrying on. Relative value
     636##   to log likelihood
     637##   */
     638
     639       
     640       
     641
     642   
    529643   
    530644
    531645##################################################################################