Ticket #3726: sage-3726-part2.patch

File sage-3726-part2.patch, 12.0 KB (added by was, 11 years ago)
  • sage/stats/hmm/hmm.pyx

    # HG changeset patch
    # User William Stein <wstein@gmail.com>
    # Date 1217002823 -7200
    # Node ID eb9a65315a153f3b18dce3b2b0629e410be64ba4
    # Parent  3c4e0a77979f7fad530001d3fd9116ec9196dfff
    trac #3726 --  Hidden Markov Model's patch.
    
    diff -r 3c4e0a77979f -r eb9a65315a15 sage/stats/hmm/hmm.pyx
    a b import math 
    1111
    1212from sage.matrix.all import is_Matrix, matrix
    1313from sage.rings.all  import RDF
     14from sage.misc.randstate import random
    1415
    1516from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense
    1617
    cdef extern from "ghmm/model.h": 
    5758    ctypedef struct ghmm_alphabet:
    5859        pass
    5960   
     61    # Discrete HMM's.
    6062    ctypedef struct ghmm_dmodel:
    6163        # Number of states
    6264        int N   
    cdef extern from "ghmm/model.h": 
    133135
    134136    int ghmm_dmodel_normalize(ghmm_dmodel *m)
    135137    int ghmm_dmodel_free(ghmm_dmodel **m)
     138    int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p)
    136139
     140
     141    # Discrete sequences
     142    ctypedef struct ghmm_dseq:
     143        # sequence array. sequence[i] [j] = j-th symbol of i-th seq
     144        int **seq
     145        # matrix of state ids, can be used to save the viterbi path during sequence generation.
     146        # ATTENTION: is NOT allocated by ghmm_dseq_calloc
     147        int **states
     148        # array of sequence length
     149        int *seq_len
     150        # array of state path lengths
     151        int *states_len
     152        # array of sequence IDs
     153        double *seq_id
     154        # positive sequence weights.  default is 1 = no weight
     155        double *seq_w
     156        # total number of sequences
     157        long seq_number
     158        # reserved space for sequences is always >= seq_number
     159        long capacity
     160        # sum of sequence weights
     161        double total_w
     162        # matrix of state labels corresponding to seq
     163        int **state_labels
     164        # number of labels for each sequence
     165        int *state_labels_len
     166        # internal flags
     167        unsigned int flags
     168
     169    ghmm_dseq *ghmm_dmodel_label_generate_sequences(
     170        ghmm_dmodel * mo, int seed, int global_len, long seq_number, int Tmax)
     171    ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len,
     172                                          long seq_number, int Tmax)
    137173
    138174cdef class HiddenMarkovModel:
    139175    pass
    cdef class DiscreteHiddenMarkovModel: 
    142178cdef class DiscreteHiddenMarkovModel:
    143179    cdef ghmm_dmodel* m
    144180    cdef bint initialized
    145     cdef object emission_symbols
     181    cdef object _emission_symbols
     182    cdef object _emission_symbols_special   
    146183    cdef Matrix_real_double_dense A, B
    147184   
    148185    def __init__(self, A, B, pi, emission_symbols=None, name=None):
    cdef class DiscreteHiddenMarkovModel: 
    178215
    179216        cdef Py_ssize_t i, j, k
    180217
    181         if emission_symbols is None:
    182             self.emission_symbols = range(B.ncols())
    183         else:
    184             self.emission_symbols = list(emission_symbols)
     218        self.A = A
     219        self.B = B
     220        self.set_emission_symbols(emission_symbols)
    185221
    186         self.m = <ghmm_dmodel*> sage_malloc(sizeof(ghmm_dmodel))
    187         if not self.m: raise MemoryError
     222        self.m = <ghmm_dmodel*> safe_malloc(sizeof(ghmm_dmodel))
     223
     224        self.m.label = to_int_array(range(len(self._emission_symbols)))
    188225       
    189226        # Set all pointers to NULL
    190227        self.m.s = NULL; self.m.name = NULL; self.m.silent = NULL
    191228        self.m.tied_to = NULL; self.m.order = NULL; self.m.background_id = NULL
    192229        self.m.bp = NULL; self.m.topo_order = NULL; self.m.pow_lookup = NULL;
    193         self.m.label = NULL; self.m.label_alphabet = NULL; self.m.alphabet = NULL
     230        self.m.label_alphabet = NULL; self.m.alphabet = NULL
    194231
    195232        # Set number of states and number of outputs
    196233        self.m.N = A.nrows()
    197         self.m.M = len(self.emission_symbols)
     234        self.m.M = len(self._emission_symbols)
    198235        # Set the model type to discrete
    199236        self.m.model_type = GHMM_kDiscreteHMM
    200 
    201         self.A = A
    202         self.B = B
    203237
    204238        # Set that no a prior model probabilities are set.
    205239        self.m.prior = -1
    cdef class DiscreteHiddenMarkovModel: 
    289323
    290324        self.initialized = True
    291325       
     326    def __dealloc__(self):
     327        if self.initialized:
     328            ghmm_dmodel_free(&self.m)
    292329
    293330    def __repr__(self):
    294331        s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%(
    cdef class DiscreteHiddenMarkovModel: 
    300337        return s
    301338
    302339    def initial_probabilities(self):
     340        """
     341        Return the list of initial state probabilities.
     342       
     343        EXAMPLES:
     344            sage: a = hmm.DiscreteHiddenMarkovModel([[0.9,0.1],[0.9,0.1]], [[0.5,0.5,0,0],[0,0,.5,.5]], [0.5,0.5], [1,-1,3,-3])
     345            sage: a.initial_probabilities()
     346            [0.5, 0.5]
     347        """
    303348        cdef Py_ssize_t i
    304349        return [self.m.s[i].pi for i in range(self.m.N)]
    305350
    306351    def transition_matrix(self, list_only=True):
     352        """
     353        Return the hidden state transition matrix.
     354       
     355        EXAMPLES:
     356            sage: a = hmm.DiscreteHiddenMarkovModel([[0.9,0.1],[0.9,0.1]], [[0.5,0.5,0,0],[0,0,.5,.5]], [0.5,0.5], [1,-1,3,-3])
     357            sage: a.transition_matrix()
     358            [0.9 0.1]
     359            [0.9 0.1]
     360        """
    307361        cdef Py_ssize_t i, j
    308362        for i from 0 <= i < self.m.N:
    309363            for j from 0 <= j < self.m.s[i].out_states:
    cdef class DiscreteHiddenMarkovModel: 
    311365        return self.A
    312366
    313367    def emission_matrix(self, list_only=True):
     368        """
     369        Return the emission probability matrix.
     370       
     371        EXAMPLES:
     372            sage: a = hmm.DiscreteHiddenMarkovModel([[0.9,0.1],[0.9,0.1]], [[0.5,0.5,0,0],[0,0,.5,.5]], [0.5,0.5], [1,-1,3,-3])
     373            sage: a.emission_matrix()
     374            [0.5 0.5 0.0 0.0]
     375            [0.0 0.0 0.5 0.5]
     376        """
    314377        cdef Py_ssize_t i, j
    315378        for i from 0 <= i < self.m.N:
    316379            for j from 0 <= j < self.B._ncols:
    cdef class DiscreteHiddenMarkovModel: 
    320383    def normalize(self):
    321384        """
    322385        Normalize the transition and emission probabilities, if applicable.
     386
     387        EXAMPLES:
     388            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,1],[1.2,0.9]], [[1,0.5],[0.5,1]], [0.1,1.2])
     389            sage: a.normalize()
     390            sage: a
     391            Discrete Hidden Markov Model (2 states, 2 outputs)
     392            Initial probabilities: [0.076923076923076927, 0.92307692307692302]
     393            Transition matrix:
     394            [0.333333333333 0.666666666667]
     395            [0.571428571429 0.428571428571]
     396            Emission matrix:
     397            [0.666666666667 0.333333333333]
     398            [0.333333333333 0.666666666667]
    323399        """
    324400        ghmm_dmodel_normalize(self.m)
    325401
    326     def __dealloc__(self):
    327         if self.initialized:
    328             ghmm_dmodel_free(&self.m)
     402    def sample_single(self, long length):
     403        """
     404        Return a single sample computed using this Hidden Markov Model.
     405       
     406        EXAMPLE:
     407            sage: set_random_seed(0)
     408            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1])
     409            sage: a.sample_single(20)
     410            [1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     411            sage: a.sample_single(1000).count(0)
     412            113
     413
     414        If the emission symbols are set
     415            sage: set_random_seed(0)
     416            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.1,0.9]], [[1,0],[0,1]], [0,1], ['up', 'down'])
     417            sage: print a.sample_single(10)
     418            ['down', 'up', 'down', 'down', 'up', 'down', 'down', 'up', 'down', 'up']
     419           
     420        """
     421        seed = random()
     422        cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, 1, length)
     423        cdef Py_ssize_t i
     424        v = [d.seq[0][i] for i in range(length)]
     425        if self._emission_symbols_special:
     426            return v
     427        else:
     428            w = self._emission_symbols
     429            return [w[i] for i in v]
     430
     431    def sample(self, long length, long number):
     432        """
     433        Return number samples from this HMM of given length.
     434
     435        EXAMPLES:
     436            sage: set_random_seed(0)
     437            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1])
     438            sage: print a.sample(10, 3)
     439            [[1, 0, 1, 1, 0, 1, 1, 0, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
     440        """
     441        seed = random()
     442        cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, number, length)
     443        cdef Py_ssize_t i, j
     444        v = [[d.seq[j][i] for i in range(length)] for j in range(number)]
     445        if self._emission_symbols_special:
     446            return v
     447        else:
     448            w = self._emission_symbols
     449            return [[w[i] for i in z] for z in v]
     450
     451    def emission_symbols(self):
     452        """
     453        Return a copy of the list of emission symbols of self.
     454
     455        Use set_emission_symbols to set the list of emission symbols.
     456
     457        EXAMPLES:
     458            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.1,0.9]], [[1,0],[0,1]], [0,1], ['up', -3/179])
     459            sage: a.emission_symbols()
     460            ['up', -3/179]
     461        """
     462        return list(self._emission_symbols)
     463
     464    def set_emission_symbols(self, emission_symbols):
     465        """
     466        Set the list of emission symbols.
     467
     468        EXAMPLES:
     469            sage: set_random_seed(0)
     470            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.1,0.9]], [[1,0],[0,1]], [0,1], ['up', 'down'])
     471            sage: a.set_emission_symbols([3,5])
     472            sage: a.emission_symbols()
     473            [3, 5]
     474            sage: a.sample_single(10)
     475            [5, 3, 5, 5, 3, 5, 5, 3, 5, 3]
     476            sage: a.set_emission_symbols([pi,5/9+e])
     477            sage: a.sample_single(10)
     478            [e + 5/9, e + 5/9, e + 5/9, e + 5/9, e + 5/9, e + 5/9, pi, pi, e + 5/9, pi]
     479        """
     480        if emission_symbols is None:
     481            self._emission_symbols = range(self.B.ncols())
     482        else:
     483            self._emission_symbols = list(emission_symbols)
     484        self._emission_symbols_special = (self._emission_symbols == range(self.B.ncols()))
     485
     486    cpdef double log_likelihood(self, seq):
     487        r"""
     488        Return $\log ( P[seq | model] )$, the log of the probability of seeing
     489        the given sequence given this model, using the forward algorithm and
     490        assuming independance of the sequence seq.
     491
     492        WARNING: By convention we return 1.0 for 0 probability events.
     493       
     494        EXAMPLES:
     495            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1])
     496            sage: a.log_likelihood([1,1])
     497            -0.10536051565782635
     498            sage: a.log_likelihood([1,0])
     499            -2.3025850929940459
     500
     501        Notice that any sequence starting with 0 can't occur, since
     502        the above model always starts in a state that produces 1 with
     503        probability 1.  By convention log(probability 0) is 1.
     504            sage: a.log_likelihood([0,0])
     505            1.0
     506
     507        Here's a special case where each sequence is equally probable.
     508            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[1,0],[0,1]], [0.5,0.5])
     509            sage: a.log_likelihood([0,0])
     510            -1.3862943611198906
     511            sage: log(0.25)
     512            -1.38629436111989
     513            sage: a.log_likelihood([0,1])
     514            -1.3862943611198906
     515            sage: a.log_likelihood([1,0])
     516            -1.3862943611198906
     517            sage: a.log_likelihood([1,1])
     518            -1.3862943611198906
     519        """
     520        cdef double log_p
     521        cdef int* O = to_int_array(seq)
     522        cdef int ret = ghmm_dmodel_logp(self.m, O, len(seq), &log_p)
     523        sage_free(O)
     524        if ret == -1:
     525            # forward returned -1: sequence can't be built
     526            return -float('Inf')
     527        return log_p
     528
    329529   
    330 
    331 
    332530
    333531##################################################################################
    334532# Helper Functions