Ticket #3726: sage-3726-part4.patch

File sage-3726-part4.patch, 41.8 KB (added by was, 11 years ago)
  • sage/stats/hmm/all.py

    # HG changeset patch
    # User William Stein <wstein@gmail.com>
    # Date 1217381449 25200
    # Node ID e36529a649ee515528d196c8f4d71974764ef337
    # Parent  93c0bb9f75e840f79d45f2f3acf3e218fda02829
    trac #3726 -- start adding support for continuous hidden markov models
    
    diff -r 93c0bb9f75e8 -r e36529a649ee sage/stats/hmm/all.py
    a b from hmm import DiscreteHiddenMarkovMode 
    1 from hmm import DiscreteHiddenMarkovModel
     1from hmm  import DiscreteHiddenMarkovModel
     2from chmm import NormalHiddenMarkovModel
  • new file sage/stats/hmm/chmm.pxd

    diff -r 93c0bb9f75e8 -r e36529a649ee sage/stats/hmm/chmm.pxd
    - +  
     1
     2from hmm cimport HiddenMarkovModel
     3
     4cdef extern from "ghmm/ghmm.h":
     5    cdef int GHMM_kContinuousHMM
     6
     7cdef extern from "ghmm/smodel.h":
     8    #############################################################
     9    # Continuous emission density types
     10    #############################################################
     11    cdef enum ghmm_density_t:
     12        normal,
     13        normal_right,   # right tail
     14        normal_approx,
     15        normal_left,    # left tail
     16        uniform,
     17        binormal,
     18        multinormal,
     19        density_number
     20
     21    #############################################################
     22    # Continuous emission
     23    #############################################################
     24    # Mean and variance types
     25    cdef union mean_t:
     26        double val
     27        double *vec
     28       
     29    cdef union variance_t:
     30        double val
     31        double *mat
     32
     33    cdef struct ghmm_c_emission:
     34        # Flag for density function for each component of the mixture
     35        #   0: normal density
     36        #   1: truncated normal (right side) density
     37        #   2: approximated normal density
     38        #   3: truncated normal (left side)
     39        #   4: uniform distribution
     40        #   6: multivariate normal
     41        int type
     42        # dimension > 1 for multivariate normals
     43        int dimension
     44        # mean for output functions (pointer to mean vector for multivariate)
     45        mean_t mean
     46        variance_t variance
     47        # pointer to inverse of covariance matrix if multivariate normal;
     48        # else NULL
     49        double *sigmainv
     50        # determinant of covariance matrix for multivariate normal
     51        double det
     52        # Cholesky decomposition of covariance matrix A,
     53        #   if A = G*G' sigmacd only holds G
     54        double *sigmacd
     55        # minimum of uniform distribution or left boundary for
     56        # right-tail gaussians
     57        double min
     58        # maximum of uniform distribution or right boundary for
     59        # left-tail gaussians
     60        double max
     61        # if fixed != 0 the parameters of the density are fixed
     62        int fixed
     63       
     64    #############################################################
     65    # Continous Emission States
     66    #############################################################
     67    ctypedef struct ghmm_cstate:
     68        # Number of output densities per state
     69        int M
     70        # initial prob.
     71        double pi
     72        # IDs of successor states
     73        int *out_id
     74        # IDs of predecessor states
     75        int *in_id
     76        # transition probs to successor states. It is a
     77        # matrix in case of mult. transition matrices (COS > 1)
     78        double **out_a
     79        # transition probs from predecessor states. It is a
     80        # matrix in case of mult. transition matrices (COS > 1)
     81        double **in_a
     82        # number of  successor states
     83        int out_states
     84        # number of  predecessor states
     85        int in_states
     86        # weight vector for output function components
     87        double *c
     88        # flag for fixation of parameter. If fix = 1 do not change parameters of
     89        # output functions, and if fix = 0 do normal training. Default is 0.
     90        int fix
     91        # vector of ghmm_c_emission
     92        # (type and parameters of output function components)
     93        ghmm_c_emission *e
     94        # contains a description of the state (null terminated utf-8)
     95        char *desc
     96        # x coordinate position for graph representation plotting
     97        int xPosition
     98        # y coordinate position for graph representation plotting
     99        int yPosition
     100
     101    #############################################################
     102    # Continous Hidden Markov Model
     103    #############################################################
     104    ctypedef struct ghmm_cmodel
     105       
     106    ctypedef struct ghmm_cmodel_class_change_context:
     107        # Names of class change module/function (for python callback)
     108        char *python_module
     109        char *python_function
     110        # index of current sequence
     111        int k
     112        # pointer to class function
     113        int (*get_class) (ghmm_cmodel*, double*, int, int)
     114        # space for any data necessary for class switch, USER is RESPONSIBLE
     115        void *user_data
     116
     117    ctypedef struct ghmm_cmodel:
     118        # Number of states
     119        int N
     120        # Maximun number of components in the states
     121        int M
     122        # Number of dimensions of the emission components.
     123        # NOTE: All emissions must have the same number of dimensions.
     124        int dim
     125        # The ghmm_cmodel includes two continuous models
     126        #   cos=1: one transition matrix
     127        #   cos>1: (integer) extension for models with several matrices.
     128        int cos
     129        # Prior for a priori probability of the model.
     130        # -1 means no prior specified (all models have equal prob. a priori.)
     131        double prior
     132        # Contains a arbitrary name for the model (null terminated utf-8)
     133        char * name
     134        # Contains bit flags for various model extensions such as
     135        # kSilentStates (see ghmm.h for a complete list).
     136        int model_type
     137        # All states of the model. Transition probabilities are part of the states.
     138        ghmm_cstate *s
     139        # pointer to a ghmm_cmodel_class_change_context struct
     140        # necessary for multiple transition classes
     141        ghmm_cmodel_class_change_context *class_change
     142
     143    int ghmm_cmodel_class_change_alloc(ghmm_cmodel * smo)
     144
     145    #############################################################
     146    # Continous Sequence of states
     147    #
     148    # Sequence structure for double sequences.  Contains an array of
     149    # sequences and corresponding data like sequence labels, sequence
     150    # weights, etc. Sequences may have different length.
     151    # multi-dimension sequences are linearized
     152    #############################################################
     153    ctypedef struct ghmm_cseq:
     154        # sequence array. sequence[i][j] = j-th symbol of i-th seq.
     155        # sequence[i][D * j] = first dimension of j-th observation of i-th sequence
     156        double **seq
     157        # array of sequence length
     158        int *seq_len
     159        # array of sequence IDs
     160        double *seq_id
     161        # positive! sequence weights.  default is 1 = no weight
     162        double *seq_w
     163        # total number of sequences
     164        long seq_number
     165        # reserved space for sequences is always >= seq_number
     166        long capacity
     167        # sum of sequence weights
     168        double total_w
     169        # total number of dimensions
     170        int dim
     171        # flags (internal)
     172        unsigned int flags
     173
     174
     175    #############################################################
     176    # Continous Baum-Welch algorithm context
     177    #############################################################
     178    ctypedef struct ghmm_cmodel_baum_welch_context:
     179        # pointer of continuous model
     180        ghmm_cmodel *smo
     181        # sequence pointer
     182        ghmm_cseq *sqd
     183        # calculated log likelihood
     184        double *logp
     185        # leave reestimation loop if diff. between successive logp values
     186        # is smaller than eps
     187        double eps
     188        # max. no of iterations
     189        int max_iter
     190
     191
     192    int ghmm_cmodel_free (ghmm_cmodel ** smo)
     193   
     194       
     195cdef class ContinuousHiddenMarkovModel(HiddenMarkovModel):
     196    cdef ghmm_cmodel* m
     197    cdef bint initialized
     198
     199cdef class NormalHiddenMarkovModel(ContinuousHiddenMarkovModel):
     200    pass
     201
     202
  • new file sage/stats/hmm/chmm.pyx

    diff -r 93c0bb9f75e8 -r e36529a649ee sage/stats/hmm/chmm.pyx
    - +  
     1
     2
     3include "../../ext/stdsage.pxi"
     4
     5include "misc.pxi"
     6
     7cdef class ContinuousHiddenMarkovModel(HiddenMarkovModel):
     8    def __init__(self, A, B, pi, name=None):
     9        self.initialized = False
     10        HiddenMarkovModel.__init__(self, A, B, pi)
     11        self.m = <ghmm_cmodel*> safe_malloc(sizeof(ghmm_cmodel))
     12        # Set number of states
     13        self.m.N = self.A.nrows()
     14
     15cdef class NormalHiddenMarkovModel(ContinuousHiddenMarkovModel):
     16    """
     17    EXAMPLES:
     18    The transition matrix:
     19        sage: A = [[0,1,0],[0.5,0,0.5],[0.3,0.3,0.4]]
     20
     21    Parameters of the normal emission distributions in pairs of (mu, sigma):
     22        sage: B = [(0,1), (-1,0.5), (1,0.2)]
     23
     24    The initial probabilities per state:
     25        sage: pi = [1,0,0]
     26
     27    Create the continuous HMM:
     28        sage: m = hmm.NormalHiddenMarkovModel(A, B, pi); m
     29    """
     30    def __init__(self, A, B, pi, name=None):
     31        ContinuousHiddenMarkovModel.__init__(self, A, B, pi)
     32
     33        # Set number of outputs
     34        self.m.M = 1
     35
     36        # Set the model type to continuous
     37        self.m.model_type = GHMM_kContinuousHMM
     38
     39        # Assign model identifier if specified
     40        if name is not None:
     41            name = str(name)
     42            self.m.name = name
     43        else:
     44            self.m.name = NULL
     45           
     46        # 1 transition matrix
     47        self.m.cos   =  1
     48        # Set that no a prior model probabilities are set.
     49        self.m.prior = -1
     50        # Dimension is 1
     51        self.m.dim   =  1
     52
     53        # Allocate and initialize states
     54        cdef ghmm_cstate* states = <ghmm_cstate*> safe_malloc(sizeof(ghmm_cstate) * self.m.N)
     55        cdef ghmm_cstate* state
     56        cdef ghmm_c_emission* e
     57
     58        for i in range(self.m.N):
     59            # Parameters of normal distribution
     60            mu, sigma = self.B[i]
     61            # Get a reference to the i-th state for convenience of the notation below.
     62            state = &(states[i])
     63            state.desc = NULL
     64            state.M = 1
     65            e = <ghmm_c_emission*> safe_malloc(sizeof(ghmm_c_emission))
     66            e.type = 0  # normal
     67            e.dimension = 1
     68            e.mean.val = mu
     69            e.variance.val = sigma
     70            # fixing of emissions is deactivated by default           
     71            e.fixed = 0
     72            e.sigmacd = NULL
     73            e.sigmainv = NULL
     74            state.e = e
     75            state.c = to_double_array([1.0])
     76
     77        # Set states
     78        self.m.s = states
     79
     80        self.initialized = True
     81
     82    def __dealloc__(self):
     83        return
     84        if self.initialized:
     85            ghmm_cmodel_free(&self.m)
     86
     87    def __repr__(self):
     88        """
     89        Return string representation of this Continuous HMM.
     90
     91        OUTPUT:
     92            string
     93
     94        EXAMPLES:
     95            sage: m = hmm.NormalHiddenMarkovModel([[0.0,1.0,0],[0.5,0.0,0.5],[0.3,0.3,0.4]], [(0.0,1.0), (-1.0,0.5), (1.0,0.2)], [1,0,0])
     96            sage: a.__repr__()
     97            "Discrete Hidden Markov Model (2 states, 2 outputs)\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']"
     98        """
     99        s = "Normal Hidden Markov Model%s (%s states, %s outputs)"%(
     100            ' ' + self.m.name if self.m.name else '',
     101            self.m.N, self.m.M)
     102        #s += '\nInitial probabilities: %s'%self.initial_probabilities()
     103        #s += '\nTransition matrix:\n%s'%self.transition_matrix()
     104        #s += '\nEmission matrix:\n%s'%self.emission_matrix()
     105        return s
     106
  • new file sage/stats/hmm/hmm.pxd

    diff -r 93c0bb9f75e8 -r e36529a649ee sage/stats/hmm/hmm.pxd
    - +  
     1from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense
     2
     3cdef extern from "ghmm/ghmm.h":
     4    cdef int GHMM_kSilentStates
     5    cdef int GHMM_kHigherOrderEmissions
     6    cdef int GHMM_kDiscreteHMM
     7
     8cdef extern from "ghmm/model.h":
     9    ctypedef struct ghmm_dstate:
     10        # Initial probability
     11        double pi
     12        # Output probability
     13        double *b
     14
     15        # ID's of the following states
     16        int *out_id
     17        # ID's of the previous states
     18        int *in_id
     19
     20        # Transition probabilities to successor states.
     21        double *out_a
     22        # Transition probabilities from predecessor states.
     23        double *in_a
     24
     25        # Number of successor states
     26        int out_states
     27        # Number of precursor states
     28        int in_states
     29        # if fix == 1 then output probabilities b stay fixed during the training
     30        int fix
     31        # Contains a description of the state (null terminated utf-8)
     32        char * desc
     33        # x coordinate position for graph representation plotting
     34        int xPosition
     35        # y coordinate position for graph representation plotting
     36        int yPosition
     37
     38    ctypedef struct ghmm_dbackground:
     39        pass
     40
     41    ctypedef struct ghmm_alphabet:
     42        pass
     43   
     44    # Discrete HMM's.
     45    ctypedef struct ghmm_dmodel:
     46        # Number of states
     47        int N   
     48        # Number of outputs
     49        int M   
     50        # Vector of states
     51        ghmm_dstate *s 
     52        # Contains bit flags for varios model extensions such as
     53        # kSilentStates, kTiedEmissions (see ghmm.h for a complete list)
     54        int model_type
     55        # The a priori probability for the model.
     56        # A value of -1 indicates that no prior is defined.
     57        # Note: this is not to be confused with priors on emission distributions.
     58        double prior
     59        # An arbitrary name for the model (null terminated utf-8)
     60        char * name
     61
     62        # Flag variables for each state indicating whether it is emitting or not.
     63        # Note: silent != NULL iff (model_type & kSilentStates) == 1
     64        int *silent
     65       
     66        # Int variable for the maximum level of higher order emissions.
     67        int maxorder
     68       
     69        # saves the history of emissions as int,
     70        # the nth-last emission is (emission_history * |alphabet|^n+1) % |alphabet|
     71        int emission_history
     72
     73        # Flag variables for each state indicating whether the states emissions
     74        # are tied to another state. Groups of tied states are represented
     75        # by their tie group leader (the lowest numbered member of the group).
     76        # tied_to[s] == kUntied  : s is not a tied state
     77        # tied_to[s] == s        : s is a tie group leader
     78        # tied_to[t] == s        : t is tied to state s (t>s)
     79        # Note: tied_to != NULL iff (model_type & kTiedEmissions) != 0  */
     80        int *tied_to
     81
     82        # Note: State store order information of the emissions.
     83        # Classical HMMS have emission order 0; that is, the emission probability
     84        # is conditioned only on the state emitting the symbol.
     85        # For higher order emissions, the emission are conditioned on the state s
     86        # as well as the previous emission_order observed symbols.
     87        # The emissions are stored in the state's usual double* b.
     88        # Note: order != NULL iff (model_type & kHigherOrderEmissions) != 0  */
     89        int * order
     90       
     91        # ghmm_dbackground is a pointer to a
     92        # ghmm_dbackground structure, which holds (essentially) an
     93        # array of background distributions (which are just vectors of floating
     94        # point numbers like state.b).
     95        # For each state the array background_id indicates which of the background
     96        # distributions to use in parameter estimation. A value of kNoBackgroundDistribution
     97        # indicates that none should be used.
     98        # Note: background_id != NULL iff (model_type & kHasBackgroundDistributions) != 0
     99        int *background_id
     100        ghmm_dbackground *bp
     101
     102        # Topological ordering of silent states
     103        #  Condition: topo_order != NULL iff (model_type & kSilentStates) != 0
     104        int *topo_order
     105        int topo_order_length
     106
     107        # pow_lookup is a array of precomputed powers
     108        # It contains in the i-th entry M (alphabet size) to the power of i
     109        # The last entry is maxorder+1.
     110        int *pow_lookup
     111
     112        # Store for each state a class label. Limits the possibly state sequence
     113        # Note: label != NULL iff (model_type & kLabeledStates) != 0
     114        int* label
     115        ghmm_alphabet* label_alphabet
     116        ghmm_alphabet* alphabet
     117
     118
     119    # Discrete sequences
     120    ctypedef struct ghmm_dseq:
     121        # sequence array. sequence[i] [j] = j-th symbol of i-th seq
     122        int **seq
     123        # matrix of state ids, can be used to save the viterbi path during sequence generation.
     124        # ATTENTION: is NOT allocated by ghmm_dseq_calloc
     125        int **states
     126        # array of sequence length
     127        int *seq_len
     128        # array of state path lengths
     129        int *states_len
     130        # array of sequence IDs
     131        double *seq_id
     132        # positive sequence weights.  default is 1 = no weight
     133        double *seq_w
     134        # total number of sequences
     135        long seq_number
     136        # reserved space for sequences is always >= seq_number
     137        long capacity
     138        # sum of sequence weights
     139        double total_w
     140        # matrix of state labels corresponding to seq
     141        int **state_labels
     142        # number of labels for each sequence
     143        int *state_labels_len
     144        # internal flags
     145        unsigned int flags
     146
     147    ghmm_dseq *ghmm_dmodel_label_generate_sequences(
     148        ghmm_dmodel * mo, int seed, int global_len, long seq_number, int Tmax)
     149    ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len,
     150                                          long seq_number, int Tmax)
     151    int ghmm_dseq_free (ghmm_dseq ** sq)
     152    ghmm_dseq *ghmm_dseq_calloc (long seq_number)
     153
     154    int ghmm_dmodel_normalize(ghmm_dmodel *m)
     155    int ghmm_dmodel_free(ghmm_dmodel **m)
     156
     157    # Problem 1: Likelihood
     158    int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p)
     159
     160    # Problem 2: Decoding
     161    int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p)
     162
     163    # Problem 3: Learning
     164    int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq)
     165   
     166
     167cdef class HiddenMarkovModel:
     168    cdef Matrix_real_double_dense A, B
     169    cdef list pi
     170
     171cdef class DiscreteHiddenMarkovModel(HiddenMarkovModel):
     172    cdef ghmm_dmodel* m
     173    cdef bint initialized
     174    cdef object _emission_symbols, _emission_symbols_dict
  • sage/stats/hmm/hmm.pyx

    diff -r 93c0bb9f75e8 -r e36529a649ee sage/stats/hmm/hmm.pyx
    a b AUTHOR: William Stein 
    55
    66EXAMPLES:
    77
     8TODO:
     9   * make models pickleable (i.e., all parameters should be obtainable
     10     using functions to make this easy).
     11   * continuous hmm's
    812"""
    913
    1014import math
    from sage.rings.all import RDF 
    1317from sage.rings.all  import RDF
    1418from sage.misc.randstate import random
    1519
    16 from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense
    17 
    1820include "../../ext/stdsage.pxi"
    1921
    20 cdef extern from "ghmm/ghmm.h":
    21     cdef int GHMM_kSilentStates
    22     cdef int GHMM_kHigherOrderEmissions
    23     cdef int GHMM_kDiscreteHMM
    24 
    25 cdef extern from "ghmm/model.h":
    26     ctypedef struct ghmm_dstate:
    27         # Initial probability
    28         double pi
    29         # Output probability
    30         double *b
    31 
    32         # ID's of the following states
    33         int *out_id
    34         # ID's of the previous states
    35         int *in_id
    36 
    37         # Transition probabilities to successor states.
    38         double *out_a
    39         # Transition probabilities from predecessor states.
    40         double *in_a
    41 
    42         # Number of successor states
    43         int out_states
    44         # Number of precursor states
    45         int in_states
    46         # if fix == 1 then output probabilities b stay fixed during the training
    47         int fix
    48         # Contains a description of the state (null terminated utf-8)
    49         char * desc
    50         # x coordinate position for graph representation plotting
    51         int xPosition
    52         # y coordinate position for graph representation plotting
    53         int yPosition
    54 
    55     ctypedef struct ghmm_dbackground:
    56         pass
    57 
    58     ctypedef struct ghmm_alphabet:
    59         pass
    60    
    61     # Discrete HMM's.
    62     ctypedef struct ghmm_dmodel:
    63         # Number of states
    64         int N   
    65         # Number of outputs
    66         int M   
    67         # Vector of states
    68         ghmm_dstate *s 
    69         # Contains bit flags for varios model extensions such as
    70         # kSilentStates, kTiedEmissions (see ghmm.h for a complete list)
    71         int model_type
    72         # The a priori probability for the model.
    73         # A value of -1 indicates that no prior is defined.
    74         # Note: this is not to be confused with priors on emission distributions.
    75         double prior
    76         # An arbitrary name for the model (null terminated utf-8)
    77         char * name
    78 
    79         # Flag variables for each state indicating whether it is emitting or not.
    80         # Note: silent != NULL iff (model_type & kSilentStates) == 1
    81         int *silent
    82        
    83         # Int variable for the maximum level of higher order emissions.
    84         int maxorder
    85        
    86         # saves the history of emissions as int,
    87         # the nth-last emission is (emission_history * |alphabet|^n+1) % |alphabet|
    88         int emission_history
    89 
    90         # Flag variables for each state indicating whether the states emissions
    91         # are tied to another state. Groups of tied states are represented
    92         # by their tie group leader (the lowest numbered member of the group).
    93         # tied_to[s] == kUntied  : s is not a tied state
    94         # tied_to[s] == s        : s is a tie group leader
    95         # tied_to[t] == s        : t is tied to state s (t>s)
    96         # Note: tied_to != NULL iff (model_type & kTiedEmissions) != 0  */
    97         int *tied_to
    98 
    99         # Note: State store order information of the emissions.
    100         # Classical HMMS have emission order 0; that is, the emission probability
    101         # is conditioned only on the state emitting the symbol.
    102         # For higher order emissions, the emission are conditioned on the state s
    103         # as well as the previous emission_order observed symbols.
    104         # The emissions are stored in the state's usual double* b.
    105         # Note: order != NULL iff (model_type & kHigherOrderEmissions) != 0  */
    106         int * order
    107        
    108         # ghmm_dbackground is a pointer to a
    109         # ghmm_dbackground structure, which holds (essentially) an
    110         # array of background distributions (which are just vectors of floating
    111         # point numbers like state.b).
    112         # For each state the array background_id indicates which of the background
    113         # distributions to use in parameter estimation. A value of kNoBackgroundDistribution
    114         # indicates that none should be used.
    115         # Note: background_id != NULL iff (model_type & kHasBackgroundDistributions) != 0
    116         int *background_id
    117         ghmm_dbackground *bp
    118 
    119         # Topological ordering of silent states
    120         #  Condition: topo_order != NULL iff (model_type & kSilentStates) != 0
    121         int *topo_order
    122         int topo_order_length
    123 
    124         # pow_lookup is a array of precomputed powers
    125         # It contains in the i-th entry M (alphabet size) to the power of i
    126         # The last entry is maxorder+1.
    127         int *pow_lookup
    128 
    129         # Store for each state a class label. Limits the possibly state sequence
    130         # Note: label != NULL iff (model_type & kLabeledStates) != 0
    131         int* label
    132         ghmm_alphabet* label_alphabet
    133         ghmm_alphabet* alphabet
    134 
    135 
    136     # Discrete sequences
    137     ctypedef struct ghmm_dseq:
    138         # sequence array. sequence[i] [j] = j-th symbol of i-th seq
    139         int **seq
    140         # matrix of state ids, can be used to save the viterbi path during sequence generation.
    141         # ATTENTION: is NOT allocated by ghmm_dseq_calloc
    142         int **states
    143         # array of sequence length
    144         int *seq_len
    145         # array of state path lengths
    146         int *states_len
    147         # array of sequence IDs
    148         double *seq_id
    149         # positive sequence weights.  default is 1 = no weight
    150         double *seq_w
    151         # total number of sequences
    152         long seq_number
    153         # reserved space for sequences is always >= seq_number
    154         long capacity
    155         # sum of sequence weights
    156         double total_w
    157         # matrix of state labels corresponding to seq
    158         int **state_labels
    159         # number of labels for each sequence
    160         int *state_labels_len
    161         # internal flags
    162         unsigned int flags
    163 
    164     ghmm_dseq *ghmm_dmodel_label_generate_sequences(
    165         ghmm_dmodel * mo, int seed, int global_len, long seq_number, int Tmax)
    166     ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len,
    167                                           long seq_number, int Tmax)
    168 
    169 
    170     int ghmm_dmodel_normalize(ghmm_dmodel *m)
    171     int ghmm_dmodel_free(ghmm_dmodel **m)
    172     int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p)
    173     int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p)
    174     int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq)
    175     int ghmm_dmodel_baum_welch_nstep (ghmm_dmodel * m, ghmm_dseq *sq, int max_step,
    176                                       double likelihood_delta)
    177    
     22include "misc.pxi"
    17823
    17924cdef class HiddenMarkovModel:
    180     pass
    181 
    182 
    183 cdef class DiscreteHiddenMarkovModel:
    184     cdef ghmm_dmodel* m
    185     cdef bint initialized
    186     cdef object _emission_symbols, _emission_symbols_dict
    187     cdef Matrix_real_double_dense A, B
    188    
    189     def __init__(self, A, B, pi, emission_symbols=None, name=None):
    190         """
    191         INPUTS:
    192             A  -- square matrix of doubles; the state change probabilities
    193             B  -- matrix of doubles; emission probabilities
    194             pi -- list of doubles; probabilities for each initial state
    195             emission_state -- list of B.ncols() symbols (just used for printing)
    196             name -- (optional) name of the model
    197 
    198         EXAMPLES:
    199         We create a discrete HMM with 2 internal states on an alphabet of
    200         size 2.
    201        
    202             sage: a = hmm.DiscreteHiddenMarkovModel([[0.2,0.8],[0.5,0.5]], [[1,0],[0,1]], [0,1])
    203            
    204         """
     25    def __init__(self, A, B, pi):
    20526        if not is_Matrix(A):
    20627            A = matrix(RDF, len(A), len(A[0]), A)
    20728        if not is_Matrix(B):
    cdef class DiscreteHiddenMarkovModel: 
    21435            A = A.change_ring(RDF)
    21536        if B.base_ring() != RDF:
    21637            B = B.change_ring(RDF)
    217         if len(pi) != A.nrows():
     38
     39        self.pi = [float(x) for x in pi]
     40        if len(self.pi) != A.nrows():
    21841            raise ValueError, "length of pi must equal number of rows of A"
    219 
    220         cdef Py_ssize_t i, j, k
    22142
    22243        self.A = A
    22344        self.B = B
     45
     46
     47cdef class DiscreteHiddenMarkovModel(HiddenMarkovModel):
     48   
     49    def __init__(self, A, B, pi, emission_symbols=None, name=None):
     50        """
     51        INPUTS:
     52            A  -- square matrix of doubles; the state change probabilities
     53            B  -- matrix of doubles; emission probabilities
     54            pi -- list of floats; probabilities for each initial state
     55            emission_state -- list of B.ncols() symbols (just used for printing)
     56            name -- (optional) name of the model
     57
     58        EXAMPLES:
     59        We create a discrete HMM with 2 internal states on an alphabet of
     60        size 2.
     61       
     62            sage: a = hmm.DiscreteHiddenMarkovModel([[0.2,0.8],[0.5,0.5]], [[1,0],[0,1]], [0,1])
     63           
     64        """
     65        self.initialized = False
     66        HiddenMarkovModel.__init__(self, A, B, pi)
     67
     68        cdef Py_ssize_t i, j, k
    22469        self.set_emission_symbols(emission_symbols)
    22570
    22671        self.m = <ghmm_dmodel*> safe_malloc(sizeof(ghmm_dmodel))
    cdef class DiscreteHiddenMarkovModel: 
    23479        self.m.label_alphabet = NULL; self.m.alphabet = NULL
    23580
    23681        # Set number of states and number of outputs
    237         self.m.N = A.nrows()
     82        self.m.N = self.A.nrows()
    23883        self.m.M = len(self._emission_symbols)
    23984        # Set the model type to discrete
    24085        self.m.model_type = GHMM_kDiscreteHMM
    24186
    24287        # Set that no a prior model probabilities are set.
    24388        self.m.prior = -1
     89
    24490        # Assign model identifier if specified
    24591        if name is not None:
    24692            name = str(name)
    cdef class DiscreteHiddenMarkovModel: 
    256102        tmp_order     = []
    257103       
    258104        for i in range(self.m.N):
    259             v = B[i]
     105            v = self.B[i]
    260106
    261107            # Get a reference to the i-th state for convenience of the notation below.
    262108            state = &(states[i])
    cdef class DiscreteHiddenMarkovModel: 
    274120                tmp_order.append(order)
    275121            else:
    276122                raise ValueError, "number of columns (= %s) of B must be a power of the number of emission symbols (= %s)"%(
    277                     B.ncols(), len(emission_symbols))
     123                    self.B.ncols(), len(emission_symbols))
    278124
    279125            state.b = to_double_array(v)
    280             state.pi = pi[i]
     126            state.pi = self.pi[i]
    281127
    282128            silent_states.append( 1 if sum(v) == 0 else 0)
    283129
    284130            # Set out probabilities, i.e., the probabilities that each
    285131            # symbol will be emitted from this state.
    286             v = A[i]
     132            v = self.A[i]
    287133            nz = v.nonzero_positions()
    288134            k = len(nz)
    289135            state.out_states = k
    cdef class DiscreteHiddenMarkovModel: 
    294140                state.out_a[j]  = v[nz[j]]
    295141
    296142            # Set "in" probabilities
    297             v = A.column(i)
     143            v = self.A.column(i)
    298144            nz = v.nonzero_positions()
    299145            k = len(nz)
    300146            state.in_states = k
    cdef class DiscreteHiddenMarkovModel: 
    307153            state.fix = 0
    308154               
    309155        self.m.s = states
    310         self.initialized = True; return
     156        self.initialized = True
    311157
    312         if sum(silent_states) > 0:
    313             self.m.model_type |= GHMM_kSilentStates
    314             self.m.silent = to_int_array(silent_states)
    315 
    316         self.m.maxorder = max(tmp_order)
    317         if self.m.maxorder > 0:
    318             self.m.model_type |= GHMM_kHigherOrderEmissions
    319             self.m.order = to_int_array(tmp_order)
    320 
    321         # Initialize lookup table for powers of the alphabet size,
    322         # which speeds up models with higher order states.
    323         powLookUp = [1] * (self.m.maxorder+2)
    324         for i in range(1,len(powLookUp)):
    325             powLookUp[i] = powLookUp[i-1] * self.m.M
    326         self.m.pow_lookup = to_int_array(powLookUp)
    327 
    328         self.initialized = True
     158##         if sum(silent_states) > 0:
     159##             self.m.model_type |= GHMM_kSilentStates
     160##             self.m.silent = to_int_array(silent_states)
     161##         self.m.maxorder = max(tmp_order)
     162##         if self.m.maxorder > 0:
     163##             self.m.model_type |= GHMM_kHigherOrderEmissions
     164##             self.m.order = to_int_array(tmp_order)
     165##         # Initialize lookup table for powers of the alphabet size,
     166##         # which speeds up models with higher order states.
     167##         powLookUp = [1] * (self.m.maxorder+2)
     168##         for i in range(1,len(powLookUp)):
     169##             powLookUp[i] = powLookUp[i-1] * self.m.M
     170##         self.m.pow_lookup = to_int_array(powLookUp)
     171##         self.initialized = True
    329172       
    330173    def __dealloc__(self):
    331174        if self.initialized:
    cdef class DiscreteHiddenMarkovModel: 
    341184        EXAMPLES:
    342185            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc'])
    343186            sage: a.__repr__()
    344             "Discrete Hidden Markov Model (2 states, 2 outputs)\n\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']"
     187            "Discrete Hidden Markov Model (2 states, 2 outputs)\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']"
    345188        """
    346         s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%(
     189        s = "Discrete Hidden Markov Model%s (%s states, %s outputs)"%(
    347190            ' ' + self.m.name if self.m.name else '',
    348191            self.m.N, self.m.M)
    349192        s += '\nInitial probabilities: %s'%self.initial_probabilities()
    cdef class DiscreteHiddenMarkovModel: 
    439282        cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, 1, length)
    440283        cdef Py_ssize_t i
    441284        v = [d.seq[0][i] for i in range(length)]
     285        ghmm_dseq_free(&d)
    442286        if self._emission_symbols_dict:
    443287            w = self._emission_symbols
    444288            return [w[i] for i in v]
    cdef class DiscreteHiddenMarkovModel: 
    510354    ####################################################################
    511355    def log_likelihood(self, seq):
    512356        r"""
    513         HMM Problem 1: Return $\log ( P[seq | model] )$, the log of
    514         the probability of seeing the given sequence given this model,
    515         using the forward algorithm and assuming independance of the
    516         sequence seq.
     357        HMM Problem 1: Likelihood. Return $\log ( P[seq | model] )$,
     358        the log of the probability of seeing the given sequence given
     359        this model, using the forward algorithm and assuming
     360        independance of the sequence seq.
    517361
    518362        INPUT:
    519363            seq -- a list; sequence of observed emissions of the HMM
    cdef class DiscreteHiddenMarkovModel: 
    566410    ####################################################################   
    567411    def viterbi(self, seq):
    568412        """
    569         HMM Problem 2: Determine a hidden sequence of states that is
    570         most likely to produce the given sequence seq of obserations.
     413        HMM Problem 2: Decoding.  Determine a hidden sequence of
     414        states that is most likely to produce the given sequence seq
     415        of obserations.
    571416
    572417        INPUT:
    573418            seq -- sequence of emitted symbols
    cdef class DiscreteHiddenMarkovModel: 
    612457    # probability of observing O.
    613458    ####################################################################
    614459    def baum_welch(self, training_seqs, nsteps=None, log_likelihood_cutoff=None):
    615         pass
    616 ## /** Baum-Welch-Algorithm for parameter reestimation (training) in
    617 ##     a discrete (discrete output functions) HMM. Scaled version
    618 ##     for multiple sequences, alpha and beta matrices are allocated with
    619 ##      ighmm_cmatrix_stat_alloc
    620 ##     New parameters set directly in hmm (no storage of previous values!).
    621 ##     For reference see: 
    622 ##     Rabiner, L.R.: "`A Tutorial on Hidden {Markov} Models and Selected
    623 ##                 Applications in Speech Recognition"', Proceedings of the IEEE,
    624 ##      77, no 2, 1989, pp 257--285   
    625 ##   @return            0/-1 success/error
    626 ##   @param mo          initial model
    627 ##   @param sq          training sequences
    628 ##   */
    629 ## /** Just like reestimate_baum_welch, but you can limit
    630 ##     the maximum number of steps
    631 ##   @return            0/-1 success/error
    632 ##   @param mo          initial model
    633 ##   @param sq          training sequences
    634 ##   @param max_step    maximal number of Baum-Welch steps
    635 ##   @param likelihood_delta minimal improvement in likelihood required for carrying on. Relative value
    636 ##   to log likelihood
    637 ##   */
     460        """
     461        HMM Problem 3: Learning.  Given an observation sequence O and
     462        the set of states in the HMM, improve the HMM using the
     463        Baum-Welch algorithm to increase the probability of observing O.
    638464
     465        INPUT:
     466            training_seqs -- a list of lists of emission symbols
     467            nsteps -- integer or None (default: None) maximum number
     468                      of Baum-Welch steps to take
     469            log_likehood_cutoff -- positive float or None (default:
     470                      None); the minimal improvement in likelihood
     471                      with respect to the last iteration required to
     472                      continue. Relative value to log likelihood
     473
     474        OUTPUT:
     475            changes the model in places, or raises a RuntimError
     476            exception on error
     477
     478        EXAMPLES:
     479        We make a model that has two states and is equally likely to output
     480        0 or 1 in either state and transitions back and forth with equal
     481        probability.
     482            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[0.5,0.5],[0.5,0.5]], [0.5,0.5])
     483
     484        We give the model some training data this much more likely to
     485        be 1 than 0.
     486            sage: a.baum_welch([[1,1,1,1,0,1], [1,0,1,1,1,1]])
     487
     488        Now the model's emission matrix changes since it is much
     489        more likely to emit 1 than 0. 
     490            sage: a
     491            Discrete Hidden Markov Model (2 states, 2 outputs)
     492            Initial probabilities: [0.5, 0.5]
     493            Transition matrix:
     494            [0.5 0.5]
     495            [0.5 0.5]
     496            Emission matrix:
     497            [0.166666666667 0.833333333333]
     498            [0.166666666667 0.833333333333]
     499
     500        Note that 1/6 = 1.666...:
     501            sage: 1.0/6
     502            0.166666666666667
     503
     504        TESTS:
     505        We test training with non-default string symbols:
     506            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[0.5,0.5],[0.5,0.5]], [0.5,0.5], ['up','down'])
     507            sage: a.baum_welch([['up','up'], ['down','up']])
     508            sage: a
     509            Discrete Hidden Markov Model (2 states, 2 outputs)
     510            Initial probabilities: [0.5, 0.5]
     511            Transition matrix:
     512            [0.5 0.5]
     513            [0.5 0.5]
     514            Emission matrix:
     515            [0.75 0.25]
     516            [0.75 0.25]
     517            Emission symbols: ['up', 'down']       
     518
     519        NOTE: Training for models including silent states is not yet supported.
     520
     521        REFERENCES:
     522            Rabiner, L.R.: "`A Tutorial on Hidden Markov Models and Selected
     523            Applications in Speech Recognition"', Proceedings of the IEEE,
     524            77, no 2, 1989, pp 257--285.
     525        """
     526        if self._emission_symbols_dict:
     527            seqs = [[self._emission_symbols_dict[z] for z in x] for x in training_seqs]
     528        else:
     529            seqs = training_seqs
     530           
     531        cdef ghmm_dseq* d = malloc_ghmm_dseq(seqs)
    639532       
     533        if ghmm_dmodel_baum_welch(self.m, d):
     534            raise RuntimeError, "error running Baum-Welch algorithm"
    640535       
     536        ghmm_dseq_free(&d)
    641537
    642    
    643    
    644 
     538           
    645539##################################################################################
    646540# Helper Functions
    647541##################################################################################
    648542
    649 cdef double* to_double_array(v) except NULL:
    650     cdef double x
    651     cdef double* w = <double*> safe_malloc(sizeof(double)*len(v))
    652     cdef Py_ssize_t i = 0
    653     for x in v:
    654         w[i] = x
    655         i += 1
    656     return w
    657 
    658 cdef int* to_int_array(v) except NULL:
    659     cdef int x
    660     cdef int* w = <int*> safe_malloc(sizeof(int)*len(v))
    661     cdef Py_ssize_t i = 0
    662     for x in v:
    663         w[i] = x
    664         i += 1
    665     return w
    666 
    667 cdef void* safe_malloc(int bytes) except NULL:
    668     """
    669     malloc the given bytes of memory and check that the malloc
    670     succeeds -- if not raise a MemoryError.
    671     """
    672     cdef void* t = sage_malloc(bytes)
    673     if not t:
    674         raise MemoryError, "error allocating memory for Hidden Markov Model"
    675     return t
    676        
     543cdef ghmm_dseq* malloc_ghmm_dseq(seqs) except NULL:
     544    cdef ghmm_dseq* d = ghmm_dseq_calloc(len(seqs))
     545    if d == NULL:
     546        raise MemoryError
     547    cdef int i, j, m, n
     548    m = len(seqs)
     549    d.seq_number = m
     550    d.capacity = m
     551    d.total_w = m
     552    for i from 0 <= i < m:
     553        v = seqs[i]
     554        n = len(v)
     555        d.seq[i] = <int*> safe_malloc(sizeof(int) * n)
     556        for j from 0 <= j < n:
     557            d.seq[i][j] = v[j]
     558        d.seq_len[i] = n
     559        d.seq_id[i] = i
     560        d.seq_w[i] = 1
     561    d.flags = 0
     562    return d
  • new file sage/stats/hmm/misc.pxi

    diff -r 93c0bb9f75e8 -r e36529a649ee sage/stats/hmm/misc.pxi
    - +  
     1
     2cdef double* to_double_array(v) except NULL:
     3    cdef double x
     4    cdef double* w = <double*> safe_malloc(sizeof(double)*len(v))
     5    cdef Py_ssize_t i = 0
     6    for x in v:
     7        w[i] = x
     8        i += 1
     9    return w
     10
     11cdef int* to_int_array(v) except NULL:
     12    cdef int x
     13    cdef int* w = <int*> safe_malloc(sizeof(int)*len(v))
     14    cdef Py_ssize_t i = 0
     15    for x in v:
     16        w[i] = x
     17        i += 1
     18    return w
     19
     20cdef void* safe_malloc(int bytes) except NULL:
     21    """
     22    malloc the given bytes of memory and check that the malloc
     23    succeeds -- if not raise a MemoryError.
     24    """
     25    cdef void* t = sage_malloc(bytes)
     26    if not t:
     27        raise MemoryError, "error allocating memory for Hidden Markov Model"
     28    return t
     29       
  • setup.py

    diff -r 93c0bb9f75e8 -r e36529a649ee setup.py
    a b hmm = Extension('sage.stats.hmm.hmm', [' 
    484484hmm = Extension('sage.stats.hmm.hmm', ['sage/stats/hmm/hmm.pyx'],
    485485                libraries = ['ghmm'])
    486486
     487chmm = Extension('sage.stats.hmm.chmm', ['sage/stats/hmm/chmm.pyx'],
     488                libraries = ['ghmm'])
     489
    487490#####################################################
    488491
    489492ext_modules = [ \
    ext_modules = [ \ 
    601604    markov_multifractal,
    602605    finance_fractal,
    603606
    604     hmm,
     607    hmm, chmm,
    605608
    606609    Extension('sage.media.channels',
    607610              sources = ['sage/media/channels.pyx']), \