Ticket #3726: sage-3726-part1.patch

File sage-3726-part1.patch, 15.7 KB (added by was, 11 years ago)
  • sage/matrix/matrix_real_double_dense.pxd

    # HG changeset patch
    # User William Stein <wstein@gmail.com>
    # Date 1216943572 -7200
    # Node ID 3c4e0a77979f7fad530001d3fd9116ec9196dfff
    # Parent  366912c69770ff1e1c218ec3428c5cf405e81909
    Cython Hidden Markov Models -- initial checkin
    
    diff -r 366912c69770 -r 3c4e0a77979f sage/matrix/matrix_real_double_dense.pxd
    a b cdef class Matrix_real_double_dense(matr 
    1414    cdef int _signum
    1515    cdef int _LU_valid
    1616    cdef _c_compute_LU(self)
    17 #    cdef ModuleElement _add_c_impl(self, ModuleElement right )
     17    cdef set_unsafe_double(self, Py_ssize_t i, Py_ssize_t j, double value)
  • sage/matrix/matrix_real_double_dense.pyx

    diff -r 366912c69770 -r 3c4e0a77979f sage/matrix/matrix_real_double_dense.pyx
    a b cdef class Matrix_real_double_dense(matr 
    224224
    225225    cdef get_unsafe(self, Py_ssize_t i, Py_ssize_t j):
    226226        return RealDoubleElement(gsl_matrix_get(self._matrix,i,j))
     227
     228    cdef set_unsafe_double(self, Py_ssize_t i, Py_ssize_t j, double value):
     229        gsl_matrix_set(self._matrix,i,j,value)
    227230   
    228231
    229232    ########################################################################
  • sage/stats/all.py

    diff -r 366912c69770 -r 3c4e0a77979f sage/stats/all.py
    a b from r import (ttest) 
    11from r import (ttest)
     2
     3import hmm.all as hmm
  • new file sage/stats/hmm/all.py

    diff -r 366912c69770 -r 3c4e0a77979f sage/stats/hmm/all.py
    - +  
     1from hmm import DiscreteHiddenMarkovModel
  • new file sage/stats/hmm/hmm.pyx

    diff -r 366912c69770 -r 3c4e0a77979f sage/stats/hmm/hmm.pyx
    - +  
     1r"""
     2Hidden Markov Models
     3
     4AUTHOR: William Stein
     5
     6EXAMPLES:
     7
     8"""
     9
     10import math
     11
     12from sage.matrix.all import is_Matrix, matrix
     13from sage.rings.all  import RDF
     14
     15from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense
     16
     17include "../../ext/stdsage.pxi"
     18
     19cdef extern from "ghmm/ghmm.h":
     20    cdef int GHMM_kSilentStates
     21    cdef int GHMM_kHigherOrderEmissions
     22    cdef int GHMM_kDiscreteHMM
     23
     24cdef extern from "ghmm/model.h":
     25    ctypedef struct ghmm_dstate:
     26        # Initial probability
     27        double pi
     28        # Output probability
     29        double *b
     30
     31        # ID's of the following states
     32        int *out_id
     33        # ID's of the previous states
     34        int *in_id
     35
     36        # Transition probabilities to successor states.
     37        double *out_a
     38        # Transition probabilities from predecessor states.
     39        double *in_a
     40
     41        # Number of successor states
     42        int out_states
     43        # Number of precursor states
     44        int in_states
     45        # if fix == 1 then output probabilities b stay fixed during the training
     46        int fix
     47        # Contains a description of the state (null terminated utf-8)
     48        char * desc
     49        # x coordinate position for graph representation plotting
     50        int xPosition
     51        # y coordinate position for graph representation plotting
     52        int yPosition
     53
     54    ctypedef struct ghmm_dbackground:
     55        pass
     56
     57    ctypedef struct ghmm_alphabet:
     58        pass
     59   
     60    ctypedef struct ghmm_dmodel:
     61        # Number of states
     62        int N   
     63        # Number of outputs
     64        int M   
     65        # Vector of states
     66        ghmm_dstate *s 
     67        # Contains bit flags for varios model extensions such as
     68        # kSilentStates, kTiedEmissions (see ghmm.h for a complete list)
     69        int model_type
     70        # The a priori probability for the model.
     71        # A value of -1 indicates that no prior is defined.
     72        # Note: this is not to be confused with priors on emission distributions.
     73        double prior
     74        # An arbitrary name for the model (null terminated utf-8)
     75        char * name
     76
     77        # Flag variables for each state indicating whether it is emitting or not.
     78        # Note: silent != NULL iff (model_type & kSilentStates) == 1
     79        int *silent
     80       
     81        # Int variable for the maximum level of higher order emissions.
     82        int maxorder
     83       
     84        # saves the history of emissions as int,
     85        # the nth-last emission is (emission_history * |alphabet|^n+1) % |alphabet|
     86        int emission_history
     87
     88        # Flag variables for each state indicating whether the states emissions
     89        # are tied to another state. Groups of tied states are represented
     90        # by their tie group leader (the lowest numbered member of the group).
     91        # tied_to[s] == kUntied  : s is not a tied state
     92        # tied_to[s] == s        : s is a tie group leader
     93        # tied_to[t] == s        : t is tied to state s (t>s)
     94        # Note: tied_to != NULL iff (model_type & kTiedEmissions) != 0  */
     95        int *tied_to
     96
     97        # Note: State store order information of the emissions.
     98        # Classical HMMS have emission order 0; that is, the emission probability
     99        # is conditioned only on the state emitting the symbol.
     100        # For higher order emissions, the emission are conditioned on the state s
     101        # as well as the previous emission_order observed symbols.
     102        # The emissions are stored in the state's usual double* b.
     103        # Note: order != NULL iff (model_type & kHigherOrderEmissions) != 0  */
     104        int * order
     105       
     106        # ghmm_dbackground is a pointer to a
     107        # ghmm_dbackground structure, which holds (essentially) an
     108        # array of background distributions (which are just vectors of floating
     109        # point numbers like state.b).
     110        # For each state the array background_id indicates which of the background
     111        # distributions to use in parameter estimation. A value of kNoBackgroundDistribution
     112        # indicates that none should be used.
     113        # Note: background_id != NULL iff (model_type & kHasBackgroundDistributions) != 0
     114        int *background_id
     115        ghmm_dbackground *bp
     116
     117        # Topological ordering of silent states
     118        #  Condition: topo_order != NULL iff (model_type & kSilentStates) != 0
     119        int *topo_order
     120        int topo_order_length
     121
     122        # pow_lookup is a array of precomputed powers
     123        # It contains in the i-th entry M (alphabet size) to the power of i
     124        # The last entry is maxorder+1.
     125        int *pow_lookup
     126
     127        # Store for each state a class label. Limits the possibly state sequence
     128        # Note: label != NULL iff (model_type & kLabeledStates) != 0
     129        int* label
     130        ghmm_alphabet* label_alphabet
     131        ghmm_alphabet* alphabet
     132
     133
     134    int ghmm_dmodel_normalize(ghmm_dmodel *m)
     135    int ghmm_dmodel_free(ghmm_dmodel **m)
     136
     137
     138cdef class HiddenMarkovModel:
     139    pass
     140
     141
     142cdef class DiscreteHiddenMarkovModel:
     143    cdef ghmm_dmodel* m
     144    cdef bint initialized
     145    cdef object emission_symbols
     146    cdef Matrix_real_double_dense A, B
     147   
     148    def __init__(self, A, B, pi, emission_symbols=None, name=None):
     149        """
     150        INPUTS:
     151            A  -- square matrix of doubles; the state change probabilities
     152            B  -- matrix of doubles; emission probabilities
     153            pi -- list of doubles; probabilities for each initial state
     154            emission_state -- list of B.ncols() symbols (just used for printing)
     155            name -- (optional) name of the model
     156
     157        EXAMPLES:
     158        We create a discrete HMM with 2 internal states on an alphabet of
     159        size 2.
     160       
     161            sage: a = hmm.DiscreteHiddenMarkovModel([[0.2,0.8],[0.5,0.5]], [[1,0],[0,1]], [0,1])
     162           
     163        """
     164        if not is_Matrix(A):
     165            A = matrix(RDF, len(A), len(A[0]), A)
     166        if not is_Matrix(B):
     167            B = matrix(RDF, len(B), len(B[0]), B)
     168        if not A.is_square():
     169            raise ValueError, "A must be square"
     170        if A.nrows() != B.nrows():
     171            raise ValueError, "number of rows of A and B must be the same"
     172        if A.base_ring() != RDF:
     173            A = A.change_ring(RDF)
     174        if B.base_ring() != RDF:
     175            B = B.change_ring(RDF)
     176        if len(pi) != A.nrows():
     177            raise ValueError, "length of pi must equal number of rows of A"
     178
     179        cdef Py_ssize_t i, j, k
     180
     181        if emission_symbols is None:
     182            self.emission_symbols = range(B.ncols())
     183        else:
     184            self.emission_symbols = list(emission_symbols)
     185
     186        self.m = <ghmm_dmodel*> sage_malloc(sizeof(ghmm_dmodel))
     187        if not self.m: raise MemoryError
     188       
     189        # Set all pointers to NULL
     190        self.m.s = NULL; self.m.name = NULL; self.m.silent = NULL
     191        self.m.tied_to = NULL; self.m.order = NULL; self.m.background_id = NULL
     192        self.m.bp = NULL; self.m.topo_order = NULL; self.m.pow_lookup = NULL;
     193        self.m.label = NULL; self.m.label_alphabet = NULL; self.m.alphabet = NULL
     194
     195        # Set number of states and number of outputs
     196        self.m.N = A.nrows()
     197        self.m.M = len(self.emission_symbols)
     198        # Set the model type to discrete
     199        self.m.model_type = GHMM_kDiscreteHMM
     200
     201        self.A = A
     202        self.B = B
     203
     204        # Set that no a prior model probabilities are set.
     205        self.m.prior = -1
     206        # Assign model identifier if specified
     207        if name is not None:
     208            name = str(name)
     209            self.m.name = name
     210        else:
     211            self.m.name = NULL
     212
     213        # Allocate and initialize states
     214        cdef ghmm_dstate* states = <ghmm_dstate*> safe_malloc(sizeof(ghmm_dstate) * self.m.N)
     215        cdef ghmm_dstate* state
     216
     217        silent_states = []
     218        tmp_order     = []
     219       
     220        for i in range(self.m.N):
     221            v = B[i]
     222
     223            # Get a reference to the i-th state for convenience of the notation below.
     224            state = &(states[i])
     225            state.desc = NULL
     226
     227            # Compute state order
     228            if self.m.M > 1:
     229                order = math.log( len(v), self.m.M ) - 1
     230            else:
     231                order = len(v) - 1
     232
     233            # Check for valid number of emission parameters
     234            order = int(order)
     235            if self.m.M**(order+1) == len(v):
     236                tmp_order.append(order)
     237            else:
     238                raise ValueError, "number of columns (= %s) of B must be a power of the number of emission symbols (= %s)"%(
     239                    B.ncols(), len(emission_symbols))
     240
     241            state.b = to_double_array(v)
     242            state.pi = pi[i]
     243
     244            silent_states.append( 1 if sum(v) == 0 else 0)
     245
     246            # Set out probabilities, i.e., the probabilities that each
     247            # symbol will be emitted from this state.
     248            v = A[i]
     249            nz = v.nonzero_positions()
     250            k = len(nz)
     251            state.out_states = k
     252            state.out_id = <int*> safe_malloc(sizeof(int)*k)
     253            state.out_a  = <double*> safe_malloc(sizeof(double)*k)
     254            for j in range(k):
     255                state.out_id[j] = nz[j]
     256                state.out_a[j]  = v[nz[j]]
     257
     258            # Set "in" probabilities
     259            v = A.column(i)
     260            nz = v.nonzero_positions()
     261            k = len(nz)
     262            state.in_states = k
     263            state.in_id = <int*> safe_malloc(sizeof(int)*k)
     264            state.in_a  = <double*> safe_malloc(sizeof(double)*k)
     265            for j in range(k):
     266                state.in_id[j] = nz[j]
     267                state.in_a[j]  = v[nz[j]]
     268               
     269            state.fix = 0
     270               
     271        self.m.s = states
     272        self.initialized = True; return
     273
     274        if sum(silent_states) > 0:
     275            self.m.model_type |= GHMM_kSilentStates
     276            self.m.silent = to_int_array(silent_states)
     277
     278        self.m.maxorder = max(tmp_order)
     279        if self.m.maxorder > 0:
     280            self.m.model_type |= GHMM_kHigherOrderEmissions
     281            self.m.order = to_int_array(tmp_order)
     282
     283        # Initialize lookup table for powers of the alphabet size,
     284        # which speeds up models with higher order states.
     285        powLookUp = [1] * (self.m.maxorder+2)
     286        for i in range(1,len(powLookUp)):
     287            powLookUp[i] = powLookUp[i-1] * self.m.M
     288        self.m.pow_lookup = to_int_array(powLookUp)
     289
     290        self.initialized = True
     291       
     292
     293    def __repr__(self):
     294        s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%(
     295            ' ' + self.m.name if self.m.name else '',
     296            self.m.N, self.m.M)
     297        s += 'Initial probabilities: %s\n'%self.initial_probabilities()
     298        s += 'Transition matrix:\n%s\n'%self.transition_matrix()
     299        s += 'Emission matrix:\n%s\n'%self.emission_matrix()
     300        return s
     301
     302    def initial_probabilities(self):
     303        cdef Py_ssize_t i
     304        return [self.m.s[i].pi for i in range(self.m.N)]
     305
     306    def transition_matrix(self, list_only=True):
     307        cdef Py_ssize_t i, j
     308        for i from 0 <= i < self.m.N:
     309            for j from 0 <= j < self.m.s[i].out_states:
     310                self.A.set_unsafe_double(i,j,self.m.s[i].out_a[j])
     311        return self.A
     312
     313    def emission_matrix(self, list_only=True):
     314        cdef Py_ssize_t i, j
     315        for i from 0 <= i < self.m.N:
     316            for j from 0 <= j < self.B._ncols:
     317                self.B.set_unsafe_double(i,j,self.m.s[i].b[j])
     318        return self.B
     319
     320    def normalize(self):
     321        """
     322        Normalize the transition and emission probabilities, if applicable.
     323        """
     324        ghmm_dmodel_normalize(self.m)
     325
     326    def __dealloc__(self):
     327        if self.initialized:
     328            ghmm_dmodel_free(&self.m)
     329   
     330
     331
     332
     333##################################################################################
     334# Helper Functions
     335##################################################################################
     336
     337cdef double* to_double_array(v) except NULL:
     338    cdef double x
     339    cdef double* w = <double*> safe_malloc(sizeof(double)*len(v))
     340    cdef Py_ssize_t i = 0
     341    for x in v:
     342        w[i] = x
     343        i += 1
     344    return w
     345
     346cdef int* to_int_array(v) except NULL:
     347    cdef int x
     348    cdef int* w = <int*> safe_malloc(sizeof(int)*len(v))
     349    cdef Py_ssize_t i = 0
     350    for x in v:
     351        w[i] = x
     352        i += 1
     353    return w
     354
     355cdef void* safe_malloc(int bytes) except NULL:
     356    """
     357    malloc the given bytes of memory and check that the malloc
     358    succeeds -- if not raise a MemoryError.
     359    """
     360    cdef void* t = sage_malloc(bytes)
     361    if not t:
     362        raise MemoryError, "error allocating memory for Hidden Markov Model"
     363    return t
     364       
  • setup.py

    diff -r 366912c69770 -r 3c4e0a77979f setup.py
    a b markov_multifractal = Extension('sage.fi 
    481481
    482482finance_fractal = Extension('sage.finance.fractal', ['sage/finance/fractal.pyx'])
    483483
     484hmm = Extension('sage.stats.hmm.hmm', ['sage/stats/hmm/hmm.pyx'],
     485                libraries = ['ghmm'])
     486
    484487#####################################################
    485488
    486489ext_modules = [ \
    ext_modules = [ \ 
    597600   
    598601    markov_multifractal,
    599602    finance_fractal,
     603
     604    hmm,
    600605
    601606    Extension('sage.media.channels',
    602607              sources = ['sage/media/channels.pyx']), \
    code = setup(name = 'sage', 
    13551360
    13561361                     'sage.finance',
    13571362
     1363                     
    13581364                     'sage.functions',
    13591365
    13601366                     'sage.geometry',
    code = setup(name = 'sage', 
    14271433                     
    14281434                     'sage.stats',
    14291435
     1436                     'sage.stats.hmm',
     1437
    14301438                     'sage.parallel',
    14311439                     
    14321440                     'sage.schemes',