Ticket #3726: sage3726part2.patch
File sage3726part2.patch, 12.0 KB (added by , 11 years ago) 


sage/stats/hmm/hmm.pyx
# HG changeset patch # User William Stein <wstein@gmail.com> # Date 1217002823 7200 # Node ID eb9a65315a153f3b18dce3b2b0629e410be64ba4 # Parent 3c4e0a77979f7fad530001d3fd9116ec9196dfff trac #3726  Hidden Markov Model's patch. diff r 3c4e0a77979f r eb9a65315a15 sage/stats/hmm/hmm.pyx
a b import math 11 11 12 12 from sage.matrix.all import is_Matrix, matrix 13 13 from sage.rings.all import RDF 14 from sage.misc.randstate import random 14 15 15 16 from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense 16 17 … … cdef extern from "ghmm/model.h": 57 58 ctypedef struct ghmm_alphabet: 58 59 pass 59 60 61 # Discrete HMM's. 60 62 ctypedef struct ghmm_dmodel: 61 63 # Number of states 62 64 int N … … cdef extern from "ghmm/model.h": 133 135 134 136 int ghmm_dmodel_normalize(ghmm_dmodel *m) 135 137 int ghmm_dmodel_free(ghmm_dmodel **m) 138 int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p) 136 139 140 141 # Discrete sequences 142 ctypedef struct ghmm_dseq: 143 # sequence array. sequence[i] [j] = jth symbol of ith seq 144 int **seq 145 # matrix of state ids, can be used to save the viterbi path during sequence generation. 146 # ATTENTION: is NOT allocated by ghmm_dseq_calloc 147 int **states 148 # array of sequence length 149 int *seq_len 150 # array of state path lengths 151 int *states_len 152 # array of sequence IDs 153 double *seq_id 154 # positive sequence weights. default is 1 = no weight 155 double *seq_w 156 # total number of sequences 157 long seq_number 158 # reserved space for sequences is always >= seq_number 159 long capacity 160 # sum of sequence weights 161 double total_w 162 # matrix of state labels corresponding to seq 163 int **state_labels 164 # number of labels for each sequence 165 int *state_labels_len 166 # internal flags 167 unsigned int flags 168 169 ghmm_dseq *ghmm_dmodel_label_generate_sequences( 170 ghmm_dmodel * mo, int seed, int global_len, long seq_number, int Tmax) 171 ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len, 172 long seq_number, int Tmax) 137 173 138 174 cdef class HiddenMarkovModel: 139 175 pass … … cdef class DiscreteHiddenMarkovModel: 142 178 cdef class DiscreteHiddenMarkovModel: 143 179 cdef ghmm_dmodel* m 144 180 cdef bint initialized 145 cdef object emission_symbols 181 cdef object _emission_symbols 182 cdef object _emission_symbols_special 146 183 cdef Matrix_real_double_dense A, B 147 184 148 185 def __init__(self, A, B, pi, emission_symbols=None, name=None): … … cdef class DiscreteHiddenMarkovModel: 178 215 179 216 cdef Py_ssize_t i, j, k 180 217 181 if emission_symbols is None: 182 self.emission_symbols = range(B.ncols()) 183 else: 184 self.emission_symbols = list(emission_symbols) 218 self.A = A 219 self.B = B 220 self.set_emission_symbols(emission_symbols) 185 221 186 self.m = <ghmm_dmodel*> sage_malloc(sizeof(ghmm_dmodel)) 187 if not self.m: raise MemoryError 222 self.m = <ghmm_dmodel*> safe_malloc(sizeof(ghmm_dmodel)) 223 224 self.m.label = to_int_array(range(len(self._emission_symbols))) 188 225 189 226 # Set all pointers to NULL 190 227 self.m.s = NULL; self.m.name = NULL; self.m.silent = NULL 191 228 self.m.tied_to = NULL; self.m.order = NULL; self.m.background_id = NULL 192 229 self.m.bp = NULL; self.m.topo_order = NULL; self.m.pow_lookup = NULL; 193 self.m.label = NULL; self.m.label_alphabet = NULL; self.m.alphabet = NULL230 self.m.label_alphabet = NULL; self.m.alphabet = NULL 194 231 195 232 # Set number of states and number of outputs 196 233 self.m.N = A.nrows() 197 self.m.M = len(self. emission_symbols)234 self.m.M = len(self._emission_symbols) 198 235 # Set the model type to discrete 199 236 self.m.model_type = GHMM_kDiscreteHMM 200 201 self.A = A202 self.B = B203 237 204 238 # Set that no a prior model probabilities are set. 205 239 self.m.prior = 1 … … cdef class DiscreteHiddenMarkovModel: 289 323 290 324 self.initialized = True 291 325 326 def __dealloc__(self): 327 if self.initialized: 328 ghmm_dmodel_free(&self.m) 292 329 293 330 def __repr__(self): 294 331 s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%( … … cdef class DiscreteHiddenMarkovModel: 300 337 return s 301 338 302 339 def initial_probabilities(self): 340 """ 341 Return the list of initial state probabilities. 342 343 EXAMPLES: 344 sage: a = hmm.DiscreteHiddenMarkovModel([[0.9,0.1],[0.9,0.1]], [[0.5,0.5,0,0],[0,0,.5,.5]], [0.5,0.5], [1,1,3,3]) 345 sage: a.initial_probabilities() 346 [0.5, 0.5] 347 """ 303 348 cdef Py_ssize_t i 304 349 return [self.m.s[i].pi for i in range(self.m.N)] 305 350 306 351 def transition_matrix(self, list_only=True): 352 """ 353 Return the hidden state transition matrix. 354 355 EXAMPLES: 356 sage: a = hmm.DiscreteHiddenMarkovModel([[0.9,0.1],[0.9,0.1]], [[0.5,0.5,0,0],[0,0,.5,.5]], [0.5,0.5], [1,1,3,3]) 357 sage: a.transition_matrix() 358 [0.9 0.1] 359 [0.9 0.1] 360 """ 307 361 cdef Py_ssize_t i, j 308 362 for i from 0 <= i < self.m.N: 309 363 for j from 0 <= j < self.m.s[i].out_states: … … cdef class DiscreteHiddenMarkovModel: 311 365 return self.A 312 366 313 367 def emission_matrix(self, list_only=True): 368 """ 369 Return the emission probability matrix. 370 371 EXAMPLES: 372 sage: a = hmm.DiscreteHiddenMarkovModel([[0.9,0.1],[0.9,0.1]], [[0.5,0.5,0,0],[0,0,.5,.5]], [0.5,0.5], [1,1,3,3]) 373 sage: a.emission_matrix() 374 [0.5 0.5 0.0 0.0] 375 [0.0 0.0 0.5 0.5] 376 """ 314 377 cdef Py_ssize_t i, j 315 378 for i from 0 <= i < self.m.N: 316 379 for j from 0 <= j < self.B._ncols: … … cdef class DiscreteHiddenMarkovModel: 320 383 def normalize(self): 321 384 """ 322 385 Normalize the transition and emission probabilities, if applicable. 386 387 EXAMPLES: 388 sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,1],[1.2,0.9]], [[1,0.5],[0.5,1]], [0.1,1.2]) 389 sage: a.normalize() 390 sage: a 391 Discrete Hidden Markov Model (2 states, 2 outputs) 392 Initial probabilities: [0.076923076923076927, 0.92307692307692302] 393 Transition matrix: 394 [0.333333333333 0.666666666667] 395 [0.571428571429 0.428571428571] 396 Emission matrix: 397 [0.666666666667 0.333333333333] 398 [0.333333333333 0.666666666667] 323 399 """ 324 400 ghmm_dmodel_normalize(self.m) 325 401 326 def __dealloc__(self): 327 if self.initialized: 328 ghmm_dmodel_free(&self.m) 402 def sample_single(self, long length): 403 """ 404 Return a single sample computed using this Hidden Markov Model. 405 406 EXAMPLE: 407 sage: set_random_seed(0) 408 sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1]) 409 sage: a.sample_single(20) 410 [1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 411 sage: a.sample_single(1000).count(0) 412 113 413 414 If the emission symbols are set 415 sage: set_random_seed(0) 416 sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.1,0.9]], [[1,0],[0,1]], [0,1], ['up', 'down']) 417 sage: print a.sample_single(10) 418 ['down', 'up', 'down', 'down', 'up', 'down', 'down', 'up', 'down', 'up'] 419 420 """ 421 seed = random() 422 cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, 1, length) 423 cdef Py_ssize_t i 424 v = [d.seq[0][i] for i in range(length)] 425 if self._emission_symbols_special: 426 return v 427 else: 428 w = self._emission_symbols 429 return [w[i] for i in v] 430 431 def sample(self, long length, long number): 432 """ 433 Return number samples from this HMM of given length. 434 435 EXAMPLES: 436 sage: set_random_seed(0) 437 sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1]) 438 sage: print a.sample(10, 3) 439 [[1, 0, 1, 1, 0, 1, 1, 0, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]] 440 """ 441 seed = random() 442 cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, number, length) 443 cdef Py_ssize_t i, j 444 v = [[d.seq[j][i] for i in range(length)] for j in range(number)] 445 if self._emission_symbols_special: 446 return v 447 else: 448 w = self._emission_symbols 449 return [[w[i] for i in z] for z in v] 450 451 def emission_symbols(self): 452 """ 453 Return a copy of the list of emission symbols of self. 454 455 Use set_emission_symbols to set the list of emission symbols. 456 457 EXAMPLES: 458 sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.1,0.9]], [[1,0],[0,1]], [0,1], ['up', 3/179]) 459 sage: a.emission_symbols() 460 ['up', 3/179] 461 """ 462 return list(self._emission_symbols) 463 464 def set_emission_symbols(self, emission_symbols): 465 """ 466 Set the list of emission symbols. 467 468 EXAMPLES: 469 sage: set_random_seed(0) 470 sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.1,0.9]], [[1,0],[0,1]], [0,1], ['up', 'down']) 471 sage: a.set_emission_symbols([3,5]) 472 sage: a.emission_symbols() 473 [3, 5] 474 sage: a.sample_single(10) 475 [5, 3, 5, 5, 3, 5, 5, 3, 5, 3] 476 sage: a.set_emission_symbols([pi,5/9+e]) 477 sage: a.sample_single(10) 478 [e + 5/9, e + 5/9, e + 5/9, e + 5/9, e + 5/9, e + 5/9, pi, pi, e + 5/9, pi] 479 """ 480 if emission_symbols is None: 481 self._emission_symbols = range(self.B.ncols()) 482 else: 483 self._emission_symbols = list(emission_symbols) 484 self._emission_symbols_special = (self._emission_symbols == range(self.B.ncols())) 485 486 cpdef double log_likelihood(self, seq): 487 r""" 488 Return $\log ( P[seq  model] )$, the log of the probability of seeing 489 the given sequence given this model, using the forward algorithm and 490 assuming independance of the sequence seq. 491 492 WARNING: By convention we return 1.0 for 0 probability events. 493 494 EXAMPLES: 495 sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1]) 496 sage: a.log_likelihood([1,1]) 497 0.10536051565782635 498 sage: a.log_likelihood([1,0]) 499 2.3025850929940459 500 501 Notice that any sequence starting with 0 can't occur, since 502 the above model always starts in a state that produces 1 with 503 probability 1. By convention log(probability 0) is 1. 504 sage: a.log_likelihood([0,0]) 505 1.0 506 507 Here's a special case where each sequence is equally probable. 508 sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[1,0],[0,1]], [0.5,0.5]) 509 sage: a.log_likelihood([0,0]) 510 1.3862943611198906 511 sage: log(0.25) 512 1.38629436111989 513 sage: a.log_likelihood([0,1]) 514 1.3862943611198906 515 sage: a.log_likelihood([1,0]) 516 1.3862943611198906 517 sage: a.log_likelihood([1,1]) 518 1.3862943611198906 519 """ 520 cdef double log_p 521 cdef int* O = to_int_array(seq) 522 cdef int ret = ghmm_dmodel_logp(self.m, O, len(seq), &log_p) 523 sage_free(O) 524 if ret == 1: 525 # forward returned 1: sequence can't be built 526 return float('Inf') 527 return log_p 528 329 529 330 331 332 530 333 531 ################################################################################## 334 532 # Helper Functions