Ticket #8547: trac_8547-take2-part2.patch

File trac_8547-take2-part2.patch, 18.4 KB (added by was, 12 years ago)

part 2; apply this and the previous patch

  • sage/stats/hmm/chmm.pyx

    # HG changeset patch
    # User William Stein <wstein@gmail.com>
    # Date 1270763713 25200
    # Node ID 3b663ff7aae85cc171f09923eecdc15c5bde9c75
    # Parent  add8a97d36c81c53eaecdab7004d62c548ac22b9
    trac 8547 -- part 2 -- implement hidden markov models in Cython from scratch (so can remove the GHMM standard package from Sage)
    
    diff -r add8a97d36c8 -r 3b663ff7aae8 sage/stats/hmm/chmm.pyx
    a b  
    4444from sage.misc.randstate cimport current_randstate, randstate
    4545
    4646
    47 # DELETE THIS FUNCTION WHEN MOVE Gaussian stuff to distributions.pyx!!!
     47# TODO: DELETE THIS FUNCTION WHEN MOVE Gaussian stuff to distributions.pyx!!! (next version)
    4848cdef double random_normal(double mean, double std, randstate rstate):
     49    """
     50    Return a number chosen randomly with given mean and standard deviation.
     51
     52    INPUT:
     53
     54        - ``mean`` -- double
     55        - ``std`` -- double, standard deviation
     56        - ``rstate`` -- a randstate object
     57       
     58    OUTPUT:
     59
     60        - a double
     61    """
    4962    # Ported from http://users.tkk.fi/~nbeijar/soft/terrain/source_o2/boxmuller.c
    5063    # This the box muller algorithm.
    5164    # Client code can get the current random state from:
     
    260273        if i < 0 or i >= self.N:
    261274            raise IndexError, 'index out of range'
    262275
    263         # TODO: change to be a normal distribution class
     276        # TODO: change to be a normal distribution class (next version)
    264277        return self.B[2*i], self.B[2*i+1]
    265278
    266279    def __reduce__(self):
     
    388401    cdef probability_init(self):
    389402        """
    390403        Used internally to compute caching information that makes
    391         certain computations in the Baum-Welch algorithm faster.
     404        certain computations in the Baum-Welch algorithm faster.  This
     405        function has no input or output.
    392406        """
    393407        self.prob = TimeSeries(2*self.N)
    394408        cdef int i
     
    732746        # Termination
    733747        return alpha, scale, log_probability
    734748
    735     cdef TimeSeries _baum_welch_gamma(self, TimeSeries alpha, TimeSeries beta):
    736         # TODO: factor out to hmm.pyx
    737         cdef int j, N = self.N
    738         cdef Py_ssize_t t, T = alpha._length//N
    739         cdef TimeSeries gamma = TimeSeries(alpha._length, initialize=False)
    740         cdef double denominator
    741         for t in range(T):
    742             denominator = 0
    743             for j in range(N):
    744                 gamma._values[t*N + j] = alpha._values[t*N + j] * beta._values[t*N + j]
    745                 denominator += gamma._values[t*N + j]
    746             for j in range(N):
    747                 gamma._values[t*N + j] /= denominator
    748         return gamma
     749    cdef TimeSeries _baum_welch_xi(self, TimeSeries alpha, TimeSeries beta, TimeSeries obs):
     750        """
     751        Used internally to compute the scaled quantity xi_t(i,j)
     752        appearing in the Baum-Welch reestimation algorithm.
    749753
    750     cdef TimeSeries _baum_welch_xi(self, TimeSeries alpha, TimeSeries beta, TimeSeries obs):
    751         # TODO: factor out to hmm.pyx
    752         """
    753         Return 3-dimensional array of xi values.
    754         We have x[t,i,j] = x[t*N*N + i*N + j].
     754        INPUT:
     755
     756            - alpha -- TimeSeries as output by the scaled forward algorithm
     757            - beta -- TimeSeries as output by the scaled backward algorithm
     758            - obs -- TimeSeries of observations
     759
     760        OUTPUT:
     761
     762            - TimeSeries xi such that xi[t*N*N + i*N + j] = xi_t(i,j).
    755763        """
    756764        cdef int i, j, N = self.N
    757765        cdef double sum
     
    943951    Gaussian mixture Hidden Markov Model.
    944952
    945953    INPUT:
    946         - A  -- matrix; the N x N transition matrix
    947         - B -- list of pairs (mu,sigma) that define the distributions
    948         - pi -- initial state probabilities
    949 
     954   
     955        - ``A``  -- matrix; the N x N transition matrix
     956        - ``B`` -- list of pairs (mu,sigma) that define the distributions
     957        - ``pi`` -- initial state probabilities
     958        - ``normalize`` --bool (default: True); if given, input is
     959          normalized to define valid probability distributions,
     960          e.g., the entries of A are made nonnegative and the rows
     961          sum to 1, and the probabilities in pi are normalized.
    950962
    951963    EXAMPLES::
    952964
     
    9971009
    9981010    def __init__(self, A, B, pi=None, bint normalize=True):
    9991011        """
     1012        Initialize a Gaussian mixture hidden Markov model.
     1013       
    10001014        EXAMPLES::
    10011015
    10021016            sage: hmm.GaussianMixtureHiddenMarkovModel([[.9,.1],[.4,.6]], [[(.4,(0,1)), (.6,(1,0.1))],[(1,(0,1))]], [.7,.3])
     
    10381052        """
    10391053        Used in pickling.
    10401054
    1041         EXAMPLES:
     1055        EXAMPLES::
     1056       
    10421057            sage: m = hmm.GaussianMixtureHiddenMarkovModel([[1]], [[(.4,(0,1)), (.6,(1,0.1))]], [1])
    10431058            sage: loads(dumps(m)) == m
    10441059            True
     
    11691184        cdef GaussianMixtureDistribution G = self.mixture[state]
    11701185        return G.prob(observation)
    11711186
    1172     # TODO: factor out to hmm.pyx
    1173     cdef TimeSeries _baum_welch_gamma(self, TimeSeries alpha, TimeSeries beta):
    1174         """
    1175         Returns gamma and gamma_all, where gamma_all contains the
    1176         (double-precision float) numbers gamma_t(j,m), and gamma
    1177         contains just the gamma_t(j).
    1178         """
    1179         cdef int j, N = self.N
    1180         cdef Py_ssize_t t, T = alpha._length//N
    1181         cdef TimeSeries gamma = TimeSeries(alpha._length, initialize=False)
    1182         cdef double denominator
    1183         for t in range(T):
    1184             denominator = 0
    1185             for j in range(N):
    1186                 gamma._values[t*N + j] = alpha._values[t*N + j] * beta._values[t*N + j]
    1187                 denominator += gamma._values[t*N + j]
    1188             for j in range(N):
    1189                 gamma._values[t*N + j] /= denominator
    1190 
    1191         return gamma
    1192 
    11931187    cdef TimeSeries _baum_welch_mixed_gamma(self, TimeSeries alpha, TimeSeries beta,
    11941188                                            TimeSeries obs, int j):
    11951189        """
  • sage/stats/hmm/hmm.pxd

    diff -r add8a97d36c8 -r 3b663ff7aae8 sage/stats/hmm/hmm.pxd
    a b  
    77
    88
    99from sage.finance.time_series cimport TimeSeries
     10from sage.stats.intlist cimport IntList
    1011
    1112cdef class HiddenMarkovModel:
    1213    cdef int N
    1314    cdef TimeSeries A, pi
    1415
     16    cdef TimeSeries _baum_welch_gamma(self, TimeSeries alpha, TimeSeries beta)
     17
  • sage/stats/hmm/hmm.pyx

    diff -r add8a97d36c8 -r 3b663ff7aae8 sage/stats/hmm/hmm.pyx
    a b  
    3636    double log(double)
    3737
    3838from sage.finance.time_series cimport TimeSeries
    39 from sage.stats.intlist cimport IntList
    4039from sage.matrix.all import is_Matrix, matrix
    4140from sage.misc.randstate cimport current_randstate, randstate
    4241
     
    151150        Return number samples from this HMM of given length.
    152151
    153152        INPUT:
    154             length -- positive integer
    155             number -- (default: None) if given, compute list of this many sample sequences
     153       
     154            - ``length`` -- positive integer
     155            - ``number`` -- (default: None) if given, compute list of this many sample sequences
    156156
    157157        OUTPUT:
    158             if number is not given, return a single TimeSeries.
    159             if number is given, return a list of TimeSeries.
     158       
     159            - if number is not given, return a single TimeSeries.
     160            - if number is given, return a list of TimeSeries.
    160161
    161         EXAMPLES:
     162        EXAMPLES::
     163       
    162164            sage: set_random_seed(0)
    163165            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1])
    164166            sage: print a.sample(10, 3)
     
    170172            sage: list(a.sample(1000)).count(0)
    171173            88
    172174
    173         If the emission symbols are set
     175        If the emission symbols are set::
     176       
    174177            sage: set_random_seed(0)
    175178            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.1,0.9]], [[1,0],[0,1]], [0,1], ['up', 'down'])
    176179            sage: a.sample(10)
     
    182185        cdef Py_ssize_t i
    183186        return [self.generate_sequence(length)[0] for i in range(number)]
    184187
     188
     189    #########################################################
     190    # Some internal funcitons used for various general
     191    # HMM algorithms.
     192    #########################################################
     193    cdef TimeSeries _baum_welch_gamma(self, TimeSeries alpha, TimeSeries beta):
     194        """
     195        Used internally to compute the scaled quantity gamma_t(j)
     196        appearing in the Baum-Welch reestimation algorithm.
     197
     198        The quantity gamma_t(j) is the (scaled!) probability of being
     199        in state j at time t, given the observation sequence.
     200
     201        INPUT:
     202
     203            - ``alpha`` -- TimeSeries as output by the scaled forward algorithm
     204            - ``beta`` -- TimeSeries as output by the scaled backward algorithm
     205
     206        OUTPUT:
     207
     208            - TimeSeries gamma such that gamma[t*N+j] is gamma_t(j).
     209        """
     210        cdef int j, N = self.N
     211        cdef Py_ssize_t t, T = alpha._length//N
     212        cdef TimeSeries gamma = TimeSeries(alpha._length, initialize=False)
     213        cdef double denominator
     214        _sig_on
     215        for t in range(T):
     216            denominator = 0
     217            for j in range(N):
     218                gamma._values[t*N + j] = alpha._values[t*N + j] * beta._values[t*N + j]
     219                denominator += gamma._values[t*N + j]
     220            for j in range(N):
     221                gamma._values[t*N + j] /= denominator
     222        _sig_off
     223        return gamma
     224
     225
    185226cdef class DiscreteHiddenMarkovModel(HiddenMarkovModel):
    186227    """
    187228    A discrete Hidden Markov model implemented using double precision
     
    230271
    231272        INPUT::
    232273
    233            - A -- a list of lists or a square N x N matrix, whose
     274           - ``A`` -- a list of lists or a square N x N matrix, whose
    234275             (i,j) entry gives the probability of transitioning from
    235276             state i to state j.
    236277
    237            - B -- a list of N lists or a matrix with N rows, such that
     278           - ``B`` -- a list of N lists or a matrix with N rows, such that
    238279             B[i,k] gives the probability of emitting symbol k while
    239280             in state i.
    240281
    241            - pi -- the probabilities of starting in each initial
     282           - ``pi`` -- the probabilities of starting in each initial
    242283             state, i.e,. pi[i] is the probability of starting in
    243284             state i.
    244285
    245            - emission_symbols -- None or list (default: None); if
     286           - ``emission_symbols`` -- None or list (default: None); if
    246287             None, the emission_symbols are the ints [0..N-1], where N
    247288             is the number of states.  Otherwise, they are the entries
    248289             of the list emissions_symbols, which must all be hashable.
    249290
    250            - normalize --bool (default: True); if given, input is
     291           - ``normalize`` --bool (default: True); if given, input is
    251292             normalized to define valid probability distributions,
    252293             e.g., the entries of A are made nonnegative and the rows
    253              sum to 1.
     294             sum to 1, and the probabilities in pi are normalized.
    254295
    255296        EXAMPLES::
    256297
     
    274315        if self._emission_symbols is not None:
    275316            self._emission_symbols_dict = dict([(y,x) for x,y in enumerate(emission_symbols)])
    276317
    277         # TODO : normalization
    278318        if not is_Matrix(B):
    279319            B = matrix(B)
    280320        if B.nrows() != self.N:
     
    292332        """
    293333        Used in pickling.
    294334
    295         EXAMPLES:
     335        EXAMPLES::
     336       
    296337            sage: m = hmm.DiscreteHiddenMarkovModel([[0.4,0.6],[0.1,0.9]], [[0.0,1.0],[1,1]], [0,1], ['a','b'])
    297338            sage: loads(dumps(m)) == m
    298339            True
     
    419460
    420461        INPUT:
    421462
    422             - obs -- sequence of observations
     463            - ``obs`` -- sequence of observations
    423464
    424             - scale -- boolean (default: True); if True, use rescaling
     465            - ``scale`` -- boolean (default: True); if True, use rescaling
    425466              to overoid loss of precision due to the very limited
    426467              dynamic range of floats.  You should leave this as True
    427468              unless the obs sequence is very small.
     
    467508
    468509        INPUT:
    469510
    470             - obs -- an integer list of observation states.
     511            - ``obs`` -- an integer list of observation states.
    471512
    472513        OUTPUT:
    473514
    474             - float -- the log of the probability that the model produced this sequence
     515            - ``float`` -- the log of the probability that the model produced this sequence
    475516
    476517        EXAMPLES::
    477518
     
    522563
    523564        INPUT:
    524565
    525             - obs -- an integer list of observation states.
     566            - ``obs`` -- an integer list of observation states.
    526567
    527568        OUTPUT:
    528569
    529             - float -- the log of the probability that the model produced this sequence
     570            - ``float`` -- the log of the probability that the model produced this sequence
    530571
    531572        EXAMPLES::
    532573
     
    611652
    612653        INPUT:
    613654
    614             - length -- positive integer
     655            - ``length`` -- positive integer
    615656
    616657        OUTPUT:
    617658
     
    712753
    713754        INPUT:
    714755
    715             - q -- a nonnegative integer, which specifies a state
    716 
    717             - r -- a real number between 0 and 1
     756            - ``q`` -- a nonnegative integer, which specifies a state
     757            - ``r`` -- a real number between 0 and 1
    718758
    719759        OUTPUT:
    720760
     
    726766            sage: set_random_seed(0)
    727767            sage: m.generate_sequence(10)  # indirect test
    728768            ([0, 1, 0, 0, 0, 0, 1, 0, 1, 1], [1, 0, 1, 1, 1, 1, 0, 0, 0, 0])
    729 
    730769        """
    731770        cdef Py_ssize_t j
    732771        cdef double a, accum = 0
     
    752791
    753792        INPUT:
    754793
    755             - seq -- sequence of emitted ints or symbols
     794            - ``seq`` -- sequence of emitted ints or symbols
    756795
    757             - log_scale -- bool (default: True) whether to scale the
     796            - ``log_scale`` -- bool (default: True) whether to scale the
    758797              sequence in order to avoid numerical overflow.
    759798
    760799        OUTPUT:
    761800
    762             - list -- "the" most probable sequence of hidden states, i.e.,
     801            - ``list`` -- "the" most probable sequence of hidden states, i.e.,
    763802              the Viterbi path.
    764803
    765             - float -- log of probability that the observed sequence
     804            - ``float`` -- log of probability that the observed sequence
    766805              was produced by the Viterbi sequence of states.
    767806
    768807        EXAMPLES::
     
    797836
    798837        INPUT:
    799838
    800             - obs -- IntList
     839            - ``obs`` -- IntList
    801840
    802841        OUTPUT:
    803842
     
    9701009
    9711010        INPUT:
    9721011
    973             - obs -- IntList
    974             - scale -- series that is *changed* in place, so that
     1012            - ``obs`` -- IntList
     1013            - ``scale`` -- series that is *changed* in place, so that
    9751014              after calling this function, scale[t] is value that is
    9761015              used to scale each of the `\beta_t(i)`.
    9771016
     
    10081047
    10091048        INPUT:
    10101049
    1011             - obs -- IntList
     1050            - ``obs`` -- IntList
    10121051
    10131052        OUTPUT:
    10141053
     
    10621101        # Termination
    10631102        return alpha, scale, log_probability
    10641103
    1065     cdef TimeSeries _baum_welch_gamma(self, TimeSeries alpha, TimeSeries beta):
    1066         """
    1067         Used internally to compute the scaled quantity gamma_t(j)
    1068         appearing in the Baum-Welch reestimation algorithm.
    1069 
    1070         The quantity gamma_t(j) is the (scaled!) probability of being
    1071         in state j at time t, given the observation sequence.
    1072 
    1073         INPUT:
    1074 
    1075             - alpha -- TimeSeries as output by the scaled forward algorithm
    1076             - beta -- TimeSeries as output by the scaled backward algorithm
    1077 
    1078         OUTPUT:
    1079 
    1080             - TimeSeries gamma such that gamma[t*N+j] is gamma_t(j).
    1081         """
    1082         cdef int j, N = self.N
    1083         cdef Py_ssize_t t, T = alpha._length//N
    1084         cdef TimeSeries gamma = TimeSeries(alpha._length, initialize=False)
    1085         cdef double denominator
    1086         _sig_on
    1087         for t in range(T):
    1088             denominator = 0
    1089             for j in range(N):
    1090                 gamma._values[t*N + j] = alpha._values[t*N + j] * beta._values[t*N + j]
    1091                 denominator += gamma._values[t*N + j]
    1092             for j in range(N):
    1093                 gamma._values[t*N + j] /= denominator
    1094         _sig_off
    1095         return gamma
    1096 
    10971104    cdef TimeSeries _baum_welch_xi(self, TimeSeries alpha, TimeSeries beta, IntList obs):
    10981105        """
    10991106        Used internally to compute the scaled quantity xi_t(i,j)
     
    11011108
    11021109        INPUT:
    11031110
    1104             - alpha -- TimeSeries as output by the scaled forward algorithm
    1105             - beta -- TimeSeries as output by the scaled backward algorithm
    1106             - obs -- IntList of observations
     1111            - ``alpha`` -- TimeSeries as output by the scaled forward algorithm
     1112            - ``beta`` -- TimeSeries as output by the scaled backward algorithm
     1113            - ``obs ``-- IntList of observations
    11071114
    11081115        OUTPUT:
    11091116
     
    11341141
    11351142        INPUT:
    11361143
    1137             - obs -- list of emissions
     1144            - ``obs`` -- list of emissions
    11381145
    1139             - max_iter -- integer (default: 100) maximum number
     1146            - ``max_iter`` -- integer (default: 100) maximum number
    11401147              of Baum-Welch steps to take
    11411148
    1142             - log_likehood_cutoff -- positive float (default: 1e-4);
     1149            - ``log_likehood_cutoff`` -- positive float (default: 1e-4);
    11431150              the minimal improvement in likelihood with respect to
    11441151              the last iteration required to continue. Relative value
    11451152              to log likelihood.
    11461153
    1147             - fix_emissions -- bool (default: False); if True, do not
     1154            - ``fix_emissions`` -- bool (default: False); if True, do not
    11481155              change emissions when updating
    11491156
    11501157        OUTPUT:
  • sage/stats/hmm/util.pyx

    diff -r add8a97d36c8 -r 3b663ff7aae8 sage/stats/hmm/util.pyx
    a b  
    5252            sage: u.normalize_probability_TimeSeries(T,0,4)
    5353            sage: T
    5454            [0.0606, 0.1818, 0.4242, 0.3333]
    55             sage: T.sum()
    56             1.0                   # 32-bit
    57             0.99999999999999978   # 64-bit
     55            sage: abs(T.sum()-1) < 1e-8    # might not exactly equal 1 due to rounding
     56            True
    5857        """
    5958        # One single bounds check only
    6059        if i < 0 or j < 0 or i > T._length or j > T._length: