Ticket #7197: trac_7197.patch

File trac_7197.patch, 12.6 KB (added by mhansen, 12 years ago)

All of the above patches folded together. Apply only this patch.

  • sage/all.py

    # HG changeset patch
    # User Andrew Hou <hou.andrew@gmail.com>
    # Date 1256536506 25200
    # Node ID 7cde940a968b734ace640fb8d66f939d78b3da8a
    # Parent  120119a6d8c15a7dbc3ea8d14a72563c13d100c0
    basic_stats.py now works!
    * * *
    basic_stats.py works now!
    * * *
    copyright header block added
    numpy imported in functions where they're used
    * * *
    Moving average function added
    * * *
    More tests in docstring, better description. Fixed floor division.
    * * *
    Fixed documentation.
    * * *
    Fixed documentation
    * * *
    Mode function now returns lowest value if values occur equally. However, if all values only occur once, the mode is still null.
    * * *
    trac 7197 -- basic statistics functions ; part 10 -- referee partch
    
    diff --git a/sage/all.py b/sage/all.py
    a b  
    134134from sage.numerical.all  import *
    135135
    136136from sage.stats.all      import *
     137import sage.stats.all as stats
    137138
    138139import sage.finance.all  as finance
    139140
  • sage/stats/all.py

    diff --git a/sage/stats/all.py b/sage/stats/all.py
    a b  
    11from r import (ttest)
     2from basic_stats import (mean, mode, std, variance, median, moving_average)
    23
    34import hmm.all as hmm
     5
  • new file sage/stats/basic_stats.py

    diff --git a/sage/stats/basic_stats.py b/sage/stats/basic_stats.py
    new file mode 100644
    - +  
     1"""
     2Basic Statistics
     3
     4This file contains basic descriptive functions. Included are the mean,
     5median, mode, moving average, standard deviation, and the variance.
     6When calling a function on data, there are checks for functions already
     7defined for that data type.
     8
     9The ``mean`` function returns the arithmetic mean (the sum of all the members
     10of a list, divided by the number of members). Further revisions may include
     11the geometric and harmonic mean. The ``median`` function returns the number
     12separating the higher half of a sample from the lower half. The ``mode``
     13returns the most common occuring member of a sample, plus the number of times
     14it occurs. If entries occur equally common, the smallest of a list of the most
     15common  entries is returned. The ``moving_average`` is a finite impulse
     16response filter, creating a series of averages using a user-defined number of
     17subsets of the full data set. The ``std`` and the ``variance`` return a
     18measurement of how far data points tend to be from the arithmetic mean.
     19
     20Functions are available in the namespace ``stats``, i.e. you can use them by
     21typing ``stats.mean``, ``stats.median``, etc.
     22
     23REMARK: If all the data you are working with are floating point
     24numbers, you may find ``finance.TimeSeries`` helpful, since it is
     25extremely fast and offers many of the same descriptive statistics as
     26in the module. 
     27
     28AUTHOR:
     29
     30    - Andrew Hou (11/06/2009)
     31 
     32"""
     33######################################################################
     34#          Copyright (C) 2009, Andrew Hou <amhou@uw.edu>
     35#
     36#  Distributed under the terms of the GNU General Public License (GPL)
     37#
     38#            The full text of the GPL is available at:
     39#                  http://www.gnu.org/licenses/
     40######################################################################
     41
     42from sage.rings.integer_ring import ZZ
     43from sage.symbolic.constants import NaN
     44from sage.functions.other import sqrt
     45
     46def mean(v):
     47    """
     48    Return the mean of the elements of `v`.
     49
     50    We define the mean of the empty list to be the (symbolic) NaN,
     51    following the convention of MATLAB, Scipy, and R.
     52
     53    INPUT:
     54
     55        - `v` -- a list of numbers
     56
     57    OUTPUT:
     58
     59        - a number
     60
     61    EXAMPLES::
     62
     63        sage: mean([pi, e])
     64        1/2*pi + 1/2*e
     65        sage: mean([])
     66        NaN
     67        sage: mean([I, sqrt(2), 3/5])
     68        1/3*sqrt(2) + 1/3*I + 1/5
     69        sage: mean([RIF(1.0103,1.0103), RIF(2)])
     70        1.5051500000000000?
     71        sage: mean(range(4))
     72        3/2
     73        sage: v = finance.TimeSeries([1..100])
     74        sage: mean(v)
     75        50.5
     76    """
     77    if hasattr(v, 'mean'): return v.mean()
     78    if len(v) == 0:
     79        return NaN
     80    s = sum(v)
     81    if isinstance(s, (int,long)):
     82        # python integers are stupid.
     83        return s/ZZ(len(v))
     84    return s/len(v)
     85
     86def mode(v):
     87    """
     88    Return the mode of `v`.  The mode is the sorted list of the most
     89    frequently occuring elements in `v`.  If `n` is the most times
     90    that any element occurs in `v`, then the mode is the sorted list
     91    of elements of `v` that occur `n` times.
     92   
     93    NOTE: The elements of `v` must be hashable and comparable.
     94
     95    INPUT:
     96
     97        - `v` -- a list
     98
     99    OUTPUT:
     100
     101        - a list
     102
     103    EXAMPLES::
     104
     105        sage: v = [1,2,4,1,6,2,6,7,1]
     106        sage: mode(v)
     107        [1]
     108        sage: v.count(1)
     109        3
     110        sage: mode([])
     111        []
     112        sage: mode([1,2,3,4,5])
     113        [1, 2, 3, 4, 5]
     114        sage: mode([3,1,2,1,2,3])
     115        [1, 2, 3]
     116        sage: mode(['sage', 4, I, 3/5, 'sage', pi])
     117        ['sage']
     118        sage: class MyClass:
     119        ...     def mode(self):
     120        ...         return [1]
     121        sage: stats.mode(MyClass())
     122        [1]
     123    """
     124    if hasattr(v, 'mode'): return v.mode()
     125    from operator import itemgetter
     126   
     127    freq = {}
     128    for i in v:
     129        if freq.has_key(i):
     130            freq[i] += 1
     131        else:
     132            freq[i] = 1
     133
     134    s = sorted(freq.items(), key=itemgetter(1), reverse=True)
     135    return [i[0] for i in s if i[1]==s[0][1]]
     136
     137def std(v, bias=False):
     138    """
     139    Returns the standard deviation of the elements of `v`
     140
     141    We define the standard deviation of the empty list to be NaN,
     142    following the convention of MATLAB, Scipy, and R.
     143
     144    INPUT:
     145
     146        - `v` -- a list of numbers
     147       
     148        - ``bias`` -- bool (default: False); if False, divide by
     149                      len(v) - 1 instead of len(v)
     150                      to give a less biased estimator (sample) for the
     151                      standard deviation.
     152
     153    OUTPUT:
     154       
     155        - a number
     156
     157    EXAMPLES::
     158
     159        sage: std([1..6], bias=True)
     160        1/2*sqrt(35/3)
     161        sage: std([1..6], bias=False)
     162        sqrt(7/2)
     163        sage: std([e, pi])
     164        sqrt(1/2)*sqrt((pi - e)^2)
     165        sage: std([])
     166        NaN
     167        sage: std([I, sqrt(2), 3/5])
     168        sqrt(1/450*(5*sqrt(2) + 5*I - 6)^2 + 1/450*(5*sqrt(2) - 10*I + 3)^2 + 1/450*(10*sqrt(2) - 5*I - 3)^2)
     169        sage: std([RIF(1.0103, 1.0103), RIF(2)])
     170        0.6998235813403261?
     171        sage: import numpy
     172        sage: x = numpy.array([1,2,3,4,5])
     173        sage: std(x, bias=False)
     174        1.5811388300841898
     175        sage: x = finance.TimeSeries([1..100])
     176        sage: std(x)
     177        29.011491975882016
     178    """
     179
     180    # NOTE: in R bias = False by default, and in Scipy bias=True by
     181    # default, and R is more popular.
     182   
     183    if hasattr(v, 'standard_deviation'): return v.standard_deviation(bias=bias)
     184
     185    import numpy
     186
     187    x = 0
     188    if isinstance(v, numpy.ndarray):
     189        # accounts for numpy arrays
     190        if bias == True:
     191            return v.std()
     192        elif bias == False:
     193            return v.std(ddof=1)
     194
     195    if len(v) == 0:
     196        # standard deviation of empty set defined as NaN
     197        return NaN
     198    for i in range(len(v)):
     199        x += (v[i] - mean(v))**2
     200    if bias:
     201        # population standard deviation
     202        if isinstance(x, (int,long)):
     203            return sqrt(x/ZZ(len(v)))
     204        return sqrt(x/len(v))
     205    else:
     206        # sample standard deviation
     207        if isinstance(x, (int,long)):
     208            return sqrt(x/ZZ(len(v)))
     209        return sqrt(x/(len(v)-1))
     210
     211
     212
     213
     214def variance(v, bias=False):
     215    """
     216    Returns the variance of the elements of `v`
     217
     218    We define the variance of the empty list to be NaN,
     219    following the convention of MATLAB, Scipy, and R.
     220
     221    INPUT:
     222
     223        - `v` -- a list of numbers
     224       
     225        - ``bias`` -- bool (default: False); if False, divide by
     226                      len(v) - 1 instead of len(v)
     227                      to give a less biased estimator (sample) for the
     228                      standard deviation.
     229
     230    OUTPUT:
     231       
     232        - a number
     233
     234
     235    EXAMPLES::
     236
     237        sage: variance([1..6])
     238        7/2
     239        sage: variance([1..6], bias=True)
     240        35/12
     241        sage: variance([e, pi])                 
     242        1/2*(pi - e)^2
     243        sage: variance([])                     
     244        NaN
     245        sage: variance([I, sqrt(2), 3/5])       
     246        1/450*(5*sqrt(2) + 5*I - 6)^2 + 1/450*(5*sqrt(2) - 10*I + 3)^2 + 1/450*(10*sqrt(2) - 5*I - 3)^2
     247        sage: variance([RIF(1.0103, 1.0103), RIF(2)])
     248        0.4897530450000000?
     249        sage: import numpy
     250        sage: x = numpy.array([1,2,3,4,5])
     251        sage: variance(x, bias=False)
     252        2.5
     253        sage: x = finance.TimeSeries([1..100])
     254        sage: variance(x)
     255        841.66666666666663
     256        sage: variance(x, bias=True)
     257        833.25
     258        sage: class MyClass:
     259        ...     def variance(self, bias = False):
     260        ...        return 1   
     261        sage: stats.variance(MyClass())
     262        1
     263    """
     264    if hasattr(v, 'variance'): return v.variance(bias=bias)
     265    import numpy
     266
     267    x = 0
     268    if isinstance(v, numpy.ndarray):
     269        # accounts for numpy arrays
     270        if bias == True:
     271            return v.var()
     272        elif bias == False:
     273            return v.var(ddof=1)
     274    if len(v) == 0:
     275        # variance of empty set defined as NaN
     276        return NaN
     277    for i in range(len(v)):
     278        x += (v[i] - mean(v))**2
     279    if bias:
     280        # population variance
     281        if isinstance(x, (int,long)):
     282            return x/ZZ(len(v))
     283        return x/len(v)
     284    else:
     285        # sample variance
     286        if isinstance(x, (int,long)):
     287            return x/ZZ(len(v))
     288        return x/(len(v)-1)
     289
     290
     291def median(v):
     292    """
     293    Return the median (middle value) of the elements of `v`
     294
     295    If `v` is empty, we define the median to be NaN, which is
     296    consistent with NumPy (note that R returns NULL).
     297    If `v` is comprised of strings, TypeError occurs.
     298    For elements other than numbers, the median is a result of ``sorted()``.
     299
     300    INPUT:
     301
     302        - `v` -- a list
     303
     304    OUTPUT:
     305
     306        - median element of `v`
     307
     308    EXAMPLES::
     309
     310        sage: median([1,2,3,4,5])
     311        3
     312        sage: median([e, pi])
     313        1/2*pi + 1/2*e
     314        sage: median(['sage', 'linux', 'python'])
     315        'python'
     316        sage: median([])
     317        NaN
     318        sage: class MyClass:
     319        ...      def median(self):
     320        ...         return 1
     321        sage: stats.median(MyClass())
     322        1
     323    """
     324    if hasattr(v, 'median'): return v.median()
     325
     326    if len(v) == 0:
     327        # Median of empty set defined as NaN
     328        return NaN
     329    values = sorted(v)
     330    if len(values) % 2 == 1:
     331        return values[((len(values))+1)/2-1]
     332    else:
     333        lower = values[(len(values)+1)/2-1]
     334        upper = values[len(values)/2]
     335        return (lower + upper)/ZZ(2)
     336
     337def moving_average(v, n):
     338    """
     339    Provides the moving average of a list `v`
     340
     341    The moving average of a list is often used to smooth out noisy data.
     342
     343    If `v` is empty, we define the entries of the moving average to be NaN.
     344
     345    INPUT:
     346 
     347        - `v` -- a list
     348
     349        - `n` -- the number of values used in computing each average.
     350
     351    OUTPUT:
     352
     353        - a list of length ``len(v)-n+1``, since we do not fabric any values
     354 
     355    EXAMPLES::
     356
     357        sage: moving_average([1..10], 1)
     358        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     359        sage: moving_average([1..10], 4)
     360        [5/2, 7/2, 9/2, 11/2, 13/2, 15/2, 17/2]
     361        sage: moving_average([], 1)
     362        []
     363        sage: moving_average([pi, e, I, sqrt(2), 3/5], 2)
     364        [1/2*pi + 1/2*e, 1/2*e + 1/2*I, 1/2*sqrt(2) + 1/2*I, 1/2*sqrt(2) + 3/10]
     365
     366    We check if the input is a time series, and if so use the
     367    optimized ``simple_moving_average`` method, but with (slightly
     368    different) meaning as defined above (the point is that the
     369    ``simple_moving_average`` on time series returns `n` values::
     370   
     371        sage: a = finance.TimeSeries([1..10])
     372        sage: stats.moving_average(a, 3)
     373        [2.0000, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000]
     374        sage: stats.moving_average(list(a), 3)
     375        [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
     376       
     377    """
     378    if len(v) == 0:
     379        return v
     380    from sage.finance.time_series import TimeSeries
     381    if isinstance(v, TimeSeries):
     382        return v.simple_moving_average(n)[n-1:]
     383    n = int(n)
     384    if n <= 0:
     385        raise ValueError, "n must be positive"
     386    nn = ZZ(n)
     387    s = sum(v[:n])
     388    ans = [s/nn]
     389    for i in range(n,len(v)):
     390        # add in the i-th value in v to our running sum,
     391        # and remove the value n places back.
     392        s += v[i] - v[i-n]
     393        ans.append(s/nn)
     394    return ans