Ticket #7197: trac_7197_part10.patch

File trac_7197_part10.patch, 7.7 KB (added by was, 12 years ago)
  • sage/stats/basic_stats.py

    # HG changeset patch
    # User William Stein <wstein@gmail.com>
    # Date 1259024580 28800
    # Node ID f112ccff635aed6344d17140b06d49777d62a37a
    # Parent  9239204dda36e49912adadbd689556c1987cb7aa
    trac 7197 -- basic statistics functions ; part 10 -- referee partch
    
    diff -r 9239204dda36 -r f112ccff635a sage/stats/basic_stats.py
    a b  
    2020Functions are available in the namespace ``stats``, i.e. you can use them by
    2121typing ``stats.mean``, ``stats.median``, etc.
    2222
     23REMARK: If all the data you are working with are floating point
     24numbers, you may find ``finance.TimeSeries`` helpful, since it is
     25extremely fast and offers many of the same descriptive statistics as
     26in the module. 
    2327
    2428AUTHOR:
    2529
     
    4347    """
    4448    Return the mean of the elements of `v`.
    4549
    46     We define the mean of the empty list to be NaN, following the
    47     convention of MATLAB, Scipy, and R.
     50    We define the mean of the empty list to be the (symbolic) NaN,
     51    following the convention of MATLAB, Scipy, and R.
    4852
    4953    INPUT:
    5054
     
    6973        sage: v = finance.TimeSeries([1..100])
    7074        sage: mean(v)
    7175        50.5
    72 
    7376    """
    7477    if hasattr(v, 'mean'): return v.mean()
    7578    if len(v) == 0:
     
    8083        return s/ZZ(len(v))
    8184    return s/len(v)
    8285
    83 
    84 
    8586def mode(v):
    8687    """
    87     Return the mode (most common) of the elements of `v`
     88    Return the mode of `v`.  The mode is the sorted list of the most
     89    frequently occuring elements in `v`.  If `n` is the most times
     90    that any element occurs in `v`, then the mode is the sorted list
     91    of elements of `v` that occur `n` times.
    8892   
    89     If `v` is empty, we define the mode to be null.
    90     If all elements occur only once, we define the mode to be null.
    91     If multiple elements occur at the same frequency, the lowest element
    92     will be displayed.
     93    NOTE: The elements of `v` must be hashable and comparable.
    9394
    94    
    9595    INPUT:
    9696
    9797        - `v` -- a list
    9898
    9999    OUTPUT:
    100100
    101         - [(element, number_of_occurences)]
     101        - a list
    102102
    103103    EXAMPLES::
    104104
    105         sage: mode([1,2,4,1,6,2,6,7,1])
    106         [(1, 3)]
     105        sage: v = [1,2,4,1,6,2,6,7,1]
     106        sage: mode(v)
     107        [1]
     108        sage: v.count(1)
     109        3
    107110        sage: mode([])
    108111        []
    109112        sage: mode([1,2,3,4,5])
    110         []
    111         sage: mode([1,2,1,2,3,3])
    112         [(1, 2)]
     113        [1, 2, 3, 4, 5]
     114        sage: mode([3,1,2,1,2,3])
     115        [1, 2, 3]
    113116        sage: mode(['sage', 4, I, 3/5, 'sage', pi])
    114         [('sage', 2)]
     117        ['sage']
    115118        sage: class MyClass:
    116119        ...     def mode(self):
    117         ...         return 1
     120        ...         return [1]
    118121        sage: stats.mode(MyClass())
    119         1
     122        [1]
    120123    """
    121124    if hasattr(v, 'mode'): return v.mode()
    122125    from operator import itemgetter
    123126   
    124127    freq = {}
    125128    for i in v:
    126         try:
     129        if freq.has_key(i):
    127130            freq[i] += 1
    128         except KeyError:
     131        else:
    129132            freq[i] = 1
    130133
    131134    s = sorted(freq.items(), key=itemgetter(1), reverse=True)
    132     if not s or s[0][1]==1:
    133         # no mode if all occur equally often
    134         return []
    135     else:
    136         # returns lowest value of equally common values
    137         return [min([i for i in s if i[1]==s[0][1]])]
    138 
    139 
     135    return [i[0] for i in s if i[1]==s[0][1]]
    140136
    141137def std(v, bias=False):
    142138    """
     
    179175        sage: x = finance.TimeSeries([1..100])
    180176        sage: std(x)
    181177        29.011491975882016
     178    """
    182179
    183 
    184     """
     180    # NOTE: in R bias = False by default, and in Scipy bias=True by
     181    # default, and R is more popular.
     182   
    185183    if hasattr(v, 'standard_deviation'): return v.standard_deviation(bias=bias)
    186184
    187185    import numpy
     
    262260        ...        return 1   
    263261        sage: stats.variance(MyClass())
    264262        1
    265 
    266 
    267 
    268263    """
    269264    if hasattr(v, 'variance'): return v.variance(bias=bias)
    270265    import numpy
     
    293288        return x/(len(v)-1)
    294289
    295290
    296 
    297291def median(v):
    298292    """
    299293    Return the median (middle value) of the elements of `v`
    300294
    301     If `v` is empty, we define the median to be null.
     295    If `v` is empty, we define the median to be NaN, which is
     296    consistent with NumPy (note that R returns NULL).
    302297    If `v` is comprised of strings, TypeError occurs.
    303     For elements other than numbers, the median is a result of ``sorted()``
     298    For elements other than numbers, the median is a result of ``sorted()``.
    304299
    305300    INPUT:
    306301
     
    319314        sage: median(['sage', 'linux', 'python'])
    320315        'python'
    321316        sage: median([])
    322         []
     317        NaN
    323318        sage: class MyClass:
    324319        ...      def median(self):
    325320        ...         return 1
     
    329324    if hasattr(v, 'median'): return v.median()
    330325
    331326    if len(v) == 0:
    332         #median of empty set defined as null
    333         return []
     327        # Median of empty set defined as NaN
     328        return NaN
    334329    values = sorted(v)
    335330    if len(values) % 2 == 1:
    336331        return values[((len(values))+1)/2-1]
     
    339334        upper = values[len(values)/2]
    340335        return (lower + upper)/ZZ(2)
    341336
    342 def moving_average(v, bins=1):
     337def moving_average(v, n):
    343338    """
    344339    Provides the moving average of a list `v`
    345340
    346     The moving average of a list is often used to smooth out noisy
    347     data. Given a selected number of bins, the original list will be
    348     cut up into that number of bins. Then, the mean of each bin is
    349     calculated, and appended into a new list.
     341    The moving average of a list is often used to smooth out noisy data.
    350342
    351343    If `v` is empty, we define the entries of the moving average to be NaN.
    352344
     
    354346 
    355347        - `v` -- a list
    356348
    357         - ``bins`` -- number of bins, default set to 1
     349        - `n` -- the number of values used in computing each average.
    358350
    359351    OUTPUT:
    360352
    361         - a list
     353        - a list of length ``len(v)-n+1``, since we do not fabric any values
    362354 
    363355    EXAMPLES::
    364356
    365         sage: moving_average([1..10],4)
    366         [3/2, 7/2, 11/2, 15/2]
    367         sage: moving_average([])
    368         [NaN]
     357        sage: moving_average([1..10], 1)
     358        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     359        sage: moving_average([1..10], 4)
     360        [5/2, 7/2, 9/2, 11/2, 13/2, 15/2, 17/2]
     361        sage: moving_average([], 1)
     362        []
    369363        sage: moving_average([pi, e, I, sqrt(2), 3/5], 2)
    370         [1/2*pi + 1/2*e, 1/2*sqrt(2) + 1/2*I]
     364        [1/2*pi + 1/2*e, 1/2*e + 1/2*I, 1/2*sqrt(2) + 1/2*I, 1/2*sqrt(2) + 3/10]
    371365
     366    We check if the input is a time series, and if so use the
     367    optimized ``simple_moving_average`` method, but with (slightly
     368    different) meaning as defined above (the point is that the
     369    ``simple_moving_average`` on time series returns `n` values::
     370   
     371        sage: a = finance.TimeSeries([1..10])
     372        sage: stats.moving_average(a, 3)
     373        [2.0000, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000]
     374        sage: stats.moving_average(list(a), 3)
     375        [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
     376       
    372377    """
    373     x = []   
    374     j = 0
    375     bin_size = int(len(v)//bins)
    376 
    377     for i in range(bins):
    378         tmp = []
    379         for k in range(j,j+bin_size):
    380             tmp.append(v[k])
    381         x.append(mean(tmp))
    382         j += bin_size
    383     return x
     378    if len(v) == 0:
     379        return v
     380    from sage.finance.time_series import TimeSeries
     381    if isinstance(v, TimeSeries):
     382        return v.simple_moving_average(n)[n-1:]
     383    n = int(n)
     384    if n <= 0:
     385        raise ValueError, "n must be positive"
     386    nn = ZZ(n)
     387    s = sum(v[:n])
     388    ans = [s/nn]
     389    for i in range(n,len(v)):
     390        # add in the i-th value in v to our running sum,
     391        # and remove the value n places back.
     392        s += v[i] - v[i-n]
     393        ans.append(s/nn)
     394    return ans