# HG changeset patch
# User William Stein <wstein@gmail.com>
# Date 1259024580 28800
# Node ID f112ccff635aed6344d17140b06d49777d62a37a
# Parent 9239204dda36e49912adadbd689556c1987cb7aa
trac 7197  basic statistics functions ; part 10  referee partch
diff r 9239204dda36 r f112ccff635a sage/stats/basic_stats.py
a

b


20  20  Functions are available in the namespace ``stats``, i.e. you can use them by 
21  21  typing ``stats.mean``, ``stats.median``, etc. 
22  22  
 23  REMARK: If all the data you are working with are floating point 
 24  numbers, you may find ``finance.TimeSeries`` helpful, since it is 
 25  extremely fast and offers many of the same descriptive statistics as 
 26  in the module. 
23  27  
24  28  AUTHOR: 
25  29  
… 
… 

43  47  """ 
44  48  Return the mean of the elements of `v`. 
45  49  
46   We define the mean of the empty list to be NaN, following the 
47   convention of MATLAB, Scipy, and R. 
 50  We define the mean of the empty list to be the (symbolic) NaN, 
 51  following the convention of MATLAB, Scipy, and R. 
48  52  
49  53  INPUT: 
50  54  
… 
… 

69  73  sage: v = finance.TimeSeries([1..100]) 
70  74  sage: mean(v) 
71  75  50.5 
72   
73  76  """ 
74  77  if hasattr(v, 'mean'): return v.mean() 
75  78  if len(v) == 0: 
… 
… 

80  83  return s/ZZ(len(v)) 
81  84  return s/len(v) 
82  85  
83   
84   
85  86  def mode(v): 
86  87  """ 
87   Return the mode (most common) of the elements of `v` 
 88  Return the mode of `v`. The mode is the sorted list of the most 
 89  frequently occuring elements in `v`. If `n` is the most times 
 90  that any element occurs in `v`, then the mode is the sorted list 
 91  of elements of `v` that occur `n` times. 
88  92  
89   If `v` is empty, we define the mode to be null. 
90   If all elements occur only once, we define the mode to be null. 
91   If multiple elements occur at the same frequency, the lowest element 
92   will be displayed. 
 93  NOTE: The elements of `v` must be hashable and comparable. 
93  94  
94   
95  95  INPUT: 
96  96  
97  97   `v`  a list 
98  98  
99  99  OUTPUT: 
100  100  
101    [(element, number_of_occurences)] 
 101   a list 
102  102  
103  103  EXAMPLES:: 
104  104  
105   sage: mode([1,2,4,1,6,2,6,7,1]) 
106   [(1, 3)] 
 105  sage: v = [1,2,4,1,6,2,6,7,1] 
 106  sage: mode(v) 
 107  [1] 
 108  sage: v.count(1) 
 109  3 
107  110  sage: mode([]) 
108  111  [] 
109  112  sage: mode([1,2,3,4,5]) 
110   [] 
111   sage: mode([1,2,1,2,3,3]) 
112   [(1, 2)] 
 113  [1, 2, 3, 4, 5] 
 114  sage: mode([3,1,2,1,2,3]) 
 115  [1, 2, 3] 
113  116  sage: mode(['sage', 4, I, 3/5, 'sage', pi]) 
114   [('sage', 2)] 
 117  ['sage'] 
115  118  sage: class MyClass: 
116  119  ... def mode(self): 
117   ... return 1 
 120  ... return [1] 
118  121  sage: stats.mode(MyClass()) 
119   1 
 122  [1] 
120  123  """ 
121  124  if hasattr(v, 'mode'): return v.mode() 
122  125  from operator import itemgetter 
123  126  
124  127  freq = {} 
125  128  for i in v: 
126   try: 
 129  if freq.has_key(i): 
127  130  freq[i] += 1 
128   except KeyError: 
 131  else: 
129  132  freq[i] = 1 
130  133  
131  134  s = sorted(freq.items(), key=itemgetter(1), reverse=True) 
132   if not s or s[0][1]==1: 
133   # no mode if all occur equally often 
134   return [] 
135   else: 
136   # returns lowest value of equally common values 
137   return [min([i for i in s if i[1]==s[0][1]])] 
138   
139   
 135  return [i[0] for i in s if i[1]==s[0][1]] 
140  136  
141  137  def std(v, bias=False): 
142  138  """ 
… 
… 

179  175  sage: x = finance.TimeSeries([1..100]) 
180  176  sage: std(x) 
181  177  29.011491975882016 
 178  """ 
182  179  
183   
184   """ 
 180  # NOTE: in R bias = False by default, and in Scipy bias=True by 
 181  # default, and R is more popular. 
 182  
185  183  if hasattr(v, 'standard_deviation'): return v.standard_deviation(bias=bias) 
186  184  
187  185  import numpy 
… 
… 

262  260  ... return 1 
263  261  sage: stats.variance(MyClass()) 
264  262  1 
265   
266   
267   
268  263  """ 
269  264  if hasattr(v, 'variance'): return v.variance(bias=bias) 
270  265  import numpy 
… 
… 

293  288  return x/(len(v)1) 
294  289  
295  290  
296   
297  291  def median(v): 
298  292  """ 
299  293  Return the median (middle value) of the elements of `v` 
300  294  
301   If `v` is empty, we define the median to be null. 
 295  If `v` is empty, we define the median to be NaN, which is 
 296  consistent with NumPy (note that R returns NULL). 
302  297  If `v` is comprised of strings, TypeError occurs. 
303   For elements other than numbers, the median is a result of ``sorted()`` 
 298  For elements other than numbers, the median is a result of ``sorted()``. 
304  299  
305  300  INPUT: 
306  301  
… 
… 

319  314  sage: median(['sage', 'linux', 'python']) 
320  315  'python' 
321  316  sage: median([]) 
322   [] 
 317  NaN 
323  318  sage: class MyClass: 
324  319  ... def median(self): 
325  320  ... return 1 
… 
… 

329  324  if hasattr(v, 'median'): return v.median() 
330  325  
331  326  if len(v) == 0: 
332   #median of empty set defined as null 
333   return [] 
 327  # Median of empty set defined as NaN 
 328  return NaN 
334  329  values = sorted(v) 
335  330  if len(values) % 2 == 1: 
336  331  return values[((len(values))+1)/21] 
… 
… 

339  334  upper = values[len(values)/2] 
340  335  return (lower + upper)/ZZ(2) 
341  336  
342   def moving_average(v, bins=1): 
 337  def moving_average(v, n): 
343  338  """ 
344  339  Provides the moving average of a list `v` 
345  340  
346   The moving average of a list is often used to smooth out noisy 
347   data. Given a selected number of bins, the original list will be 
348   cut up into that number of bins. Then, the mean of each bin is 
349   calculated, and appended into a new list. 
 341  The moving average of a list is often used to smooth out noisy data. 
350  342  
351  343  If `v` is empty, we define the entries of the moving average to be NaN. 
352  344  
… 
… 

354  346  
355  347   `v`  a list 
356  348  
357    ``bins``  number of bins, default set to 1 
 349   `n`  the number of values used in computing each average. 
358  350  
359  351  OUTPUT: 
360  352  
361    a list 
 353   a list of length ``len(v)n+1``, since we do not fabric any values 
362  354  
363  355  EXAMPLES:: 
364  356  
365   sage: moving_average([1..10],4) 
366   [3/2, 7/2, 11/2, 15/2] 
367   sage: moving_average([]) 
368   [NaN] 
 357  sage: moving_average([1..10], 1) 
 358  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 
 359  sage: moving_average([1..10], 4) 
 360  [5/2, 7/2, 9/2, 11/2, 13/2, 15/2, 17/2] 
 361  sage: moving_average([], 1) 
 362  [] 
369  363  sage: moving_average([pi, e, I, sqrt(2), 3/5], 2) 
370   [1/2*pi + 1/2*e, 1/2*sqrt(2) + 1/2*I] 
 364  [1/2*pi + 1/2*e, 1/2*e + 1/2*I, 1/2*sqrt(2) + 1/2*I, 1/2*sqrt(2) + 3/10] 
371  365  
 366  We check if the input is a time series, and if so use the 
 367  optimized ``simple_moving_average`` method, but with (slightly 
 368  different) meaning as defined above (the point is that the 
 369  ``simple_moving_average`` on time series returns `n` values:: 
 370  
 371  sage: a = finance.TimeSeries([1..10]) 
 372  sage: stats.moving_average(a, 3) 
 373  [2.0000, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000] 
 374  sage: stats.moving_average(list(a), 3) 
 375  [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] 
 376  
372  377  """ 
373   x = [] 
374   j = 0 
375   bin_size = int(len(v)//bins) 
376   
377   for i in range(bins): 
378   tmp = [] 
379   for k in range(j,j+bin_size): 
380   tmp.append(v[k]) 
381   x.append(mean(tmp)) 
382   j += bin_size 
383   return x 
 378  if len(v) == 0: 
 379  return v 
 380  from sage.finance.time_series import TimeSeries 
 381  if isinstance(v, TimeSeries): 
 382  return v.simple_moving_average(n)[n1:] 
 383  n = int(n) 
 384  if n <= 0: 
 385  raise ValueError, "n must be positive" 
 386  nn = ZZ(n) 
 387  s = sum(v[:n]) 
 388  ans = [s/nn] 
 389  for i in range(n,len(v)): 
 390  # add in the ith value in v to our running sum, 
 391  # and remove the value n places back. 
 392  s += v[i]  v[in] 
 393  ans.append(s/nn) 
 394  return ans 