1 """module for performing statistical calculations.
2
3 (c) 2007-2012 Matt Hilton
4
5 (c) 2013-2014 Matt Hilton & Steven Boada
6
7 U{http://astlib.sourceforge.net}
8
9 This module (as you may notice) provides very few statistical routines. It does, however, provide
10 biweight (robust) estimators of location and scale, as described in Beers et al. 1990 (AJ, 100,
11 32), in addition to a robust least squares fitting routine that uses the biweight transform.
12
13 Some routines may fail if they are passed lists with few items and encounter a `divide by zero'
14 error. Where this occurs, the function will return None. An error message will be printed to the
15 console when this happens if astStats.REPORT_ERRORS=True (the default). Testing if an
16 astStats function returns None can be used to handle errors in scripts.
17
18 For extensive statistics modules, the Python bindings for GNU R (U{http://rpy.sourceforge.net}), or
19 SciPy (U{http://www.scipy.org}) are suggested.
20
21 """
22
23 import math
24 import numpy
25 import sys
26
27 REPORT_ERRORS=True
28
29
31 """Calculates the mean average of a list of numbers.
32
33 @type dataList: list or numpy array
34 @param dataList: input data, must be a one dimensional list
35 @rtype: float
36 @return: mean average
37
38 """
39 return numpy.mean(dataList)
40
41
43 """Calculates the weighted mean average of a two dimensional list (value, weight) of
44 numbers.
45
46 @type dataList: list
47 @param dataList: input data, must be a two dimensional list in format [value, weight]
48 @rtype: float
49 @return: weighted mean average
50
51 """
52 sum=0
53 weightSum=0
54 for item in dataList:
55 sum=sum+float(item[0]*item[1])
56 weightSum=weightSum+item[1]
57 if len(dataList)>0:
58 mean=sum/weightSum
59 else:
60 mean=0
61 return mean
62
63
65 """Calculates the (sample) standard deviation of a list of numbers.
66
67 @type dataList: list or numpy array
68 @param dataList: input data, must be a one dimensional list
69 @rtype: float
70 @return: standard deviation
71
72 """
73 return numpy.std(dataList)
74
75
77 """Calculates the root mean square of a list of numbers.
78
79 @type dataList: list
80 @param dataList: input data, must be a one dimensional list
81 @rtype: float
82 @return: root mean square
83
84 """
85 dataListSq=[]
86 for item in dataList:
87 dataListSq.append(item*item)
88 listMeanSq=mean(dataListSq)
89 rms=math.sqrt(listMeanSq)
90
91 return rms
92
93
95 """Calculates the weighted (sample) standard deviation of a list of numbers.
96
97 @type dataList: list
98 @param dataList: input data, must be a two dimensional list in format [value, weight]
99 @rtype: float
100 @return: weighted standard deviation
101
102 @note: Returns None if an error occurs.
103
104 """
105 listMean=weightedMean(dataList)
106 sum=0
107 wSum=0
108 wNonZero=0
109 for item in dataList:
110 if item[1]>0.0:
111 sum=sum+float((item[0]-listMean)/item[1])*float((item[0]-listMean)/item[1])
112 wSum=wSum+float(1.0/item[1])*float(1.0/item[1])
113
114 if len(dataList)>1:
115 nFactor=float(len(dataList))/float(len(dataList)-1)
116 stdev=math.sqrt(nFactor*(sum/wSum))
117 else:
118 if REPORT_ERRORS==True:
119 print("""ERROR: astStats.weightedStdev() : dataList contains < 2 items.""")
120 stdev=None
121 return stdev
122
123
134
135
137 """Returns an estimate of the mode of a set of values by mode=(3*median)-(2*mean).
138
139 @type dataList: list
140 @param dataList: input data, must be a one dimensional list
141 @rtype: float
142 @return: estimate of mode average
143
144 """
145 mode=(3*median(dataList))-(2*mean(dataList))
146
147 return mode
148
149
151 """Calculates the Median Absolute Deviation of a list of numbers.
152
153 @type dataList: list
154 @param dataList: input data, must be a one dimensional list
155 @rtype: float
156 @return: median absolute deviation
157
158 """
159 listMedian=median(dataList)
160
161
162 diffModuli=[]
163 for item in dataList:
164 diffModuli.append(math.fabs(item-listMedian))
165
166 MAD=median(diffModuli)
167
168 return MAD
169
170
172 """Calculates the biweight location estimator (like a robust average) of a list of
173 numbers.
174
175 @type dataList: list
176 @param dataList: input data, must be a one dimensional list
177 @type tuningConstant: float
178 @param tuningConstant: 6.0 is recommended.
179 @rtype: float
180 @return: biweight location
181
182 @note: Returns None if an error occurs.
183
184 """
185 C=tuningConstant
186 listMedian=median(dataList)
187 listMAD=MAD(dataList)
188 if listMAD!=0:
189 uValues=[]
190 for item in dataList:
191 uValues.append((item-listMedian)/(C*listMAD))
192
193 top=0
194 bottom=0
195 for i in range(len(uValues)):
196 if math.fabs(uValues[i])<=1.0:
197 top=top+((dataList[i]-listMedian) \
198 *(1.0-(uValues[i]*uValues[i])) \
199 *(1.0-(uValues[i]*uValues[i])))
200
201 bottom=bottom+((1.0-(uValues[i]*uValues[i])) \
202 *(1.0-(uValues[i]*uValues[i])))
203
204 CBI=listMedian+(top/bottom)
205
206 else:
207 if REPORT_ERRORS==True:
208 print("""ERROR: astStats: biweightLocation() : MAD() returned 0.""")
209 return None
210
211 return CBI
212
213
215 """Calculates the biweight scale estimator (like a robust standard deviation) of a list
216 of numbers.
217
218 @type dataList: list
219 @param dataList: input data, must be a one dimensional list
220 @type tuningConstant: float
221 @param tuningConstant: 9.0 is recommended.
222 @rtype: float
223 @return: biweight scale
224
225 @note: Returns None if an error occurs.
226
227 """
228 C=tuningConstant
229
230
231 listMedian=median(dataList)
232 listMAD=MAD(dataList)
233 diffModuli=[]
234 for item in dataList:
235 diffModuli.append(math.fabs(item-listMedian))
236 uValues=[]
237 for item in dataList:
238 try:
239 uValues.append((item-listMedian)/(C*listMAD))
240 except ZeroDivisionError:
241 if REPORT_ERRORS==True:
242 print("""ERROR: astStats.biweightScale() : divide by zero error.""")
243 return None
244
245 top=0
246 bottom=0
247 valCount=0
248
249 for i in range(len(uValues)):
250
251 if math.fabs(uValues[i])<=1.0:
252 u2Term=1.0-(uValues[i]*uValues[i])
253 u4Term=math.pow(u2Term, 4)
254 top=top+((diffModuli[i]*diffModuli[i])*u4Term)
255 bottom=bottom+(u2Term*(1.0-(5.0*(uValues[i]*uValues[i]))))
256 valCount=valCount+1
257
258 top=math.sqrt(top)
259 bottom=math.fabs(bottom)
260
261 SBI=math.pow(float(valCount), 0.5)*(top/bottom)
262 return SBI
263
264
266 """Iteratively calculates biweight location and scale, using sigma clipping, for a list
267 of values. The calculation is performed on the first column of a multi-dimensional
268 list; other columns are ignored.
269
270 @type dataList: list
271 @param dataList: input data
272 @type tuningConstant: float
273 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for
274 scale estimates
275 @type sigmaCut: float
276 @param sigmaCut: sigma clipping to apply
277 @rtype: dictionary
278 @return: estimate of biweight location, scale, and list of non-clipped data, in the format
279 {'biweightLocation', 'biweightScale', 'dataList'}
280
281 @note: Returns None if an error occurs.
282
283 """
284
285 iterations=0
286 clippedValues=[]
287 for row in dataList:
288 if type(row)==list:
289 clippedValues.append(row[0])
290 else:
291 clippedValues.append(row)
292
293 while iterations<11 and len(clippedValues)>5:
294
295 cbi=biweightLocation(clippedValues, tuningConstant)
296 sbi=biweightScale(clippedValues, tuningConstant)
297
298
299
300
301 if cbi==None or sbi==None:
302
303 if REPORT_ERRORS==True:
304 print("""ERROR: astStats : biweightClipped() :
305 divide by zero error.""")
306
307 return None
308
309 else:
310
311 clippedValues=[]
312 clippedData=[]
313 for row in dataList:
314 if type(row)==list:
315 if row[0]>cbi-(sigmaCut*sbi) \
316 and row[0]<cbi+(sigmaCut*sbi):
317 clippedValues.append(row[0])
318 clippedData.append(row)
319 else:
320 if row>cbi-(sigmaCut*sbi) \
321 and row<cbi+(sigmaCut*sbi):
322 clippedValues.append(row)
323 clippedData.append(row)
324
325 iterations=iterations+1
326
327 return {'biweightLocation':cbi, 'biweightScale':sbi, 'dataList':clippedData}
328
329
358
359
361 """Performs an ordinary least squares fit on a two dimensional list of numbers.
362 Minimum number of data points is 5.
363
364 @type dataList: list
365 @param dataList: input data, must be a two dimensional list in format [x, y]
366 @rtype: dictionary
367 @return: slope and intercept on y-axis, with associated errors, in the format
368 {'slope', 'intercept', 'slopeError', 'interceptError'}
369
370 @note: Returns None if an error occurs.
371
372 """
373 sumX=0
374 sumY=0
375 sumXY=0
376 sumXX=0
377 n=float(len(dataList))
378 if n > 2:
379 for item in dataList:
380 sumX=sumX+item[0]
381 sumY=sumY+item[1]
382 sumXY=sumXY+(item[0]*item[1])
383 sumXX=sumXX+(item[0]*item[0])
384 m=((n*sumXY)-(sumX*sumY))/((n*sumXX)-(sumX*sumX))
385 c=((sumXX*sumY)-(sumX*sumXY))/((n*sumXX)-(sumX*sumX))
386
387 sumRes=0
388 for item in dataList:
389
390 sumRes=sumRes+((item[1]-(m*item[0])-c) \
391 *(item[1]-(m*item[0])-c))
392
393 sigma=math.sqrt((1.0/(n-2))*sumRes)
394
395 try:
396 mSigma=(sigma*math.sqrt(n))/math.sqrt((n*sumXX)-(sumX*sumX))
397 except:
398 mSigma=numpy.nan
399 try:
400 cSigma=(sigma*math.sqrt(sumXX))/math.sqrt((n*sumXX)-(sumX*sumX))
401 except:
402 cSigma=numpy.nan
403 else:
404 if REPORT_ERRORS==True:
405 print("""ERROR: astStats.OLSFit() : dataList contains < 3 items.""")
406
407 return None
408
409 return {'slope':m,
410 'intercept':c,
411 'slopeError':mSigma,
412 'interceptError':cSigma}
413
414
416 """Calculates the clipped mean and stdev of a list of numbers.
417
418 @type dataList: list
419 @param dataList: input data, one dimensional list of numbers
420 @type sigmaCut: float
421 @param sigmaCut: clipping in Gaussian sigma to apply
422 @type maxIterations: int
423 @param maxIterations: maximum number of iterations
424 @rtype: dictionary
425 @return: format {'clippedMean', 'clippedStdev', 'numPoints'}
426
427 """
428
429 listCopy=[]
430 for d in dataList:
431 listCopy.append(d)
432 listCopy=numpy.array(listCopy)
433
434 iterations=0
435 while iterations < maxIterations and len(listCopy) > 4:
436
437 m=listCopy.mean()
438 s=listCopy.std()
439
440 listCopy=listCopy[numpy.less(abs(listCopy), abs(m+sigmaCut*s))]
441
442 iterations=iterations+1
443
444 return {'clippedMean': m, 'clippedStdev': s, 'numPoints': listCopy.shape[0]}
445
446
448 """Performs a weighted least squares fit on a list of numbers with sigma clipping. Minimum number of data
449 points is 5.
450
451 @type dataList: list
452 @param dataList: input data, must be a three dimensional list in format [x, y, y weight]
453 @rtype: dictionary
454 @return: slope and intercept on y-axis, with associated errors, in the format
455 {'slope', 'intercept', 'slopeError', 'interceptError'}
456
457 @note: Returns None if an error occurs.
458
459 """
460
461 iterations=0
462 clippedValues=[]
463 for row in dataList:
464 clippedValues.append(row)
465
466 while iterations<11 and len(clippedValues)>4:
467
468 fitResults=weightedLSFit(clippedValues, "errors")
469
470 if fitResults['slope'] == None:
471
472 if REPORT_ERRORS==True:
473 print("""ERROR: astStats : clippedWeightedLSFit() :
474 divide by zero error.""")
475
476 return None
477
478 else:
479
480 clippedValues=[]
481 for row in dataList:
482
483
484 fit=fitResults['slope']*row[0]+fitResults['intercept']
485 res=row[1]-fit
486 if abs(res)/row[2] < sigmaCut:
487 clippedValues.append(row)
488
489 iterations=iterations+1
490
491
492 fitResults['numDataPoints']=len(clippedValues)
493
494 return fitResults
495
496
498 """Performs a weighted least squares fit on a three dimensional list of numbers [x, y, y error].
499
500 @type dataList: list
501 @param dataList: input data, must be a three dimensional list in format [x, y, y error]
502 @type weightType: string
503 @param weightType: if "errors", weights are calculated assuming the input data is in the
504 format [x, y, error on y]; if "weights", the weights are assumed to be already calculated and
505 stored in a fourth column [x, y, error on y, weight] (as used by e.g. L{astStats.biweightLSFit})
506 @rtype: dictionary
507 @return: slope and intercept on y-axis, with associated errors, in the format
508 {'slope', 'intercept', 'slopeError', 'interceptError'}
509
510 @note: Returns None if an error occurs.
511
512 """
513 if weightType == "weights":
514 sumW=0
515 sumWX=0
516 sumWY=0
517 sumWXY=0
518 sumWXX=0
519 n=float(len(dataList))
520 if n > 4:
521 for item in dataList:
522 W=item[3]
523 sumWX=sumWX+(W*item[0])
524 sumWY=sumWY+(W*item[1])
525 sumWXY=sumWXY+(W*item[0]*item[1])
526 sumWXX=sumWXX+(W*item[0]*item[0])
527 sumW=sumW+W
528
529
530 try:
531 m=((sumW*sumWXY)-(sumWX*sumWY)) \
532 /((sumW*sumWXX)-(sumWX*sumWX))
533 except ZeroDivisionError:
534 if REPORT_ERRORS == True:
535 print("ERROR: astStats.weightedLSFit() : divide by zero error.")
536 return None
537
538 try:
539 c=((sumWXX*sumWY)-(sumWX*sumWXY)) \
540 /((sumW*sumWXX)-(sumWX*sumWX))
541 except ZeroDivisionError:
542 if REPORT_ERRORS == True:
543 print("ERROR: astStats.weightedLSFit() : divide by zero error.")
544 return None
545
546 sumRes=0
547 for item in dataList:
548
549 sumRes=sumRes+((item[1]-(m*item[0])-c) \
550 *(item[1]-(m*item[0])-c))
551
552 sigma=math.sqrt((1.0/(n-2))*sumRes)
553
554
555
556 if (n*sumWXX)-(sumWX*sumWX)>0.0:
557
558 mSigma=(sigma*math.sqrt(n)) \
559 /math.sqrt((n*sumWXX)-(sumWX*sumWX))
560
561 cSigma=(sigma*math.sqrt(sumWXX)) \
562 /math.sqrt((n*sumWXX)-(sumWX*sumWX))
563
564 else:
565
566 if REPORT_ERRORS==True:
567 print("""ERROR: astStats.weightedLSFit()
568 : divide by zero error.""")
569 return None
570
571 else:
572 if REPORT_ERRORS==True:
573 print("""ERROR: astStats.weightedLSFit() :
574 dataList contains < 5 items.""")
575 return None
576
577 elif weightType == "errors":
578 sumX=0
579 sumY=0
580 sumXY=0
581 sumXX=0
582 sumSigma=0
583 n=float(len(dataList))
584 for item in dataList:
585 sumX=sumX+(item[0]/(item[2]*item[2]))
586 sumY=sumY+(item[1]/(item[2]*item[2]))
587 sumXY=sumXY+((item[0]*item[1])/(item[2]*item[2]))
588 sumXX=sumXX+((item[0]*item[0])/(item[2]*item[2]))
589 sumSigma=sumSigma+(1.0/(item[2]*item[2]))
590 delta=(sumSigma*sumXX)-(sumX*sumX)
591 m=((sumSigma*sumXY)-(sumX*sumY))/delta
592 c=((sumXX*sumY)-(sumX*sumXY))/delta
593 mSigma=math.sqrt(sumSigma/delta)
594 cSigma=math.sqrt(sumXX/delta)
595
596 return {'slope':m,
597 'intercept':c,
598 'slopeError':mSigma,
599 'interceptError':cSigma}
600
601
603 """Performs a weighted least squares fit, where the weights used are the biweight
604 transforms of the residuals to the previous best fit .i.e. the procedure is iterative,
605 and converges very quickly (iterations is set to 10 by default). Minimum number of data
606 points is 10.
607
608 This seems to give slightly different results to the equivalent R routine, so use at your
609 own risk!
610
611 @type dataList: list
612 @param dataList: input data, must be a three dimensional list in format [x, y, y weight]
613 @type tuningConstant: float
614 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for
615 scale estimates
616 @type sigmaCut: float
617 @param sigmaCut: sigma clipping to apply (set to None if not required)
618 @rtype: dictionary
619 @return: slope and intercept on y-axis, with associated errors, in the format
620 {'slope', 'intercept', 'slopeError', 'interceptError'}
621
622 @note: Returns None if an error occurs.
623
624 """
625
626 dataCopy=[]
627 for row in dataList:
628 dataCopy.append(row)
629
630
631 results=OLSFit(dataCopy)
632 origLen=len(dataCopy)
633 for k in range(10):
634 m=results['slope']
635 c=results['intercept']
636 res=[]
637 for item in dataCopy:
638 res.append((m*item[0]+c)-item[1])
639
640 if len(res)>5:
641
642
643 if sigmaCut != None:
644 absRes=[]
645 for item in res:
646 absRes.append(abs(item))
647 sigma=stdev(absRes)
648 count=0
649 for item in absRes:
650 if item>(sigmaCut*sigma) \
651 and len(dataCopy)>2:
652 del dataCopy[count]
653 del res[count]
654
655
656
657
658 count=count-1
659
660 count=count+1
661
662
663 weights=biweightTransform(res, tuningConstant)
664
665
666
667 wData=[]
668 for i in range(len(dataCopy)):
669 wData.append([dataCopy[i][0], dataCopy[i][1], dataCopy[i][2], weights[i][1]])
670
671 results=weightedLSFit(wData, "weights")
672
673 return results
674
675
677 """Bins the input data cumulatively.
678
679 @param data: input data, must be a one dimensional list
680 @type binMin: float
681 @param binMin: minimum value from which to bin data
682 @type binMax: float
683 @param binMax: maximum value from which to bin data
684 @type binTotal: int
685 @param binTotal: number of bins
686 @rtype: list
687 @return: binned data, in format [bin centre, frequency]
688
689 """
690
691 binStep=float(binMax-binMin)/binTotal
692 bins=[]
693 totalItems=len(data)
694 for i in range(binTotal):
695 bins.append(0)
696 for item in data:
697 if item>(binMin+(i*binStep)):
698 bins[i]=bins[i]+1.0/totalItems
699
700
701 coords=[]
702 for i in range(binTotal):
703 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
704
705 return coords
706
707
708 -def binner(data, binMin, binMax, binTotal):
709 """Bins the input data..
710
711 @param data: input data, must be a one dimensional list
712 @type binMin: float
713 @param binMin: minimum value from which to bin data
714 @type binMax: float
715 @param binMax: maximum value from which to bin data
716 @type binTotal: int
717 @param binTotal: number of bins
718 @rtype: list
719 @return: binned data, in format [bin centre, frequency]
720
721 """
722
723 binStep=float(binMax-binMin)/binTotal
724 bins=[]
725 for i in range(binTotal):
726 bins.append(0)
727 for item in data:
728 if item>(binMin+(i*binStep)) \
729 and item<=(binMin+((i+1)*binStep)):
730 bins[i]=bins[i]+1
731
732
733 coords=[]
734 for i in range(binTotal):
735 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
736
737 return coords
738
739
741 """Bins the input data, recorded frequency is sum of weights in bin.
742
743 @param data: input data, must be a one dimensional list
744 @type binMin: float
745 @param binMin: minimum value from which to bin data
746 @type binMax: float
747 @param binMax: maximum value from which to bin data
748 @type binTotal: int
749 @param binTotal: number of bins
750 @rtype: list
751 @return: binned data, in format [bin centre, frequency]
752
753 """
754
755 binStep=float(binMax-binMin)/binTotal
756 bins=[]
757 for i in range(binTotal):
758 bins.append(0.0)
759 for item, weight in zip(data, weights):
760 if item>(binMin+(i*binStep)) \
761 and item<=(binMin+((i+1)*binStep)):
762 bins[i]=bins[i]+weight
763
764
765 coords=[]
766 for i in range(binTotal):
767 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
768
769 return coords
770
771
772