Mercurial > hg > medcouple
changeset 65:364a0c9df823
show_mc.py: copy from medcouple.py
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Wed, 18 May 2016 08:49:01 -0400 |
parents | d5ec0659b26b |
children | 916349600c4b |
files | talk/code/show_mc.py |
diffstat | 1 files changed, 232 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
new file mode 100755 --- /dev/null +++ b/talk/code/show_mc.py @@ -0,0 +1,232 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# medcouple.py --- + +# Copyright © 2015 Jordi Gutiérrez Hermoso <jordigh@octave.org> + +# Author: Jordi Gutiérrez Hermoso + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +import random + +from itertools import tee, izip + +def wmedian(A, W): + """This computes the weighted median of array A with corresponding + weights W. + """ + + AW = zip(A, W) + n = len(AW) + + wtot = sum(W) + + beg = 0 + end = n - 1 + + while True: + mid = (beg + end)//2 + + AW = sorted(AW, key = lambda x: x[0]) # A partial sort would suffice here + + trial = AW[mid][0] + + wleft = wright = 0 + for (a, w) in AW: + if a < trial: + wleft += w + else: + # This also includes a == trial, i.e. the "middle" + # weight. + wright += w + + if 2*wleft > wtot: + end = mid + elif 2*wright < wtot: + beg = mid + else: + return trial + + +def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022): + """Calculates the medcouple robust measure of skewness. + + Parameters + ---------- + y : array-like, 1-d + + Returns + ------- + mc : float + The medcouple statistic + + .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of + Skewness." Journal of Computational and Graphical Statistics, Vol. + 13, No. 4 (Dec., 2004), pp. 996- 1017 + + .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element + in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of + Computing, Vol. 7, No. 2 (May 1978), pp. 147-53. + + """ + # FIXME: Figure out what to do about NaNs. + + n = len(X) + n2 = (n - 1)//2 + + if n < 3: + return 0 + + Z = sorted([float(x) for x in X], reverse=True) + + if n % 2 == 1: + Zmed = Z[n2] + else: + Zmed = (Z[n2] + Z[n2 + 1])/2 + + #Check if the median is at the edges up to relative epsilon + if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)): + return -1.0 + if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)): + return 1.0 + + # Centre Z wrt median, so that median(Z) = 0. + Z = [z - Zmed for z in Z] + + # Scale inside [-0.5, 0.5], for greater numerical stability. + Zden = 2*max(Z[0], -Z[-1]) + Z = [z/Zden for z in Z] + Zmed /= Zden + + Zeps = eps1*(eps1 + abs(Zmed)) + + # These overlap on the entries that are tied with the median + Zplus = [z for z in Z if z >= -Zeps] + Zminus = [z for z in Z if Zeps >= z] + + n_plus = len(Zplus) + n_minus = len(Zminus) + + + def h_kern(i, j): + """Kernel function h for the medcouple, closing over the values of + Zplus and Zminus just defined above. + + In case a and be are within epsilon of the median, the kernel + is the signum of their position. + + """ + a = Zplus[i] + b = Zminus[j] + + if abs(a - b) <= 2*eps2: + h = signum(n_plus - 1 - i - j) + else: + h = (a + b)/(a - b) + + return h + + # Init left and right borders + L = [0]*n_plus + R = [n_minus - 1]*n_plus + + Ltot = 0 + Rtot = n_minus*n_plus + medc_idx = Rtot//2 + + # kth pair algorithm (Johnson & Mizoguchi) + while Rtot - Ltot > n_plus: + + # First, compute the median inside the given bounds + # (Be stingy, reuse same generator) + [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i]) + + A = [h_kern(i, (L[i] + R[i])//2) for i in I1] + W = [R[i] - L[i] + 1 for i in I2] + Am = wmedian(A, W) + + Am_eps = eps1*(eps1 + abs(Am)) + + # Compute new left and right boundaries, based on the weighted + # median + P = [] + Q = [] + + j = 0 + for i in xrange(n_plus - 1, -1, -1): + while j < n_minus and h_kern(i, j) - Am > Am_eps: + j += 1 + P.append(j - 1) + + P.reverse() + + j = n_minus - 1 + for i in xrange(0, n_plus): + while j >= 0 and h_kern(i, j) - Am < -Am_eps: + j -= 1 + Q.append(j + 1) + + # Check on which side of those bounds the desired median of + # the whole matrix may be. + sumP = sum(P) + len(P) + sumQ = sum(Q) + + if medc_idx <= sumP - 1: + R = P + Rtot = sumP + else: + if medc_idx > sumQ - 1: + L = Q + Ltot = sumQ + else: + return Am + + # Didn't find the median, but now we have a very small search + # space to find it in, just between the left and right boundaries. + # This space is of size Rtot - Ltot which is <= n_plus + A = [] + for (i, (l, r)) in enumerate(izip(L, R)): + for j in xrange(l, r + 1): + A.append(h_kern(i, j)) + + A.sort() # A partial sort would suffice here + A.reverse() + + Am = A[medc_idx - Ltot] + + return Am + + +def signum(x): + if x > 0: + return 1 + if x < 0: + return -1 + else: + return 0 + + +def main(): + import sys + fname = sys.argv[1] + with open(fname) as f: + data = [float(x) for x in f.readlines() if x.strip() != ""] + + print "%.16g" % medcouple_1d(data) + +if __name__ == "__main__": + main()