# HG changeset patch # User Jordi GutiƩrrez Hermoso # Date 1421186148 18000 # Node ID 74d0d08dbc958baede29813c0a7d51e469f901f2 # Parent bb3a5c43fc658cc65a38921f7bf96d0db7ce636e Add Python implementation diff --git a/medcouple.py b/medcouple.py new file mode 100644 --- /dev/null +++ b/medcouple.py @@ -0,0 +1,205 @@ +import random +import numpy as np + +from itertools import tee + +def partsort(L, n): + """This is a partial sort of the weighted list L = [(a,w) for a in A, + w in W], so that the element at position n is in the correct spot. + Also known as quickselect. The elements of W are ignored for the + purposes of this partial sort. + """ + + beg = 0 + end = len(L) + + while True: + pivot = random.randint(beg,end-1) + pivotval = L[pivot][0] + L[pivot], L[end-1] = L[end-1], L[pivot] + + idx = beg + for i in xrange(beg, end): + if L[i][0] < pivotval: + L[idx], L[i] = L[i], L[idx] + idx += 1 + + L[idx], L[end-1] = L[end-1], L[idx] + + if idx == n: + return L[idx] + elif idx < n: + beg = idx+1 + else: + end = idx + +def wmedian(A, W): + """This computes the weighted median of array A with corresponding + weights W. + """ + + AW = zip(A,W) + n = len(AW) + + wtot = sum(W) + + beg = 0 + end = n + + while True: + mid = (beg + end)//2 + + partsort(AW, mid) + trial = AW[mid][0] + + wleft = wright = 0 + for (a,w) in AW: + if a < trial: + wleft += w + else: + # This also includes a == trial, i.e. the "middle" + # weight. + wright += w + + if 2*wleft > wtot: + end = mid + elif 2*wright <= wtot: + beg = mid + else: + return trial + +def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022 ): + """Calculates the medcouple robust measure of skewness. + + Parameters + ---------- + y : array-like, 1-d + + Returns + ------- + mc : float + The medcouple statistic + + .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of + Skewness." Journal of Computational and Graphical Statistics, Vol. + 13, No. 4 (Dec., 2004), pp. 996- 1017 + + .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element + in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of + Computing, 7, 147-53. + + """ + # FIXME: Figure out what to do about NaNs. + + n = len(X) + n2 = (n-1)//2 + + if n < 3: + return 0 + + Z = sorted(X, reverse=True) + + if n % 2 == 1: + Zmed = Z[n2 + 1] + else: + Zmed = (Z[n2] + Z[n2 + 1])/2 + + #Check if the median is at the edges up to relative epsilon + if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)): + return -1.0 + if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)): + return 1.0 + + # Centre Z wrt median, so that median(Z) = 0. + Z = [z - Zmed for z in Z] + + # Scale inside [-0.5, 0.5], for greater numerical stability. + Zden = -2*max(-Z[0], Z[-1]) + Z = [z/zden for z in Zden] + Zmed /= Zden + + Zeps = eps1*(eps1 + abs(Zmed)) + + # These overlap on the entries that are tied with the median + Zplus = [z for z in Z if z >= -Zeps] + Zminus = [z for z in Z if Zeps >= z] + + n_plus = len(Zplus) + n_minus = len(Zminus) + + def h_kern(i, j): + """Kernel function h for the medcouple, closing over the values of + Zplus and Zminus just defined above. + + In case a and be are within epsilon of the median, the kernel + is the signum of their position. + + """ + a = Zplus[i] + b = Zminus[j] + + if abs(a - b) <= 2*eps2: + return signum(n_plus - 1 - i - j) + + return (a+b)/(a-b) + + # Init left and right borders + L = [0]*n_plus + R = [n_plus-1]*n_plus + + Ltot = 0 + Rtot = n_plus*n_minus + + mid_idx = Rtot//2 + + # kth pair algorithm (Johnson & Mizoguchi) + while Rtot - Ltot > n_plus: + + # First, compute the median inside the given bounds + I1 = (i for i in xrange(0, n_plus) if L[i] <= R[i]) + I2 = tee(I1) # Be stingy, reuse same generator + A = [h_kern(i, (L[i] + R[i])//2) for i in I1] + W = [R[i] - L[i] for i in I2] + Am = wmedian(A,W) + + # Compute new left and right boundaries, based on the weighted + # median + P = Q = [] + + j = -1 + for i in xrange(n_plus - 1, -1, -1): + while j < n_plus - 1 and h_kern(i, j) > Am: + j += 1 + P.append(j) + P.reverse() + + j = n_plus + for i in xrange(0, n_plus): + while j > -1 and h_kern(i, j) < Am: + j -= 1 + Q.append(j) + + # Check on which side of those bounds the desired median of + # the whole matrix may be. + sumP = sum(P) + sumQ = sum(Q) + if mid_idx <= sumP: + R = P + Rtot = sumP + elif mid_idx > sumQ: + L = Q + Ltot = sumQ + else: + return Am + + # Uh, implement this part... + import ipdb; ipdb.set_trace() + + +def signum(x): + if x > 0: + return 1 + if x < 0: + return -1 + else: + return 0