Mercurial > hg > medcouple
view pymedcouple @ 9:e6bcaf38edaa
Fix citation
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Thu, 15 Jan 2015 16:26:16 -0500 |
parents | b3e878bb793d |
children | 3d145bcc8694 |
line wrap: on
line source
#!/usr/bin/python import random import numpy as np from itertools import tee, izip def partsort(L, n): """This is a partial sort of the weighted list L = [(a,w) for a in A, w in W], so that the element at position n is in the correct spot. Also known as quickselect. The elements of W are ignored for the purposes of this partial sort. """ beg = 0 end = len(L) while True: pivot = random.randint(beg,end-1) pivotval = L[pivot][0] L[pivot], L[end-1] = L[end-1], L[pivot] idx = beg for i in xrange(beg, end): if L[i][0] < pivotval: L[idx], L[i] = L[i], L[idx] idx += 1 L[idx], L[end-1] = L[end-1], L[idx] if idx == n: return L[idx] elif idx < n: beg = idx+1 else: end = idx def wmedian(A, W): """This computes the weighted median of array A with corresponding weights W. """ AW = zip(A,W) n = len(AW) wtot = sum(W) beg = 0 end = n-1 while True: mid = (beg + end)//2 partsort(AW, mid) trial = AW[mid][0] wleft = wright = 0 for (a,w) in AW: if a < trial: wleft += w else: # This also includes a == trial, i.e. the "middle" # weight. wright += w if 2*wleft > wtot: end = mid elif 2*wright <= wtot: beg = mid else: return trial def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022, debug=False ): """Calculates the medcouple robust measure of skewness. Parameters ---------- y : array-like, 1-d Returns ------- mc : float The medcouple statistic .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of Skewness." Journal of Computational and Graphical Statistics, Vol. 13, No. 4 (Dec., 2004), pp. 996- 1017 .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of Computing, Vol. 7, No. 2 (May 1978), pp. 147-53. """ # FIXME: Figure out what to do about NaNs. n = len(X) n2 = (n-1)//2 if n < 3: return 0 Z = sorted(X, reverse=True) if n % 2 == 1: Zmed = Z[n2] else: Zmed = (Z[n2] + Z[n2+1])/2 #Check if the median is at the edges up to relative epsilon if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)): return -1.0 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)): return 1.0 # Centre Z wrt median, so that median(Z) = 0. Z = [z - Zmed for z in Z] # Scale inside [-0.5, 0.5], for greater numerical stability. Zden = 2*max(Z[0], -Z[-1]) Z = [z/Zden for z in Z] Zmed /= Zden Zeps = eps1*(eps1 + abs(Zmed)) # These overlap on the entries that are tied with the median Zplus = [z for z in Z if z >= -Zeps] Zminus = [z for z in Z if Zeps >= z] n_plus = len(Zplus) n_minus = len(Zminus) def h_kern(i, j, debug=False): """Kernel function h for the medcouple, closing over the values of Zplus and Zminus just defined above. In case a and be are within epsilon of the median, the kernel is the signum of their position. """ a = Zplus[i] b = Zminus[j] if abs(a - b) <= 2*eps2: h = signum(n_plus - 1 - i - j) else: h = (a+b)/(a-b) return h # Init left and right borders L = [0]*n_plus R = [n_minus - 1]*n_plus Ltot = 0 Rtot = n_minus*n_plus mid_idx = (Rtot-1)//2 # kth pair algorithm (Johnson & Mizoguchi) while Rtot - Ltot > n_plus: # First, compute the median inside the given bounds # (Be stingy, reuse same generator) [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i]) A = [h_kern(i, (L[i] + R[i])//2) for i in I1] W = [R[i] - L[i] + 1 for i in I2] Am = wmedian(A,W) Am_eps = eps1*(eps1 + abs(Am)) # Compute new left and right boundaries, based on the weighted # median P = [] Q = [] j = 0 for i in xrange(n_plus - 1, -1, -1): while j < n_minus - 1 and h_kern(i, j) - Am > Am_eps: j += 1 P.append(j-1) P.reverse() j = n_minus - 1 for i in xrange(0, n_plus): while j >= 0 and h_kern(i, j) - Am < -Am_eps: j -= 1 Q.append(j+1) # Check on which side of those bounds the desired median of # the whole matrix may be. sumP = sum(P) + len(P) sumQ = sum(Q) if mid_idx <= sumP - 1: R = P Rtot = sumP else: if mid_idx > sumQ - 1: L = Q Ltot = sumQ else: return Am # Didn't find the median, but now we have a very small search # space to find it in, just between the left and right boundaries. # This space is of size Rtot - Ltot which is <= n_plus A = [] for (i, (l, r)) in enumerate(izip(L, R)): for j in xrange(l, r + 1): A.append(h_kern(i, j, debug=debug)) A.sort() A.reverse() Am = A[mid_idx - Ltot] return Am def signum(x): if x > 0: return 1 if x < 0: return -1 else: return 0 def main(): import sys fname = sys.argv[1] with open(fname) as f: data = [float(x) for x in f.readlines() if x.strip() != ""] print "%.16g" % medcouple_1d(data) if __name__ == "__main__": main()