Mercurial > hg > medcouple
view pymedcouple @ 22:f5b4a2ab6204
medcouple_1d: modify the definition of medc_idx to match R's definition
This uses the right (not left) middle in case of an even number of
things and the middle one in case of an odd number.
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Sat, 17 Jan 2015 19:56:07 -0500 |
parents | d63e763f8ac1 |
children | 29a178b23219 |
line wrap: on
line source
#!/usr/bin/python import random import numpy as np from itertools import tee, izip def partsort(L, n): """This is a partial sort of the weighted list L = [(a,w) for a in A, w in W], so that the element at position n is in the correct spot. Also known as quickselect. The elements of W are ignored for the purposes of this partial sort. """ beg = 0 end = len(L) while True: pivot = random.randint(beg,end-1) pivotval = L[pivot][0] L[pivot], L[end-1] = L[end-1], L[pivot] idx = beg for i in xrange(beg, end): if L[i][0] < pivotval: L[idx], L[i] = L[i], L[idx] idx += 1 L[idx], L[end-1] = L[end-1], L[idx] if idx == n: return L[idx] elif idx < n: beg = idx+1 else: end = idx def wmedian(A, W): """This computes the weighted median of array A with corresponding weights W. """ AW = zip(A,W) n = len(AW) wtot = sum(W) beg = 0 end = n-1 while True: mid = (beg + end)//2 partsort(AW, mid) trial = AW[mid][0] wleft = wright = 0 for (a,w) in AW: if a < trial: wleft += w else: # This also includes a == trial, i.e. the "middle" # weight. wright += w if 2*wleft > wtot: end = mid elif 2*wright < wtot: beg = mid else: return trial def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022): """Calculates the medcouple robust measure of skewness. Parameters ---------- y : array-like, 1-d Returns ------- mc : float The medcouple statistic .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of Skewness." Journal of Computational and Graphical Statistics, Vol. 13, No. 4 (Dec., 2004), pp. 996- 1017 .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of Computing, Vol. 7, No. 2 (May 1978), pp. 147-53. """ # FIXME: Figure out what to do about NaNs. n = len(X) n2 = (n-1)//2 if n < 3: return 0 Z = sorted(X, reverse=True) if n % 2 == 1: Zmed = Z[n2] else: Zmed = (Z[n2] + Z[n2+1])/2 #Check if the median is at the edges up to relative epsilon if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)): return -1.0 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)): return 1.0 # Centre Z wrt median, so that median(Z) = 0. Z = [z - Zmed for z in Z] # Scale inside [-0.5, 0.5], for greater numerical stability. Zden = 2*max(Z[0], -Z[-1]) Z = [z/Zden for z in Z] Zmed /= Zden Zeps = eps1*(eps1 + abs(Zmed)) # These overlap on the entries that are tied with the median Zplus = [z for z in Z if z >= -Zeps] Zminus = [z for z in Z if Zeps >= z] n_plus = len(Zplus) n_minus = len(Zminus) def h_kern(i, j): """Kernel function h for the medcouple, closing over the values of Zplus and Zminus just defined above. In case a and be are within epsilon of the median, the kernel is the signum of their position. """ a = Zplus[i] b = Zminus[j] if abs(a - b) <= 2*eps2: h = signum(n_plus - 1 - i - j) else: h = (a+b)/(a-b) return h # Init left and right borders L = [0]*n_plus R = [n_minus - 1]*n_plus Ltot = 0 Rtot = n_minus*n_plus medc_idx = Rtot//2 # kth pair algorithm (Johnson & Mizoguchi) while Rtot - Ltot > n_plus: # First, compute the median inside the given bounds # (Be stingy, reuse same generator) [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i]) A = [h_kern(i, (L[i] + R[i])//2) for i in I1] W = [R[i] - L[i] + 1 for i in I2] Am = wmedian(A,W) Am_eps = eps1*(eps1 + abs(Am)) # Compute new left and right boundaries, based on the weighted # median P = [] Q = [] j = 0 for i in xrange(n_plus - 1, -1, -1): while j < n_minus and h_kern(i, j) - Am > Am_eps: j += 1 P.append(j - 1) P.reverse() j = n_minus - 1 for i in xrange(0, n_plus): while j >= 0 and h_kern(i, j) - Am < -Am_eps: j -= 1 Q.append(j + 1) # Check on which side of those bounds the desired median of # the whole matrix may be. sumP = sum(P) + len(P) sumQ = sum(Q) if medc_idx <= sumP - 1: R = P Rtot = sumP else: if medc_idx > sumQ - 1: L = Q Ltot = sumQ else: return Am # Didn't find the median, but now we have a very small search # space to find it in, just between the left and right boundaries. # This space is of size Rtot - Ltot which is <= n_plus A = [] for (i, (l, r)) in enumerate(izip(L, R)): for j in xrange(l, r + 1): A.append(h_kern(i, j)) A.sort() A.reverse() Am = A[medc_idx - Ltot] return Am def signum(x): if x > 0: return 1 if x < 0: return -1 else: return 0 def main(): import sys fname = sys.argv[1] with open(fname) as f: data = [float(x) for x in f.readlines() if x.strip() != ""] print "%.16g" % medcouple_1d(data) if __name__ == "__main__": main()