diff medcouple.py @ 4:74d0d08dbc95

Add Python implementation
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Tue, 13 Jan 2015 16:55:48 -0500
parents
children
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/medcouple.py
@@ -0,0 +1,205 @@
+import random
+import numpy as np
+
+from itertools import tee
+
+def partsort(L, n):
+    """This is a partial sort of the weighted list L = [(a,w) for a in A,
+    w in W], so that the element at position n is in the correct spot.
+    Also known as quickselect. The elements of W are ignored for the
+    purposes of this partial sort.
+    """
+    
+    beg = 0
+    end = len(L)
+    
+    while True:
+        pivot = random.randint(beg,end-1)
+        pivotval = L[pivot][0]
+        L[pivot], L[end-1] = L[end-1], L[pivot]
+        
+        idx = beg
+        for i in xrange(beg, end):
+            if L[i][0] < pivotval:
+                L[idx], L[i] = L[i], L[idx]
+                idx += 1
+
+        L[idx], L[end-1] = L[end-1], L[idx]
+
+        if idx == n:
+            return L[idx]
+        elif idx < n:
+            beg = idx+1
+        else:
+            end = idx
+
+def wmedian(A, W):
+    """This computes the weighted median of array A with corresponding
+    weights W.
+    """
+    
+    AW = zip(A,W)
+    n = len(AW)
+
+    wtot = sum(W)
+
+    beg = 0
+    end = n
+
+    while True:
+        mid = (beg + end)//2
+        
+        partsort(AW, mid)
+        trial = AW[mid][0]
+
+        wleft = wright = 0
+        for (a,w) in AW:
+            if a < trial:
+                wleft += w
+            else:
+                # This also includes a == trial, i.e. the "middle"
+                # weight.
+                wright += w
+
+        if 2*wleft > wtot:
+            end = mid
+        elif 2*wright <= wtot:
+            beg = mid
+        else:
+            return trial
+
+def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022 ):
+    """Calculates the medcouple robust measure of skewness.
+
+    Parameters
+    ----------
+    y : array-like, 1-d
+
+    Returns
+    -------
+    mc : float
+        The medcouple statistic
+   
+    .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of
+    Skewness." Journal of Computational and Graphical Statistics, Vol.
+    13, No. 4 (Dec., 2004), pp. 996- 1017
+
+    .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element
+    in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of
+    Computing, 7, 147-53.
+
+    """
+    # FIXME: Figure out what to do about NaNs.
+    
+    n = len(X)
+    n2 = (n-1)//2
+
+    if n < 3:
+        return 0
+
+    Z = sorted(X, reverse=True)
+
+    if n % 2 == 1:
+        Zmed = Z[n2 + 1]
+    else:
+        Zmed = (Z[n2] + Z[n2 + 1])/2
+    
+    #Check if the median is at the edges up to relative epsilon
+    if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
+        return -1.0
+    if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
+        return 1.0
+    
+    # Centre Z wrt median, so that median(Z) = 0.
+    Z = [z - Zmed for z in Z]
+
+    # Scale inside [-0.5, 0.5], for greater numerical stability.
+    Zden = -2*max(-Z[0], Z[-1])
+    Z = [z/zden for z in Zden]
+    Zmed /= Zden
+    
+    Zeps = eps1*(eps1 + abs(Zmed))
+
+    # These overlap on the entries that are tied with the median
+    Zplus   = [z for z in Z if z >= -Zeps]
+    Zminus  = [z for z in Z if Zeps >= z]
+
+    n_plus = len(Zplus)
+    n_minus = len(Zminus)
+
+    def h_kern(i, j):
+        """Kernel function h for the medcouple, closing over the values of
+        Zplus and Zminus just defined above.
+
+        In case a and be are within epsilon of the median, the kernel
+        is the signum of their position.
+
+        """
+        a = Zplus[i]
+        b = Zminus[j]
+
+        if abs(a - b) <= 2*eps2:
+            return signum(n_plus - 1 -  i - j)
+
+        return (a+b)/(a-b)
+
+    # Init left and right borders
+    L = [0]*n_plus
+    R = [n_plus-1]*n_plus
+
+    Ltot = 0
+    Rtot = n_plus*n_minus
+
+    mid_idx = Rtot//2
+
+    # kth pair algorithm (Johnson & Mizoguchi)
+    while Rtot - Ltot > n_plus:
+        
+        # First, compute the median inside the given bounds
+        I1 = (i for i in xrange(0, n_plus) if L[i] <= R[i])
+        I2 = tee(I1) # Be stingy, reuse same generator
+        A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
+        W = [R[i] - L[i] for i in I2]
+        Am = wmedian(A,W)
+
+        # Compute new left and right boundaries, based on the weighted
+        # median
+        P = Q = []
+
+        j = -1
+        for i in xrange(n_plus - 1, -1, -1):
+            while j < n_plus - 1 and h_kern(i, j) > Am:
+                j += 1
+            P.append(j)
+        P.reverse()
+
+        j = n_plus
+        for i in xrange(0, n_plus):
+            while j > -1 and h_kern(i, j) < Am:
+                j -= 1
+            Q.append(j)
+
+        # Check on which side of those bounds the desired median of
+        # the whole matrix may be.
+        sumP = sum(P)
+        sumQ = sum(Q)
+        if mid_idx <= sumP:
+            R = P
+            Rtot = sumP
+        elif mid_idx > sumQ:
+            L = Q
+            Ltot = sumQ
+        else:
+            return Am
+
+    # Uh, implement this part...
+    import ipdb; ipdb.set_trace()
+    
+
+def signum(x):
+    if x > 0:
+        return 1
+    if x < 0:
+        return -1
+    else:
+        return 0