diff pymedcouple @ 7:6c7c7dc9d8ef

hairball
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Thu, 15 Jan 2015 15:53:19 -0500
parents e3b1dcc51e6a
children b3e878bb793d
line wrap: on
line diff
--- a/pymedcouple
+++ b/pymedcouple
@@ -3,7 +3,7 @@
 import random
 import numpy as np
 
-from itertools import tee
+from itertools import tee, izip
 
 def partsort(L, n):
     """This is a partial sort of the weighted list L = [(a,w) for a in A,
@@ -46,7 +46,7 @@
     wtot = sum(W)
 
     beg = 0
-    end = n
+    end = n-1
 
     while True:
         mid = (beg + end)//2
@@ -70,7 +70,7 @@
         else:
             return trial
 
-def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022 ):
+def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022, debug=False ):
     """Calculates the medcouple robust measure of skewness.
 
     Parameters
@@ -102,21 +102,27 @@
     Z = sorted(X, reverse=True)
 
     if n % 2 == 1:
-        Zmed = Z[n2 + 1]
+        Zmed = Z[n2]
     else:
-        Zmed = (Z[n2] + Z[n2 + 1])/2
+        Zmed = (Z[n2] + Z[n2+1])/2
     
     #Check if the median is at the edges up to relative epsilon
     if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
         return -1.0
     if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
         return 1.0
-    
+
+    if debug:
+        print "Zmed = ", Zmed
+        
     # Centre Z wrt median, so that median(Z) = 0.
     Z = [z - Zmed for z in Z]
 
     # Scale inside [-0.5, 0.5], for greater numerical stability.
-    Zden = -2*max(-Z[0], Z[-1])
+    Zden = 2*max(Z[0], -Z[-1])
+    if debug:
+        print "Zden =", Zden
+    
     Z = [z/Zden for z in Z]
     Zmed /= Zden
     
@@ -126,10 +132,20 @@
     Zplus   = [z for z in Z if z >= -Zeps]
     Zminus  = [z for z in Z if Zeps >= z]
 
+    if debug:
+        for zp in Zplus:
+            print "Zplus : %g" % zp
+        for zm in Zminus:
+            print "Zminus: %g" % zm
+
     n_plus = len(Zplus)
     n_minus = len(Zminus)
 
-    def h_kern(i, j):
+    if debug:
+        print "n_plus =", n_plus
+        print "n_minus = ", n_minus
+
+    def h_kern(i, j, debug=False):
         """Kernel function h for the medcouple, closing over the values of
         Zplus and Zminus just defined above.
 
@@ -141,62 +157,99 @@
         b = Zminus[j]
 
         if abs(a - b) <= 2*eps2:
-            return signum(n_plus - 1 -  i - j)
+            h = signum(n_plus - 1 -  i - j)
+        else:
+            h = (a+b)/(a-b)
 
-        return (a+b)/(a-b)
+        if False:
+            print "a = {a}, b = {b}, h({i},{j}) = {h}".format(**locals())
+
+        return h
 
     # Init left and right borders
     L = [0]*n_plus
-    R = [n_plus-1]*n_plus
+    R = [n_minus - 1]*n_plus
 
     Ltot = 0
-    Rtot = n_plus*n_minus
+    Rtot = n_minus*n_plus
+    mid_idx = (Rtot-1)//2
 
-    mid_idx = Rtot//2
+    if debug:
+        print "mid_idx = ", mid_idx
 
     # kth pair algorithm (Johnson & Mizoguchi)
     while Rtot - Ltot > n_plus:
+        if debug:
+            print "L = ", L
+            print "R = ", R
         
         # First, compute the median inside the given bounds
         # (Be stingy, reuse same generator)
         [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i])
 
         A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
-        W = [R[i] - L[i] for i in I2]
+        W = [R[i] - L[i] + 1 for i in I2]
         Am = wmedian(A,W)
+        if debug:
+            print "Am = ", Am
+            
+        Am_eps = eps1*(eps1 + abs(Am))
 
         # Compute new left and right boundaries, based on the weighted
         # median
-        P = Q = []
+        P = []
+        Q = []
 
-        j = -1
+        j = 0
         for i in xrange(n_plus - 1, -1, -1):
-            while j < n_plus - 1 and h_kern(i, j) > Am:
+            while j < n_minus - 1 and h_kern(i, j) - Am > Am_eps:
                 j += 1
-            P.append(j)
+            P.append(j-1)
+            
         P.reverse()
 
-        j = n_plus
+        j = n_minus - 1
         for i in xrange(0, n_plus):
-            while j > -1 and h_kern(i, j) < Am:
+            while j >= 0 and h_kern(i, j) - Am < -Am_eps:
                 j -= 1
-            Q.append(j)
+            Q.append(j+1)
 
         # Check on which side of those bounds the desired median of
         # the whole matrix may be.
-        sumP = sum(P)
+        sumP = sum(P) + len(P)
         sumQ = sum(Q)
-        if mid_idx <= sumP:
+        if debug:
+            print "P: ", sumP, P
+            print "Q: ", sumQ, Q
+            print
+        
+        if mid_idx <= sumP - 1:
             R = P
             Rtot = sumP
-        elif mid_idx > sumQ:
-            L = Q
-            Ltot = sumQ
         else:
-            return Am
+            if mid_idx > sumQ - 1:
+                L = Q
+                Ltot = sumQ
+            else:
+                return Am
 
-    # Uh, implement this part...
-    import ipdb; ipdb.set_trace()
+    # Didn't find the median, but now we have a very small search
+    # space to find it in, just between the left and right boundaries.
+    # This space is of size Rtot - Ltot which is <= n_plus
+    if debug:
+        print "L =", L
+        print "R =", R
+    A = []
+    for (i, (l, r)) in enumerate(izip(L, R)):
+        for j in xrange(l, r + 1):
+            A.append(h_kern(i, j, debug=debug))
+    A.sort()
+    A.reverse()
+
+    if debug:
+        print "len(A) = ", len(A)
+    Am =  A[mid_idx - Ltot]
+    return Am
     
 
 def signum(x):
@@ -212,7 +265,7 @@
     fname = sys.argv[1]
     with open(fname) as f:
         data = [float(x) for x in f.readlines() if x.strip() != ""]
-    print medcouple_1d(data)
+    print "%.16g" % medcouple_1d(data)
 
 if __name__ == "__main__":
     main()