comparison medcouple.py @ 4:74d0d08dbc95

Add Python implementation
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Tue, 13 Jan 2015 16:55:48 -0500
parents
children
comparison
equal deleted inserted replaced
3:bb3a5c43fc65 4:74d0d08dbc95
1 import random
2 import numpy as np
3
4 from itertools import tee
5
6 def partsort(L, n):
7 """This is a partial sort of the weighted list L = [(a,w) for a in A,
8 w in W], so that the element at position n is in the correct spot.
9 Also known as quickselect. The elements of W are ignored for the
10 purposes of this partial sort.
11 """
12
13 beg = 0
14 end = len(L)
15
16 while True:
17 pivot = random.randint(beg,end-1)
18 pivotval = L[pivot][0]
19 L[pivot], L[end-1] = L[end-1], L[pivot]
20
21 idx = beg
22 for i in xrange(beg, end):
23 if L[i][0] < pivotval:
24 L[idx], L[i] = L[i], L[idx]
25 idx += 1
26
27 L[idx], L[end-1] = L[end-1], L[idx]
28
29 if idx == n:
30 return L[idx]
31 elif idx < n:
32 beg = idx+1
33 else:
34 end = idx
35
36 def wmedian(A, W):
37 """This computes the weighted median of array A with corresponding
38 weights W.
39 """
40
41 AW = zip(A,W)
42 n = len(AW)
43
44 wtot = sum(W)
45
46 beg = 0
47 end = n
48
49 while True:
50 mid = (beg + end)//2
51
52 partsort(AW, mid)
53 trial = AW[mid][0]
54
55 wleft = wright = 0
56 for (a,w) in AW:
57 if a < trial:
58 wleft += w
59 else:
60 # This also includes a == trial, i.e. the "middle"
61 # weight.
62 wright += w
63
64 if 2*wleft > wtot:
65 end = mid
66 elif 2*wright <= wtot:
67 beg = mid
68 else:
69 return trial
70
71 def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022 ):
72 """Calculates the medcouple robust measure of skewness.
73
74 Parameters
75 ----------
76 y : array-like, 1-d
77
78 Returns
79 -------
80 mc : float
81 The medcouple statistic
82
83 .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of
84 Skewness." Journal of Computational and Graphical Statistics, Vol.
85 13, No. 4 (Dec., 2004), pp. 996- 1017
86
87 .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element
88 in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of
89 Computing, 7, 147-53.
90
91 """
92 # FIXME: Figure out what to do about NaNs.
93
94 n = len(X)
95 n2 = (n-1)//2
96
97 if n < 3:
98 return 0
99
100 Z = sorted(X, reverse=True)
101
102 if n % 2 == 1:
103 Zmed = Z[n2 + 1]
104 else:
105 Zmed = (Z[n2] + Z[n2 + 1])/2
106
107 #Check if the median is at the edges up to relative epsilon
108 if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
109 return -1.0
110 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
111 return 1.0
112
113 # Centre Z wrt median, so that median(Z) = 0.
114 Z = [z - Zmed for z in Z]
115
116 # Scale inside [-0.5, 0.5], for greater numerical stability.
117 Zden = -2*max(-Z[0], Z[-1])
118 Z = [z/zden for z in Zden]
119 Zmed /= Zden
120
121 Zeps = eps1*(eps1 + abs(Zmed))
122
123 # These overlap on the entries that are tied with the median
124 Zplus = [z for z in Z if z >= -Zeps]
125 Zminus = [z for z in Z if Zeps >= z]
126
127 n_plus = len(Zplus)
128 n_minus = len(Zminus)
129
130 def h_kern(i, j):
131 """Kernel function h for the medcouple, closing over the values of
132 Zplus and Zminus just defined above.
133
134 In case a and be are within epsilon of the median, the kernel
135 is the signum of their position.
136
137 """
138 a = Zplus[i]
139 b = Zminus[j]
140
141 if abs(a - b) <= 2*eps2:
142 return signum(n_plus - 1 - i - j)
143
144 return (a+b)/(a-b)
145
146 # Init left and right borders
147 L = [0]*n_plus
148 R = [n_plus-1]*n_plus
149
150 Ltot = 0
151 Rtot = n_plus*n_minus
152
153 mid_idx = Rtot//2
154
155 # kth pair algorithm (Johnson & Mizoguchi)
156 while Rtot - Ltot > n_plus:
157
158 # First, compute the median inside the given bounds
159 I1 = (i for i in xrange(0, n_plus) if L[i] <= R[i])
160 I2 = tee(I1) # Be stingy, reuse same generator
161 A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
162 W = [R[i] - L[i] for i in I2]
163 Am = wmedian(A,W)
164
165 # Compute new left and right boundaries, based on the weighted
166 # median
167 P = Q = []
168
169 j = -1
170 for i in xrange(n_plus - 1, -1, -1):
171 while j < n_plus - 1 and h_kern(i, j) > Am:
172 j += 1
173 P.append(j)
174 P.reverse()
175
176 j = n_plus
177 for i in xrange(0, n_plus):
178 while j > -1 and h_kern(i, j) < Am:
179 j -= 1
180 Q.append(j)
181
182 # Check on which side of those bounds the desired median of
183 # the whole matrix may be.
184 sumP = sum(P)
185 sumQ = sum(Q)
186 if mid_idx <= sumP:
187 R = P
188 Rtot = sumP
189 elif mid_idx > sumQ:
190 L = Q
191 Ltot = sumQ
192 else:
193 return Am
194
195 # Uh, implement this part...
196 import ipdb; ipdb.set_trace()
197
198
199 def signum(x):
200 if x > 0:
201 return 1
202 if x < 0:
203 return -1
204 else:
205 return 0