Mercurial > hg > medcouple
comparison medcouple.py @ 4:74d0d08dbc95
Add Python implementation
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Tue, 13 Jan 2015 16:55:48 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:bb3a5c43fc65 | 4:74d0d08dbc95 |
---|---|
1 import random | |
2 import numpy as np | |
3 | |
4 from itertools import tee | |
5 | |
6 def partsort(L, n): | |
7 """This is a partial sort of the weighted list L = [(a,w) for a in A, | |
8 w in W], so that the element at position n is in the correct spot. | |
9 Also known as quickselect. The elements of W are ignored for the | |
10 purposes of this partial sort. | |
11 """ | |
12 | |
13 beg = 0 | |
14 end = len(L) | |
15 | |
16 while True: | |
17 pivot = random.randint(beg,end-1) | |
18 pivotval = L[pivot][0] | |
19 L[pivot], L[end-1] = L[end-1], L[pivot] | |
20 | |
21 idx = beg | |
22 for i in xrange(beg, end): | |
23 if L[i][0] < pivotval: | |
24 L[idx], L[i] = L[i], L[idx] | |
25 idx += 1 | |
26 | |
27 L[idx], L[end-1] = L[end-1], L[idx] | |
28 | |
29 if idx == n: | |
30 return L[idx] | |
31 elif idx < n: | |
32 beg = idx+1 | |
33 else: | |
34 end = idx | |
35 | |
36 def wmedian(A, W): | |
37 """This computes the weighted median of array A with corresponding | |
38 weights W. | |
39 """ | |
40 | |
41 AW = zip(A,W) | |
42 n = len(AW) | |
43 | |
44 wtot = sum(W) | |
45 | |
46 beg = 0 | |
47 end = n | |
48 | |
49 while True: | |
50 mid = (beg + end)//2 | |
51 | |
52 partsort(AW, mid) | |
53 trial = AW[mid][0] | |
54 | |
55 wleft = wright = 0 | |
56 for (a,w) in AW: | |
57 if a < trial: | |
58 wleft += w | |
59 else: | |
60 # This also includes a == trial, i.e. the "middle" | |
61 # weight. | |
62 wright += w | |
63 | |
64 if 2*wleft > wtot: | |
65 end = mid | |
66 elif 2*wright <= wtot: | |
67 beg = mid | |
68 else: | |
69 return trial | |
70 | |
71 def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022 ): | |
72 """Calculates the medcouple robust measure of skewness. | |
73 | |
74 Parameters | |
75 ---------- | |
76 y : array-like, 1-d | |
77 | |
78 Returns | |
79 ------- | |
80 mc : float | |
81 The medcouple statistic | |
82 | |
83 .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of | |
84 Skewness." Journal of Computational and Graphical Statistics, Vol. | |
85 13, No. 4 (Dec., 2004), pp. 996- 1017 | |
86 | |
87 .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element | |
88 in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of | |
89 Computing, 7, 147-53. | |
90 | |
91 """ | |
92 # FIXME: Figure out what to do about NaNs. | |
93 | |
94 n = len(X) | |
95 n2 = (n-1)//2 | |
96 | |
97 if n < 3: | |
98 return 0 | |
99 | |
100 Z = sorted(X, reverse=True) | |
101 | |
102 if n % 2 == 1: | |
103 Zmed = Z[n2 + 1] | |
104 else: | |
105 Zmed = (Z[n2] + Z[n2 + 1])/2 | |
106 | |
107 #Check if the median is at the edges up to relative epsilon | |
108 if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)): | |
109 return -1.0 | |
110 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)): | |
111 return 1.0 | |
112 | |
113 # Centre Z wrt median, so that median(Z) = 0. | |
114 Z = [z - Zmed for z in Z] | |
115 | |
116 # Scale inside [-0.5, 0.5], for greater numerical stability. | |
117 Zden = -2*max(-Z[0], Z[-1]) | |
118 Z = [z/zden for z in Zden] | |
119 Zmed /= Zden | |
120 | |
121 Zeps = eps1*(eps1 + abs(Zmed)) | |
122 | |
123 # These overlap on the entries that are tied with the median | |
124 Zplus = [z for z in Z if z >= -Zeps] | |
125 Zminus = [z for z in Z if Zeps >= z] | |
126 | |
127 n_plus = len(Zplus) | |
128 n_minus = len(Zminus) | |
129 | |
130 def h_kern(i, j): | |
131 """Kernel function h for the medcouple, closing over the values of | |
132 Zplus and Zminus just defined above. | |
133 | |
134 In case a and be are within epsilon of the median, the kernel | |
135 is the signum of their position. | |
136 | |
137 """ | |
138 a = Zplus[i] | |
139 b = Zminus[j] | |
140 | |
141 if abs(a - b) <= 2*eps2: | |
142 return signum(n_plus - 1 - i - j) | |
143 | |
144 return (a+b)/(a-b) | |
145 | |
146 # Init left and right borders | |
147 L = [0]*n_plus | |
148 R = [n_plus-1]*n_plus | |
149 | |
150 Ltot = 0 | |
151 Rtot = n_plus*n_minus | |
152 | |
153 mid_idx = Rtot//2 | |
154 | |
155 # kth pair algorithm (Johnson & Mizoguchi) | |
156 while Rtot - Ltot > n_plus: | |
157 | |
158 # First, compute the median inside the given bounds | |
159 I1 = (i for i in xrange(0, n_plus) if L[i] <= R[i]) | |
160 I2 = tee(I1) # Be stingy, reuse same generator | |
161 A = [h_kern(i, (L[i] + R[i])//2) for i in I1] | |
162 W = [R[i] - L[i] for i in I2] | |
163 Am = wmedian(A,W) | |
164 | |
165 # Compute new left and right boundaries, based on the weighted | |
166 # median | |
167 P = Q = [] | |
168 | |
169 j = -1 | |
170 for i in xrange(n_plus - 1, -1, -1): | |
171 while j < n_plus - 1 and h_kern(i, j) > Am: | |
172 j += 1 | |
173 P.append(j) | |
174 P.reverse() | |
175 | |
176 j = n_plus | |
177 for i in xrange(0, n_plus): | |
178 while j > -1 and h_kern(i, j) < Am: | |
179 j -= 1 | |
180 Q.append(j) | |
181 | |
182 # Check on which side of those bounds the desired median of | |
183 # the whole matrix may be. | |
184 sumP = sum(P) | |
185 sumQ = sum(Q) | |
186 if mid_idx <= sumP: | |
187 R = P | |
188 Rtot = sumP | |
189 elif mid_idx > sumQ: | |
190 L = Q | |
191 Ltot = sumQ | |
192 else: | |
193 return Am | |
194 | |
195 # Uh, implement this part... | |
196 import ipdb; ipdb.set_trace() | |
197 | |
198 | |
199 def signum(x): | |
200 if x > 0: | |
201 return 1 | |
202 if x < 0: | |
203 return -1 | |
204 else: | |
205 return 0 |