Mercurial > hg > medcouple
comparison medcouple.py @ 33:9a531463663e
Rename pymedcouple to medcouple.py
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Sun, 18 Jan 2015 21:00:59 -0500 |
parents | pymedcouple@1a2bb52722c2 |
children | 4fb3b87b8610 |
comparison
equal
deleted
inserted
replaced
32:5b77348383b7 | 33:9a531463663e |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import random | |
4 | |
5 from itertools import tee, izip | |
6 | |
7 def wmedian(A, W): | |
8 """This computes the weighted median of array A with corresponding | |
9 weights W. | |
10 """ | |
11 | |
12 AW = zip(A, W) | |
13 n = len(AW) | |
14 | |
15 wtot = sum(W) | |
16 | |
17 beg = 0 | |
18 end = n - 1 | |
19 | |
20 while True: | |
21 mid = (beg + end)//2 | |
22 | |
23 AW = sorted(AW, key = lambda x: x[0]) # A partial sort would suffice here | |
24 | |
25 trial = AW[mid][0] | |
26 | |
27 wleft = wright = 0 | |
28 for (a, w) in AW: | |
29 if a < trial: | |
30 wleft += w | |
31 else: | |
32 # This also includes a == trial, i.e. the "middle" | |
33 # weight. | |
34 wright += w | |
35 | |
36 if 2*wleft > wtot: | |
37 end = mid | |
38 elif 2*wright < wtot: | |
39 beg = mid | |
40 else: | |
41 return trial | |
42 | |
43 | |
44 def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022): | |
45 """Calculates the medcouple robust measure of skewness. | |
46 | |
47 Parameters | |
48 ---------- | |
49 y : array-like, 1-d | |
50 | |
51 Returns | |
52 ------- | |
53 mc : float | |
54 The medcouple statistic | |
55 | |
56 .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of | |
57 Skewness." Journal of Computational and Graphical Statistics, Vol. | |
58 13, No. 4 (Dec., 2004), pp. 996- 1017 | |
59 | |
60 .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element | |
61 in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of | |
62 Computing, Vol. 7, No. 2 (May 1978), pp. 147-53. | |
63 | |
64 """ | |
65 # FIXME: Figure out what to do about NaNs. | |
66 | |
67 n = len(X) | |
68 n2 = (n - 1)//2 | |
69 | |
70 if n < 3: | |
71 return 0 | |
72 | |
73 Z = sorted(X, reverse=True) | |
74 | |
75 if n % 2 == 1: | |
76 Zmed = Z[n2] | |
77 else: | |
78 Zmed = (Z[n2] + Z[n2 + 1])/2 | |
79 | |
80 #Check if the median is at the edges up to relative epsilon | |
81 if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)): | |
82 return -1.0 | |
83 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)): | |
84 return 1.0 | |
85 | |
86 # Centre Z wrt median, so that median(Z) = 0. | |
87 Z = [z - Zmed for z in Z] | |
88 | |
89 # Scale inside [-0.5, 0.5], for greater numerical stability. | |
90 Zden = 2*max(Z[0], -Z[-1]) | |
91 Z = [z/Zden for z in Z] | |
92 Zmed /= Zden | |
93 | |
94 Zeps = eps1*(eps1 + abs(Zmed)) | |
95 | |
96 # These overlap on the entries that are tied with the median | |
97 Zplus = [z for z in Z if z >= -Zeps] | |
98 Zminus = [z for z in Z if Zeps >= z] | |
99 | |
100 n_plus = len(Zplus) | |
101 n_minus = len(Zminus) | |
102 | |
103 | |
104 def h_kern(i, j): | |
105 """Kernel function h for the medcouple, closing over the values of | |
106 Zplus and Zminus just defined above. | |
107 | |
108 In case a and be are within epsilon of the median, the kernel | |
109 is the signum of their position. | |
110 | |
111 """ | |
112 a = Zplus[i] | |
113 b = Zminus[j] | |
114 | |
115 if abs(a - b) <= 2*eps2: | |
116 h = signum(n_plus - 1 - i - j) | |
117 else: | |
118 h = (a + b)/(a - b) | |
119 | |
120 return h | |
121 | |
122 # Init left and right borders | |
123 L = [0]*n_plus | |
124 R = [n_minus - 1]*n_plus | |
125 | |
126 Ltot = 0 | |
127 Rtot = n_minus*n_plus | |
128 medc_idx = Rtot//2 | |
129 | |
130 # kth pair algorithm (Johnson & Mizoguchi) | |
131 while Rtot - Ltot > n_plus: | |
132 | |
133 # First, compute the median inside the given bounds | |
134 # (Be stingy, reuse same generator) | |
135 [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i]) | |
136 | |
137 A = [h_kern(i, (L[i] + R[i])//2) for i in I1] | |
138 W = [R[i] - L[i] + 1 for i in I2] | |
139 Am = wmedian(A, W) | |
140 | |
141 Am_eps = eps1*(eps1 + abs(Am)) | |
142 | |
143 # Compute new left and right boundaries, based on the weighted | |
144 # median | |
145 P = [] | |
146 Q = [] | |
147 | |
148 j = 0 | |
149 for i in xrange(n_plus - 1, -1, -1): | |
150 while j < n_minus and h_kern(i, j) - Am > Am_eps: | |
151 j += 1 | |
152 P.append(j - 1) | |
153 | |
154 P.reverse() | |
155 | |
156 j = n_minus - 1 | |
157 for i in xrange(0, n_plus): | |
158 while j >= 0 and h_kern(i, j) - Am < -Am_eps: | |
159 j -= 1 | |
160 Q.append(j + 1) | |
161 | |
162 # Check on which side of those bounds the desired median of | |
163 # the whole matrix may be. | |
164 sumP = sum(P) + len(P) | |
165 sumQ = sum(Q) | |
166 | |
167 if medc_idx <= sumP - 1: | |
168 R = P | |
169 Rtot = sumP | |
170 else: | |
171 if medc_idx > sumQ - 1: | |
172 L = Q | |
173 Ltot = sumQ | |
174 else: | |
175 return Am | |
176 | |
177 # Didn't find the median, but now we have a very small search | |
178 # space to find it in, just between the left and right boundaries. | |
179 # This space is of size Rtot - Ltot which is <= n_plus | |
180 A = [] | |
181 for (i, (l, r)) in enumerate(izip(L, R)): | |
182 for j in xrange(l, r + 1): | |
183 A.append(h_kern(i, j)) | |
184 | |
185 A.sort() # A partial sort would suffice here | |
186 A.reverse() | |
187 | |
188 Am = A[medc_idx - Ltot] | |
189 | |
190 return Am | |
191 | |
192 | |
193 def signum(x): | |
194 if x > 0: | |
195 return 1 | |
196 if x < 0: | |
197 return -1 | |
198 else: | |
199 return 0 | |
200 | |
201 | |
202 def main(): | |
203 import sys | |
204 fname = sys.argv[1] | |
205 with open(fname) as f: | |
206 data = [float(x) for x in f.readlines() if x.strip() != ""] | |
207 | |
208 print "%.16g" % medcouple_1d(data) | |
209 | |
210 if __name__ == "__main__": | |
211 main() |