comparison medcouple.py @ 33:9a531463663e

Rename pymedcouple to medcouple.py
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Sun, 18 Jan 2015 21:00:59 -0500
parents pymedcouple@1a2bb52722c2
children 4fb3b87b8610
comparison
equal deleted inserted replaced
32:5b77348383b7 33:9a531463663e
1 #!/usr/bin/python
2
3 import random
4
5 from itertools import tee, izip
6
7 def wmedian(A, W):
8 """This computes the weighted median of array A with corresponding
9 weights W.
10 """
11
12 AW = zip(A, W)
13 n = len(AW)
14
15 wtot = sum(W)
16
17 beg = 0
18 end = n - 1
19
20 while True:
21 mid = (beg + end)//2
22
23 AW = sorted(AW, key = lambda x: x[0]) # A partial sort would suffice here
24
25 trial = AW[mid][0]
26
27 wleft = wright = 0
28 for (a, w) in AW:
29 if a < trial:
30 wleft += w
31 else:
32 # This also includes a == trial, i.e. the "middle"
33 # weight.
34 wright += w
35
36 if 2*wleft > wtot:
37 end = mid
38 elif 2*wright < wtot:
39 beg = mid
40 else:
41 return trial
42
43
44 def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022):
45 """Calculates the medcouple robust measure of skewness.
46
47 Parameters
48 ----------
49 y : array-like, 1-d
50
51 Returns
52 -------
53 mc : float
54 The medcouple statistic
55
56 .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of
57 Skewness." Journal of Computational and Graphical Statistics, Vol.
58 13, No. 4 (Dec., 2004), pp. 996- 1017
59
60 .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element
61 in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of
62 Computing, Vol. 7, No. 2 (May 1978), pp. 147-53.
63
64 """
65 # FIXME: Figure out what to do about NaNs.
66
67 n = len(X)
68 n2 = (n - 1)//2
69
70 if n < 3:
71 return 0
72
73 Z = sorted(X, reverse=True)
74
75 if n % 2 == 1:
76 Zmed = Z[n2]
77 else:
78 Zmed = (Z[n2] + Z[n2 + 1])/2
79
80 #Check if the median is at the edges up to relative epsilon
81 if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
82 return -1.0
83 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
84 return 1.0
85
86 # Centre Z wrt median, so that median(Z) = 0.
87 Z = [z - Zmed for z in Z]
88
89 # Scale inside [-0.5, 0.5], for greater numerical stability.
90 Zden = 2*max(Z[0], -Z[-1])
91 Z = [z/Zden for z in Z]
92 Zmed /= Zden
93
94 Zeps = eps1*(eps1 + abs(Zmed))
95
96 # These overlap on the entries that are tied with the median
97 Zplus = [z for z in Z if z >= -Zeps]
98 Zminus = [z for z in Z if Zeps >= z]
99
100 n_plus = len(Zplus)
101 n_minus = len(Zminus)
102
103
104 def h_kern(i, j):
105 """Kernel function h for the medcouple, closing over the values of
106 Zplus and Zminus just defined above.
107
108 In case a and be are within epsilon of the median, the kernel
109 is the signum of their position.
110
111 """
112 a = Zplus[i]
113 b = Zminus[j]
114
115 if abs(a - b) <= 2*eps2:
116 h = signum(n_plus - 1 - i - j)
117 else:
118 h = (a + b)/(a - b)
119
120 return h
121
122 # Init left and right borders
123 L = [0]*n_plus
124 R = [n_minus - 1]*n_plus
125
126 Ltot = 0
127 Rtot = n_minus*n_plus
128 medc_idx = Rtot//2
129
130 # kth pair algorithm (Johnson & Mizoguchi)
131 while Rtot - Ltot > n_plus:
132
133 # First, compute the median inside the given bounds
134 # (Be stingy, reuse same generator)
135 [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i])
136
137 A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
138 W = [R[i] - L[i] + 1 for i in I2]
139 Am = wmedian(A, W)
140
141 Am_eps = eps1*(eps1 + abs(Am))
142
143 # Compute new left and right boundaries, based on the weighted
144 # median
145 P = []
146 Q = []
147
148 j = 0
149 for i in xrange(n_plus - 1, -1, -1):
150 while j < n_minus and h_kern(i, j) - Am > Am_eps:
151 j += 1
152 P.append(j - 1)
153
154 P.reverse()
155
156 j = n_minus - 1
157 for i in xrange(0, n_plus):
158 while j >= 0 and h_kern(i, j) - Am < -Am_eps:
159 j -= 1
160 Q.append(j + 1)
161
162 # Check on which side of those bounds the desired median of
163 # the whole matrix may be.
164 sumP = sum(P) + len(P)
165 sumQ = sum(Q)
166
167 if medc_idx <= sumP - 1:
168 R = P
169 Rtot = sumP
170 else:
171 if medc_idx > sumQ - 1:
172 L = Q
173 Ltot = sumQ
174 else:
175 return Am
176
177 # Didn't find the median, but now we have a very small search
178 # space to find it in, just between the left and right boundaries.
179 # This space is of size Rtot - Ltot which is <= n_plus
180 A = []
181 for (i, (l, r)) in enumerate(izip(L, R)):
182 for j in xrange(l, r + 1):
183 A.append(h_kern(i, j))
184
185 A.sort() # A partial sort would suffice here
186 A.reverse()
187
188 Am = A[medc_idx - Ltot]
189
190 return Am
191
192
193 def signum(x):
194 if x > 0:
195 return 1
196 if x < 0:
197 return -1
198 else:
199 return 0
200
201
202 def main():
203 import sys
204 fname = sys.argv[1]
205 with open(fname) as f:
206 data = [float(x) for x in f.readlines() if x.strip() != ""]
207
208 print "%.16g" % medcouple_1d(data)
209
210 if __name__ == "__main__":
211 main()