comparison pymedcouple @ 7:6c7c7dc9d8ef

hairball
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Thu, 15 Jan 2015 15:53:19 -0500
parents e3b1dcc51e6a
children b3e878bb793d
comparison
equal deleted inserted replaced
6:e3b1dcc51e6a 7:6c7c7dc9d8ef
1 #!/usr/bin/python 1 #!/usr/bin/python
2 2
3 import random 3 import random
4 import numpy as np 4 import numpy as np
5 5
6 from itertools import tee 6 from itertools import tee, izip
7 7
8 def partsort(L, n): 8 def partsort(L, n):
9 """This is a partial sort of the weighted list L = [(a,w) for a in A, 9 """This is a partial sort of the weighted list L = [(a,w) for a in A,
10 w in W], so that the element at position n is in the correct spot. 10 w in W], so that the element at position n is in the correct spot.
11 Also known as quickselect. The elements of W are ignored for the 11 Also known as quickselect. The elements of W are ignored for the
44 n = len(AW) 44 n = len(AW)
45 45
46 wtot = sum(W) 46 wtot = sum(W)
47 47
48 beg = 0 48 beg = 0
49 end = n 49 end = n-1
50 50
51 while True: 51 while True:
52 mid = (beg + end)//2 52 mid = (beg + end)//2
53 53
54 partsort(AW, mid) 54 partsort(AW, mid)
68 elif 2*wright <= wtot: 68 elif 2*wright <= wtot:
69 beg = mid 69 beg = mid
70 else: 70 else:
71 return trial 71 return trial
72 72
73 def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022 ): 73 def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022, debug=False ):
74 """Calculates the medcouple robust measure of skewness. 74 """Calculates the medcouple robust measure of skewness.
75 75
76 Parameters 76 Parameters
77 ---------- 77 ----------
78 y : array-like, 1-d 78 y : array-like, 1-d
100 return 0 100 return 0
101 101
102 Z = sorted(X, reverse=True) 102 Z = sorted(X, reverse=True)
103 103
104 if n % 2 == 1: 104 if n % 2 == 1:
105 Zmed = Z[n2 + 1] 105 Zmed = Z[n2]
106 else: 106 else:
107 Zmed = (Z[n2] + Z[n2 + 1])/2 107 Zmed = (Z[n2] + Z[n2+1])/2
108 108
109 #Check if the median is at the edges up to relative epsilon 109 #Check if the median is at the edges up to relative epsilon
110 if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)): 110 if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
111 return -1.0 111 return -1.0
112 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)): 112 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
113 return 1.0 113 return 1.0
114 114
115 if debug:
116 print "Zmed = ", Zmed
117
115 # Centre Z wrt median, so that median(Z) = 0. 118 # Centre Z wrt median, so that median(Z) = 0.
116 Z = [z - Zmed for z in Z] 119 Z = [z - Zmed for z in Z]
117 120
118 # Scale inside [-0.5, 0.5], for greater numerical stability. 121 # Scale inside [-0.5, 0.5], for greater numerical stability.
119 Zden = -2*max(-Z[0], Z[-1]) 122 Zden = 2*max(Z[0], -Z[-1])
123 if debug:
124 print "Zden =", Zden
125
120 Z = [z/Zden for z in Z] 126 Z = [z/Zden for z in Z]
121 Zmed /= Zden 127 Zmed /= Zden
122 128
123 Zeps = eps1*(eps1 + abs(Zmed)) 129 Zeps = eps1*(eps1 + abs(Zmed))
124 130
125 # These overlap on the entries that are tied with the median 131 # These overlap on the entries that are tied with the median
126 Zplus = [z for z in Z if z >= -Zeps] 132 Zplus = [z for z in Z if z >= -Zeps]
127 Zminus = [z for z in Z if Zeps >= z] 133 Zminus = [z for z in Z if Zeps >= z]
128 134
135 if debug:
136 for zp in Zplus:
137 print "Zplus : %g" % zp
138 for zm in Zminus:
139 print "Zminus: %g" % zm
140
129 n_plus = len(Zplus) 141 n_plus = len(Zplus)
130 n_minus = len(Zminus) 142 n_minus = len(Zminus)
131 143
132 def h_kern(i, j): 144 if debug:
145 print "n_plus =", n_plus
146 print "n_minus = ", n_minus
147
148 def h_kern(i, j, debug=False):
133 """Kernel function h for the medcouple, closing over the values of 149 """Kernel function h for the medcouple, closing over the values of
134 Zplus and Zminus just defined above. 150 Zplus and Zminus just defined above.
135 151
136 In case a and be are within epsilon of the median, the kernel 152 In case a and be are within epsilon of the median, the kernel
137 is the signum of their position. 153 is the signum of their position.
139 """ 155 """
140 a = Zplus[i] 156 a = Zplus[i]
141 b = Zminus[j] 157 b = Zminus[j]
142 158
143 if abs(a - b) <= 2*eps2: 159 if abs(a - b) <= 2*eps2:
144 return signum(n_plus - 1 - i - j) 160 h = signum(n_plus - 1 - i - j)
145 161 else:
146 return (a+b)/(a-b) 162 h = (a+b)/(a-b)
163
164 if False:
165 print "a = {a}, b = {b}, h({i},{j}) = {h}".format(**locals())
166
167 return h
147 168
148 # Init left and right borders 169 # Init left and right borders
149 L = [0]*n_plus 170 L = [0]*n_plus
150 R = [n_plus-1]*n_plus 171 R = [n_minus - 1]*n_plus
151 172
152 Ltot = 0 173 Ltot = 0
153 Rtot = n_plus*n_minus 174 Rtot = n_minus*n_plus
154 175 mid_idx = (Rtot-1)//2
155 mid_idx = Rtot//2 176
177 if debug:
178 print "mid_idx = ", mid_idx
156 179
157 # kth pair algorithm (Johnson & Mizoguchi) 180 # kth pair algorithm (Johnson & Mizoguchi)
158 while Rtot - Ltot > n_plus: 181 while Rtot - Ltot > n_plus:
182 if debug:
183 print "L = ", L
184 print "R = ", R
159 185
160 # First, compute the median inside the given bounds 186 # First, compute the median inside the given bounds
161 # (Be stingy, reuse same generator) 187 # (Be stingy, reuse same generator)
162 [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i]) 188 [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i])
163 189
164 A = [h_kern(i, (L[i] + R[i])//2) for i in I1] 190 A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
165 W = [R[i] - L[i] for i in I2] 191 W = [R[i] - L[i] + 1 for i in I2]
166 Am = wmedian(A,W) 192 Am = wmedian(A,W)
193 if debug:
194 print "Am = ", Am
195
196 Am_eps = eps1*(eps1 + abs(Am))
167 197
168 # Compute new left and right boundaries, based on the weighted 198 # Compute new left and right boundaries, based on the weighted
169 # median 199 # median
170 P = Q = [] 200 P = []
171 201 Q = []
172 j = -1 202
203 j = 0
173 for i in xrange(n_plus - 1, -1, -1): 204 for i in xrange(n_plus - 1, -1, -1):
174 while j < n_plus - 1 and h_kern(i, j) > Am: 205 while j < n_minus - 1 and h_kern(i, j) - Am > Am_eps:
175 j += 1 206 j += 1
176 P.append(j) 207 P.append(j-1)
208
177 P.reverse() 209 P.reverse()
178 210
179 j = n_plus 211 j = n_minus - 1
180 for i in xrange(0, n_plus): 212 for i in xrange(0, n_plus):
181 while j > -1 and h_kern(i, j) < Am: 213 while j >= 0 and h_kern(i, j) - Am < -Am_eps:
182 j -= 1 214 j -= 1
183 Q.append(j) 215 Q.append(j+1)
184 216
185 # Check on which side of those bounds the desired median of 217 # Check on which side of those bounds the desired median of
186 # the whole matrix may be. 218 # the whole matrix may be.
187 sumP = sum(P) 219 sumP = sum(P) + len(P)
188 sumQ = sum(Q) 220 sumQ = sum(Q)
189 if mid_idx <= sumP: 221 if debug:
222 print "P: ", sumP, P
223 print "Q: ", sumQ, Q
224 print
225
226 if mid_idx <= sumP - 1:
190 R = P 227 R = P
191 Rtot = sumP 228 Rtot = sumP
192 elif mid_idx > sumQ: 229 else:
193 L = Q 230 if mid_idx > sumQ - 1:
194 Ltot = sumQ 231 L = Q
195 else: 232 Ltot = sumQ
196 return Am 233 else:
197 234 return Am
198 # Uh, implement this part... 235
199 import ipdb; ipdb.set_trace() 236 # Didn't find the median, but now we have a very small search
237 # space to find it in, just between the left and right boundaries.
238 # This space is of size Rtot - Ltot which is <= n_plus
239 if debug:
240 print "L =", L
241 print "R =", R
242 A = []
243 for (i, (l, r)) in enumerate(izip(L, R)):
244 for j in xrange(l, r + 1):
245 A.append(h_kern(i, j, debug=debug))
246 A.sort()
247 A.reverse()
248
249 if debug:
250 print "len(A) = ", len(A)
251 Am = A[mid_idx - Ltot]
252 return Am
200 253
201 254
202 def signum(x): 255 def signum(x):
203 if x > 0: 256 if x > 0:
204 return 1 257 return 1
210 def main(): 263 def main():
211 import sys 264 import sys
212 fname = sys.argv[1] 265 fname = sys.argv[1]
213 with open(fname) as f: 266 with open(fname) as f:
214 data = [float(x) for x in f.readlines() if x.strip() != ""] 267 data = [float(x) for x in f.readlines() if x.strip() != ""]
215 print medcouple_1d(data) 268 print "%.16g" % medcouple_1d(data)
216 269
217 if __name__ == "__main__": 270 if __name__ == "__main__":
218 main() 271 main()