comparison pymedcouple @ 5:cbe17f888c79

Make it executable
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Tue, 13 Jan 2015 17:00:49 -0500
parents medcouple.py@74d0d08dbc95
children e3b1dcc51e6a
comparison
equal deleted inserted replaced
4:74d0d08dbc95 5:cbe17f888c79
1 #!/usr/bin/python
2
3 import random
4 import numpy as np
5
6 from itertools import tee
7
8 def partsort(L, n):
9 """This is a partial sort of the weighted list L = [(a,w) for a in A,
10 w in W], so that the element at position n is in the correct spot.
11 Also known as quickselect. The elements of W are ignored for the
12 purposes of this partial sort.
13 """
14
15 beg = 0
16 end = len(L)
17
18 while True:
19 pivot = random.randint(beg,end-1)
20 pivotval = L[pivot][0]
21 L[pivot], L[end-1] = L[end-1], L[pivot]
22
23 idx = beg
24 for i in xrange(beg, end):
25 if L[i][0] < pivotval:
26 L[idx], L[i] = L[i], L[idx]
27 idx += 1
28
29 L[idx], L[end-1] = L[end-1], L[idx]
30
31 if idx == n:
32 return L[idx]
33 elif idx < n:
34 beg = idx+1
35 else:
36 end = idx
37
38 def wmedian(A, W):
39 """This computes the weighted median of array A with corresponding
40 weights W.
41 """
42
43 AW = zip(A,W)
44 n = len(AW)
45
46 wtot = sum(W)
47
48 beg = 0
49 end = n
50
51 while True:
52 mid = (beg + end)//2
53
54 partsort(AW, mid)
55 trial = AW[mid][0]
56
57 wleft = wright = 0
58 for (a,w) in AW:
59 if a < trial:
60 wleft += w
61 else:
62 # This also includes a == trial, i.e. the "middle"
63 # weight.
64 wright += w
65
66 if 2*wleft > wtot:
67 end = mid
68 elif 2*wright <= wtot:
69 beg = mid
70 else:
71 return trial
72
73 def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022 ):
74 """Calculates the medcouple robust measure of skewness.
75
76 Parameters
77 ----------
78 y : array-like, 1-d
79
80 Returns
81 -------
82 mc : float
83 The medcouple statistic
84
85 .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of
86 Skewness." Journal of Computational and Graphical Statistics, Vol.
87 13, No. 4 (Dec., 2004), pp. 996- 1017
88
89 .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element
90 in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of
91 Computing, 7, 147-53.
92
93 """
94 # FIXME: Figure out what to do about NaNs.
95
96 n = len(X)
97 n2 = (n-1)//2
98
99 if n < 3:
100 return 0
101
102 Z = sorted(X, reverse=True)
103
104 if n % 2 == 1:
105 Zmed = Z[n2 + 1]
106 else:
107 Zmed = (Z[n2] + Z[n2 + 1])/2
108
109 #Check if the median is at the edges up to relative epsilon
110 if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
111 return -1.0
112 if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
113 return 1.0
114
115 # Centre Z wrt median, so that median(Z) = 0.
116 Z = [z - Zmed for z in Z]
117
118 # Scale inside [-0.5, 0.5], for greater numerical stability.
119 Zden = -2*max(-Z[0], Z[-1])
120 Z = [z/Zden for z in Z]
121 Zmed /= Zden
122
123 Zeps = eps1*(eps1 + abs(Zmed))
124
125 # These overlap on the entries that are tied with the median
126 Zplus = [z for z in Z if z >= -Zeps]
127 Zminus = [z for z in Z if Zeps >= z]
128
129 n_plus = len(Zplus)
130 n_minus = len(Zminus)
131
132 def h_kern(i, j):
133 """Kernel function h for the medcouple, closing over the values of
134 Zplus and Zminus just defined above.
135
136 In case a and be are within epsilon of the median, the kernel
137 is the signum of their position.
138
139 """
140 a = Zplus[i]
141 b = Zminus[j]
142
143 if abs(a - b) <= 2*eps2:
144 return signum(n_plus - 1 - i - j)
145
146 return (a+b)/(a-b)
147
148 # Init left and right borders
149 L = [0]*n_plus
150 R = [n_plus-1]*n_plus
151
152 Ltot = 0
153 Rtot = n_plus*n_minus
154
155 mid_idx = Rtot//2
156
157 # kth pair algorithm (Johnson & Mizoguchi)
158 while Rtot - Ltot > n_plus:
159
160 # First, compute the median inside the given bounds
161 I1 = (i for i in xrange(0, n_plus) if L[i] <= R[i])
162 I2 = tee(I1) # Be stingy, reuse same generator
163 A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
164 W = [R[i] - L[i] for i in I2]
165 Am = wmedian(A,W)
166
167 # Compute new left and right boundaries, based on the weighted
168 # median
169 P = Q = []
170
171 j = -1
172 for i in xrange(n_plus - 1, -1, -1):
173 while j < n_plus - 1 and h_kern(i, j) > Am:
174 j += 1
175 P.append(j)
176 P.reverse()
177
178 j = n_plus
179 for i in xrange(0, n_plus):
180 while j > -1 and h_kern(i, j) < Am:
181 j -= 1
182 Q.append(j)
183
184 # Check on which side of those bounds the desired median of
185 # the whole matrix may be.
186 sumP = sum(P)
187 sumQ = sum(Q)
188 if mid_idx <= sumP:
189 R = P
190 Rtot = sumP
191 elif mid_idx > sumQ:
192 L = Q
193 Ltot = sumQ
194 else:
195 return Am
196
197 # Uh, implement this part...
198 import ipdb; ipdb.set_trace()
199
200
201 def signum(x):
202 if x > 0:
203 return 1
204 if x < 0:
205 return -1
206 else:
207 return 0
208
209 def main():
210 import sys
211 fname = sys.argv[1]
212 with open(fname) as f:
213 data = [float(x) for x in f.readlines() if x.strip() != ""]
214 print medcouple_1d(data)
215
216 if __name__ == "__main__":
217 main()