view medcouple.py @ 52:242afe8021b4

medcouple.py: make sure data is floats
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Mon, 16 May 2016 22:26:11 -0400
parents 2493b21d7d8b
children
line wrap: on
line source

#!/usr/bin/python
# -*- coding: utf-8 -*-

# medcouple.py ---

# Copyright  © 2015 Jordi Gutiérrez Hermoso <jordigh@octave.org>

# Author: Jordi Gutiérrez Hermoso

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.


import random

from itertools import tee, izip

def wmedian(A, W):
    """This computes the weighted median of array A with corresponding
    weights W.
    """

    AW = zip(A, W)
    n = len(AW)

    wtot = sum(W)

    beg = 0
    end = n - 1

    while True:
        mid = (beg + end)//2

        AW = sorted(AW, key = lambda x: x[0]) # A partial sort would suffice here

        trial = AW[mid][0]

        wleft = wright = 0
        for (a, w) in AW:
            if a < trial:
                wleft += w
            else:
                # This also includes a == trial, i.e. the "middle"
                # weight.
                wright += w

        if 2*wleft > wtot:
            end = mid
        elif 2*wright < wtot:
            beg = mid
        else:
            return trial


def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022):
    """Calculates the medcouple robust measure of skewness.

    Parameters
    ----------
    y : array-like, 1-d

    Returns
    -------
    mc : float
        The medcouple statistic

    .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of
    Skewness." Journal of Computational and Graphical Statistics, Vol.
    13, No. 4 (Dec., 2004), pp. 996- 1017

    .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element
    in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of
    Computing, Vol. 7, No. 2 (May 1978), pp. 147-53.

    """
    # FIXME: Figure out what to do about NaNs.

    n = len(X)
    n2 = (n - 1)//2

    if n < 3:
        return 0

    Z = sorted([float(x) for x in X], reverse=True)

    if n % 2 == 1:
        Zmed = Z[n2]
    else:
        Zmed = (Z[n2] + Z[n2 + 1])/2

    #Check if the median is at the edges up to relative epsilon
    if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
        return -1.0
    if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
        return 1.0

    # Centre Z wrt median, so that median(Z) = 0.
    Z = [z - Zmed for z in Z]

    # Scale inside [-0.5, 0.5], for greater numerical stability.
    Zden = 2*max(Z[0], -Z[-1])
    Z = [z/Zden for z in Z]
    Zmed /= Zden

    Zeps = eps1*(eps1 + abs(Zmed))

    # These overlap on the entries that are tied with the median
    Zplus   = [z for z in Z if z >= -Zeps]
    Zminus  = [z for z in Z if Zeps >= z]

    n_plus = len(Zplus)
    n_minus = len(Zminus)


    def h_kern(i, j):
        """Kernel function h for the medcouple, closing over the values of
        Zplus and Zminus just defined above.

        In case a and be are within epsilon of the median, the kernel
        is the signum of their position.

        """
        a = Zplus[i]
        b = Zminus[j]

        if abs(a - b) <= 2*eps2:
            h = signum(n_plus - 1 -  i - j)
        else:
            h = (a + b)/(a - b)

        return h

    # Init left and right borders
    L = [0]*n_plus
    R = [n_minus - 1]*n_plus

    Ltot = 0
    Rtot = n_minus*n_plus
    medc_idx = Rtot//2

    # kth pair algorithm (Johnson & Mizoguchi)
    while Rtot - Ltot > n_plus:

        # First, compute the median inside the given bounds
        # (Be stingy, reuse same generator)
        [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i])

        A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
        W = [R[i] - L[i] + 1 for i in I2]
        Am = wmedian(A, W)

        Am_eps = eps1*(eps1 + abs(Am))

        # Compute new left and right boundaries, based on the weighted
        # median
        P = []
        Q = []

        j = 0
        for i in xrange(n_plus - 1, -1, -1):
            while j < n_minus and h_kern(i, j) - Am > Am_eps:
                j += 1
            P.append(j - 1)

        P.reverse()

        j = n_minus - 1
        for i in xrange(0, n_plus):
            while j >= 0 and h_kern(i, j) - Am < -Am_eps:
                j -= 1
            Q.append(j + 1)

        # Check on which side of those bounds the desired median of
        # the whole matrix may be.
        sumP = sum(P) + len(P)
        sumQ = sum(Q)

        if medc_idx <= sumP - 1:
            R = P
            Rtot = sumP
        else:
            if medc_idx > sumQ - 1:
                L = Q
                Ltot = sumQ
            else:
                return Am

    # Didn't find the median, but now we have a very small search
    # space to find it in, just between the left and right boundaries.
    # This space is of size Rtot - Ltot which is <= n_plus
    A = []
    for (i, (l, r)) in enumerate(izip(L, R)):
        for j in xrange(l, r + 1):
            A.append(h_kern(i, j))

    A.sort()     # A partial sort would suffice here
    A.reverse()

    Am = A[medc_idx - Ltot]

    return Am


def signum(x):
    if x > 0:
        return 1
    if x < 0:
        return -1
    else:
        return 0


def main():
    import sys
    fname = sys.argv[1]
    with open(fname) as f:
        data = [float(x) for x in f.readlines() if x.strip() != ""]

    print "%.16g" % medcouple_1d(data)

if __name__ == "__main__":
    main()