Mercurial > hg > medcouple

#!/usr/bin/python
# -*- coding: utf-8 -*-

# medcouple.py ---

# Copyright  © 2015 Jordi Gutiérrez Hermoso <jordigh@octave.org>

# Author: Jordi Gutiérrez Hermoso

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys

import numpy as np

from itertools import tee, izip

def wmedian(A, W):
    """This computes the weighted median of array A with corresponding
    weights W.
    """

    AW = zip(A, W)
    n = len(AW)

    wtot = sum(W)

    beg = 0
    end = n - 1

    while True:
        mid = (beg + end)//2

        AW = sorted(AW, key = lambda x: x[0]) # A partial sort would suffice here

        trial = AW[mid][0]

        wleft = wright = 0
        for (a, w) in AW:
            if a < trial:
                wleft += w
            else:
                # This also includes a == trial, i.e. the "middle"
                # weight.
                wright += w

        if 2*wleft > wtot:
            end = mid
        elif 2*wright < wtot:
            beg = mid
        else:
            return trial


def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022):
    """Calculates the medcouple robust measure of skewness.

    Parameters
    ----------
    y : array-like, 1-d

    Returns
    -------
    mc : float
        The medcouple statistic

    .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of
    Skewness." Journal of Computational and Graphical Statistics, Vol.
    13, No. 4 (Dec., 2004), pp. 996- 1017

    .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element
    in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of
    Computing, Vol. 7, No. 2 (May 1978), pp. 147-53.

    """

    # Yeah, yeah, leaky file handles; it'll be handled by the time the
    # program closes.
    Hfile = open("H", "w")
    Amfile = open("Am", "w")
    Rfile = open("R", "w")
    Lfile = open("L", "w")
    Pfile = open("P", "w")
    Qfile = open("Q", "w")
    RemFile = open("remaining", "w")
    Ptotfile = open("Ptotal", "w")
    Qtotfile = open("Qtotal", "w")
    medc_idxfile = open("medc_idx", "w")
    jackpot = open("jackpot", "w")

    # FIXME: Figure out what to do about NaNs.

    n = len(X)
    n2 = (n - 1)//2

    if n < 3:
        return 0

    Z = sorted([float(x) for x in X], reverse=True)

    if n % 2 == 1:
        Zmed = Z[n2]
    else:
        Zmed = (Z[n2] + Z[n2 + 1])/2

    #Check if the median is at the edges up to relative epsilon
    if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
        return -1.0
    if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
        return 1.0

    # Centre Z wrt median, so that median(Z) = 0.
    Z = [z - Zmed for z in Z]

    # Scale inside [-0.5, 0.5], for greater numerical stability.
    Zden = 2*max(Z[0], -Z[-1])
    Z = [z/Zden for z in Z]
    Zmed /= Zden

    Zeps = eps1*(eps1 + abs(Zmed))

    # These overlap on the entries that are tied with the median
    Zplus   = [z for z in Z if z >= -Zeps]
    Zminus  = [z for z in Z if Zeps >= z]

    n_plus = len(Zplus)
    n_minus = len(Zminus)


    def h_kern(i, j):
        """Kernel function h for the medcouple, closing over the values of
        Zplus and Zminus just defined above.

        In case a and be are within epsilon of the median, the kernel
        is the signum of their position.

        """
        a = Zplus[i]
        b = Zminus[j]

        if abs(a - b) <= 2*eps2:
            h = signum(n_plus - 1 -  i - j)
        else:
            h = (a + b)/(a - b)

        return h

    for i, zp in enumerate(Zplus):
        for j, zm in enumerate(Zminus):
            Hfile.write("%s " % h_kern(i, j)),
        Hfile.write("\n")

    # Init left and right borders
    L = [0]*n_plus
    R = [n_minus - 1]*n_plus

    Ltot = 0
    Rtot = n_minus*n_plus
    medc_idx = Rtot//2
    medc_idxfile.write("%s\n" % medc_idx)

    Lfile.write("%s\n" % " ".join(str(x) for x in L))
    Rfile.write("%s\n" % " ".join(str(x) for x in R))
    RemFile.write("%s\n" % Rtot)

    # kth pair algorithm (Johnson & Mizoguchi)
    while Rtot - Ltot > n_plus:

        # First, compute the median inside the given bounds
        # (Be stingy, reuse same generator)
        [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i])

        A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
        W = [R[i] - L[i] + 1 for i in I2]
        Am = wmedian(A, W)

        Am_eps = eps1*(eps1 + abs(Am))

        Amfile.write("%s\n" % Am)

        # Compute new left and right boundaries, based on the weighted
        # median
        P = []
        Q = []

        j = 0
        for i in xrange(n_plus - 1, -1, -1):
            while j < n_minus and h_kern(i, j) - Am > Am_eps:
                j += 1
            P.append(j - 1)

        P.reverse()

        j = n_minus - 1
        for i in xrange(0, n_plus):
            while j >= 0 and h_kern(i, j) - Am < -Am_eps:
                j -= 1
            Q.append(j + 1)

        # Check on which side of those bounds the desired median of
        # the whole matrix may be.
        sumP = sum(P) + len(P)
        sumQ = sum(Q)

        Ptotfile.write("%s\n" % sumP)
        Qtotfile.write("%s\n" % sumQ)
        Pfile.write("%s\n" % " ".join(str(x) for x in P))
        Qfile.write("%s\n" % " ".join(str(x) for x in Q))

        if medc_idx <= sumP - 1:
            R = P
            Rtot = sumP
            Lfile.write("%s\n" % " ".join(str(x) for x in L))
            Rfile.write("%s\n" % " ".join(str(x) for x in R))
            RemFile.write("%s\n" % (Rtot - Ltot))
        else:
            if medc_idx > sumQ - 1:
                L = Q
                Ltot = sumQ
                Lfile.write("%s\n" % " ".join(str(x) for x in L))
                Rfile.write("%s\n" % " ".join(str(x) for x in R))
                RemFile.write("%s\n" % (Rtot - Ltot))
            else:
                jackpot.write("1")
                return Am

    # Didn't find the median, but now we have a very small search
    # space to find it in, just between the left and right boundaries.
    # This space is of size Rtot - Ltot which is <= n_plus
    jackpot.write("0")
    A = []
    for (i, (l, r)) in enumerate(izip(L, R)):
        for j in xrange(l, r + 1):
            A.append(h_kern(i, j))

    A.sort()     # A partial sort would suffice here
    A.reverse()

    Am = A[medc_idx - Ltot]

    return Am


def signum(x):
    if x > 0:
        return 1
    if x < 0:
        return -1
    else:
        return 0


def main():
    if len(sys.argv) < 2:
        print "Need a size"
        exit(1)

    data = np.random.uniform(size=int(sys.argv[1]))

    mc = medcouple_1d(data)
    with open("mc", "w") as f:
        f.write("%s\n" % mc)

if __name__ == "__main__":
    main()
author	Jordi Gutiérrez Hermoso <jordigh@octave.org>
date	Fri, 20 May 2016 08:43:48 -0400
parents	5510ac95a3d3
children	f81a65086498