view pymedcouple @ 6:e3b1dcc51e6a

Fix tee usage
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Tue, 13 Jan 2015 17:26:39 -0500
parents cbe17f888c79
children 6c7c7dc9d8ef
line wrap: on
line source

#!/usr/bin/python

import random
import numpy as np

from itertools import tee

def partsort(L, n):
    """This is a partial sort of the weighted list L = [(a,w) for a in A,
    w in W], so that the element at position n is in the correct spot.
    Also known as quickselect. The elements of W are ignored for the
    purposes of this partial sort.
    """
    
    beg = 0
    end = len(L)
    
    while True:
        pivot = random.randint(beg,end-1)
        pivotval = L[pivot][0]
        L[pivot], L[end-1] = L[end-1], L[pivot]
        
        idx = beg
        for i in xrange(beg, end):
            if L[i][0] < pivotval:
                L[idx], L[i] = L[i], L[idx]
                idx += 1

        L[idx], L[end-1] = L[end-1], L[idx]

        if idx == n:
            return L[idx]
        elif idx < n:
            beg = idx+1
        else:
            end = idx

def wmedian(A, W):
    """This computes the weighted median of array A with corresponding
    weights W.
    """
    
    AW = zip(A,W)
    n = len(AW)

    wtot = sum(W)

    beg = 0
    end = n

    while True:
        mid = (beg + end)//2
        
        partsort(AW, mid)
        trial = AW[mid][0]

        wleft = wright = 0
        for (a,w) in AW:
            if a < trial:
                wleft += w
            else:
                # This also includes a == trial, i.e. the "middle"
                # weight.
                wright += w

        if 2*wleft > wtot:
            end = mid
        elif 2*wright <= wtot:
            beg = mid
        else:
            return trial

def medcouple_1d(X, eps1 = 2**-52, eps2 = 2**-1022 ):
    """Calculates the medcouple robust measure of skewness.

    Parameters
    ----------
    y : array-like, 1-d

    Returns
    -------
    mc : float
        The medcouple statistic
   
    .. [1] G. Brys, M. Hubert, and A. Struyf "A Robust Measure of
    Skewness." Journal of Computational and Graphical Statistics, Vol.
    13, No. 4 (Dec., 2004), pp. 996- 1017

    .. [2] D. B. Johnson and T. Mizoguchi "Selecting the Kth Element
    in $X + Y$ and $X_1 + X_2 + \cdots X_m$". SIAM Journal of
    Computing, 7, 147-53.

    """
    # FIXME: Figure out what to do about NaNs.
    
    n = len(X)
    n2 = (n-1)//2

    if n < 3:
        return 0

    Z = sorted(X, reverse=True)

    if n % 2 == 1:
        Zmed = Z[n2 + 1]
    else:
        Zmed = (Z[n2] + Z[n2 + 1])/2
    
    #Check if the median is at the edges up to relative epsilon
    if abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)):
        return -1.0
    if abs(Z[-1] - Zmed) < eps1*(eps1 + abs(Zmed)):
        return 1.0
    
    # Centre Z wrt median, so that median(Z) = 0.
    Z = [z - Zmed for z in Z]

    # Scale inside [-0.5, 0.5], for greater numerical stability.
    Zden = -2*max(-Z[0], Z[-1])
    Z = [z/Zden for z in Z]
    Zmed /= Zden
    
    Zeps = eps1*(eps1 + abs(Zmed))

    # These overlap on the entries that are tied with the median
    Zplus   = [z for z in Z if z >= -Zeps]
    Zminus  = [z for z in Z if Zeps >= z]

    n_plus = len(Zplus)
    n_minus = len(Zminus)

    def h_kern(i, j):
        """Kernel function h for the medcouple, closing over the values of
        Zplus and Zminus just defined above.

        In case a and be are within epsilon of the median, the kernel
        is the signum of their position.

        """
        a = Zplus[i]
        b = Zminus[j]

        if abs(a - b) <= 2*eps2:
            return signum(n_plus - 1 -  i - j)

        return (a+b)/(a-b)

    # Init left and right borders
    L = [0]*n_plus
    R = [n_plus-1]*n_plus

    Ltot = 0
    Rtot = n_plus*n_minus

    mid_idx = Rtot//2

    # kth pair algorithm (Johnson & Mizoguchi)
    while Rtot - Ltot > n_plus:
        
        # First, compute the median inside the given bounds
        # (Be stingy, reuse same generator)
        [I1, I2] = tee(i for i in xrange(0, n_plus) if L[i] <= R[i])

        A = [h_kern(i, (L[i] + R[i])//2) for i in I1]
        W = [R[i] - L[i] for i in I2]
        Am = wmedian(A,W)

        # Compute new left and right boundaries, based on the weighted
        # median
        P = Q = []

        j = -1
        for i in xrange(n_plus - 1, -1, -1):
            while j < n_plus - 1 and h_kern(i, j) > Am:
                j += 1
            P.append(j)
        P.reverse()

        j = n_plus
        for i in xrange(0, n_plus):
            while j > -1 and h_kern(i, j) < Am:
                j -= 1
            Q.append(j)

        # Check on which side of those bounds the desired median of
        # the whole matrix may be.
        sumP = sum(P)
        sumQ = sum(Q)
        if mid_idx <= sumP:
            R = P
            Rtot = sumP
        elif mid_idx > sumQ:
            L = Q
            Ltot = sumQ
        else:
            return Am

    # Uh, implement this part...
    import ipdb; ipdb.set_trace()
    

def signum(x):
    if x > 0:
        return 1
    if x < 0:
        return -1
    else:
        return 0

def main():
    import sys
    fname = sys.argv[1]
    with open(fname) as f:
        data = [float(x) for x in f.readlines() if x.strip() != ""]
    print medcouple_1d(data)

if __name__ == "__main__":
    main()