view medcouple.c++ @ 74:305b7361a5bd default tip @

showalgo: save a snapshot instead of waiting for keyboard input
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Sun, 29 May 2016 19:05:01 -0400
parents 9b6502f3f3bc
children
line wrap: on
line source

/*
  Algorithm for the skewness estimator medcouple (MC)
  --------------------------------------------------
  ( originally matlabmc.c and also  mc/mcrsoft/spmc.c )
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <algorithm>

#include <inttypes.h>
// -> int64_t

template <typename T> int sign(T val) {
    return (T(0) < val) - (val < T(0));
}

/* Interface routines to be called via .C() and those from API : */
#include "wgt_himed.c"
/*
  including

  whimed_i(a,iw,n): the weighted high median of an array a of length n,
  using the positive integer weights iw[].

  * which is in ./wgt_himed.c_templ
  *		 ~~~~~~~~~~~~~~~~~
  */

/* Includes the auxiliary function

   h_kern(a,b, ai,bi,ab, eps):	the values h(a,b)  needed to compute the mc
*/
static
double h_kern(double a, double b, int ai, int bi, int ab, double eps);

/* MM:	The tolerance  'eps1' and 'eps2' can now be passed from R;
 *	the original code had only one 'eps' for both and hardcoded
 *	   eps =  0.0000000000001;  /.* == 1e-13 )) *./
 */
/* MK:  eps1: for (relative) "equality" checks
 *      eps2: used to check for over- and underflow, respectively
 *      therefore I suggest eps1 = DBL_EPS and eps2 = DBL_MIN
 */
double mc_C(double *z, int n, double *eps, int trace_lev)
{
  /* NOTE:
     eps	  = c(eps1, eps2)
     iter := c(maxit, trace.lev)  as input
     = c(it, converged)     as output
  */
  int it = 0;
  bool converged = true;
  double medc; // "the" result

  if (n < 3) {
    return 0.;
  }
  /* copy data before sort()ing in place, also reflecting it */
  /* NOTE: x[0] is empty --- so 1-indexing is used (MM: danger!) */
  double *x  = (double *) malloc((n+1)*sizeof(double));
  x[0] = 0;
  for (int i = 0; i < n; i++)
    x[i+1] = -z[i];

  std::sort(&x[1], &x[1] + n); /* full sort */

  double xmed; // := median( x[1:n] ) = - median( z[0:(n-1)] ):
  if (n%2) { /* n even */
    xmed = x[(n/2)+1];
  }
  else { /* n  odd */
    int ind = (n/2);
    xmed = (x[ind] + x[ind+1])/2;
  }

  if (fabs(x[1] - xmed) < eps[0] * (eps[0] + fabs(xmed))) {
    return -1;
  } else if (fabs(x[n] - xmed) < eps[0] * (eps[0] + fabs(xmed))) {
    return 1;
  }
  /* else : median is not at the border ------------------- */

  if(trace_lev)
    printf("mc_C(z[1:%d], trace_lev=%d): Median = %g (not at the border)\n", 
           n, trace_lev, -xmed);

  int i,j;
  /* center x[] wrt median --> such that then  median( x[1:n] ) == 0 */
  for (i = 1; i <= n; i++)
    x[i] -= xmed;

  /* Now scale to inside [-0.5, 0.5] and flip sign such that afterwards
   *  x[1] >= x[2] >= ... >= x[n] */
  double xden = -2 * std::max(-x[1], x[n]);
  for (i = 1; i <= n; i++)
    x[i] /= xden;
  xmed /= xden;
  if(trace_lev >= 2)
    printf(" x[] has been rescaled (* 1/s) with s = %g\n", -xden);

  j = 1;
  double x_eps = eps[0] * (eps[0] + fabs(xmed));
  while (x[j] > x_eps && j <= n) { /* test relative to xmed */
    /* x1[j] = x[j]; */
    j++;
  }
  if(trace_lev >= 2)
    printf("   x1[] := {x | x_j > x_eps = %g}    has %d (='j-1') entries\n", 
           x_eps, j-1);
  i = 1;
  double *x2 = x+j-1; /* pointer -- corresponding to  x2[i] = x[j]; */
  while (x[j] > -x_eps && j <= n) { /* test relative to xmed */
    /* x1[j] = x[j]; */
    /* x2[i] = x[j]; */
    j++;
    i++;
  }
  /* now  x1[] := {x | x_j > -eps}  also includes the median (0) */
  if(trace_lev >= 2)
    printf("'median-x' {x | -eps < x_i <= eps} has %d (= 'k') entries\n",
           i-1);
  int h1 = j-1, /* == size of x1[] == the sum of those two sizes above */
    /* conceptually,  x2[] := {x | x_j <= eps}   (which includes the median 0) */
    h2 = i + (n-j);// == size of x2[] == maximal size of whimed() arrays

  if(trace_lev)
    printf("  now allocating 2+5 work arrays of size (1+) h2=%d each:\n", h2);
  /* work arrays for whimed_i() :  allocate *once* only !! */
  double *acand  = (double *) malloc(h2*sizeof(double)),
    *a_srt  = (double *) malloc(h2*sizeof(double));

  int    *iw_cand= (int *)	malloc(h2*sizeof(int)),
    /* work arrays for the fast-median-of-table algorithm:
     *  currently still with  1-indexing */
    *left  = (int *) malloc((h2+1)*sizeof(int)),
    *right = (int *) malloc((h2+1)*sizeof(int)),
    *p     = (int *) malloc((h2+1)*sizeof(int)),
    *q     = (int *) malloc((h2+1)*sizeof(int));

  for (i = 1; i <= h2; i++) {
    left [i] = 1;
    right[i] = h1;
  }
  int64_t nr = ((int64_t) h1) * ((int64_t) h2), // <-- careful to *NOT* overflow
    knew = nr/2 +1;
  if(trace_lev >= 2)
    printf(" (h1,h2, nr, knew) = (%d,%d, %.0f, %.0f)\n",
           h1,h2, (double)nr, (double)knew);

  double trial = -2./* -Wall */;
  double *work= (double *) malloc(n*sizeof(double));
  int	   *iwt = (int *)    malloc(n*sizeof(int));
  bool IsFound = false;
  int nl = 0,
    neq = 0;
  /* MK:  'neq' counts the number of observations in the
   *      inside the tolerance range, i.e., where left > right + 1,
   *      since we would miss those when just using 'nl-nr'.
   *      This is to prevent index overflow in work[] later on.
   *      left might be larger than right + 1 since we are only
   *      testing with accuracy eps_trial and therefore there might
   *      be more than one observation in the `tolerance range`
   *      between < and <=.
   */
  while (!IsFound && (nr-nl+neq > n) && it < 1e5)
  {
    int64_t sum_p, sum_q;
    it++;
    j = 0;
    for (i = 1; i <= h2; i++)
	    if (left[i] <= right[i]) {
        iwt[j] = right[i] - left[i]+1;
        int k = left[i] + (iwt[j]/2);
        work[j] = h_kern(x[k], x2[i], k, i, h1+1, eps[1]);
        j++;
	    }
    if(trace_lev >= 4) {
	    printf(" before whimed(): work and iwt, each [0:(%d-1)]:\n", j);
	    if(j >= 100) {
        for(i=0; i < 90; i++) printf(" %8g", work[i]); printf("\n  ... ");
        for(i=j-4; i < j; i++)printf(" %8g", work[i]); printf("\n");
        for(i=0; i < 90; i++) printf(" %8d", iwt [i]); printf("\n  ... ");
        for(i=j-4; i < j; i++)printf(" %8d", iwt [i]); printf("\n");
	    } else { // j <= 99
        for(i=0; i < j; i++) printf(" %8g", work[i]); printf("\n");
        for(i=0; i < j; i++) printf(" %8d", iwt [i]); printf("\n");
	    }
    }
    trial = whimed_i(work, iwt, j, acand, a_srt, iw_cand);
    double eps_trial = eps[0] * (eps[0] + fabs(trial));
    if(trace_lev >= 3)
	    printf("%2s it=%2d, whimed(*, n=%6d)= %8g ", " ", it, j, trial);

    j = 1;
    for (i = h2; i >= 1; i--) {
	    while (j <= h1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) - trial > eps_trial) {
        // while (j <= h1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) > trial) {
        if (trace_lev >= 5)
          printf("\nj=%3d, i=%3d, x[j]=%g, x2[i]=%g, h=%g", 
                 j, i, x[j], x2[i],
                 h_kern(x[j],x2[i],j,i,h1+1,eps[1]));
        j++;
	    }
      /* 	    for(; j <= h1; j++) { */
      /* 		register double h = h_kern(x[j],x2[i],j,i,h1+1,eps[1]); */
      /* 		if(h > trial) break; */
      /* 	    } */
	    p[i] = j-1;
    }
    j = h1;
    for (i = 1, sum_p=0, sum_q=0; i <= h2; i++) {
	    while (j >= 1 && trial - h_kern(x[j],x2[i],j,i,h1+1,eps[1]) > eps_trial)
        // while (j >= 1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) < trial)
        j--;
	    q[i] = j+1;

	    sum_p += p[i];
	    sum_q += j;/* = q[i]-1 */
    }

    if(trace_lev >= 3) {
	    if (trace_lev == 3)
        printf("sum_(p,q)= (%.0f,%.0f)", (double)sum_p, (double)sum_q);
	    else { /* trace_lev >= 4 */
        printf("\n%3s p[1:%d]:", "", h2);
        bool lrg = h2 >= 100; 
        int i_m = lrg ? 95 : h2;
        for(i = 1; i <= i_m; i++) printf(" %2d", p[i]); if(lrg) printf(" ...");
        printf(" sum=%4.0f\n%3s q[1:%d]:", (double)sum_p, "", h2);
        for(i = 1; i <= i_m; i++) printf(" %2d", q[i]); if(lrg) printf(" ...");
        printf(" sum=%4.0f\n", (double)sum_q);
	    }
    }

    if (knew <= sum_p) {
	    if(trace_lev >= 3)
        printf("; sum_p >= kn\n");
	    for (i = 1, neq = 0; i <= h2; i++) {
        right[i] = p[i];
        if (left[i] > right[i]+1) neq += left[i]-right[i]-1;
	    }
	    nr = sum_p;
    }
    else { /* knew > sum_p */
	    IsFound = (knew <= sum_q); /* i.e. sum_p < knew <= sum_q */;

	    if(trace_lev >= 3)
        printf("; s_p < kn ?<=? s_q: %s\n", IsFound ? "TRUE": "no");
	    if(IsFound) {
        medc = trial;
	    } else { /*	 knew > sum_q */
        for (i = 1; i <= h2; i++) {
          left[i] = q[i];
          if (left[i] > right[i]+1) neq += left[i]-right[i]-1;
        }
        nl = sum_q;
	    }
    }

  } /* end while loop */

  converged = IsFound || (nr-nl+neq <= n);
  if(!converged) {
    /* still: */
    medc = trial;
  }

  if (converged && !IsFound) { /* e.g., for  mc(1:4) : */
    j = 0;
    for (i = 1; i <= h2; i++) {
	    if (left[i] <= right[i]) {
        for (int k = left[i]; k <= right[i]; k++) {
          work[j] = -h_kern(x[k],x2[i],k,i,h1+1,eps[1]);
          j++;
        }
	    }
    }
    if(trace_lev)
	    printf("  not found [it=%d,  (nr,nl) = (%d,%d)],"
             " -> (knew-nl, j) = (%d,%d)\n",
             it, nr, nl, knew-nl, j);
    /* using rPsort(work, n,k), since we don't need work[] anymore:*/
    std::nth_element(work, work + knew-nl-1, work + j);
    medc = - work[knew-nl-1];
  }

  if(converged && trace_lev >= 2)
    printf("converged in %d iterations\n", it);

  return medc;

} /* end{ mc_C } */


static
double h_kern(double a, double b, int ai, int bi, int ab, double eps)
{
  /*     if (fabs(a-b) <= DBL_MIN) */
  /* check for zero division and positive b */
  if (fabs(a-b) < 2.0*eps || b > 0)
    return sign((double)(ab - (ai+bi)));

  /* else */
  return (a+b)/(a-b);
}

#include <vector>
#include <iostream>
#include <string>
#include <fstream>
#include <iomanip>
#include <sstream>

int main(int argc, char** argv) {
  using namespace std;

  vector<double> eps = {numeric_limits<double>::epsilon(), 
                        numeric_limits<double>::min()};

  string fname;
  if (argc > 1) {
    fname = argv[1];
  }
  else {
    cerr << "Please provide input file." << endl;
    return 1;
  }

  double trace_lev = 0;
  if(argc > 2) {
    stringstream ss;
    ss << argv[2];
    ss >> trace_lev;
  }

  ifstream ifs(fname);
  if (not ifs) {
    cerr << "File " << fname << " not found." << endl;
    return 1;
  }

  vector<double> data;  
  double datum;
  while (ifs >> datum) {
    data.push_back(datum);
  }
  
  cout << setprecision(16) << mc_C(&data[0], data.size(), &eps[0], trace_lev) << endl;

  return 0;
}