Mercurial > hg > medcouple
view medcouple.c++ @ 74:305b7361a5bd default tip @
showalgo: save a snapshot instead of waiting for keyboard input
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Sun, 29 May 2016 19:05:01 -0400 |
parents | 9b6502f3f3bc |
children |
line wrap: on
line source
/* Algorithm for the skewness estimator medcouple (MC) -------------------------------------------------- ( originally matlabmc.c and also mc/mcrsoft/spmc.c ) */ #include <stdio.h> #include <stdlib.h> #include <math.h> #include <algorithm> #include <inttypes.h> // -> int64_t template <typename T> int sign(T val) { return (T(0) < val) - (val < T(0)); } /* Interface routines to be called via .C() and those from API : */ #include "wgt_himed.c" /* including whimed_i(a,iw,n): the weighted high median of an array a of length n, using the positive integer weights iw[]. * which is in ./wgt_himed.c_templ * ~~~~~~~~~~~~~~~~~ */ /* Includes the auxiliary function h_kern(a,b, ai,bi,ab, eps): the values h(a,b) needed to compute the mc */ static double h_kern(double a, double b, int ai, int bi, int ab, double eps); /* MM: The tolerance 'eps1' and 'eps2' can now be passed from R; * the original code had only one 'eps' for both and hardcoded * eps = 0.0000000000001; /.* == 1e-13 )) *./ */ /* MK: eps1: for (relative) "equality" checks * eps2: used to check for over- and underflow, respectively * therefore I suggest eps1 = DBL_EPS and eps2 = DBL_MIN */ double mc_C(double *z, int n, double *eps, int trace_lev) { /* NOTE: eps = c(eps1, eps2) iter := c(maxit, trace.lev) as input = c(it, converged) as output */ int it = 0; bool converged = true; double medc; // "the" result if (n < 3) { return 0.; } /* copy data before sort()ing in place, also reflecting it */ /* NOTE: x[0] is empty --- so 1-indexing is used (MM: danger!) */ double *x = (double *) malloc((n+1)*sizeof(double)); x[0] = 0; for (int i = 0; i < n; i++) x[i+1] = -z[i]; std::sort(&x[1], &x[1] + n); /* full sort */ double xmed; // := median( x[1:n] ) = - median( z[0:(n-1)] ): if (n%2) { /* n even */ xmed = x[(n/2)+1]; } else { /* n odd */ int ind = (n/2); xmed = (x[ind] + x[ind+1])/2; } if (fabs(x[1] - xmed) < eps[0] * (eps[0] + fabs(xmed))) { return -1; } else if (fabs(x[n] - xmed) < eps[0] * (eps[0] + fabs(xmed))) { return 1; } /* else : median is not at the border ------------------- */ if(trace_lev) printf("mc_C(z[1:%d], trace_lev=%d): Median = %g (not at the border)\n", n, trace_lev, -xmed); int i,j; /* center x[] wrt median --> such that then median( x[1:n] ) == 0 */ for (i = 1; i <= n; i++) x[i] -= xmed; /* Now scale to inside [-0.5, 0.5] and flip sign such that afterwards * x[1] >= x[2] >= ... >= x[n] */ double xden = -2 * std::max(-x[1], x[n]); for (i = 1; i <= n; i++) x[i] /= xden; xmed /= xden; if(trace_lev >= 2) printf(" x[] has been rescaled (* 1/s) with s = %g\n", -xden); j = 1; double x_eps = eps[0] * (eps[0] + fabs(xmed)); while (x[j] > x_eps && j <= n) { /* test relative to xmed */ /* x1[j] = x[j]; */ j++; } if(trace_lev >= 2) printf(" x1[] := {x | x_j > x_eps = %g} has %d (='j-1') entries\n", x_eps, j-1); i = 1; double *x2 = x+j-1; /* pointer -- corresponding to x2[i] = x[j]; */ while (x[j] > -x_eps && j <= n) { /* test relative to xmed */ /* x1[j] = x[j]; */ /* x2[i] = x[j]; */ j++; i++; } /* now x1[] := {x | x_j > -eps} also includes the median (0) */ if(trace_lev >= 2) printf("'median-x' {x | -eps < x_i <= eps} has %d (= 'k') entries\n", i-1); int h1 = j-1, /* == size of x1[] == the sum of those two sizes above */ /* conceptually, x2[] := {x | x_j <= eps} (which includes the median 0) */ h2 = i + (n-j);// == size of x2[] == maximal size of whimed() arrays if(trace_lev) printf(" now allocating 2+5 work arrays of size (1+) h2=%d each:\n", h2); /* work arrays for whimed_i() : allocate *once* only !! */ double *acand = (double *) malloc(h2*sizeof(double)), *a_srt = (double *) malloc(h2*sizeof(double)); int *iw_cand= (int *) malloc(h2*sizeof(int)), /* work arrays for the fast-median-of-table algorithm: * currently still with 1-indexing */ *left = (int *) malloc((h2+1)*sizeof(int)), *right = (int *) malloc((h2+1)*sizeof(int)), *p = (int *) malloc((h2+1)*sizeof(int)), *q = (int *) malloc((h2+1)*sizeof(int)); for (i = 1; i <= h2; i++) { left [i] = 1; right[i] = h1; } int64_t nr = ((int64_t) h1) * ((int64_t) h2), // <-- careful to *NOT* overflow knew = nr/2 +1; if(trace_lev >= 2) printf(" (h1,h2, nr, knew) = (%d,%d, %.0f, %.0f)\n", h1,h2, (double)nr, (double)knew); double trial = -2./* -Wall */; double *work= (double *) malloc(n*sizeof(double)); int *iwt = (int *) malloc(n*sizeof(int)); bool IsFound = false; int nl = 0, neq = 0; /* MK: 'neq' counts the number of observations in the * inside the tolerance range, i.e., where left > right + 1, * since we would miss those when just using 'nl-nr'. * This is to prevent index overflow in work[] later on. * left might be larger than right + 1 since we are only * testing with accuracy eps_trial and therefore there might * be more than one observation in the `tolerance range` * between < and <=. */ while (!IsFound && (nr-nl+neq > n) && it < 1e5) { int64_t sum_p, sum_q; it++; j = 0; for (i = 1; i <= h2; i++) if (left[i] <= right[i]) { iwt[j] = right[i] - left[i]+1; int k = left[i] + (iwt[j]/2); work[j] = h_kern(x[k], x2[i], k, i, h1+1, eps[1]); j++; } if(trace_lev >= 4) { printf(" before whimed(): work and iwt, each [0:(%d-1)]:\n", j); if(j >= 100) { for(i=0; i < 90; i++) printf(" %8g", work[i]); printf("\n ... "); for(i=j-4; i < j; i++)printf(" %8g", work[i]); printf("\n"); for(i=0; i < 90; i++) printf(" %8d", iwt [i]); printf("\n ... "); for(i=j-4; i < j; i++)printf(" %8d", iwt [i]); printf("\n"); } else { // j <= 99 for(i=0; i < j; i++) printf(" %8g", work[i]); printf("\n"); for(i=0; i < j; i++) printf(" %8d", iwt [i]); printf("\n"); } } trial = whimed_i(work, iwt, j, acand, a_srt, iw_cand); double eps_trial = eps[0] * (eps[0] + fabs(trial)); if(trace_lev >= 3) printf("%2s it=%2d, whimed(*, n=%6d)= %8g ", " ", it, j, trial); j = 1; for (i = h2; i >= 1; i--) { while (j <= h1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) - trial > eps_trial) { // while (j <= h1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) > trial) { if (trace_lev >= 5) printf("\nj=%3d, i=%3d, x[j]=%g, x2[i]=%g, h=%g", j, i, x[j], x2[i], h_kern(x[j],x2[i],j,i,h1+1,eps[1])); j++; } /* for(; j <= h1; j++) { */ /* register double h = h_kern(x[j],x2[i],j,i,h1+1,eps[1]); */ /* if(h > trial) break; */ /* } */ p[i] = j-1; } j = h1; for (i = 1, sum_p=0, sum_q=0; i <= h2; i++) { while (j >= 1 && trial - h_kern(x[j],x2[i],j,i,h1+1,eps[1]) > eps_trial) // while (j >= 1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) < trial) j--; q[i] = j+1; sum_p += p[i]; sum_q += j;/* = q[i]-1 */ } if(trace_lev >= 3) { if (trace_lev == 3) printf("sum_(p,q)= (%.0f,%.0f)", (double)sum_p, (double)sum_q); else { /* trace_lev >= 4 */ printf("\n%3s p[1:%d]:", "", h2); bool lrg = h2 >= 100; int i_m = lrg ? 95 : h2; for(i = 1; i <= i_m; i++) printf(" %2d", p[i]); if(lrg) printf(" ..."); printf(" sum=%4.0f\n%3s q[1:%d]:", (double)sum_p, "", h2); for(i = 1; i <= i_m; i++) printf(" %2d", q[i]); if(lrg) printf(" ..."); printf(" sum=%4.0f\n", (double)sum_q); } } if (knew <= sum_p) { if(trace_lev >= 3) printf("; sum_p >= kn\n"); for (i = 1, neq = 0; i <= h2; i++) { right[i] = p[i]; if (left[i] > right[i]+1) neq += left[i]-right[i]-1; } nr = sum_p; } else { /* knew > sum_p */ IsFound = (knew <= sum_q); /* i.e. sum_p < knew <= sum_q */; if(trace_lev >= 3) printf("; s_p < kn ?<=? s_q: %s\n", IsFound ? "TRUE": "no"); if(IsFound) { medc = trial; } else { /* knew > sum_q */ for (i = 1; i <= h2; i++) { left[i] = q[i]; if (left[i] > right[i]+1) neq += left[i]-right[i]-1; } nl = sum_q; } } } /* end while loop */ converged = IsFound || (nr-nl+neq <= n); if(!converged) { /* still: */ medc = trial; } if (converged && !IsFound) { /* e.g., for mc(1:4) : */ j = 0; for (i = 1; i <= h2; i++) { if (left[i] <= right[i]) { for (int k = left[i]; k <= right[i]; k++) { work[j] = -h_kern(x[k],x2[i],k,i,h1+1,eps[1]); j++; } } } if(trace_lev) printf(" not found [it=%d, (nr,nl) = (%d,%d)]," " -> (knew-nl, j) = (%d,%d)\n", it, nr, nl, knew-nl, j); /* using rPsort(work, n,k), since we don't need work[] anymore:*/ std::nth_element(work, work + knew-nl-1, work + j); medc = - work[knew-nl-1]; } if(converged && trace_lev >= 2) printf("converged in %d iterations\n", it); return medc; } /* end{ mc_C } */ static double h_kern(double a, double b, int ai, int bi, int ab, double eps) { /* if (fabs(a-b) <= DBL_MIN) */ /* check for zero division and positive b */ if (fabs(a-b) < 2.0*eps || b > 0) return sign((double)(ab - (ai+bi))); /* else */ return (a+b)/(a-b); } #include <vector> #include <iostream> #include <string> #include <fstream> #include <iomanip> #include <sstream> int main(int argc, char** argv) { using namespace std; vector<double> eps = {numeric_limits<double>::epsilon(), numeric_limits<double>::min()}; string fname; if (argc > 1) { fname = argv[1]; } else { cerr << "Please provide input file." << endl; return 1; } double trace_lev = 0; if(argc > 2) { stringstream ss; ss << argv[2]; ss >> trace_lev; } ifstream ifs(fname); if (not ifs) { cerr << "File " << fname << " not found." << endl; return 1; } vector<double> data; double datum; while (ifs >> datum) { data.push_back(datum); } cout << setprecision(16) << mc_C(&data[0], data.size(), &eps[0], trace_lev) << endl; return 0; }