Mercurial > hg > medcouple
changeset 0:e374cdd5e1e7
init
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Tue, 13 Jan 2015 16:34:00 -0500 |
parents | |
children | 135f5ebf0726 |
files | mc.c medcouple.c octmedcouple wgt_himed.c wgt_himed_templ.h |
diffstat | 5 files changed, 586 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
new file mode 100644 --- /dev/null +++ b/mc.c @@ -0,0 +1,40 @@ +#include "mex.h" +#include "mlmc.c" +#include<stdio.h> + +void mexFunction( int nlhs, mxArray *plhs[], + int nrhs, const mxArray*prhs[] ) + +{ + double *yout; + double *yin; + long m,n,i; + + /* Check for proper number of arguments */ + + if (nrhs != 1) { + mexErrMsgTxt("One input argument required."); + } else if (nlhs > 1) { + mexErrMsgTxt("Too many output arguments."); + } + + m = mxGetM(prhs[0]); + n = mxGetN(prhs[0]); + + if (n!=1 && m==1) + { + mexErrMsgTxt("Input must be a columnvector."); + } + /* Create a matrix for the return argument */ + plhs[0] = mxCreateDoubleMatrix(1, n, mxREAL); + + /* Assign pointers to the various parameters */ + yout = mxGetPr(plhs[0]); + yin = mxGetPr(prhs[0]); + + for (i=0;i<n;i++) + mlmc(&yout[i],&yin[i*m],&m); + + return; + +}
new file mode 100644 --- /dev/null +++ b/medcouple.c @@ -0,0 +1,355 @@ +/* + Algorithm for the skewness estimator medcouple (MC) + -------------------------------------------------- + ( originally matlabmc.c and also mc/mcrsoft/spmc.c ) +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <algorithm> + +#include <inttypes.h> +// -> int64_t + +template <typename T> int sign(T val) { + return (T(0) < val) - (val < T(0)); +} + +/* Interface routines to be called via .C() and those from API : */ +#include "wgt_himed.c" +/* + including + + whimed_i(a,iw,n): the weighted high median of an array a of length n, + using the positive integer weights iw[]. + + * which is in ./wgt_himed.c_templ + * ~~~~~~~~~~~~~~~~~ + */ + +/* Includes the auxiliary function + + h_kern(a,b, ai,bi,ab, eps): the values h(a,b) needed to compute the mc +*/ +static +double h_kern(double a, double b, int ai, int bi, int ab, double eps); + +/* MM: The tolerance 'eps1' and 'eps2' can now be passed from R; + * the original code had only one 'eps' for both and hardcoded + * eps = 0.0000000000001; /.* == 1e-13 )) *./ + */ +/* MK: eps1: for (relative) "equality" checks + * eps2: used to check for over- and underflow, respectively + * therefore I suggest eps1 = DBL_EPS and eps2 = DBL_MIN + */ +double mc_C(double *z, int n, double *eps) +{ + /* NOTE: + eps = c(eps1, eps2) + iter := c(maxit, trace.lev) as input + = c(it, converged) as output + */ + int trace_lev = 0, it = 0; + bool converged = true; + double medc; // "the" result + + if (n < 3) { + return 0.; + } + /* copy data before sort()ing in place, also reflecting it */ + /* NOTE: x[0] is empty --- so 1-indexing is used (MM: danger!) */ + double *x = (double *) malloc((n+1)*sizeof(double)); + x[0] = 0; + for (int i = 0; i < n; i++) + x[i+1] = -z[i]; + + std::sort(&x[1], &x[1] + n); /* full sort */ + + double xmed; // := median( x[1:n] ) = - median( z[0:(n-1)] ): + if (n%2) { /* n even */ + xmed = x[(n/2)+1]; + } + else { /* n odd */ + int ind = (n/2); + xmed = (x[ind] + x[ind+1])/2; + } + + if (fabs(x[1] - xmed) < eps[0] * (eps[0] + fabs(xmed))) { + return -1; + } else if (fabs(x[n] - xmed) < eps[0] * (eps[0] + fabs(xmed))) { + return 1; + } + /* else : median is not at the border ------------------- */ + + if(trace_lev) + printf("mc_C(z[1:%d], trace_lev=%d): Median = %g (not at the border)\n", + n, trace_lev, -xmed); + + int i,j; + /* center x[] wrt median --> such that then median( x[1:n] ) == 0 */ + for (i = 1; i <= n; i++) + x[i] -= xmed; + + /* Now scale to inside [-0.5, 0.5] and flip sign such that afterwards + * x[1] >= x[2] >= ... >= x[n] */ + double xden = -2 * std::max(-x[1], x[n]); + for (i = 1; i <= n; i++) + x[i] /= xden; + xmed /= xden; + if(trace_lev >= 2) + printf(" x[] has been rescaled (* 1/s) with s = %g\n", -xden); + + j = 1; + double x_eps = eps[0] * (eps[0] + fabs(xmed)); + while (x[j] > x_eps && j <= n) { /* test relative to xmed */ + /* x1[j] = x[j]; */ + j++; + } + if(trace_lev >= 2) + printf(" x1[] := {x | x_j > x_eps = %g} has %d (='j-1') entries\n", + x_eps, j-1); + i = 1; + double *x2 = x+j-1; /* pointer -- corresponding to x2[i] = x[j]; */ + while (x[j] > -x_eps && j <= n) { /* test relative to xmed */ + /* x1[j] = x[j]; */ + /* x2[i] = x[j]; */ + j++; + i++; + } + /* now x1[] := {x | x_j > -eps} also includes the median (0) */ + if(trace_lev >= 2) + printf("'median-x' {x | -eps < x_i <= eps} has %d (= 'k') entries\n", + i-1); + int h1 = j-1, /* == size of x1[] == the sum of those two sizes above */ + /* conceptually, x2[] := {x | x_j <= eps} (which includes the median 0) */ + h2 = i + (n-j);// == size of x2[] == maximal size of whimed() arrays + + if(trace_lev) + printf(" now allocating 2+5 work arrays of size (1+) h2=%d each:\n", h2); + /* work arrays for whimed_i() : allocate *once* only !! */ + double *acand = (double *) malloc(h2*sizeof(double)), + *a_srt = (double *) malloc(h2*sizeof(double)); + + int *iw_cand= (int *) malloc(h2*sizeof(int)), + /* work arrays for the fast-median-of-table algorithm: + * currently still with 1-indexing */ + *left = (int *) malloc((h2+1)*sizeof(int)), + *right = (int *) malloc((h2+1)*sizeof(int)), + *p = (int *) malloc((h2+1)*sizeof(int)), + *q = (int *) malloc((h2+1)*sizeof(int)); + + for (i = 1; i <= h2; i++) { + left [i] = 1; + right[i] = h1; + } + int64_t nr = ((int64_t) h1) * ((int64_t) h2), // <-- careful to *NOT* overflow + knew = nr/2 +1; + if(trace_lev >= 2) + printf(" (h1,h2, nr, knew) = (%d,%d, %.0f, %.0f)\n", + h1,h2, (double)nr, (double)knew); + + double trial = -2./* -Wall */; + double *work= (double *) malloc(n*sizeof(double)); + int *iwt = (int *) malloc(n*sizeof(int)); + bool IsFound = false; + int nl = 0, + neq = 0; + /* MK: 'neq' counts the number of observations in the + * inside the tolerance range, i.e., where left > right + 1, + * since we would miss those when just using 'nl-nr'. + * This is to prevent index overflow in work[] later on. + * left might be larger than right + 1 since we are only + * testing with accuracy eps_trial and therefore there might + * be more than one observation in the `tolerance range` + * between < and <=. + */ + while (!IsFound && (nr-nl+neq > n) && it < 1e5) + { + int64_t sum_p, sum_q; + it++; + j = 0; + for (i = 1; i <= h2; i++) + if (left[i] <= right[i]) { + iwt[j] = right[i] - left[i]+1; + int k = left[i] + (iwt[j]/2); + work[j] = h_kern(x[k], x2[i], k, i, h1+1, eps[1]); + j++; + } + if(trace_lev >= 4) { + printf(" before whimed(): work and iwt, each [0:(%d-1)]:\n", j); + if(j >= 100) { + for(i=0; i < 90; i++) printf(" %8g", work[i]); printf("\n ... "); + for(i=j-4; i < j; i++)printf(" %8g", work[i]); printf("\n"); + for(i=0; i < 90; i++) printf(" %8d", iwt [i]); printf("\n ... "); + for(i=j-4; i < j; i++)printf(" %8d", iwt [i]); printf("\n"); + } else { // j <= 99 + for(i=0; i < j; i++) printf(" %8g", work[i]); printf("\n"); + for(i=0; i < j; i++) printf(" %8d", iwt [i]); printf("\n"); + } + } + trial = whimed_i(work, iwt, j, acand, a_srt, iw_cand); + double eps_trial = eps[0] * (eps[0] + fabs(trial)); + if(trace_lev >= 3) + printf("%2s it=%2d, whimed(*, n=%6d)= %8g ", " ", it, j, trial); + + j = 1; + for (i = h2; i >= 1; i--) { + while (j <= h1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) - trial > eps_trial) { + // while (j <= h1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) > trial) { + if (trace_lev >= 5) + printf("\nj=%3d, i=%3d, x[j]=%g, x2[i]=%g, h=%g", + j, i, x[j], x2[i], + h_kern(x[j],x2[i],j,i,h1+1,eps[1])); + j++; + } + /* for(; j <= h1; j++) { */ + /* register double h = h_kern(x[j],x2[i],j,i,h1+1,eps[1]); */ + /* if(h > trial) break; */ + /* } */ + p[i] = j-1; + } + j = h1; + for (i = 1, sum_p=0, sum_q=0; i <= h2; i++) { + while (j >= 1 && trial - h_kern(x[j],x2[i],j,i,h1+1,eps[1]) > eps_trial) + // while (j >= 1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) < trial) + j--; + q[i] = j+1; + + sum_p += p[i]; + sum_q += j;/* = q[i]-1 */ + } + + if(trace_lev >= 3) { + if (trace_lev == 3) + printf("sum_(p,q)= (%.0f,%.0f)", (double)sum_p, (double)sum_q); + else { /* trace_lev >= 4 */ + printf("\n%3s p[1:%d]:", "", h2); + bool lrg = h2 >= 100; + int i_m = lrg ? 95 : h2; + for(i = 1; i <= i_m; i++) printf(" %2d", p[i]); if(lrg) printf(" ..."); + printf(" sum=%4.0f\n%3s q[1:%d]:", (double)sum_p, "", h2); + for(i = 1; i <= i_m; i++) printf(" %2d", q[i]); if(lrg) printf(" ..."); + printf(" sum=%4.0f\n", (double)sum_q); + } + } + + if (knew <= sum_p) { + if(trace_lev >= 3) + printf("; sum_p >= kn\n"); + for (i = 1, neq = 0; i <= h2; i++) { + right[i] = p[i]; + if (left[i] > right[i]+1) neq += left[i]-right[i]-1; + } + nr = sum_p; + } + else { /* knew > sum_p */ + IsFound = (knew <= sum_q); /* i.e. sum_p < knew <= sum_q */; + + if(trace_lev >= 3) + printf("; s_p < kn ?<=? s_q: %s\n", IsFound ? "TRUE": "no"); + if(IsFound) { + medc = trial; + } else { /* knew > sum_q */ + for (i = 1; i <= h2; i++) { + left[i] = q[i]; + if (left[i] > right[i]+1) neq += left[i]-right[i]-1; + } + nl = sum_q; + } + } + + } /* end while loop */ + + converged = IsFound || (nr-nl+neq <= n); + if(!converged) { + /* still: */ + medc = trial; + } + + if (converged && !IsFound) { /* e.g., for mc(1:4) : */ + j = 0; + for (i = 1; i <= h2; i++) { + if (left[i] <= right[i]) { + for (int k = left[i]; k <= right[i]; k++) { + work[j] = -h_kern(x[k],x2[i],k,i,h1+1,eps[1]); + j++; + } + } + } + if(trace_lev) + printf(" not found [it=%d, (nr,nl) = (%d,%d)]," + " -> (knew-nl, j) = (%d,%d)\n", + it, nr, nl, knew-nl, j); + /* using rPsort(work, n,k), since we don't need work[] anymore:*/ + std::nth_element(work, work + knew-nl-1, work + j); + medc = - work[knew-nl-1]; + } + + if(converged && trace_lev >= 2) + printf("converged in %d iterations\n", it); + + return medc; + +} /* end{ mc_C } */ + + +static +double h_kern(double a, double b, int ai, int bi, int ab, double eps) +{ + /* if (fabs(a-b) <= DBL_MIN) */ + /* check for zero division and positive b */ + if (fabs(a-b) < 2.0*eps || b > 0) + return sign((double)(ab - (ai+bi))); + + /* else */ + return (a+b)/(a-b); +} + +#include <vector> +#include <iostream> +#include <string> +#include <fstream> +#include <iomanip> + +int main(int argc, char** argv) { + using namespace std; + + vector<double> eps = {numeric_limits<double>::epsilon(), + numeric_limits<double>::min()}; + + string fname; + if (argc > 1) { + fname = argv[1]; + } + else { + cerr << "Please provide input file." << endl; + return 1; + } + + ifstream ifs(fname); + if (not ifs) { + cerr << "File " << fname << " not found." << endl; + return 1; + } + + vector<double> data; + double datum; + while (ifs >> datum) { + data.push_back(datum); + } + + cout << setprecision(16) << mc_C(&data[0], data.size(), &eps[0]) << endl; + + return 0; +} + + +/* Local variables section + + * Local variables: + * mode: c + * kept-old-versions: 12 + * kept-new-versions: 20 + * End: + */
new file mode 100755 --- /dev/null +++ b/octmedcouple @@ -0,0 +1,15 @@ +#!/usr/bin/octave -q + +args = argv(); +if isempty(args) + disp("Provide input filename"); + exit(1); +endif + +fname = args{1}; +format long; +x = load(fname); +tic; +m = mc(x); +disp(toc) +disp(m);
new file mode 100644 --- /dev/null +++ b/wgt_himed.c @@ -0,0 +1,64 @@ +/* + * R : A Computer Language for Statistical Data Analysis + * Copyright (C) 2006--2007 the R Development Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + Used to be part of ./qn_sn.c + + Note by MM: We have explicit permission from P.Rousseeuw to + licence it under the GNU Public Licence. + + See also ../inst/Copyrights +*/ +#include <inttypes.h> +/* ^^^^^^^^^^ is supposedly more common and standard than + * #include <stdint.h> + * or #include <sys/types.h> */ +/* --> int64_t ; if people don't have the above, they can forget about it.. */ +/* #include "int64.h" */ + +#include <math.h> /* -> <math.h> and much more */ + +#define _i_whimed_ +#include "wgt_himed_templ.h" + +#define _d_whimed_ +#include "wgt_himed_templ.h" + + +void wgt_himed_i(double *x, int *n, int *iw, double *res) +{ + double *a_srt, *acand; + int *iw_cand, nn = (int)*n; + + acand = (double *)malloc(nn*sizeof(double)); + a_srt = (double *)malloc(nn*sizeof(double)); + iw_cand= (int *) malloc(nn*sizeof(int)); + + *res = whimed_i(x, (int *)iw, nn, acand, a_srt, iw_cand); +} + +void wgt_himed(double *x, int *n, double *w, double *res) +{ + double *a_srt, *a_cand, *w_cand; + int nn = (int)*n; + + a_cand = (double *) malloc(nn*sizeof(double)); + a_srt = (double *) malloc(nn*sizeof(double)); + w_cand = (double *) malloc(nn*sizeof(double)); + + *res = whimed(x, w, nn, a_cand, a_srt, w_cand); +}
new file mode 100644 --- /dev/null +++ b/wgt_himed_templ.h @@ -0,0 +1,112 @@ +/*------ Definition of a template for whimed(_i) : * + * -------- ~~~~~~ + * i.e., included several times from ./wgt_himed.c + * ~~~~~~~~~~~~~ + */ + + +#ifdef _d_whimed_ + +# define _WHIMED_ whimed +# define _WGT_TYPE_ double +# define _WGT_SUM_TYPE_ double +# undef _d_whimed_ + +#elif defined (_i_whimed_) + +# define _WHIMED_ whimed_i +# define _WGT_TYPE_ int +# define _WGT_SUM_TYPE_ int64_t +# undef _i_whimed_ + +#else +# error "must define correct whimed_ macro !" +#endif + +#include <algorithm> + + +double _WHIMED_(double *a, _WGT_TYPE_ *w, int n, + double* a_cand, double *a_srt, _WGT_TYPE_* w_cand) +{ + + /* + Algorithm to compute the weighted high median in O(n) time. + + The whimed is defined as the smallest a[j] such that the sum + of the weights of all a[i] <= a[j] is strictly greater than + half of the total weight. + + Arguments: + + a: double array containing the observations + n: number of observations + w: array of (int/double) weights of the observations. + */ + + int n2, i, kcand; + /* sum of weights: `int' do overflow when n ~>= 1e5 */ + _WGT_SUM_TYPE_ wleft, wmid, wright, w_tot, wrest; + + double trial; + + w_tot = 0; + for (i = 0; i < n; ++i) + w_tot += w[i]; + wrest = 0; + + /* REPEAT : */ + do { + for (i = 0; i < n; ++i) + a_srt[i] = a[i]; + n2 = n/2;/* =^= n/2 +1 with 0-indexing */ + std::nth_element(a_srt, a_srt + n2, a_srt + n); + trial = a_srt[n2]; + + wleft = 0; wmid = 0; wright= 0; + for (i = 0; i < n; ++i) { + if (a[i] < trial) + wleft += w[i]; + else if (a[i] > trial) + wright += w[i]; + else + wmid += w[i]; + } + /* wleft = sum_{i; a[i] < trial} w[i] + * wmid = sum_{i; a[i] == trial} w[i] at least one 'i' since trial is one a[]! + * wright= sum_{i; a[i] > trial} w[i] + */ + kcand = 0; + if (2 * (wrest + wleft) > w_tot) { + for (i = 0; i < n; ++i) { + if (a[i] < trial) { + a_cand[kcand] = a[i]; + w_cand[kcand] = w[i]; ++kcand; + } + } + } + else if (2 * (wrest + wleft + wmid) <= w_tot) { + for (i = 0; i < n; ++i) { + if (a[i] > trial) { + a_cand[kcand] = a[i]; + w_cand[kcand] = w[i]; ++kcand; + } + } + wrest += wleft + wmid; + } + else { + return trial; + /*==========*/ + } + n = kcand; + for (i = 0; i < n; ++i) { + a[i] = a_cand[i]; + w[i] = w_cand[i]; + } + } while(1); + +} /* _WHIMED_ */ + +#undef _WHIMED_ +#undef _WGT_TYPE_ +#undef _WGT_SUM_TYPE_