changeset 0:e374cdd5e1e7

init
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Tue, 13 Jan 2015 16:34:00 -0500
parents
children 135f5ebf0726
files mc.c medcouple.c octmedcouple wgt_himed.c wgt_himed_templ.h
diffstat 5 files changed, 586 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/mc.c
@@ -0,0 +1,40 @@
+#include "mex.h"
+#include "mlmc.c"
+#include<stdio.h>
+
+void mexFunction( int nlhs, mxArray *plhs[],
+                  int nrhs, const mxArray*prhs[] )
+
+{
+    double *yout;
+    double *yin;
+    long m,n,i;
+
+    /* Check for proper number of arguments */
+
+    if (nrhs != 1) {
+        mexErrMsgTxt("One input argument required.");
+    } else if (nlhs > 1) {
+        mexErrMsgTxt("Too many output arguments.");
+    }
+
+    m = mxGetM(prhs[0]);
+    n = mxGetN(prhs[0]);
+
+    if (n!=1 && m==1)
+    {
+        mexErrMsgTxt("Input must be a columnvector.");
+    }
+    /* Create a matrix for the return argument */
+    plhs[0] = mxCreateDoubleMatrix(1, n, mxREAL);
+
+    /* Assign pointers to the various parameters */
+    yout = mxGetPr(plhs[0]);
+    yin = mxGetPr(prhs[0]);
+
+        for (i=0;i<n;i++)
+                mlmc(&yout[i],&yin[i*m],&m);
+
+    return;
+
+}
new file mode 100644
--- /dev/null
+++ b/medcouple.c
@@ -0,0 +1,355 @@
+/*
+  Algorithm for the skewness estimator medcouple (MC)
+  --------------------------------------------------
+  ( originally matlabmc.c and also  mc/mcrsoft/spmc.c )
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <algorithm>
+
+#include <inttypes.h>
+// -> int64_t
+
+template <typename T> int sign(T val) {
+    return (T(0) < val) - (val < T(0));
+}
+
+/* Interface routines to be called via .C() and those from API : */
+#include "wgt_himed.c"
+/*
+  including
+
+  whimed_i(a,iw,n): the weighted high median of an array a of length n,
+  using the positive integer weights iw[].
+
+  * which is in ./wgt_himed.c_templ
+  *		 ~~~~~~~~~~~~~~~~~
+  */
+
+/* Includes the auxiliary function
+
+   h_kern(a,b, ai,bi,ab, eps):	the values h(a,b)  needed to compute the mc
+*/
+static
+double h_kern(double a, double b, int ai, int bi, int ab, double eps);
+
+/* MM:	The tolerance  'eps1' and 'eps2' can now be passed from R;
+ *	the original code had only one 'eps' for both and hardcoded
+ *	   eps =  0.0000000000001;  /.* == 1e-13 )) *./
+ */
+/* MK:  eps1: for (relative) "equality" checks
+ *      eps2: used to check for over- and underflow, respectively
+ *      therefore I suggest eps1 = DBL_EPS and eps2 = DBL_MIN
+ */
+double mc_C(double *z, int n, double *eps)
+{
+  /* NOTE:
+     eps	  = c(eps1, eps2)
+     iter := c(maxit, trace.lev)  as input
+     = c(it, converged)     as output
+  */
+  int trace_lev = 0, it = 0;
+  bool converged = true;
+  double medc; // "the" result
+
+  if (n < 3) {
+    return 0.;
+  }
+  /* copy data before sort()ing in place, also reflecting it */
+  /* NOTE: x[0] is empty --- so 1-indexing is used (MM: danger!) */
+  double *x  = (double *) malloc((n+1)*sizeof(double));
+  x[0] = 0;
+  for (int i = 0; i < n; i++)
+    x[i+1] = -z[i];
+
+  std::sort(&x[1], &x[1] + n); /* full sort */
+
+  double xmed; // := median( x[1:n] ) = - median( z[0:(n-1)] ):
+  if (n%2) { /* n even */
+    xmed = x[(n/2)+1];
+  }
+  else { /* n  odd */
+    int ind = (n/2);
+    xmed = (x[ind] + x[ind+1])/2;
+  }
+
+  if (fabs(x[1] - xmed) < eps[0] * (eps[0] + fabs(xmed))) {
+    return -1;
+  } else if (fabs(x[n] - xmed) < eps[0] * (eps[0] + fabs(xmed))) {
+    return 1;
+  }
+  /* else : median is not at the border ------------------- */
+
+  if(trace_lev)
+    printf("mc_C(z[1:%d], trace_lev=%d): Median = %g (not at the border)\n", 
+           n, trace_lev, -xmed);
+
+  int i,j;
+  /* center x[] wrt median --> such that then  median( x[1:n] ) == 0 */
+  for (i = 1; i <= n; i++)
+    x[i] -= xmed;
+
+  /* Now scale to inside [-0.5, 0.5] and flip sign such that afterwards
+   *  x[1] >= x[2] >= ... >= x[n] */
+  double xden = -2 * std::max(-x[1], x[n]);
+  for (i = 1; i <= n; i++)
+    x[i] /= xden;
+  xmed /= xden;
+  if(trace_lev >= 2)
+    printf(" x[] has been rescaled (* 1/s) with s = %g\n", -xden);
+
+  j = 1;
+  double x_eps = eps[0] * (eps[0] + fabs(xmed));
+  while (x[j] > x_eps && j <= n) { /* test relative to xmed */
+    /* x1[j] = x[j]; */
+    j++;
+  }
+  if(trace_lev >= 2)
+    printf("   x1[] := {x | x_j > x_eps = %g}    has %d (='j-1') entries\n", 
+           x_eps, j-1);
+  i = 1;
+  double *x2 = x+j-1; /* pointer -- corresponding to  x2[i] = x[j]; */
+  while (x[j] > -x_eps && j <= n) { /* test relative to xmed */
+    /* x1[j] = x[j]; */
+    /* x2[i] = x[j]; */
+    j++;
+    i++;
+  }
+  /* now  x1[] := {x | x_j > -eps}  also includes the median (0) */
+  if(trace_lev >= 2)
+    printf("'median-x' {x | -eps < x_i <= eps} has %d (= 'k') entries\n",
+           i-1);
+  int h1 = j-1, /* == size of x1[] == the sum of those two sizes above */
+    /* conceptually,  x2[] := {x | x_j <= eps}   (which includes the median 0) */
+    h2 = i + (n-j);// == size of x2[] == maximal size of whimed() arrays
+
+  if(trace_lev)
+    printf("  now allocating 2+5 work arrays of size (1+) h2=%d each:\n", h2);
+  /* work arrays for whimed_i() :  allocate *once* only !! */
+  double *acand  = (double *) malloc(h2*sizeof(double)),
+    *a_srt  = (double *) malloc(h2*sizeof(double));
+
+  int    *iw_cand= (int *)	malloc(h2*sizeof(int)),
+    /* work arrays for the fast-median-of-table algorithm:
+     *  currently still with  1-indexing */
+    *left  = (int *) malloc((h2+1)*sizeof(int)),
+    *right = (int *) malloc((h2+1)*sizeof(int)),
+    *p     = (int *) malloc((h2+1)*sizeof(int)),
+    *q     = (int *) malloc((h2+1)*sizeof(int));
+
+  for (i = 1; i <= h2; i++) {
+    left [i] = 1;
+    right[i] = h1;
+  }
+  int64_t nr = ((int64_t) h1) * ((int64_t) h2), // <-- careful to *NOT* overflow
+    knew = nr/2 +1;
+  if(trace_lev >= 2)
+    printf(" (h1,h2, nr, knew) = (%d,%d, %.0f, %.0f)\n",
+           h1,h2, (double)nr, (double)knew);
+
+  double trial = -2./* -Wall */;
+  double *work= (double *) malloc(n*sizeof(double));
+  int	   *iwt = (int *)    malloc(n*sizeof(int));
+  bool IsFound = false;
+  int nl = 0,
+    neq = 0;
+  /* MK:  'neq' counts the number of observations in the
+   *      inside the tolerance range, i.e., where left > right + 1,
+   *      since we would miss those when just using 'nl-nr'.
+   *      This is to prevent index overflow in work[] later on.
+   *      left might be larger than right + 1 since we are only
+   *      testing with accuracy eps_trial and therefore there might
+   *      be more than one observation in the `tolerance range`
+   *      between < and <=.
+   */
+  while (!IsFound && (nr-nl+neq > n) && it < 1e5)
+  {
+    int64_t sum_p, sum_q;
+    it++;
+    j = 0;
+    for (i = 1; i <= h2; i++)
+	    if (left[i] <= right[i]) {
+        iwt[j] = right[i] - left[i]+1;
+        int k = left[i] + (iwt[j]/2);
+        work[j] = h_kern(x[k], x2[i], k, i, h1+1, eps[1]);
+        j++;
+	    }
+    if(trace_lev >= 4) {
+	    printf(" before whimed(): work and iwt, each [0:(%d-1)]:\n", j);
+	    if(j >= 100) {
+        for(i=0; i < 90; i++) printf(" %8g", work[i]); printf("\n  ... ");
+        for(i=j-4; i < j; i++)printf(" %8g", work[i]); printf("\n");
+        for(i=0; i < 90; i++) printf(" %8d", iwt [i]); printf("\n  ... ");
+        for(i=j-4; i < j; i++)printf(" %8d", iwt [i]); printf("\n");
+	    } else { // j <= 99
+        for(i=0; i < j; i++) printf(" %8g", work[i]); printf("\n");
+        for(i=0; i < j; i++) printf(" %8d", iwt [i]); printf("\n");
+	    }
+    }
+    trial = whimed_i(work, iwt, j, acand, a_srt, iw_cand);
+    double eps_trial = eps[0] * (eps[0] + fabs(trial));
+    if(trace_lev >= 3)
+	    printf("%2s it=%2d, whimed(*, n=%6d)= %8g ", " ", it, j, trial);
+
+    j = 1;
+    for (i = h2; i >= 1; i--) {
+	    while (j <= h1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) - trial > eps_trial) {
+        // while (j <= h1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) > trial) {
+        if (trace_lev >= 5)
+          printf("\nj=%3d, i=%3d, x[j]=%g, x2[i]=%g, h=%g", 
+                 j, i, x[j], x2[i],
+                 h_kern(x[j],x2[i],j,i,h1+1,eps[1]));
+        j++;
+	    }
+      /* 	    for(; j <= h1; j++) { */
+      /* 		register double h = h_kern(x[j],x2[i],j,i,h1+1,eps[1]); */
+      /* 		if(h > trial) break; */
+      /* 	    } */
+	    p[i] = j-1;
+    }
+    j = h1;
+    for (i = 1, sum_p=0, sum_q=0; i <= h2; i++) {
+	    while (j >= 1 && trial - h_kern(x[j],x2[i],j,i,h1+1,eps[1]) > eps_trial)
+        // while (j >= 1 && h_kern(x[j],x2[i],j,i,h1+1,eps[1]) < trial)
+        j--;
+	    q[i] = j+1;
+
+	    sum_p += p[i];
+	    sum_q += j;/* = q[i]-1 */
+    }
+
+    if(trace_lev >= 3) {
+	    if (trace_lev == 3)
+        printf("sum_(p,q)= (%.0f,%.0f)", (double)sum_p, (double)sum_q);
+	    else { /* trace_lev >= 4 */
+        printf("\n%3s p[1:%d]:", "", h2);
+        bool lrg = h2 >= 100; 
+        int i_m = lrg ? 95 : h2;
+        for(i = 1; i <= i_m; i++) printf(" %2d", p[i]); if(lrg) printf(" ...");
+        printf(" sum=%4.0f\n%3s q[1:%d]:", (double)sum_p, "", h2);
+        for(i = 1; i <= i_m; i++) printf(" %2d", q[i]); if(lrg) printf(" ...");
+        printf(" sum=%4.0f\n", (double)sum_q);
+	    }
+    }
+
+    if (knew <= sum_p) {
+	    if(trace_lev >= 3)
+        printf("; sum_p >= kn\n");
+	    for (i = 1, neq = 0; i <= h2; i++) {
+        right[i] = p[i];
+        if (left[i] > right[i]+1) neq += left[i]-right[i]-1;
+	    }
+	    nr = sum_p;
+    }
+    else { /* knew > sum_p */
+	    IsFound = (knew <= sum_q); /* i.e. sum_p < knew <= sum_q */;
+
+	    if(trace_lev >= 3)
+        printf("; s_p < kn ?<=? s_q: %s\n", IsFound ? "TRUE": "no");
+	    if(IsFound) {
+        medc = trial;
+	    } else { /*	 knew > sum_q */
+        for (i = 1; i <= h2; i++) {
+          left[i] = q[i];
+          if (left[i] > right[i]+1) neq += left[i]-right[i]-1;
+        }
+        nl = sum_q;
+	    }
+    }
+
+  } /* end while loop */
+
+  converged = IsFound || (nr-nl+neq <= n);
+  if(!converged) {
+    /* still: */
+    medc = trial;
+  }
+
+  if (converged && !IsFound) { /* e.g., for  mc(1:4) : */
+    j = 0;
+    for (i = 1; i <= h2; i++) {
+	    if (left[i] <= right[i]) {
+        for (int k = left[i]; k <= right[i]; k++) {
+          work[j] = -h_kern(x[k],x2[i],k,i,h1+1,eps[1]);
+          j++;
+        }
+	    }
+    }
+    if(trace_lev)
+	    printf("  not found [it=%d,  (nr,nl) = (%d,%d)],"
+             " -> (knew-nl, j) = (%d,%d)\n",
+             it, nr, nl, knew-nl, j);
+    /* using rPsort(work, n,k), since we don't need work[] anymore:*/
+    std::nth_element(work, work + knew-nl-1, work + j);
+    medc = - work[knew-nl-1];
+  }
+
+  if(converged && trace_lev >= 2)
+    printf("converged in %d iterations\n", it);
+
+  return medc;
+
+} /* end{ mc_C } */
+
+
+static
+double h_kern(double a, double b, int ai, int bi, int ab, double eps)
+{
+  /*     if (fabs(a-b) <= DBL_MIN) */
+  /* check for zero division and positive b */
+  if (fabs(a-b) < 2.0*eps || b > 0)
+    return sign((double)(ab - (ai+bi)));
+
+  /* else */
+  return (a+b)/(a-b);
+}
+
+#include <vector>
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <iomanip>
+
+int main(int argc, char** argv) {
+  using namespace std;
+
+  vector<double> eps = {numeric_limits<double>::epsilon(), 
+                        numeric_limits<double>::min()};
+
+  string fname;
+  if (argc > 1) {
+    fname = argv[1];
+  }
+  else {
+    cerr << "Please provide input file." << endl;
+    return 1;
+  }
+
+  ifstream ifs(fname);
+  if (not ifs) {
+    cerr << "File " << fname << " not found." << endl;
+    return 1;
+  }
+
+  vector<double> data;  
+  double datum;
+  while (ifs >> datum) {
+    data.push_back(datum);
+  }
+  
+  cout << setprecision(16) << mc_C(&data[0], data.size(), &eps[0]) << endl;
+
+  return 0;
+}
+
+
+/* Local variables section
+
+ * Local variables:
+ * mode: c
+ * kept-old-versions: 12
+ * kept-new-versions: 20
+ * End:
+ */
new file mode 100755
--- /dev/null
+++ b/octmedcouple
@@ -0,0 +1,15 @@
+#!/usr/bin/octave -q
+
+args = argv();
+if isempty(args)
+   disp("Provide input filename");
+   exit(1);
+endif
+   
+fname = args{1};
+format long;
+x = load(fname);
+tic; 
+m = mc(x);
+disp(toc)
+disp(m);
new file mode 100644
--- /dev/null
+++ b/wgt_himed.c
@@ -0,0 +1,64 @@
+/*
+ *  R : A Computer Language for Statistical Data Analysis
+ *  Copyright (C) 2006--2007	the R Development Core Team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+ Used to be part of ./qn_sn.c
+
+ Note by MM: We have explicit permission from P.Rousseeuw to
+ licence it under the GNU Public Licence.
+
+ See also ../inst/Copyrights
+*/
+#include <inttypes.h>
+/*        ^^^^^^^^^^ is supposedly more common and standard than
+ * #include <stdint.h>
+ * or #include <sys/types.h> */
+/* --> int64_t ; if people don't have the above, they can forget about it.. */
+/* #include "int64.h" */
+
+#include <math.h> /* -> <math.h> and much more */
+
+#define _i_whimed_
+#include "wgt_himed_templ.h"
+
+#define _d_whimed_
+#include "wgt_himed_templ.h"
+
+
+void wgt_himed_i(double *x, int *n, int *iw, double *res)
+{
+    double *a_srt, *acand;
+    int *iw_cand, nn = (int)*n;
+
+    acand  = (double *)malloc(nn*sizeof(double));
+    a_srt  = (double *)malloc(nn*sizeof(double));
+    iw_cand= (int *)   malloc(nn*sizeof(int));
+
+    *res = whimed_i(x, (int *)iw, nn, acand, a_srt, iw_cand);
+}
+
+void wgt_himed(double *x, int *n, double *w, double *res)
+{
+    double *a_srt, *a_cand, *w_cand;
+    int nn = (int)*n;
+
+    a_cand = (double *) malloc(nn*sizeof(double));
+    a_srt  = (double *) malloc(nn*sizeof(double));
+    w_cand = (double *) malloc(nn*sizeof(double));
+
+    *res = whimed(x, w, nn, a_cand, a_srt, w_cand);
+}
new file mode 100644
--- /dev/null
+++ b/wgt_himed_templ.h
@@ -0,0 +1,112 @@
+/*------ Definition of a template for whimed(_i) : *
+ *			 --------     ~~~~~~
+ * i.e., included several times from ./wgt_himed.c
+ *				     ~~~~~~~~~~~~~
+ */
+
+
+#ifdef _d_whimed_
+
+# define _WHIMED_	whimed
+# define _WGT_TYPE_	double
+# define _WGT_SUM_TYPE_ double
+# undef _d_whimed_
+
+#elif defined (_i_whimed_)
+
+# define _WHIMED_	whimed_i
+# define _WGT_TYPE_	int
+# define _WGT_SUM_TYPE_ int64_t
+# undef _i_whimed_
+
+#else
+# error "must define correct  whimed_  macro !"
+#endif
+
+#include <algorithm>
+
+
+double _WHIMED_(double *a, _WGT_TYPE_ *w, int n,
+                double* a_cand, double *a_srt, _WGT_TYPE_* w_cand)
+{
+
+  /*
+    Algorithm to compute the weighted high median in O(n) time.
+
+    The whimed is defined as the smallest a[j] such that the sum
+    of the weights of all a[i] <= a[j] is strictly greater than
+    half of the total weight.
+
+    Arguments:
+
+    a: double array containing the observations
+    n: number of observations
+    w: array of (int/double) weights of the observations.
+  */
+
+  int n2, i, kcand;
+  /* sum of weights: `int' do overflow when  n ~>= 1e5 */
+  _WGT_SUM_TYPE_ wleft, wmid, wright, w_tot, wrest;
+
+  double trial;
+
+  w_tot = 0;
+  for (i = 0; i < n; ++i)
+    w_tot += w[i];
+  wrest = 0;
+
+  /* REPEAT : */
+  do {
+    for (i = 0; i < n; ++i)
+	    a_srt[i] = a[i];
+    n2 = n/2;/* =^= n/2 +1 with 0-indexing */
+    std::nth_element(a_srt, a_srt + n2, a_srt + n);
+    trial = a_srt[n2];
+
+    wleft = 0;    wmid  = 0;    wright= 0;
+    for (i = 0; i < n; ++i) {
+	    if (a[i] < trial)
+        wleft += w[i];
+	    else if (a[i] > trial)
+        wright += w[i];
+	    else
+        wmid += w[i];
+    }
+    /* wleft = sum_{i; a[i]	 < trial}  w[i]
+     * wmid	 = sum_{i; a[i] == trial}  w[i] at least one 'i' since trial is one a[]!
+     * wright= sum_{i; a[i]	 > trial}  w[i]
+     */
+    kcand = 0;
+    if (2 * (wrest + wleft) > w_tot) {
+	    for (i = 0; i < n; ++i) {
+        if (a[i] < trial) {
+          a_cand[kcand] = a[i];
+          w_cand[kcand] = w[i];	++kcand;
+        }
+	    }
+    }
+    else if (2 * (wrest + wleft + wmid) <= w_tot) {
+	    for (i = 0; i < n; ++i) {
+        if (a[i] > trial) {
+          a_cand[kcand] = a[i];
+          w_cand[kcand] = w[i];	++kcand;
+        }
+	    }
+	    wrest += wleft + wmid;
+    }
+    else {
+	    return trial;
+	    /*==========*/
+    }
+    n = kcand;
+    for (i = 0; i < n; ++i) {
+	    a[i] = a_cand[i];
+	    w[i] = w_cand[i];
+    }
+  } while(1);
+
+} /* _WHIMED_ */
+
+#undef _WHIMED_
+#undef _WGT_TYPE_
+#undef _WGT_SUM_TYPE_