view jmedcouple.c++ @ 28:65f6fad65f49

jmedcouple: get rid of unused trace_lev parameter
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Sun, 18 Jan 2015 20:41:04 -0500
parents 40cbfb64e952
children cf8c8c825c6d
line wrap: on
line source

#include <vector>
#include <limits>
#include <iostream>
#include <string>
#include <fstream>
#include <iomanip>
#include <algorithm>
#include <utility>

using namespace std;

template <typename T> 
int signum(T val) {
    return (T(0) < val) - (val < T(0));
}

template <typename Container, typename Out = typename Container::value_type>
Out sum(const Container& C)  {
  return accumulate(C.begin(), C.end(), Out());
} 

/*
  This computes the weighted median of array A with corresponding
  weights W.
*/
double wmedian(const vector<double>& A,
               const vector<int>& W)
{
  typedef pair<double,int> aw_t;

  int n = A.size();
  vector<aw_t> AW(n);
  for (int i = 0; i < n; i ++) 
    AW[i] = make_pair(A[i], W[i]);

  long wtot = sum(W);

  int beg = 0;
  int end = n - 1;

  while (true) {
    int mid = (beg + end)/2;
    nth_element(AW.begin(), AW.begin() + mid, AW.end(),
                [](const aw_t& l, const aw_t& r){return l.first > r.first;});
    double trial = AW[mid].first;

    long wleft = 0, wright = 0;

    for (aw_t& aw: AW ) {
      if (aw.first > trial)
        wleft += aw.second;
      else
        // This also includes a == trial, i.e. the "middle" weight.
        wright += aw.second;
    }

    if (2*wleft > wtot)
      end = mid;
    else if (2*wright < wtot)
      beg = mid;
    else
      return trial;
  }
  
  return 0;
}

double medcouple(const std::vector<double>& X,
                 double eps1 = numeric_limits<double>::epsilon(), 
                 double eps2 = numeric_limits<double>::min())
{
  size_t n = X.size(), n2 = (n - 1)/2;
  
  if (n < 3)
    return 0;

  vector<double> Z = X;
  sort(Z.begin(), Z.end(), std::greater<double>());

  double Zmed;
  if (n % 2 == 1) 
    Zmed = Z[n2];
  else
    Zmed = (Z[n2] + Z[n2 + 1])/2;

  // Check if the median is at the edges up to relative epsilon
  if (abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed)))
    return -1.0;
  if (abs(Z[n - 1] - Zmed) < eps1*(eps1 + abs(Zmed)))
    return 1.0;
  
  // Centre Z wrt median, so that median(Z) = 0.
  for_each(Z.begin(), Z.end(), [&](double& z){ z -= Zmed;});

  // Scale inside [-0.5, 0.5], for greater numerical stability.
  double Zden = 2*max(Z[0], -Z[n - 1]);
  for_each(Z.begin(), Z.end(), [&](double& z){z /= Zden;});
  Zmed /= Zden;

  double Zeps = eps1*(eps1 + abs(Zmed));

  // These overlap on the entries that are tied with the median
  vector<double> Zplus, Zminus;
  copy_if(Z.begin(), Z.end(), back_inserter(Zplus), 
          [=](double z){return z >= -Zeps;});
  copy_if(Z.begin(), Z.end(), back_inserter(Zminus), 
          [=](double z){return Zeps >= z;});

  size_t 
    n_plus = Zplus.size(), 
    n_minus = Zminus.size();

  /*
    Kernel function h for the medcouple, closing over the values of
    Zplus and Zminus just defined above.

    In case a and be are within epsilon of the median, the kernel
    is the signum of their position.
  */
  auto h_kern = [&](int i, int j) {
    double a = Zplus[i];
    double b = Zminus[j];

    double h;
    if (abs(a - b) <= 2*eps2)
      h = signum(n_plus - 1 - i - j);
    else
      h = (a + b)/(a - b);

    return h;
  };
 
  // Init left and right borders
  vector<int> L(n_plus, 0);
  vector<int> R(n_plus, n_minus - 1);

  long Ltot = 0;
  long Rtot = n_minus*n_plus;
  long medc_idx = Rtot/2;
  
  // kth pair algorithm (Johnson & Mizoguchi)
  while (Rtot - Ltot > n_plus) {
    // First, compute the median inside the given bounds
    vector<double> A;
    vector<int> W;
    for (int i = 0; i < n_plus; i++) {
      if (L[i] <= R[i]) {
        A.push_back(h_kern(i, (L[i] + R[i])/2));
        W.push_back(R[i] - L[i] + 1);
      }
    }
    double Am = wmedian(A, W);
    
    double Am_eps = eps1*(eps1 + abs(Am));

    //Compute new left and right boundaries, based on the weighted
    //median
    vector<int> P(n_plus), Q(n_plus);

    {
      int j = 0;
      for (int i = n_plus - 1; i >= 0; i--) {
        while (j < n_minus and h_kern(i, j) - Am > Am_eps)
          j++;
        P[i] = j - 1;
      }
    }

    {
      int j = n_minus - 1;
      for (int i = 0; i < n_plus; i++) {
        while (j >= 0 and h_kern(i, j) - Am < -Am_eps)
          j--;
        Q[i] = j + 1;
      }
    }

    long 
      sumP = sum(P) + n_plus,
      sumQ = sum(Q);
    
    if (medc_idx <= sumP - 1) {
      R = P;
      Rtot = sumP;
    }
    else {
      if (medc_idx > sumQ - 1) {
        L = Q;
        Ltot = sumQ;
      }
      else
        return Am;
    }
  }

  // Didn't find the median, but now we have a very small search space
  // to find it in, just between the left and right boundaries. This
  // space is of size Rtot - Ltot which is <= n_plus
  vector<double> A;
  for (int i = 0; i < n_plus; i++) {
    for (int j = L[i]; j <= R[i]; j++)
      A.push_back(h_kern(i, j));
  }
  nth_element(A.begin(), A.begin() + medc_idx - Ltot, A.end(),
              [](double x, double y) {return x > y;});
  
  double Am = A[medc_idx - Ltot];
  return Am;
}



int main(int argc, char** argv) {
  double 
    eps1 = numeric_limits<double>::epsilon(), 
    eps2 = numeric_limits<double>::min();

  string fname;
  if (argc > 1) {
    fname = argv[1];
  }
  else {
    cerr << "Please provide input file." << endl;
    return 1;
  }

  ifstream ifs(fname);
  if (not ifs) {
    cerr << "File " << fname << " not found." << endl;
    return 1;
  }

  vector<double> data;  
  double datum;
  while (ifs >> datum) {
    data.push_back(datum);
  }
  
  cout << setprecision(16) 
       << medcouple(data, eps1, eps2) << endl;

  return 0;
}