view talk/talk.tex @ 74:305b7361a5bd default tip @

showalgo: save a snapshot instead of waiting for keyboard input
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Sun, 29 May 2016 19:05:01 -0400
parents d5ec0659b26b
children
line wrap: on
line source

%%% BEGIN BEAMER PREAMBLE %%%
\documentclass[blue]{beamer}
\usepackage{bm, fourier, anyfontsize, xcolor}
\newcommand{\MC}{\operatorname{MC}}
\newcommand{\signum}{\operatorname{signum}}
\newcommand{\IQR}{\operatorname{IQR}}

\theoremstyle{definition}
\newtheorem*{defn}{Definition}


\mode<presentation>
{
  \usetheme{boxes}
  \usecolortheme{crane}
}
\beamertemplatenavigationsymbolsempty

\AtBeginSection[]
{
  \begin{frame}<beamer>
    \frametitle{Outline}
    \tableofcontents[currentsection]
  \end{frame}
}


\usepackage{times}
%%% END BEAMER PREAMBLE %%%


%%% BEGIN METADATA %%%

\author{Jordi G. H. $\langle$jordigh@octave.org$\rangle$ }


\title{The Medcouple}
\subtitle{A robust measure of skewness}
\date{}

%%% END METADATA%%%

\begin{document}

\begin{frame}
  \titlepage
\end{frame}

\begin{frame}
  \frametitle{Outline}
  \tableofcontents
\end{frame}

\section{Outliers and Boxplots}

\begin{frame}{What is an outlier?}
  \pause
  \begin{center}
    \pgfimage[height=2.5in]{img/normal-boxhistplot.pdf}
  \end{center}
  A simple answer: Tukey's boxplots
\end{frame}

\begin{frame}{Anatomy of a boxplot}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{\pgfimage[width=4in]{img/normal-points}}
    \only<2>{\pgfimage[width=4in]{img/normal-boxplot}}
    \only<3>{\pgfimage[width=4in]{img/normal-boxplot-bare/base}}
    \only<4>{\pgfimage[width=4in]{img/normal-boxplot-bare/median}}
    \only<5>{\pgfimage[width=4in]{img/normal-boxplot-bare/q1q3}}
    \only<6>{\pgfimage[width=4in]{img/normal-boxplot-bare/IQR}}
    \only<7>{\pgfimage[width=4in]{img/normal-boxplot-bare/whiskers}}
    \only<8>{\pgfimage[width=4in]{img/normal-boxplot-bare/15}}
    \only<9>{\pgfimage[width=4in]{img/normal-boxplot-bare/outliers}}
  \end{overlayarea}
\end{frame}

\begin{frame}{Anatomy of a boxplot}
  \begin{itemize}
    \item Why 1.5?
    \pause
    \item Tukey responded: ``it's less than 2 and more than 1''
  \end{itemize}
\end{frame}

\begin{frame}{Outliers}
  \pause
  \begin{center}
    \pgfimage[height=2.5in]{img/normal-boxhistplot}
  \end{center}

  The boxplot identifies $10$ outliers out of $1000$ points ($1\%$)
\end{frame}

\begin{frame}{Skew distributions}
  Remember:
  \begin{center}
    \pgfimage[width=4in]{img/skew-distributions}
  \end{center}
\end{frame}

\begin{frame}
  For skew distributions...
\end{frame}

\begin{frame}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \pgfimage[height=3in]{img/geometric-boxhistplot}

      $433$ outliers out of $10 000$ points ($4.3\%$)
    }
    \only<2>{
      \pgfimage[height=3in]{img/boys-and-girls}

      $578$ and $644$ outliers for actors and actresses respectively
      ($1.2\%$ and $3\%$)
    }
  \end{overlayarea}
\end{frame}

\begin{frame}
  \begin{itemize}
    \item Too many outliers...
    \pause
    \item Idea: adjust whisker lengths taking into account skewness:
  \end{itemize}
  \emph{M. Hubert; E. Vandervieren (2008). "An adjusted boxplot for skewed
    distributions". Computational Statistics and Data Analysis 52
    (12): 5186-5201. doi:10.1016/j.csda.2007.11.008.}
\end{frame}

\begin{frame}{Adjusted boxplot}
  \begin{overlayarea}{\textwidth}{3cm}
    \only<1>{
      Recall normal whiskers:
      % Trick to hide medcouple, use whiteout, so that the text gets
      % positioned the same with or without it.
      \begin{align*}
        \text{lower} &= Q_1 - 1.5 \IQR\textcolor{white}{e^{a \MC}} \\
        \text{higher} &= Q_3 + 1.5 \IQR\textcolor{white}{e^{b \MC}}
      \end{align*}
    }
    \only<2>{
      Instead, use adjusted whiskers:
      \begin{align*}
        \text{lower} &= Q_1 - 1.5 \IQR\textcolor{red}{e^{a \MC}}  \\
        \text{higher} &= Q_3 + 1.5 \IQR\textcolor{red}{e^{b \MC}}
      \end{align*}
      \begin{itemize}
        \item[$\MC$] -- the \emph{medcouple}, a measure of skewness
        \item[$a, b$] -- parameters to fit across some sample distributions
      \end{itemize}
    }
  \end{overlayarea}
\end{frame}

\begin{frame}{Adjusted boxplot}
  For the whiskers, Hubert and Vandervieren recommend:
  \[
  \begin{cases}
    [Q_1 - 1.5 \IQR e^{-3 \MC},  Q_3 + 1.5 \IQR e^{4 \MC}] &\text{if } \MC > 0 \\
    [Q_1 - 1.5 \IQR e^{-4 \MC},  Q_3 + 1.5 \IQR e^{3 \MC}] &\text{if } \MC < 0
  \end{cases}
  \]
  \pause
  Of course, if $\MC = 0$ (no skewness) then no adjustment
\end{frame}

\begin{frame}
  Let's see some adjusted boxplots...
\end{frame}

\begin{frame}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \pgfimage[height=3in]{img/geometric-boxhistplot}

      $433$ outliers out of $10 000$ points ($4.3\%$)
    }
    \only<2>{
      \pgfimage[height=3in]{img/geometric-boxhistplot-adjusted}

      \textcolor{red}{$25$ outliers} out of $10 000$ points
      (\textcolor{red}{$0.25\%$}) (\textcolor{blue}{$\MC = 0.25$})
    }
  \end{overlayarea}
\end{frame}

\begin{frame}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \pgfimage[height=3in]{img/normal-boxhistplot}

      $10$ outliers out of $1 000$ points ($1\%$)
    }
    \only<2>{
      \pgfimage[height=3in]{img/normal-boxhistplot-adjusted}

      \textcolor{red}{$10$ outliers} out of $1 000$ points
      (\textcolor{red}{$1\%$}) (\textcolor{blue}{$\MC = 0.0006$})
    }
  \end{overlayarea}
\end{frame}

\begin{frame}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \pgfimage[height=3in]{img/boys-and-girls}

      $578$ and $644$ outliers for actors and actresses respectively
      ($1.2\%$ and $3\%$)
    }
    \only<2>{
      \pgfimage[height=3in]{img/boys-and-girls-adjusted}

      \textcolor{red}{$346$} and \textcolor{red}{$657$} outliers for
      actors and actresses respectively
      (\textcolor{red}{$0.69\%$} and \textcolor{red}{$3\%$})
      (\textcolor{blue}{$\MC = 0.12$} and \textcolor{blue}{$\MC = 0.231$})

    }
  \end{overlayarea}
\end{frame}

\section{The Medcouple}

\begin{frame}
  \emph{G. Brys; M. Hubert; A. Struyf (November 2004). "A Robust
    Measure of Skewness". Journal of Computational and Graphical
    Statistics 13 (4): 996-1017. doi:10.1198/106186004X12632.}
\end{frame}

\begin{frame}{Motivation}
  Consider the quartile skewness:
  \[
  B_1 = \frac{(Q_3 - Q_2) - (Q_2 - Q_1)}{Q_3 - Q_1}
  \]
  $Q_2 = \text{median}$
\end{frame}

\begin{frame}{Definition}
  Idea: compute this kernel over all couples split along the median:
  \[
  h(x_i, x_j) =
  \begin{cases}
    \frac{(x_i - m) - (m - x_j)}{x_i - x_j} \\
    \signum(p - 1 - i - j) & \text{if } x_i = m = x_j
  \end{cases}
  \]
  \pause
  where
  \begin{itemize}
    \item $m = \text{median}$
    \item $x_i \geq m \geq x_j$
    \item $p = |\{x_i \geq m\}|$
  \end{itemize}
  \pause
  \begin{defn}
    The \emph{medcouple} is the median of the kernel of all couples
    above.
  \end{defn}
\end{frame}

\begin{frame}{Properties}
  It is easy to see that medcouple is
  \pause
  \begin{itemize}
    \item location-invariant
    \pause
    \item scale-invariant
    \pause
    \item between $-1$ and $1$
    \pause
    \item a measure of skewness
  \end{itemize}
\end{frame}

\begin{frame}{Properties}
  The medcouple is a \emph{robust} measure of skewness.
  \pause
  \begin{defn}
    A statistic is \emph{robust} if it does not depend on the values
    of extreme values (outliers).
  \end{defn}
  \pause
  \begin{itemize}
    \item The median has maximum robustness. Its breakdown point is $50\%$.
    \pause
    \item The medcouple's breakdown point is $25\%$.
  \end{itemize}
\end{frame}


\begin{frame}{Computing the medcouple}
  \begin{center}
    \pgfimage[width=4in]{img/naive/x-orig.png}
  \end{center}

  Take some $X$ random numbers.
\end{frame}

\begin{frame}{Computing the medcouple}
  \begin{center}
    \pgfimage[width=4in]{img/naive/x-sorted.png}
  \end{center}

  Sort them.
\end{frame}

\begin{frame}{Computing the medcouple}
  \begin{center}
    \pgfimage[width=4in]{img/naive/sortx-red.png}
  \end{center}

  Pick the median.
\end{frame}

\begin{frame}{Computing the medcouple}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \begin{center}
        \pgfimage[height=2in]{img/naive/medc-computation-init.png}
      \end{center}
      Split up $X$ into $X^+$ and $X^-$ along the median.
    }
    \only<2>{
      \begin{center}
        \pgfimage[height=2in]{img/naive/medc-computation.png}
      \end{center}
      Evaluate the kernel for all couples.
      \[
      \frac{ (x_i - x_m) - (x_m - x_j)}{x_i - x_j},
      \quad x_i \in X^+, \quad x_j \in X^-
      \]
    }
    \only<3>{
      \begin{center}
        \pgfimage[height=2in]{img/naive/medc-computation-done.png}
      \end{center}
      The median of this matrix is the medcouple.
    }
  \end{overlayarea}
\end{frame}

\begin{frame}
  Problem: this algorithm is $O(n^2)$!!
\end{frame}

\section{Computation of the Medcouple}

\begin{frame}
  Now pure CS:
  \pause
  \begin{problem}
    Find the median of a matrix with sorted rows and sorted columns.
  \end{problem}
\end{frame}

\begin{frame}
  \emph{Donald B. Johnson; Tetsuo Mizoguchi (May 1978). "Selecting The
    Kth Element In X + Y And X1 + X2 +...+ Xm". SIAM Journal of
    Computing 7 (2): 147-153. doi:10.1137/0207013.}
  \pause
  \begin{itemize}
    \item Old paper, hard to read. But cool ideas!
    \pause
    \item Paper generalises to $n$-dimensional array with sorted slices
  \end{itemize}
\end{frame}

\begin{frame}{Preliminaries}
  Helper algorithms:
  \pause
  \begin{description}
    \item[Selection Algorithm:] e.g. quickselect,
    \texttt{nth\_element} in C++ or \texttt{partition} in numpy, $O(n)$ time
    \pause
    \item[Weighted Median:] binary search using
    selection algorithm, $O(n)$ time.
  \end{description}
\end{frame}

\begin{frame}
  We now make two important observations
\end{frame}

\begin{frame}{Two observations}
  Compare against whole matrix in $O(n)$ time
  \begin{overlayarea}{\textwidth}{6cm}
    \only<1>{
      \begin{center}
        \pgfimage[width=2in]{img/kth-pair/greater-than}
      \end{center}
      
      Bottom-up.
    }
    \only<2>{
      \begin{center}
        \pgfimage[width=2in]{img/kth-pair/less-than}
      \end{center}
      
      Top-down.
    }    
  \end{overlayarea}
\end{frame}

\begin{frame}{Two observations}
  Compare against half the matrix in $O(1)$ time
  \begin{center}
    \pgfimage[width=2in]{img/kth-pair/middle-of-middle}
  \end{center}
\end{frame}

\begin{frame}{Putting it together}
  Iteratively, we have bounds for the global median
  \begin{center}
    \pgfimage[width=2in]{img/kth-pair/row-medians}
  \end{center}
\end{frame}

\begin{frame}{Putting it together}
  If we align the row medians...
  \begin{center}
    \pgfimage[width=2in]{img/kth-pair/row-medians-aligned}
  \end{center}
\end{frame}

\begin{frame}{Putting it together}
  With the weighted median (in $O(n)$ time), we compare to at least half the
  remaining entries, discarding at least $1/4$.
  \begin{center}
    \pgfimage[width=2in]{img/kth-pair/row-medians-compared}
  \end{center}
\end{frame}

\begin{frame}
  So, final algorithm!
\end{frame}

\begin{frame}{Grand finale}
  Procedure (serves one medcouple)

  \begin{enumerate}
    \item Compute ingredients for medcouple matrix: $O(n \log n)$
    \pause
    \begin{enumerate}[(A)]
      \item \label{loop} Compute $t = \text{median of row medians}$: $O(n)$
      \pause
      \item \label{compare} Compare $t$ to the whole matrix: $O(n)$
      \pause
      \item Found the median? Found the medcouple! DONE.
      \pause
      \item \label{compare} Use comparison from \eqref{compare} to
      throw out entries.
      \pause
      \item More than $n$ entries remaining? LOOP TO \eqref{loop}.
    \end{enumerate}
    \pause
    \item Use selection algorithm on remaining entries: $O(n)$.
  \end{enumerate}
  \pause

  Step \eqref{compare} throws out at least $1/4$ of all entries, so
  the whole algorithm is $O(n \log n)$.
\end{frame}

\begin{frame}
  \begin{center}
    \LARGE{Questions?}
  \end{center}
\end{frame}

\end{document}