Mercurial > hg > medcouple
view talk/talk.tex @ 74:305b7361a5bd default tip @
showalgo: save a snapshot instead of waiting for keyboard input
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Sun, 29 May 2016 19:05:01 -0400 |
parents | d5ec0659b26b |
children |
line wrap: on
line source
%%% BEGIN BEAMER PREAMBLE %%% \documentclass[blue]{beamer} \usepackage{bm, fourier, anyfontsize, xcolor} \newcommand{\MC}{\operatorname{MC}} \newcommand{\signum}{\operatorname{signum}} \newcommand{\IQR}{\operatorname{IQR}} \theoremstyle{definition} \newtheorem*{defn}{Definition} \mode<presentation> { \usetheme{boxes} \usecolortheme{crane} } \beamertemplatenavigationsymbolsempty \AtBeginSection[] { \begin{frame}<beamer> \frametitle{Outline} \tableofcontents[currentsection] \end{frame} } \usepackage{times} %%% END BEAMER PREAMBLE %%% %%% BEGIN METADATA %%% \author{Jordi G. H. $\langle$jordigh@octave.org$\rangle$ } \title{The Medcouple} \subtitle{A robust measure of skewness} \date{} %%% END METADATA%%% \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Outline} \tableofcontents \end{frame} \section{Outliers and Boxplots} \begin{frame}{What is an outlier?} \pause \begin{center} \pgfimage[height=2.5in]{img/normal-boxhistplot.pdf} \end{center} A simple answer: Tukey's boxplots \end{frame} \begin{frame}{Anatomy of a boxplot} \begin{overlayarea}{\textwidth}{8cm} \only<1>{\pgfimage[width=4in]{img/normal-points}} \only<2>{\pgfimage[width=4in]{img/normal-boxplot}} \only<3>{\pgfimage[width=4in]{img/normal-boxplot-bare/base}} \only<4>{\pgfimage[width=4in]{img/normal-boxplot-bare/median}} \only<5>{\pgfimage[width=4in]{img/normal-boxplot-bare/q1q3}} \only<6>{\pgfimage[width=4in]{img/normal-boxplot-bare/IQR}} \only<7>{\pgfimage[width=4in]{img/normal-boxplot-bare/whiskers}} \only<8>{\pgfimage[width=4in]{img/normal-boxplot-bare/15}} \only<9>{\pgfimage[width=4in]{img/normal-boxplot-bare/outliers}} \end{overlayarea} \end{frame} \begin{frame}{Anatomy of a boxplot} \begin{itemize} \item Why 1.5? \pause \item Tukey responded: ``it's less than 2 and more than 1'' \end{itemize} \end{frame} \begin{frame}{Outliers} \pause \begin{center} \pgfimage[height=2.5in]{img/normal-boxhistplot} \end{center} The boxplot identifies $10$ outliers out of $1000$ points ($1\%$) \end{frame} \begin{frame}{Skew distributions} Remember: \begin{center} \pgfimage[width=4in]{img/skew-distributions} \end{center} \end{frame} \begin{frame} For skew distributions... \end{frame} \begin{frame} \begin{overlayarea}{\textwidth}{8cm} \only<1>{ \pgfimage[height=3in]{img/geometric-boxhistplot} $433$ outliers out of $10 000$ points ($4.3\%$) } \only<2>{ \pgfimage[height=3in]{img/boys-and-girls} $578$ and $644$ outliers for actors and actresses respectively ($1.2\%$ and $3\%$) } \end{overlayarea} \end{frame} \begin{frame} \begin{itemize} \item Too many outliers... \pause \item Idea: adjust whisker lengths taking into account skewness: \end{itemize} \emph{M. Hubert; E. Vandervieren (2008). "An adjusted boxplot for skewed distributions". Computational Statistics and Data Analysis 52 (12): 5186-5201. doi:10.1016/j.csda.2007.11.008.} \end{frame} \begin{frame}{Adjusted boxplot} \begin{overlayarea}{\textwidth}{3cm} \only<1>{ Recall normal whiskers: % Trick to hide medcouple, use whiteout, so that the text gets % positioned the same with or without it. \begin{align*} \text{lower} &= Q_1 - 1.5 \IQR\textcolor{white}{e^{a \MC}} \\ \text{higher} &= Q_3 + 1.5 \IQR\textcolor{white}{e^{b \MC}} \end{align*} } \only<2>{ Instead, use adjusted whiskers: \begin{align*} \text{lower} &= Q_1 - 1.5 \IQR\textcolor{red}{e^{a \MC}} \\ \text{higher} &= Q_3 + 1.5 \IQR\textcolor{red}{e^{b \MC}} \end{align*} \begin{itemize} \item[$\MC$] -- the \emph{medcouple}, a measure of skewness \item[$a, b$] -- parameters to fit across some sample distributions \end{itemize} } \end{overlayarea} \end{frame} \begin{frame}{Adjusted boxplot} For the whiskers, Hubert and Vandervieren recommend: \[ \begin{cases} [Q_1 - 1.5 \IQR e^{-3 \MC}, Q_3 + 1.5 \IQR e^{4 \MC}] &\text{if } \MC > 0 \\ [Q_1 - 1.5 \IQR e^{-4 \MC}, Q_3 + 1.5 \IQR e^{3 \MC}] &\text{if } \MC < 0 \end{cases} \] \pause Of course, if $\MC = 0$ (no skewness) then no adjustment \end{frame} \begin{frame} Let's see some adjusted boxplots... \end{frame} \begin{frame} \begin{overlayarea}{\textwidth}{8cm} \only<1>{ \pgfimage[height=3in]{img/geometric-boxhistplot} $433$ outliers out of $10 000$ points ($4.3\%$) } \only<2>{ \pgfimage[height=3in]{img/geometric-boxhistplot-adjusted} \textcolor{red}{$25$ outliers} out of $10 000$ points (\textcolor{red}{$0.25\%$}) (\textcolor{blue}{$\MC = 0.25$}) } \end{overlayarea} \end{frame} \begin{frame} \begin{overlayarea}{\textwidth}{8cm} \only<1>{ \pgfimage[height=3in]{img/normal-boxhistplot} $10$ outliers out of $1 000$ points ($1\%$) } \only<2>{ \pgfimage[height=3in]{img/normal-boxhistplot-adjusted} \textcolor{red}{$10$ outliers} out of $1 000$ points (\textcolor{red}{$1\%$}) (\textcolor{blue}{$\MC = 0.0006$}) } \end{overlayarea} \end{frame} \begin{frame} \begin{overlayarea}{\textwidth}{8cm} \only<1>{ \pgfimage[height=3in]{img/boys-and-girls} $578$ and $644$ outliers for actors and actresses respectively ($1.2\%$ and $3\%$) } \only<2>{ \pgfimage[height=3in]{img/boys-and-girls-adjusted} \textcolor{red}{$346$} and \textcolor{red}{$657$} outliers for actors and actresses respectively (\textcolor{red}{$0.69\%$} and \textcolor{red}{$3\%$}) (\textcolor{blue}{$\MC = 0.12$} and \textcolor{blue}{$\MC = 0.231$}) } \end{overlayarea} \end{frame} \section{The Medcouple} \begin{frame} \emph{G. Brys; M. Hubert; A. Struyf (November 2004). "A Robust Measure of Skewness". Journal of Computational and Graphical Statistics 13 (4): 996-1017. doi:10.1198/106186004X12632.} \end{frame} \begin{frame}{Motivation} Consider the quartile skewness: \[ B_1 = \frac{(Q_3 - Q_2) - (Q_2 - Q_1)}{Q_3 - Q_1} \] $Q_2 = \text{median}$ \end{frame} \begin{frame}{Definition} Idea: compute this kernel over all couples split along the median: \[ h(x_i, x_j) = \begin{cases} \frac{(x_i - m) - (m - x_j)}{x_i - x_j} \\ \signum(p - 1 - i - j) & \text{if } x_i = m = x_j \end{cases} \] \pause where \begin{itemize} \item $m = \text{median}$ \item $x_i \geq m \geq x_j$ \item $p = |\{x_i \geq m\}|$ \end{itemize} \pause \begin{defn} The \emph{medcouple} is the median of the kernel of all couples above. \end{defn} \end{frame} \begin{frame}{Properties} It is easy to see that medcouple is \pause \begin{itemize} \item location-invariant \pause \item scale-invariant \pause \item between $-1$ and $1$ \pause \item a measure of skewness \end{itemize} \end{frame} \begin{frame}{Properties} The medcouple is a \emph{robust} measure of skewness. \pause \begin{defn} A statistic is \emph{robust} if it does not depend on the values of extreme values (outliers). \end{defn} \pause \begin{itemize} \item The median has maximum robustness. Its breakdown point is $50\%$. \pause \item The medcouple's breakdown point is $25\%$. \end{itemize} \end{frame} \begin{frame}{Computing the medcouple} \begin{center} \pgfimage[width=4in]{img/naive/x-orig.png} \end{center} Take some $X$ random numbers. \end{frame} \begin{frame}{Computing the medcouple} \begin{center} \pgfimage[width=4in]{img/naive/x-sorted.png} \end{center} Sort them. \end{frame} \begin{frame}{Computing the medcouple} \begin{center} \pgfimage[width=4in]{img/naive/sortx-red.png} \end{center} Pick the median. \end{frame} \begin{frame}{Computing the medcouple} \begin{overlayarea}{\textwidth}{8cm} \only<1>{ \begin{center} \pgfimage[height=2in]{img/naive/medc-computation-init.png} \end{center} Split up $X$ into $X^+$ and $X^-$ along the median. } \only<2>{ \begin{center} \pgfimage[height=2in]{img/naive/medc-computation.png} \end{center} Evaluate the kernel for all couples. \[ \frac{ (x_i - x_m) - (x_m - x_j)}{x_i - x_j}, \quad x_i \in X^+, \quad x_j \in X^- \] } \only<3>{ \begin{center} \pgfimage[height=2in]{img/naive/medc-computation-done.png} \end{center} The median of this matrix is the medcouple. } \end{overlayarea} \end{frame} \begin{frame} Problem: this algorithm is $O(n^2)$!! \end{frame} \section{Computation of the Medcouple} \begin{frame} Now pure CS: \pause \begin{problem} Find the median of a matrix with sorted rows and sorted columns. \end{problem} \end{frame} \begin{frame} \emph{Donald B. Johnson; Tetsuo Mizoguchi (May 1978). "Selecting The Kth Element In X + Y And X1 + X2 +...+ Xm". SIAM Journal of Computing 7 (2): 147-153. doi:10.1137/0207013.} \pause \begin{itemize} \item Old paper, hard to read. But cool ideas! \pause \item Paper generalises to $n$-dimensional array with sorted slices \end{itemize} \end{frame} \begin{frame}{Preliminaries} Helper algorithms: \pause \begin{description} \item[Selection Algorithm:] e.g. quickselect, \texttt{nth\_element} in C++ or \texttt{partition} in numpy, $O(n)$ time \pause \item[Weighted Median:] binary search using selection algorithm, $O(n)$ time. \end{description} \end{frame} \begin{frame} We now make two important observations \end{frame} \begin{frame}{Two observations} Compare against whole matrix in $O(n)$ time \begin{overlayarea}{\textwidth}{6cm} \only<1>{ \begin{center} \pgfimage[width=2in]{img/kth-pair/greater-than} \end{center} Bottom-up. } \only<2>{ \begin{center} \pgfimage[width=2in]{img/kth-pair/less-than} \end{center} Top-down. } \end{overlayarea} \end{frame} \begin{frame}{Two observations} Compare against half the matrix in $O(1)$ time \begin{center} \pgfimage[width=2in]{img/kth-pair/middle-of-middle} \end{center} \end{frame} \begin{frame}{Putting it together} Iteratively, we have bounds for the global median \begin{center} \pgfimage[width=2in]{img/kth-pair/row-medians} \end{center} \end{frame} \begin{frame}{Putting it together} If we align the row medians... \begin{center} \pgfimage[width=2in]{img/kth-pair/row-medians-aligned} \end{center} \end{frame} \begin{frame}{Putting it together} With the weighted median (in $O(n)$ time), we compare to at least half the remaining entries, discarding at least $1/4$. \begin{center} \pgfimage[width=2in]{img/kth-pair/row-medians-compared} \end{center} \end{frame} \begin{frame} So, final algorithm! \end{frame} \begin{frame}{Grand finale} Procedure (serves one medcouple) \begin{enumerate} \item Compute ingredients for medcouple matrix: $O(n \log n)$ \pause \begin{enumerate}[(A)] \item \label{loop} Compute $t = \text{median of row medians}$: $O(n)$ \pause \item \label{compare} Compare $t$ to the whole matrix: $O(n)$ \pause \item Found the median? Found the medcouple! DONE. \pause \item \label{compare} Use comparison from \eqref{compare} to throw out entries. \pause \item More than $n$ entries remaining? LOOP TO \eqref{loop}. \end{enumerate} \pause \item Use selection algorithm on remaining entries: $O(n)$. \end{enumerate} \pause Step \eqref{compare} throws out at least $1/4$ of all entries, so the whole algorithm is $O(n \log n)$. \end{frame} \begin{frame} \begin{center} \LARGE{Questions?} \end{center} \end{frame} \end{document}