view talk/talk.tex @ 60:cd940f75aab6

Finish section 1, two more to go
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Tue, 17 May 2016 20:54:41 -0400
parents 57a0f3beaba8
children 73b369370665
line wrap: on
line source

%%% BEGIN BEAMER PREAMBLE %%%
\documentclass[blue]{beamer}
\usepackage{bm, fourier, anyfontsize, xcolor}
\newcommand{\MC}{\operatorname{MC}}
\newcommand{\IQR}{\operatorname{IQR}}


\mode<presentation>
{
  \usetheme{boxes}
  \usecolortheme{crane}
}
\beamertemplatenavigationsymbolsempty

\AtBeginSection[]
{
  \begin{frame}<beamer>
    \frametitle{Outline}
    \tableofcontents[currentsection]
  \end{frame}
}


\usepackage{times}
%%% END BEAMER PREAMBLE %%%


%%% BEGIN METADATA %%%

\author{Jordi G. H. $\langle$jordigh@octave.org$\rangle$ }


\title{The Medcouple}
\subtitle{A robust measure of skewness}
\date{}

%%% END METADATA%%%

\begin{document}

\begin{frame}
  \titlepage
\end{frame}

\begin{frame}
  \frametitle{Outline}
  \tableofcontents
\end{frame}

\section{Outliers and Boxplots}

\begin{frame}{What is an outlier?}
  \pause
  \begin{center}
    \pgfimage[height=2.5in]{img/normal-boxhistplot.pdf}
  \end{center}
  A simple answer: Tukey's boxplots
\end{frame}

\begin{frame}{Anatomy of a boxplot}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{\pgfimage[width=4in]{img/normal-points}}
    \only<2>{\pgfimage[width=4in]{img/normal-boxplot}}
    \only<3>{\pgfimage[width=4in]{img/normal-boxplot-bare/base}}
    \only<4>{\pgfimage[width=4in]{img/normal-boxplot-bare/median}}
    \only<5>{\pgfimage[width=4in]{img/normal-boxplot-bare/q1q3}}
    \only<6>{\pgfimage[width=4in]{img/normal-boxplot-bare/IQR}}
    \only<7>{\pgfimage[width=4in]{img/normal-boxplot-bare/whiskers}}
    \only<8>{\pgfimage[width=4in]{img/normal-boxplot-bare/15}}
    \only<9>{\pgfimage[width=4in]{img/normal-boxplot-bare/outliers}}
  \end{overlayarea}
\end{frame}

\begin{frame}{Anatomy of a boxplot}
  \begin{itemize}
    \item Why 1.5?
    \pause
    \item Tukey responded: ``it's less than 2 and more than 1''
  \end{itemize}
\end{frame}

\begin{frame}{Outliers}
  \pause
  \begin{center}
    \pgfimage[height=2.5in]{img/normal-boxhistplot}
  \end{center}
  
  The boxplot identifies $10$ outliers out of $1000$ points ($1\%$)
\end{frame}

\begin{frame}{Skew distributions}
  Remember:
  \begin{center}
    \pgfimage[width=4in]{img/skew-distributions}
  \end{center}
\end{frame}

\begin{frame}
  For skew distributions...
\end{frame}

\begin{frame}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \pgfimage[height=3in]{img/geometric-boxhistplot}
      
      $433$ outliers out of $10 000$ points ($4.3\%$)
    }
    \only<2>{
      \pgfimage[height=3in]{img/boys-and-girls}
      
      $578$ and $644$ outliers for actors and actresses respectively
      ($1.2\%$ and $3\%$)
    }
  \end{overlayarea}
\end{frame}

\begin{frame}
  \begin{itemize}
    \item Too many outliers...
    \pause
    \item Idea: adjust whisker lengths taking into account skewness:
  \end{itemize}
  \emph{M. Hubert; E. Vandervieren (2008). "An adjusted boxplot for skewed
    distributions". Computational Statistics and Data Analysis 52
    (12): 5186-5201. doi:10.1016/j.csda.2007.11.008.}
\end{frame}

\begin{frame}{Adjusted boxplot}
  \begin{overlayarea}{\textwidth}{3cm}
    \only<1>{
      Recall normal whiskers:
      % Trick to hide medcouple, use whiteout, so that the text gets
      % positioned the same with or without it.
      \begin{align*}
        \text{lower} &= Q_1 - 1.5 \IQR\textcolor{white}{e^{a \MC}} \\
        \text{higher} &= Q_3 + 1.5 \IQR\textcolor{white}{e^{b \MC}}
      \end{align*}
    }
    \only<2>{
      Instead, use adjusted whiskers:
      \begin{align*}
        \text{lower} &= Q_1 - 1.5 \IQR\textcolor{red}{e^{a \MC}}  \\
        \text{higher} &= Q_3 + 1.5 \IQR\textcolor{red}{e^{b \MC}}
      \end{align*}
      \begin{itemize}
        \item[$\MC$] -- the \emph{medcouple}, a measure of skewness
        \item[$a, b$] -- parameters to fit across some sample distributions
      \end{itemize}
    }
  \end{overlayarea}
\end{frame}

\begin{frame}{Adjusted boxplot}
  For the whiskers, Hubert and Vandervieren recommend:
  \[
  \begin{cases}
    [Q_1 - 1.5 \IQR e^{-3 \MC},  Q_3 + 1.5 \IQR e^{4 \MC}] &\text{if} \MC > 0 \\
    [Q_1 - 1.5 \IQR e^{-4 \MC},  Q_3 + 1.5 \IQR e^{3 \MC}] &\text{if} \MC < 0
  \end{cases}
  \]
  \pause
  Of course, if $\MC = 0$ (no skewness) then no adjustment
\end{frame}

\begin{frame}
  Let's see some adjusted boxplots...
\end{frame}

\begin{frame}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \pgfimage[height=3in]{img/geometric-boxhistplot}
      
      $433$ outliers out of $10 000$ points ($4.3\%$)
    }
    \only<2>{
      \pgfimage[height=3in]{img/geometric-boxhistplot-adjusted}
      
      \textcolor{red}{$25$ outliers} out of $10 000$ points
      (\textcolor{red}{$0.25\%$}) (\textcolor{blue}{$\MC = 0.25$})
    }
  \end{overlayarea}
\end{frame}

\begin{frame}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \pgfimage[height=3in]{img/normal-boxhistplot}
      
      $10$ outliers out of $1 000$ points ($1\%$)
    }
    \only<2>{
      \pgfimage[height=3in]{img/normal-boxhistplot-adjusted}
      
      \textcolor{red}{$10$ outliers} out of $1 000$ points
      (\textcolor{red}{$1\%$}) (\textcolor{blue}{$\MC = 0.0006$})
    }
  \end{overlayarea}
\end{frame}

\begin{frame}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{
      \pgfimage[height=3in]{img/boys-and-girls}

      $578$ and $644$ outliers for actors and actresses respectively
      ($1.2\%$ and $3\%$)
    }
    \only<2>{
      \pgfimage[height=3in]{img/boys-and-girls-adjusted}

      \textcolor{red}{$346$} and \textcolor{red}{$657$} outliers for
      actors and actresses respectively
      (\textcolor{red}{$0.69\%$} and \textcolor{red}{$3\%$})
      (\textcolor{blue}{$\MC = 0.12$} and \textcolor{blue}{$\MC = 0.231$})

    }
  \end{overlayarea}
\end{frame}

\section{The Medcouple}

\begin{frame}
  
\end{frame}


\begin{frame}{Computing the medcouple}
  \begin{center}
    \pgfimage[width=4in]{img/naive/x-orig.png}
  \end{center}

  Take some $X$ random numbers.
\end{frame}

\begin{frame}{Computing the medcouple}
  \begin{center}
    \pgfimage[width=4in]{img/naive/x-sorted.png}
  \end{center}

  Sort them.
\end{frame}

\begin{frame}{Computing the medcouple}
  \begin{center}
    \pgfimage[width=4in]{img/naive/sortx-red.png}
  \end{center}

  Pick the median.
\end{frame}

\begin{frame}{Computing the medcouple}
  \begin{overlayarea}{\textwidth}{8cm}
    \only<1>{%
      \begin{center}
        \pgfimage[height=2in]{img/naive/medc-computation-init.png}
      \end{center}
      Split up $X$ into $X^+$ and $X^-$ along the median.}
    \only<2>{%
      \begin{center}
        \pgfimage[height=2in]{img/naive/medc-computation.png}
      \end{center}
      Evaluate the kernel for all couples.
      \[
      \frac{ (x_i^+ - x_m) - (x_m - x_j^-)}{x_i - x_j},
      \quad x_i^+ \in X^+, \quad x_j^- \in X^-
      \]}
    \only<3>{%
      \begin{center}
        \pgfimage[height=2in]{img/naive/medc-computation-done.png}
      \end{center}
      The median of this matrix is the medcouple.}
  \end{overlayarea}
\end{frame}

\section{Computation of the Medcouple}

\begin{frame}
  wtf
\end{frame}

\end{document}