# HG changeset patch # User Jordi GutiƩrrez Hermoso # Date 1463532881 14400 # Node ID cd940f75aab69cf281a2695423f162e2207f2ea5 # Parent 049258e4b72a3b55a76648d38cd9c614f5b15063 Finish section 1, two more to go diff --git a/talk/code/plots.py b/talk/code/plots.py --- a/talk/code/plots.py +++ b/talk/code/plots.py @@ -26,6 +26,7 @@ # Do an adjusted boxplot if adjusted: mc = medcouple_1d(data) + print "MC = ", mc iqr = data_stats[0]['iqr'] q1 = data_stats[0]['q1'] q3 = data_stats[0]['q3'] diff --git a/talk/talk.tex b/talk/talk.tex --- a/talk/talk.tex +++ b/talk/talk.tex @@ -1,6 +1,8 @@ %%% BEGIN BEAMER PREAMBLE %%% -\documentclass[green]{beamer} -\usepackage{bm, fourier, anyfontsize} +\documentclass[blue]{beamer} +\usepackage{bm, fourier, anyfontsize, xcolor} +\newcommand{\MC}{\operatorname{MC}} +\newcommand{\IQR}{\operatorname{IQR}} \mode @@ -47,24 +49,184 @@ \section{Outliers and Boxplots} +\begin{frame}{What is an outlier?} + \pause + \begin{center} + \pgfimage[height=2.5in]{img/normal-boxhistplot.pdf} + \end{center} + A simple answer: Tukey's boxplots +\end{frame} + +\begin{frame}{Anatomy of a boxplot} + \begin{overlayarea}{\textwidth}{8cm} + \only<1>{\pgfimage[width=4in]{img/normal-points}} + \only<2>{\pgfimage[width=4in]{img/normal-boxplot}} + \only<3>{\pgfimage[width=4in]{img/normal-boxplot-bare/base}} + \only<4>{\pgfimage[width=4in]{img/normal-boxplot-bare/median}} + \only<5>{\pgfimage[width=4in]{img/normal-boxplot-bare/q1q3}} + \only<6>{\pgfimage[width=4in]{img/normal-boxplot-bare/IQR}} + \only<7>{\pgfimage[width=4in]{img/normal-boxplot-bare/whiskers}} + \only<8>{\pgfimage[width=4in]{img/normal-boxplot-bare/15}} + \only<9>{\pgfimage[width=4in]{img/normal-boxplot-bare/outliers}} + \end{overlayarea} +\end{frame} + +\begin{frame}{Anatomy of a boxplot} + \begin{itemize} + \item Why 1.5? + \pause + \item Tukey responded: ``it's less than 2 and more than 1'' + \end{itemize} +\end{frame} + +\begin{frame}{Outliers} + \pause + \begin{center} + \pgfimage[height=2.5in]{img/normal-boxhistplot} + \end{center} + + The boxplot identifies $10$ outliers out of $1000$ points ($1\%$) +\end{frame} + +\begin{frame}{Skew distributions} + Remember: + \begin{center} + \pgfimage[width=4in]{img/skew-distributions} + \end{center} +\end{frame} + \begin{frame} - What is an outlier? + For skew distributions... +\end{frame} + +\begin{frame} + \begin{overlayarea}{\textwidth}{8cm} + \only<1>{ + \pgfimage[height=3in]{img/geometric-boxhistplot} + + $433$ outliers out of $10 000$ points ($4.3\%$) + } + \only<2>{ + \pgfimage[height=3in]{img/boys-and-girls} + + $578$ and $644$ outliers for actors and actresses respectively + ($1.2\%$ and $3\%$) + } + \end{overlayarea} \end{frame} \begin{frame} - \begin{center} - \pgfimage[width=4.5in,height=3.5in]{img/boys-and-girls} - \end{center} + \begin{itemize} + \item Too many outliers... + \pause + \item Idea: adjust whisker lengths taking into account skewness: + \end{itemize} + \emph{M. Hubert; E. Vandervieren (2008). "An adjusted boxplot for skewed + distributions". Computational Statistics and Data Analysis 52 + (12): 5186-5201. doi:10.1016/j.csda.2007.11.008.} +\end{frame} + +\begin{frame}{Adjusted boxplot} + \begin{overlayarea}{\textwidth}{3cm} + \only<1>{ + Recall normal whiskers: + % Trick to hide medcouple, use whiteout, so that the text gets + % positioned the same with or without it. + \begin{align*} + \text{lower} &= Q_1 - 1.5 \IQR\textcolor{white}{e^{a \MC}} \\ + \text{higher} &= Q_3 + 1.5 \IQR\textcolor{white}{e^{b \MC}} + \end{align*} + } + \only<2>{ + Instead, use adjusted whiskers: + \begin{align*} + \text{lower} &= Q_1 - 1.5 \IQR\textcolor{red}{e^{a \MC}} \\ + \text{higher} &= Q_3 + 1.5 \IQR\textcolor{red}{e^{b \MC}} + \end{align*} + \begin{itemize} + \item[$\MC$] -- the \emph{medcouple}, a measure of skewness + \item[$a, b$] -- parameters to fit across some sample distributions + \end{itemize} + } + \end{overlayarea} +\end{frame} + +\begin{frame}{Adjusted boxplot} + For the whiskers, Hubert and Vandervieren recommend: + \[ + \begin{cases} + [Q_1 - 1.5 \IQR e^{-3 \MC}, Q_3 + 1.5 \IQR e^{4 \MC}] &\text{if} \MC > 0 \\ + [Q_1 - 1.5 \IQR e^{-4 \MC}, Q_3 + 1.5 \IQR e^{3 \MC}] &\text{if} \MC < 0 + \end{cases} + \] + \pause + Of course, if $\MC = 0$ (no skewness) then no adjustment +\end{frame} + +\begin{frame} + Let's see some adjusted boxplots... +\end{frame} + +\begin{frame} + \begin{overlayarea}{\textwidth}{8cm} + \only<1>{ + \pgfimage[height=3in]{img/geometric-boxhistplot} + + $433$ outliers out of $10 000$ points ($4.3\%$) + } + \only<2>{ + \pgfimage[height=3in]{img/geometric-boxhistplot-adjusted} + + \textcolor{red}{$25$ outliers} out of $10 000$ points + (\textcolor{red}{$0.25\%$}) (\textcolor{blue}{$\MC = 0.25$}) + } + \end{overlayarea} +\end{frame} + +\begin{frame} + \begin{overlayarea}{\textwidth}{8cm} + \only<1>{ + \pgfimage[height=3in]{img/normal-boxhistplot} + + $10$ outliers out of $1 000$ points ($1\%$) + } + \only<2>{ + \pgfimage[height=3in]{img/normal-boxhistplot-adjusted} + + \textcolor{red}{$10$ outliers} out of $1 000$ points + (\textcolor{red}{$1\%$}) (\textcolor{blue}{$\MC = 0.0006$}) + } + \end{overlayarea} +\end{frame} + +\begin{frame} + \begin{overlayarea}{\textwidth}{8cm} + \only<1>{ + \pgfimage[height=3in]{img/boys-and-girls} + + $578$ and $644$ outliers for actors and actresses respectively + ($1.2\%$ and $3\%$) + } + \only<2>{ + \pgfimage[height=3in]{img/boys-and-girls-adjusted} + + \textcolor{red}{$346$} and \textcolor{red}{$657$} outliers for + actors and actresses respectively + (\textcolor{red}{$0.69\%$} and \textcolor{red}{$3\%$}) + (\textcolor{blue}{$\MC = 0.12$} and \textcolor{blue}{$\MC = 0.231$}) + + } + \end{overlayarea} \end{frame} \section{The Medcouple} \begin{frame} - omg + \end{frame} -\begin{frame} +\begin{frame}{Computing the medcouple} \begin{center} \pgfimage[width=4in]{img/naive/x-orig.png} \end{center} @@ -72,7 +234,7 @@ Take some $X$ random numbers. \end{frame} -\begin{frame} +\begin{frame}{Computing the medcouple} \begin{center} \pgfimage[width=4in]{img/naive/x-sorted.png} \end{center} @@ -80,7 +242,7 @@ Sort them. \end{frame} -\begin{frame} +\begin{frame}{Computing the medcouple} \begin{center} \pgfimage[width=4in]{img/naive/sortx-red.png} \end{center} @@ -88,7 +250,7 @@ Pick the median. \end{frame} -\begin{frame} +\begin{frame}{Computing the medcouple} \begin{overlayarea}{\textwidth}{8cm} \only<1>{% \begin{center}