% NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is
% likely to be overwritten.
\SweaveOpts{engine=R,eps=FALSE,pdf=TRUE,width=10,height=6.5,strip.white=all}
\SweaveOpts{prefix=TRUE,prefix.string=figs/data,include=TRUE}
\setkeys{Gin}{width=\textwidth}
<>=
options(width=65)
library(lattice)
library(Matrix)
library(lme4)
#lattice.options(default.theme = standard.theme(color = FALSE))
@
\begin{frame}\frametitle{Organizing data in R}
\begin{itemize}
\item Standard rectangular data sets (columns are variables,
rows are observations) are stored in \R{} as \Emph{data
frames}.
\item The columns can be \Emph{numeric} variables (e.g. measurements
or counts) or \Emph{factor} variables (categorical data) or
\Emph{ordered} factor variables. These types are called the
\Emph{class} of the variable.
\item The \code{str} function provides a concise description of the
structure of a data set (or any other class of object in R). The
\code{summary} function summarizes each variable according to its
class. Both are highly recommended for routine use.
\item Entering just the name of the data frame causes it to be
printed. For large data frames use the \code{head} and \code{tail}
functions to view the first few or last few rows.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{R packages}
\begin{itemize}
\item Packages incorporate functions, data and documentation.
\item You can produce packages for private or in-house use or you
can contribute your package to the Comprehensive R Archive Network
(CRAN), \url{http://cran.us.R-project.org}
\item We will be using the \Emph{lme4} package from CRAN.
Install it from the \Emph{Packages} menu item or with
<>=
install.packages("lme4")
@
\item You only need to install a package once. If a new version
becomes available you can update (see the menu item).
\item To use a package in an R session you attach it using
<>=
require(lme4)
@
or
<>=
library(lme4)
@
(This usage causes widespread confusion of the terms ``package'' and ``library''.)
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Accessing documentation}
\begin{itemize}
\item To be added to CRAN, a package must pass a series of quality
control checks. In particular, all functions and data sets must
be documented. Examples and tests can also be included.
\item The \code{data} function provides names and brief descriptions
of the data sets in a package.
\begin{Schunk}
\begin{Sinput}
> data(package = "lme4")
\end{Sinput}
\begin{Soutput}
Data sets in package 'lme4':
Dyestuff Yield of dyestuff by batch
Dyestuff2 Yield of dyestuff by batch
Pastes Paste strength by batch and cask
Penicillin Variation in penicillin testing
cake Breakage angle of chocolate cakes
cbpp Contagious bovine pleuropneumonia
sleepstudy Reaction times in a sleep deprivation study
\end{Soutput}
\end{Schunk}
\item Use \code{?} followed by the name of a function or data set to
view its documentation. If the documentation contains an example
section, you can execute it with the \code{example} function.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Lattice graphics}
\begin{itemize}
\item One of the strengths of R is its graphics capabilities.
\item There are several styles of graphics in R. The style in
Deepayan Sarkar's \Emph{lattice} package
is well-suited to the type of data we will be discussing.
\item I will not show every piece of code used to produce the data
graphics. The code is available in the script files for the
slides (and sometimes in the example sections of the data set's documentation).
\item Deepayan's book, \Emph{Lattice: Multivariate Data
Visualization with R} (Springer, 2008) provides in-depth
documentation and explanations of lattice graphics.
\item I also recommend Phil Spector's book, \Emph{Data Manipulation
with R} (Springer, 2008).
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{The Dyestuff data set}
\begin{itemize}
\item The \code{Dyestuff}, \code{Penicillin} and \code{Pastes} data
sets all come from the classic book \Emph{Statistical Methods in
Research and Production}, edited by O.L. Davies and first published
in 1947.
\item The \code{Dyestuff} data are a balanced one-way classification
of the \code{Yield} of dyestuff from samples produced from six
\code{Batch}es of an intermediate product. See \code{?Dyestuff}.
\end{itemize}
<>=
str(Dyestuff)
summary(Dyestuff)
@
\end{frame}
\begin{frame}
\frametitle{The effect of the batches}
\begin{itemize}
\item To emphasize that \code{Batch} is categorical, we use letters
instead of numbers to designate the levels.
\item Because there is no inherent ordering of the levels of
\code{Batch}, we will reorder the levels if, say, doing so can
make a plot more informative.
\item The particular batches observed are just a selection of the
possible batches and are entirely used up during the course of
the experiment.
\item It is not particularly important to estimate and compare
yields from these batches. Instead we wish to estimate the
variability in yields due to batch-to-batch variability.
\item The \code{Batch} factor will be used in \Emph{random-effects}
terms in models that we fit.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Dyestuff data plot}
\begin{center}
<>=
print(dotplot(reorder(Batch, Yield) ~ Yield, Dyestuff,
ylab = "Batch", jitter.y = TRUE,
xlab = "Yield of dyestuff (grams of standard color)",
type = c("p", "a")))
@
\end{center}
\begin{itemize}
\item The line joins the mean yields of the six batches, which have
been reordered by increasing mean yield.
\item The vertical positions are jittered slightly to reduce
overplotting. The lowest yield for batch A was observed on two
distinct preparations from that batch.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{A mixed-effects model for the dyestuff yield}
<>=
fm1 <- lmer(Yield ~ 1 + (1|Batch), Dyestuff)
print(fm1)
@
\begin{itemize}
\item Fitted model \code{fm1} has one fixed-effect parameter, the mean
yield, and one random-effects term, generating a simple, scalar
random effect for each level of \code{Batch}.
\end{itemize}
\end{frame}
<>=
op <- options(digits=5)
@
\begin{frame}[fragile]
\frametitle{Extracting information from the fitted model}
\begin{itemize}
\item \code{fm1} is an object of class \code{"mer"} (mixed-effects
representation).
\item There are many \Emph{extractor} functions that can be applied
to such objects.
\end{itemize}
<>=
fixef(fm1)
ranef(fm1, drop = TRUE)
fitted(fm1)
@
\end{frame}
<>=
options(op)
@
\begin{frame}[fragile]
\frametitle{Definition of linear mixed-effects models}
\begin{itemize}
\item A mixed-effects model incorporates two vector-valued random
variables: the response, $\bm{\mathcal{Y}}$, and the random effects,
$\bm{\mathcal{B}}$. We observe the value, $\bm y$, of
$\bm{\mathcal{Y}}$. We do not observe the value of
$\bm{\mathcal{B}}$.
\item In a \Emph{linear mixed-effects model} the conditional distribution,
$\bm{\mathcal{Y}}|\bm{\mathcal{B}}$, and the marginal distribution,
$\bm{\mathcal{B}}$, are independent, multivariate normal (or
``Gaussian'') distributions,
\end{itemize}
\begin{displaymath}
\left(\bm{\mathcal{Y}}|\bm{\mathcal{B}}=\bm b\right)\sim\mathcal{N}\left(\bm
X\bm\beta+\bm Z\bm b,\sigma^2\bm I\right),\quad
\bm{\mathcal{B}}\sim\mathcal{N}\left(\bm 0,\sigma^2\bm\Sigma\right),
\quad(\bm{\mathcal{Y}}|\bm{\mathcal{B}})\perp\bm{\mathcal{B}} .
\end{displaymath}
\begin{itemize}
\item The scalar $\sigma$ is the \Emph{common scale parameter}; the
$p$-dimensional $\bm\beta$ is the \Emph{fixed-effects parameter};
the $n\times p$ $\bm X$ and the $n\times q$ $\bm Z$ are known, fixed
\Emph{model matrices}; and the $q\times q$ \Emph{relative
variance-covariance matrix} $\bm\Sigma(\bm\theta)$ is a positive
semidefinite, symmetric $q\times q$ matrix that depends on the
parameter $\bm\theta$.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Conditional modes of the random effects}
\begin{itemize}
\item Technically we do not provide ``estimates'' of the random
effects because they are not parameters.
\item One answer to the question, ``so what are those numbers
anyway?'' is that they are BLUPs (Best Linear Unbiased Predictors)
but that answer is not informative and the concept does not
generalize.
\item A better answer is that those values are the conditional
means, $\mathsf{E}[\bm{\mathcal{B}}|\bm{\mathcal{Y}}=\bm y]$,
evaluated at the estimated parameters. Regrettably, we can only
evaluate the conditional means for linear mixed models.
\item However, these values are also the conditional modes and that
concept does generalize to other types of mixed models.
\item For linear mixed models we can evaluate the conditional
standard deviations of these random variables and plot a
prediction interval. These intervals can be arranged in a normal
probability plot, sometimes called a ``caterpillar plot''.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Caterpillar plot for fm1}
\begin{center}
<>=
print(qqmath(ranef(fm1, postVar = TRUE), strip = FALSE)[[1]])
@
\end{center}
\end{frame}
\begin{frame}[fragile]
\frametitle{Mixed-effects model formulas}
\begin{itemize}
\item In \code{lmer} the model is specified by the \code{formula}
argument. As in most R model-fitting functions, this is the first
argument.
\item The model formula consists of two expressions separated by the
\code{$\sim$} symbol.
\item The expression on the left, typically the name of a variable, is
evaluated as the response.
\item The right-hand side consists of one or more \Emph{terms} separated
by `\code{+}' symbols.
\item A random-effects term consists of two expressions separated by
the vertical bar, (`\code{|}'), symbol (read as ``given'' or
``by''). Typically, such terms are enclosed in parentheses.
\item The expression on the right of the `\code{|}' is evaluated as a factor,
which we call the \Emph{grouping factor} for that term.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Simple, scalar random-effects terms}
\begin{itemize}
\item In a \Emph{simple, scalar} random-effects term, the expression
on the left of the `\code{|}' is `\code{1}'. Such a term generates
one random effect (i.e. a scalar) for each level of the grouping
factor.
\item Each random-effects term contributes a set of columns to $\bm
Z$. For a simple, scalar r.e. term these are the indicator
columns for the levels of the grouping factor. The transpose of
the \code{Batch} indicators is
\end{itemize}
<>=
with(Dyestuff, as(Batch, "sparseMatrix"))
@
\end{frame}
\begin{frame}
\frametitle{Formulation of the marginal variance matrix}
\begin{itemize}
\item In addition to determining $\bm Z$, the random effects terms
determine the form and parameterization of the relative
variance-covariance matrix, $\bm\Sigma(\bm\theta)$.
\item The parameterization is based on a modified ``LDL$^\prime$''
Cholesky factorization
\begin{displaymath}
\bm\Sigma=\bm T\bm S\bm S\trans\bm T\trans
\end{displaymath}
where $\bm T$ is a $q\times q$ unit lower \Emph{T}riangular matrix
and $\bm S$ is a $q\times q$ diagonal \Emph{S}cale matrix with
nonnegative diagonal elements.
\item $\bm\Sigma$, $\bm T$ and $\bm S$ are all block-diagonal, with
blocks corresponding to the random-effects terms.
\item The diagonal block of $\bm T$ for a scalar random effects term
is the identity matrix, $\bm I$, and the block in $\bm S$ is a
nonnegative multiple of $\bm I$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Verbose fitting, extracting T and S}
\begin{itemize}
\item The optional argument \code{verbose = TRUE} causes \code{lmer}
to print iteration information during the optimzation of the
parameter estimates.
\item The quantity being minimized is the \Emph{profiled deviance}
of the model. The deviance is negative twice the log-likelihood.
It is profiled in the sense that it is a function of $\bm\theta$
only --- $\bm\beta$ and $\sigma$ are at their conditional estimates.
\item If you want to see exactly how the parameters $\bm\theta$
generate $\bm\Sigma$, use \code{expand} to obtain a list
with components \code{sigma}, \code{T} and \code{S}. The list
also contains a permutation matrix \code{P} whose role we will
discuss later.
\item $\bm T$, $\bm S$ and $\bm\Sigma$ can be very large but are
always highly patterned. The \code{image} function can be used to
examine their structure.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Obtain the verbose output for fitting fm1}
<>=
invisible(update(fm1, verbose = TRUE))
@
\begin{itemize}
\item The first number on each line is the iteration count --- iteration
0 is at the starting value for $\bm\theta$.
\item The second number is the profiled deviance --- the criterion
to be minimized at the estimates.
\item The third and subsequent numbers are the parameter vector $\bm\theta$.
\end{itemize}
\end{frame}
<>=
op <- options(digits = 5)
@
\begin{frame}[fragile]
\frametitle{Extract T and S}
\begin{itemize}
\item As previously indicated, $\bm T$ and $\bm S$ from \code{fm1}
are boring.
\end{itemize}
<>=
efm1 <- expand(fm1)
efm1$S
efm1$T
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Reconstructing $\bm\Sigma$}
<>=
(fm1S <- tcrossprod(efm1$T %*% efm1$S))
@
\begin{center}
<>=
print(image(efm1[["T"]], xlab = NULL, ylab = NULL, sub = "T"),
position = c(0,0,0.36,1), more = TRUE)
print(image(efm1[["S"]], xlab = NULL, ylab = NULL, sub = "S"),
position = c(0.32,0,0.68,1), more = TRUE)
print(image(fm1S, xlab = NULL, ylab = NULL, sub = expression(Sigma)),
position = c(0.64,0,1,1))
@
\end{center}
\end{frame}
<>=
options(op)
@
\begin{frame}
\frametitle{REML estimates versus ML estimates}
\begin{itemize}
\item The default parameter estimation criterion for linear mixed
models is restricted (or ``residual'') maximum likelihood (REML).
\item Maximum likelihood (ML) estimates (sometimes called ``full
maximum likelihood'') can be requested by specifying \code{REML =
FALSE} in the call to \code{lmer}.
\item Generally REML estimates of variance components are
preferred. ML estimates are known to be biased. Although REML
estimates are not guaranteed to be unbiased, they are usually less
biased than ML estimates.
\item Roughly the difference between REML and ML estimates of
variance components is comparable to estimating $\sigma^2$ in a
fixed-effects regression by $\mathit{SSR}/(n-p)$ versus
$\mathit{SSR}/n$, where $\mathit{SSR}$ is the residual sum of
squares.
\item For a balanced, one-way classification like the
\code{Dyestuff} data, the REML and ML estimates of the fixed-effects
are identical.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Re-fitting the model for ML estimates}
<>=
(fm1M <- update(fm1, REML = FALSE))
@
(The extra parentheses around the assignment cause the value to
be printed. Generally the results of assignments are not printed.)
\end{frame}
\begin{frame}[fragile]
\frametitle{Recap of the Dyestuff model}
\begin{itemize}
\item The model is fit as
<>=
fm1@call
@
\item There is one random-effects term, \code{(1|Batch)}, in the model
formula. It is a simple, scalar term for the grouping factor
\code{Batch} with $n_1=6$ levels. Thus $q=6$.
\item The model matrix $\bm Z$ is the $30\times 6$ matrix of
indicators of the levels of \code{Batch}.
\item The relative variance-covariance matrix, $\bm\Sigma$, is a
nonnegative multiple of the $6\times 6$ identity matrix $\bm I_6$.
\item The fixed-effects parameter vector, $\bm\beta$, is of length
$p=1$. All the elements of the $30\times 1$ model matrix $\bm X$
are unity.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{The Penicillin data (also check the ?Penicillin description)}
<>=
str(Penicillin)
xtabs(~ sample + plate, Penicillin)
@
\begin{itemize}
\item These are measurements of the potency (measured by the diameter
of a clear area on a Petri dish) of penicillin samples in a
balanced, unreplicated two-way crossed classification with the test
medium, \code{plate}.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Penicillin data plot}
\begin{center}
<>=
print(dotplot(reorder(plate, diameter) ~ diameter, Penicillin, groups = sample,
ylab = "Plate", xlab = "Diameter of growth inhibition zone (mm)",
type = c("p", "a"), auto.key = list(columns = 6, lines = TRUE)))
@
\end{center}
\end{frame}
\begin{frame}[fragile]
\frametitle{Model with crossed simple random effects for Penicillin}
<>=
(fm2 <- lmer(diameter ~ 1 + (1|plate) + (1|sample), Penicillin))
@
\end{frame}
<>=
op <- options(digits = 5)
@
\begin{frame}[fragile]
\frametitle{Fixed and random effects for fm2}
\begin{itemize}
\item The model for the $n=144$ observations has $p=1$ fixed-effects
parameter and $q=30$ random effects from $k=2$ random effects
terms in the formula.
\end{itemize}
<>=
fixef(fm2)
ranef(fm2, drop = TRUE)
@
\end{frame}
<>=
options(op)
@
\begin{frame}[fragile]
\frametitle{Prediction intervals for random effects}
<>=
qrr2 <- qqmath(ranef(fm2, postVar = TRUE), strip = FALSE)
print(qrr2[[1]], pos = c(0,0,0.5,1), more = TRUE)
print(qrr2[[2]], pos = c(0.5,0,1,1))
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Model matrix Z for fm2}
\begin{itemize}
\item Because the model matrix $\bm Z$ is generated from $k=2$
simple, scalar random effects terms, it consists of two sets of
indicator columns.
\item The structure of $\bm Z\trans$ is shown below. (Generally we
will show the transpose of these model matrices - they fit better
on slides.)
\end{itemize}
\begin{center}
<>=
print(image(fm2@Zt, xlab = NULL, ylab = NULL, sub = "Z'"))
@
\end{center}
\end{frame}
\begin{frame}
\frametitle{Models with crossed random effects}
\begin{itemize}
\item Many people believe that mixed-effects models are equivalent
to hierarchical linear models (HLMs) or ``multilevel models''.
This is not true. The \code{plate} and \code{sample} factors in
\code{fm2} are crossed. They do not represent levels in a hierarchy.
\item There is no difficulty in defining and fitting models with
crossed random effects (meaning random-effects terms whose
grouping factors are crossed).
\item Crossing of random effects can affect the speed with which a
model can be fit.
\item The crucial calculation in each \code{lmer} iteration is
evaluation of the sparse, lower triangular, Cholesky factor, $\bm
L(\bm\theta)$, that satisfies
\begin{displaymath}
\bm L(\bm\theta)\bm L(\bm\theta)\trans =
\bm P(\bm A(\bm\theta)\bm A(\bm\theta)\trans + \bm I_q)\bm P\trans
\end{displaymath}
from $\bm A(\bm\theta)\trans=\bm Z\bm T(\bm\theta)\bm
S(\bm\theta)$. Crossing of grouping factors increases the number
of nonzeros in $\bm A\bm A\trans$ and also causes some
``fill-in'' when creating $\bm L$ from $\bm A$.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{All HLMs are mixed models but not vice-versa}
\begin{itemize}
\item Even though Raudenbush and Bryk do discuss models for crossed
factors in their HLM book, such models are not hierarchical.
\item Experimental situations with crossed random factors, such as
``subject'' and ``stimulus'', are common. We can and should model
such data according to its structure.
\item In longitudinal studies of subjects in social contexts (e.g.
students in classrooms or in schools) we almost always have partial
crossing of the subject and the context factors, meaning that, over
the course of the study, a particular student may be observed in
more than one class (partial crossing) but not all students are
observed in all classes. The student and class factors are
neither fully crossed nor strictly nested.
\item For longitudinal data, ``nested'' is only important if it means
``nested across time''. ``Nested at a particular time'' doesn't
count.
\item The \code{lme4} package in \R{} is different from most other
software for fitting mixed models in that it handles fully crossed
and partially crossed random effects gracefully.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Images of some of the $q\times q$ matrices for fm2}
\begin{itemize}
\item Because both random-effects terms are scalar terms, $\bm T$ is
a block-diagonal matrix of two blocks, both of which are identity
matrices. Hence $\bm T=\bm I_q$.
\item For this model it is also the case that $\bm P = \bm I_q$.
\item $\bm S$ consists of two diagonal blocks, both of which are
multiples of an identity matrix. The multiples are different.
\end{itemize}
\begin{center}
<>=
print(image(expand(fm2)$S, xlab = NULL, ylab = NULL, sub = "S"),
position = c(0,0,0.36,1), more = TRUE)
print(image(tcrossprod(fm2@A), xlab = NULL, ylab = NULL, sub = "AA'"),
position = c(0.32,0,0.68,1), more = TRUE)
print(image(fm2@L, xlab = NULL, ylab = NULL, sub = "L", colorkey = FALSE),
position = c(0.64,0,1,1))
@
\end{center}
\end{frame}
\begin{frame}[fragile]
\frametitle{Recap of the Penicillin model}
\begin{itemize}
\item The model formula is
<>=
fm2@call[["formula"]]
@
\item There are two random-effects terms, \code{(1|plate)} and
\code{(1|sample)}. Both are simple, scalar ($q_1=q_2=1$) random
effects terms, with $n_1=24$ and $n_2=6$ levels, respectively.
Thus $q=q_1n_1+q_2n_2=30$.
\item The model matrix $\bm Z$ is the $144\times 30$ matrix created
from two sets of indicator columns.
\item The relative variance-covariance matrix, $\bm\Sigma$, is block
diagonal in two blocks that are nonnegative multiples of identity
matrices. The matrices $\bm A\bm A\trans$ and $\bm L$ show the
crossing of the factors. $\bm L$ has some fill-in relative to $\bm
A\bm A\trans$.
\item The fixed-effects parameter vector, $\bm\beta$, is of length
$p=1$. All the elements of the $144\times 1$ model matrix $\bm X$
are unity.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{The Pastes data (also check the ?Pastes description)}
<>=
str(Pastes)
xtabs(~ batch + sample, Pastes, sparse = TRUE)
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Structure of the Pastes data}
\begin{itemize}
\item The \code{sample} factor is nested within the \code{batch}
factor. Each sample is from one of three casks selected from a
particular batch.
\item Note that there are 30, not 3, distinct samples.
\item We can label the casks as `a', `b' and `c' but then the
\code{cask} factor by itself is meaningless (because cask `a' in
batch `A' is unrelated to cask `a'in batches `B', `C', $\dots$).
The \code{cask} factor is only meaningful within a \code{batch}.
\item Only the \code{batch} and \code{cask} factors, which are
apparently crossed, were present in the original data set.
\code{cask} may be described as being nested within \code{batch}
but that is not reflected in the data. It is \Emph{implicitly
nested}, not explicitly nested.
\item You can save yourself a lot of grief by immediately creating
the explicitly nested factor. The recipe is
\end{itemize}
<>=
Pastes <- within(Pastes, sample <- (batch:cask)[drop = TRUE])
@
\end{frame}
\begin{frame}
\frametitle{Avoid implicitly nested representations}
\begin{itemize}
\item The \code{lme4} package allows for very general model
specifications. It does not require that factors associated with
random effects be hierarchical or ``multilevel'' factors in the
design.
\item The same model specification can be used for data with nested
or crossed or partially crossed factors. Nesting or crossing is
determined from the structure of the factors in the data, not the
model specification.
\item You can avoid confusion about nested and crossed factors by
following one simple rule: ensure that different levels of a
factor in the experiment correspond to different labels of the
factor in the data.
\item Samples were drawn from 30, not 3, distinct casks in this
experiment. We should specify models using the \code{sample}
factor with 30 levels, not the \code{cask} factor with 3 levels.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Pastes data plot}
<>=
Pastes <- within(Pastes, bb <- reorder(batch, strength))
Pastes <- within(Pastes, ss <- reorder(reorder(sample, strength),
as.numeric(batch)))
print(dotplot(ss ~ strength | bb, Pastes,
strip = FALSE, strip.left = TRUE, layout = c(1, 10),
scales = list(y = list(relation = "free")),
ylab = "Sample within batch", type = c("p", "a"),
xlab = "Paste strength", jitter.y = TRUE))
@
\end{frame}
\begin{frame}[fragile]
\frametitle{A model with nested random effects}
<>=
(fm3 <- lmer(strength ~ 1 + (1|batch) + (1|sample), Pastes))
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Random effects from model fm3}
<>=
qrr3 <- qqmath(ranef(fm3, postVar = TRUE), strip = FALSE)
print(qrr3[[1]], pos = c(0,0,0.5,1), more = TRUE)
print(qrr3[[2]], pos = c(0.5,0,1,1))
@
\begin{itemize}
\item This plot and the data plot both show that the sample-to-sample
variability dominates the batch-to-batch variability. We will
return to this observation later.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Dimensions and relationships in fm3}
\begin{itemize}
\item There are $n=60$ observations, $p=1$ fixed-effects parameter,
$k=2$ simple, scalar random-effects terms ($q_1=q_2=1$) with
grouping factors having $n_1=30$ and $n_2=10$ levels.
\item Because both random-effects terms are scalar terms, $\bm T=\bm
I_{40}$ and $\bm S$ is block-diagonal in two diagonal blocks of
sizes $30$ and $10$, respectively. $\bm Z$ is generated from two
sets of indicators.
\end{itemize}
\begin{center}
<>=
print(image(fm3@Zt, xlab = NULL, ylab = NULL, sub = NULL))
@
\end{center}
\end{frame}
\begin{frame}[fragile]
\frametitle{Images of some of the $q\times q$ matrices for fm3}
\begin{itemize}
\item The permutation $\bm P$ has two purposes: reduce fill-in and
``post-order'' the columns to keep nonzeros near the diagonal.
\item In a model with strictly nested grouping factors there will
be no fill-in. The permutation $\bm P$ is chosen for
post-ordering only.
\end{itemize}
\begin{center}
<>=
print(image(tcrossprod(fm3@A), xlab = NULL, ylab = NULL, sub = "AA'"),
position = c(0,0,0.36,1), more = TRUE)
print(image(expand(fm3)$P, xlab = NULL, ylab = NULL, sub = "P"),
position = c(0.32,0,0.68,1), more = TRUE)
print(image(fm3@L, xlab = NULL, ylab = NULL, sub = "L", colorkey = FALSE),
position = c(0.64,0,1,1))
@
\end{center}
\end{frame}
\begin{frame}
\frametitle{Eliminate the random-effects term for batch?}
\begin{itemize}
\item We have seen that there is little batch-to-batch variability
beyond that induced by the variability of samples within batches.
\item We can fit a reduced model without that term and compare it to
the original model.
\item Somewhat confusingly, model comparisons from likelihood ratio
tests are obtained by calling the \code{anova} function on the two
models. (Put the simpler model first in the call to \code{anova}.)
\item Sometimes likelihood ratio tests can be evaluated using the REML
criterion and sometimes they can't. Instead of learning the rules
of when you can and when you can't, it is easiest always to refit the
models with \code{REML = FALSE} before comparing.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Comparing ML fits of the full and reduced models}
<>=
fm3M <- update(fm3, REML = FALSE)
fm4M <- lmer(strength ~ 1 + (1|sample), Pastes, REML = FALSE)
anova(fm4M, fm3M)
@
\end{frame}
\begin{frame}
\frametitle{p-values of LR tests on variance components}
\begin{itemize}
\item The likelihood ratio is a reasonable criterion for comparing
these two models. However, the theory behind using a $\chi^2$
distribution with 1 degree of freedom as a reference distribution
for this test statistic does not apply in this case. The null
hypothesis is on the boundary of the alternative hypothesis.
\item Even at the best of times, the p-values for such tests are only
approximate because they are based on the asymptotic behavior of the
test statistic. To carry the argument further, all results in
statistics are based on models and, as George Box famously said,
``All models are wrong; some models are useful.''
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{LR tests on variance components (cont'd)}
\begin{itemize}
\item In this case the problem with the boundary condition results in
a p-value that is larger than it would be if, say, you compared this
likelihood ratio to values obtained for data simulated from the null
hypothesis model. We say these results are ``conservative''.
\item As a rule of thumb, the p-value for a simple, scalar term is
roughly twice as large as it should be.
\item In this case, dividing the p-value in half would not affect our
conclusion.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Updated model, REML estimates}
<>=
(fm4 <- update(fm4M, REML = TRUE))
@
\end{frame}
\begin{frame}
\frametitle{Recap of the analysis of the Pastes data}
\begin{itemize}
\item The data consist of $n=60$ observations on $q_1=30$ samples
nested within $q_2=10$ batches.
\item The data are labelled with a \code{cask} factor with $3$
levels but that is an implicitly nested factor. Create the
explicit factor \code{sample} and ignore \code{cask} from then
on.
\item Specification of a model for nested factors is exactly the
same as specification of a model with crossed or partially crossed
factors --- provided that you avoid using implicitly nested factors.
\item In this case the \code{batch} factor was inert --- it did not
``explain'' substantial variability in addition to that attributed
to the \code{sample} factor. We therefor prefer the simpler model.
\item At the risk of ``beating a dead horse'', notice that, if we had
used the \code{cask} factor in some way, we would still need to
create a factor like \code{sample} to be able to reduce the
model. The \code{cask} factor is only meaningful within \code{batch}.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Recap of simple, scalar random-effects terms}
\begin{itemize}
\item For the \code{lmer} function (and also for \code{glmer} and
\code{nlmer}) a simple, scalar random effects term is of the form
\code{(1|F)}.
\item The number of random effects generated by the $i$th such
term is the number of levels, $n_i$, of \code{F} (after dropping
``unused'' levels --- those that do not occur in the data. The idea
of having such levels is not as peculiar as it may seem if, say,
you fitting a model to a subset of the original data.)
\item Such a term contributes $n_i$ columns to $\bm Z$. These
columns are the indicator columns of the grouping factor.
\item Such a term contributes a diagonal block $\bm I_{n_i}$ to $\bm
T$. If all random effects terms are scalar terms then $\bm T=\bm
I$.
\item Such a term contributes a diagonal block $c_i\bm I_{n_i}$ to
$\bm S$. The multipliers $c_i$ can be different for different
terms. The term contributes exactly one element (which is $c_i$)
to $\bm\theta$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{This is all very nice, but $\dots$}
\begin{itemize}
\item These methods are interesting but the results are not really
new. Similar results are quoted in \Emph{Statistical Methods in
Research and Production}, which is a very old book.
\item The approach described in that book is actually quite
sophisticated, especially when you consider that the methods
described there, based on observed and expected mean squares, are
for hand calculation (in pre-calculator days)!
\item Why go to all the trouble of working with sparse matrices and
all that if you could get the same results with paper and pencil?
The one-word answer is \Emph{balance}.
\item Those methods depend on the data being balanced. The design
must be completely balanced and the resulting data must also be
completely balanced.
\item Balance is fragile. Even if the design is balanced, a single
missing or questionable observation destroys the balance.
Observational studies (as opposed to, say, laboratory experiments)
cannot be expected to yield balanced data sets.
\item Also, the models involve only simple, scalar random effects
and do not incorporate covariates.
\end{itemize}
\end{frame}
\begin{frame}[fragile]\frametitle{A large observational data set}
\begin{itemize}
\item A major universiy (not mine) provided data on the grade point
score (\code{gr.pt}) by student (\code{id}), instructor
(\code{instr}) and department (\code{dept}) from a 10 year period.
I regret that I cannot make these data available to others.
\item These factors are unbalanced and partially crossed.
\end{itemize}
\begin{Schunk}
\begin{Sinput}
> str(anon.grades.df)
\end{Sinput}
\begin{Soutput}
'data.frame': 1721024 obs. of 9 variables:
$ instr : Factor w/ 7964 levels "10000","10001",..: 1 1 1 1 1 1 1 1 1 1 ...
$ dept : Factor w/ 106 levels "AERO","AFAM",..: 43 43 43 43 43 43 43 43 43 43 ...
$ id : Factor w/ 54711 levels "900000001","900000002",..: 12152 1405 23882 18875 18294 20922 4150 13540 5499 6425 ...
$ nclass : num 40 29 33 13 47 49 37 14 21 20 ...
$ vgpa : num NA NA NA NA NA NA NA NA NA NA ...
$ rawai : num 2.88 -1.15 -0.08 -1.94 3.00 ...
$ gr.pt : num 4 1.7 2 0 3.7 1.7 2 4 2 2.7 ...
$ section : Factor w/ 70366 levels "19959 AERO011A001",..: 18417 18417 18417 18417 9428 18417 18417 9428 9428 9428 ...
$ semester: num 19989 19989 19989 19989 19972 ...
\end{Soutput}
\end{Schunk}%$
\end{frame}
\begin{frame}[fragile]\frametitle{A preliminary model}
\begin{Schunk}
\begin{Soutput}
Linear mixed model fit by REML
Formula: gr.pt ~ (1 | id) + (1 | instr) + (1 | dept)
Data: anon.grades.df
AIC BIC logLik deviance REMLdev
3447389 3447451 -1723690 3447374 3447379
Random effects:
Groups Name Variance Std.Dev.
id (Intercept) 0.3085 0.555
instr (Intercept) 0.0795 0.282
dept (Intercept) 0.0909 0.301
Residual 0.4037 0.635
Number of obs: 1685394, groups: id, 54711; instr, 7915; dept, 102
Fixed effects:
Estimate Std. Error t value
(Intercept) 3.1996 0.0314 102
\end{Soutput}
\end{Schunk}
\end{frame}
\begin{frame}
\frametitle{Comments on the model fit}
\begin{itemize}
\item $n=1685394$, $p=1$, $k=3$, $n_1=54711$, $n_2=7915$, $n_3=102$,
$q_1=q_2=q_3=1$, $q=62728$
\item This model is sometimes called the ``unconditional'' model in
that it does not incorporate covariates beyond the grouping factors.
\item It takes less than an hour to fit an "unconditional" model
with random effects for student (\code{id}), instructor
(\code{inst}) and department (\code{dept}) to these data.
\item Naturally, this is just the first step. We want to look at
possible time trends and the possible influences of the
covariates.
\end{itemize}
\end{frame}