~ubuntu-branches/ubuntu/wily/lme4/wily

Committer: Package Import Robot
Author(s): Dirk Eddelbuettel
Date: 2015-06-27 07:20:27 UTC
mfrom: (1.2.26)
Revision ID: package-import@ubuntu.com-20150627072027-6v0d8riuxbcksti5

Tags: 1.1-8-1

* New upstream release

* debian/control: Set Build-Depends: to current R version
* debian/control: Set Standards-Version: to current version

files added:
R/sysdata.rda

data/Arabidopsis.rda

inst/doc/PLSvGLS.R

inst/doc/PLSvGLS.Rnw

inst/doc/Theory.R

inst/doc/Theory.Rnw

inst/doc/lme4-extras.pdf

inst/doc/lmer.R

inst/doc/lmer.Rnw

inst/doc/lmerperf.R

inst/doc/lmerperf.Rmd

inst/doc/lmerperf.html

inst/testdata/badhess.RData

inst/testdata/dataEx_Glmer.txt

inst/testdata/rankMatrix.rds

inst/testdata/sbTobb.Rdata

inst/utils

inst/utils/allFit.R

man/Arabidopsis.Rd

man/convergence.Rd

man/factorize.Rd

man/glmerLaplaceHandle.Rd

man/hatvalues.merMod.Rd

man/lmList4-class.Rd

man/mkSimulationTemplate.Rd

man/plot.lmList4.Rd

man/troubleshooting.Rd

man/vcconv.Rd

tests/bootMer.Rout

tests/lmList-tst.R

tests/lme-tst-fits.rda

tests/refit_allequal.RData

vignettes/blkarray.sty

vignettes/lmerperf.Rmd

vignettes/lmerperf.md

files removed:
data/GQN.rda

inst/testdata/rankMatrix.rda

man/GQN.Rd

man/lmList-class.Rd

tests/lmList.R

files modified:
DESCRIPTION

NAMESPACE

R/AllClass.R *

R/AllGeneric.R *

R/GHrule.R *

R/bootMer.R *

R/checkConv.R *

R/deriv.R *

R/hooks.R *

R/lmList.R *

R/lmer.R

R/lmerControl.R

R/mcmcsamp.R *

R/modular.R

R/nbinom.R *

R/optimizer.R *

R/plot.R *

R/predict.R *

R/profile.R *

R/sparsegrid.R *

R/utilities.R

R/vcconv.R *

README.md

build/vignette.rds

debian/changelog

debian/control

inst/CITATION

inst/NEWS.Rd

inst/doc/PLSvGLS.pdf

inst/doc/Theory.pdf

inst/doc/lmer.pdf

inst/testdata/Johnson.rda *

inst/testdata/SO_sep25.RData *

inst/testdata/boo01L.RData *

inst/testdata/colonizer_rand.rda *

inst/testdata/confint_ex.rda *

inst/testdata/crabs_randdata00.Rda *

inst/testdata/crabs_randdata2.Rda *

inst/testdata/culcita_dat.RData *

inst/testdata/gopherdat2.RData *

inst/testdata/gotway_hessianfly.rda *

inst/testdata/hotpower.csv *

inst/testdata/koller-data.R *

inst/testdata/lme-tst-fits.R *

inst/testdata/lme-tst-fits.rda *

inst/testdata/lme-tst-funs.R *

inst/testdata/mastitis.rda *

inst/testdata/polytom2.RData *

inst/testdata/polytom3.RData *

inst/testdata/polytomous_vcov_ex.RData *

inst/testdata/radinger_dat.RData *

inst/testdata/survdat_reduced.Rda *

inst/testdata/tprfm1.RData *

inst/testdata/trees513.R *

inst/testdata/trees513.RData *

inst/tests/napredict2.R

inst/tests/test-NAhandling.R

inst/tests/test-formulaEval.R

inst/tests/test-glmer.R

inst/tests/test-glmmFail.R

inst/tests/test-lmer.R

inst/tests/test-methods.R

inst/tests/test-rank.R

inst/tests/test-start.R

inst/tests/test-utils.R

man/Dyestuff.Rd *

man/GHrule.Rd *

man/GQdk.Rd *

man/InstEval.Rd *

man/NelderMead-class.Rd *

man/Nelder_Mead.Rd *

man/Pastes.Rd *

man/Penicillin.Rd *

man/VarCorr.Rd *

man/VerbAgg.Rd *

man/bootMer.Rd *

man/cake.Rd *

man/cbpp.Rd *

man/confint.merMod.Rd *

man/drop1.merMod.Rd *

man/dummy.Rd *

man/expandDoubleVerts.Rd *

man/findbars.Rd *

man/fixef.Rd

man/fortify.Rd *

man/getME.Rd *

man/glmer.Rd *

man/isREML.Rd *

man/lmList.Rd *

man/lme4-package.Rd *

man/lmer.Rd *

man/lmerControl.Rd

man/merMod-class.Rd *

man/merPredD-class.Rd *

man/merPredD.Rd *

man/mkMerMod.Rd *

man/mkReTrms.Rd *

man/mkRespMod.Rd *

man/mkVarCorr.Rd *

man/mkdevfun.Rd *

man/modular.Rd *

man/ngrps.Rd *

man/nlformula.Rd *

man/nlmer.Rd *

man/nloptwrap.Rd

man/nobars.Rd *

man/plot.merMod.Rd *

man/plots.thpr.Rd *

man/predict.merMod.Rd *

man/profile-methods.Rd *

man/pvalues.Rd *

man/ranef.Rd *

man/refit.Rd *

man/refitML.Rd *

man/residuals.merMod.Rd *

man/sigma.Rd *

man/simulate.merMod.Rd *

man/sleepstudy.Rd *

man/subbars.Rd *

man/varianceProf.Rd *

src/Makevars *

src/Makevars.win *

src/external.cpp *

src/glmFamily.cpp *

src/lme4CholmodDecomposition.h *

src/mcmcsamp.cpp *

src/optimizer.cpp *

src/optimizer.h *

src/predModule.cpp *

src/predModule.h *

src/respModule.cpp *

src/respModule.h *

tests/AAAtest-all.R *

tests/AAAtest-all.Rout

tests/HSAURtrees.R *

tests/README *

tests/README.md *

tests/REMLdev.R *

tests/Rplots.pdf

tests/ST.R *

tests/agridat_gotway.R *

tests/bootMer.R *

tests/boundary.R *

tests/condVar.R *

tests/confint.R *

tests/devCritFun.R *

tests/dynload.R *

tests/elston.R *

tests/evalCall.R *

tests/extras.R *

tests/falsezero_dorie.R *

tests/fewlevels.R *

tests/getME.R *

tests/glmer-1.R *

tests/glmerControlPass.R *

tests/glmerWarn.R *

tests/glmmExt.R *

tests/glmmWeights.R *

tests/hatvalues.R *

tests/lme4_nlme.R *

tests/lmer-0.R *

tests/lmer-1.R *

tests/lmer-1.Rout.save

tests/lmer2_ex.R *

tests/methods.R *

tests/minval.R *

tests/modFormula.R *

tests/nbinom.R *

tests/nlmer-conv.R *

tests/nlmer.R *

tests/nlmer.Rout.save *

tests/offset.R *

tests/optimizer.R *

tests/polytomous.R *

tests/prLogistic.R *

tests/predict.R *

tests/predict.Rout

tests/predict_basis.R *

tests/predict_basis.Rout

tests/predsim.R *

tests/priorWeights.R *

tests/priorWeightsModComp.R *

tests/profile-tst.R *

tests/refit.R *

tests/refit.Rout

tests/respiratory.R *

tests/simulate.R *

tests/testOptControl.R *

tests/testcolonizer.R *

tests/testcrab.R *

tests/throw.R *

tests/vcov-etc.R *

vignettes/PLSvGLS.Rnw *

vignettes/Theory.Rnw *

vignettes/lme4.bib *

vignettes/lmer.Rnw

Show diffs side-by-side

added added

removed removed

inst/doc/PLSvGLS.Rnw

\documentclass[12pt]{article}

\usepackage{Sweave,amsmath,amsfonts,bm}

\usepackage[authoryear,round]{natbib}

\bibliographystyle{plainnat}

\DeclareMathOperator \tr {tr}

\DefineVerbatimEnvironment{Sinput}{Verbatim}

{formatcom={\vspace{-1ex}},fontshape=sl,

fontfamily=courier,fontseries=b, fontsize=\footnotesize}

\DefineVerbatimEnvironment{Soutput}{Verbatim}

{formatcom={\vspace{-1ex}},fontfamily=courier,fontseries=b,%

fontsize=\footnotesize}

%%\VignetteIndexEntry{PLS vs GLS for LMMs}

%%\VignetteDepends{lme4}

\title{Penalized least squares versus generalized least squares

representations of linear mixed models}

\author{Douglas Bates\\Department of Statistics\\%

University of Wisconsin -- Madison}

\begin{document}

\SweaveOpts{engine=R,eps=FALSE,pdf=TRUE,strip.white=true,keep.source=TRUE}

\SweaveOpts{include=FALSE}

\setkeys{Gin}{width=\textwidth}

\newcommand{\code}[1]{\texttt{\small{#1}}}

\newcommand{\package}[1]{\textsf{\small{#1}}}

\newcommand{\trans}{\ensuremath{^\prime}}

<<preliminaries,echo=FALSE,results=hide>>=

options(width=65,digits=5)

#library(lme4)

\maketitle

\begin{abstract}

The methods in the \code{lme4} package for \code{R} for fitting

linear mixed models are based on sparse matrix methods, especially

the Cholesky decomposition of sparse positive-semidefinite matrices,

in a penalized least squares representation of the conditional model

for the response given the random effects. The representation is

similar to that in Henderson's mixed-model equations. An

alternative representation of the calculations is as a generalized

least squares problem. We describe the two representations, show

the equivalence of the two representations and explain why we feel

that the penalized least squares approach is more versatile and more

computationally efficient.

\end{abstract}

\section{Definition of the model}

\label{sec:Definition}

We consider linear mixed models in which the random effects are

represented by a $q$-dimensional random vector, $\bm{\mathcal{B}}$, and

the response is represented by an $n$-dimensional random vector,

$\bm{\mathcal{Y}}$. We observe a value, $\bm y$, of the response. The

random effects are unobserved.

For our purposes, we will assume a ``spherical'' multivariate normal

conditional distribution of $\bm{\mathcal{Y}}$, given

$\bm{\mathcal{B}}$. That is, we assume the variance-covariance matrix

of $\bm{\mathcal{Y}}|\bm{\mathcal{B}}$ is simply $\sigma^2\bm I_n$,

where $\bm I_n$ denotes the identity matrix of order $n$. (The term

``spherical'' refers to the fact that contours of the conditional

density are concentric spheres.)

The conditional mean,

$\mathrm{E}[\bm{\mathcal{Y}}|\bm{\mathcal{B}}=\bm b]$, is a linear

function of $\bm b$ and the $p$-dimensional fixed-effects parameter,

$\bm\beta$,

\begin{equation}

\label{eq:condmean}

\mathrm{E}[\bm{\mathcal{Y}}|\bm{\mathcal{B}}=\bm b]=

\bm X\bm\beta+\bm Z\bm b ,

\end{equation}

where $\bm X$ and $\bm Z$ are known model matrices of sizes $n\times

p$ and $n\times q$, respectively. Thus

\begin{equation}

\label{eq:yconditional}

\bm{\mathcal{Y}}|\bm{\mathcal{B}}\sim

\mathcal{N}\left(\bm X\bm\beta+\bm Z\bm b,\sigma^2\bm I_n\right) .

\end{equation}

The marginal distribution of the random effects

\begin{equation}

\label{eq:remargin}

\bm{\mathcal{B}}\sim\mathcal{N}\left(\bm 0,\sigma^2\bm\Sigma(\bm\theta)\right)

\end{equation}

is also multivariate normal, with mean $\bm 0$ and variance-covariance

matrix $\sigma^2\bm\Sigma(\bm\theta)$. The scalar, $\sigma^2$, in

(\ref{eq:remargin}) is the same as the $\sigma^2$ in

(\ref{eq:yconditional}). As described in the next section, the

relative variance-covariance matrix, $\bm\Sigma(\bm\theta)$, is a

$q\times q$ positive semidefinite matrix depending on a parameter

vector, $\bm\theta$. Typically the dimension of $\bm\theta$ is much,

much smaller than $q$.

\subsection{Variance-covariance of the random effects}

\label{sec:revarcov}

The relative variance-covariance matrix, $\bm\Sigma(\bm\theta)$, must

be symmetric and positive semidefinite (i.e. $\bm x\trans\bm\Sigma\bm

100

x\ge0,\forall\bm x\in\mathbb{R}^q$). Because the estimate of a

101

variance component can be zero, it is important to allow for a

102

semidefinite $\bm\Sigma$. We do not assume that $\bm\Sigma$ is

103

positive definite (i.e. $\bm x\trans\bm\Sigma\bm x>0,\forall\bm

104

x\in\mathbb{R}^q, \bm x\ne\bm 0$) and, hence, we cannot assume that $\bm\Sigma^{-1}$

105

exists.

106

107

A positive semidefinite matrix such as $\bm\Sigma$ has a Cholesky

108

decomposition of the so-called ``LDL$\trans$'' form. We use a

109

slight modification of this form,

110

\begin{equation}

111

\label{eq:TSdef}

112

\bm\Sigma(\bm\theta)=\bm T(\bm\theta)\bm S(\bm\theta)\bm

113

S(\bm\theta)\bm T(\bm\theta)\trans ,

114

\end{equation}

115

where $\bm T(\bm\theta)$ is a unit lower-triangular $q\times q$ matrix

116

and $\bm S(\bm\theta)$ is a diagonal $q\times q$ matrix with

117

nonnegative diagonal elements that act as scale factors. (They are

118

the relative standard deviations of certain linear combinations of the

119

random effects.) Thus, $\bm T$ is a triangular matrix and $\bm S$ is

120

a scale matrix.

121

122

Both $\bm T$ and $\bm S$ are highly patterned.

123

124

\subsection{Orthogonal random effects}

125

\label{sec:orthogonal}

126

127

Let us define a $q$-dimensional random vector, $\bm{\mathcal{U}}$, of

128

orthogonal random effects with marginal distribution

129

\begin{equation}

130

\label{eq:Udist}

131

\bm{\mathcal{U}}\sim\mathcal{N}\left(\bm 0,\sigma^2\bm I_q\right)

132

\end{equation}

133

and, for a given value of $\bm\theta$, express $\bm{\mathcal{B}}$ as a

134

linear transformation of $\bm{\mathcal{U}}$,

135

\begin{equation}

136

\label{eq:UtoB}

137

\bm{\mathcal{B}}=\bm T(\bm\theta)\bm S(\bm\theta)\bm{\mathcal{U}} .

138

\end{equation}

139

Note that the transformation (\ref{eq:UtoB}) gives the desired

140

distribution of $\bm{\mathcal{B}}$ in that

141

$\mathrm{E}[\bm{\mathcal{B}}]=\bm T\bm

142

S\mathrm{E}[\bm{\mathcal{U}}]=\bm 0$ and

143

\begin{displaymath}

144

\mathrm{Var}(\bm{\mathcal{B}})=\mathrm{E}[\bm{\mathcal{B}}\bm{\mathcal{B}}\trans]

145

=\bm T\bm S\mathrm{E}[\bm{\mathcal{U}}\bm{\mathcal{U}}\trans]\bm

146

S\bm T\trans=\sigma^2\bm T\bm S\bm S\bm T\trans=\bm\Sigma .

147

\end{displaymath}

148

149

The conditional distribution, $\bm{\mathcal{Y}}|\bm{\mathcal{U}}$, can

150

be derived from $\bm{\mathcal{Y}}|\bm{\mathcal{B}}$ as

151

\begin{equation}

152

\label{eq:YgivenU}

153

\bm{\mathcal{Y}}|\bm{\mathcal{U}}\sim\mathcal{N}\left(\bm X\bm\beta+\bm

154

Z\bm T\bm S\bm u, \sigma^2\bm I\right)

155

\end{equation}

156

We will write the transpose of $\bm Z\bm T\bm S$ as $\bm A$. Because

157

the matrices $\bm T$ and $\bm S$ depend on the parameter $\bm\theta$,

158

$\bm A$ is also a function of $\bm\theta$,

159

\begin{equation}

160

\label{eq:Adef}

161

\bm A\trans(\bm\theta)=\bm Z\bm T(\bm\theta)\bm S(\bm\theta) .

162

\end{equation}

163

164

In applications, the matrix $\bm Z$ is derived from indicator columns

165

of the levels of one or more factors in the data and is a

166

\emph{sparse} matrix, in the sense that most of its elements are zero.

167

The matrix $\bm A$ is also sparse. In fact, the structure of $\bm T$

168

and $\bm S$ are such that pattern of nonzeros in $\bm A$ is that same

169

as that in $\bm Z\trans$.

170

171

\subsection{Sparse matrix methods}

172

\label{sec:sparseMatrix}

173

174

The reason for defining $\bm A$ as the transpose of a model matrix is

175

because $\bm A$ is stored and manipulated as a sparse matrix. In the

176

compressed column-oriented storage form that we use for sparse

177

matrices, there are advantages to storing $\bm A$ as a matrix of $n$

178

columns and $q$ rows. In particular, the CHOLMOD sparse matrix

179

library allows us to evaluate the sparse Cholesky factor, $\bm

180

L(\bm\theta)$, a sparse lower triangular matrix that satisfies

181

\begin{equation}

182

\label{eq:SparseChol}

183

\bm L(\bm\theta)\bm L(\bm\theta)\trans=

184

\bm P\left(\bm A(\bm\theta)\bm A(\bm\theta)\trans+\bm I_q\right)\bm P\trans ,

185

\end{equation}

186

directly from $\bm A(\bm\theta)$.

187

188

In (\ref{eq:SparseChol}) the $q\times q$ matrix $\bm P$ is a

189

``fill-reducing'' permutation matrix determined from the pattern of

190

nonzeros in $\bm Z$. $\bm P$ does not affect the statistical theory

191

(if $\bm{\mathcal{U}}\sim\mathcal{N}(\bm 0,\sigma^2\bm I)$ then $\bm

192

P\trans\bm{\mathcal{U}}$ also has a $\mathcal{N}(\bm 0,\sigma^2\bm I)$

193

distribution because $\bm P\bm P\trans=\bm P\trans\bm P=\bm I$) but,

194

because it affects the number of nonzeros in $\bm L$, it can have a

195

tremendous impact on the amount storage required for $\bm L$ and the

196

time required to evaluate $\bm L$ from $\bm A$. Indeed, it is

197

precisely because $\bm L(\bm\theta)$ can be evaluated quickly, even

198

for complex models applied the large data sets, that the \code{lmer}

199

function is effective in fitting such models.

200

201

\section{The penalized least squares approach to linear mixed models}

202

\label{sec:Penalized}

203

204

Given a value of $\bm\theta$ we form $\bm A(\bm\theta)$ from which we

205

evaluate $\bm L(\bm\theta)$. We can then solve for the $q\times p$

206

matrix, $\bm R_{\bm{ZX}}$, in the system of equations

207

\begin{equation}

208

\label{eq:RZX}

209

\bm L(\theta)\bm R_{\bm{ZX}}=\bm P\bm A(\bm\theta)\bm X

210

\end{equation}

211

and for the $p\times p$ upper triangular matrix, $\bm R_{\bm X}$, satisfying

212

\begin{equation}

213

\label{eq:RX}

214

\bm R_{\bm X}\trans\bm R_{\bm X}=

215

\bm X\trans\bm X-\bm R_{\bm{ZX}}\trans\bm R_{\bm{ZX}}

216

\end{equation}

217

218

The conditional mode, $\tilde{\bm u}(\bm\theta)$, of the

219

orthogonal random effects and the conditional mle,

220

$\widehat{\bm\beta}(\bm\theta)$, of the fixed-effects parameters

221

can be determined simultaneously as the solutions to a penalized least

222

squares problem,

223

\begin{equation}

224

\label{eq:PLS}

225

\begin{bmatrix}

226

\tilde{\bm u}(\bm\theta)\\

227

\widehat{\bm\beta}(\bm\theta)

228

\end{bmatrix}=

229

\arg\min_{\bm u,\bm\beta}\left\|

230

\begin{bmatrix}\bm y\\\bm 0\end{bmatrix} -

231

\begin{bmatrix}

232

\bm A\trans\bm P\trans & \bm X\\

233

\bm I_q & \bm 0

234

\end{bmatrix}

235

\begin{bmatrix}\bm u\\\bm\beta\end{bmatrix} ,

236

\right\|^2

237

\end{equation}

238

for which the solution satisfies

239

\begin{equation}

240

\label{eq:PLSsol}

241

\begin{bmatrix}

242

\bm P\left(\bm A\bm A\trans+\bm I\right)\bm P\trans &

243

\bm P\bm A\bm X\\

244

\bm X\trans\bm A\trans\bm P\trans & \bm X\trans\bm X

245

\end{bmatrix}

246

\begin{bmatrix}

247

\tilde{\bm u}(\bm\theta)\\

248

\widehat{\bm\beta}(\bm\theta)

249

\end{bmatrix}=

250

\begin{bmatrix}\bm P\bm A\bm y\\\bm X\trans\bm y\end{bmatrix} .

251

\end{equation}

252

The Cholesky factor of the system matrix for the PLS problem can be

253

expressed using $\bm L$, $\bm R_{\bm Z\bm X}$ and $\bm R_{\bm X}$, because

254

\begin{equation}

255

\label{eq:PLSChol}

256

\begin{bmatrix}

257

\bm P\left(\bm A\bm A\trans+\bm I\right)\bm P\trans & \bm P\bm

258

A\bm X\\

259

\bm X\trans\bm A\trans\bm P\trans & \bm X\trans\bm X

260

\end{bmatrix} =

261

\begin{bmatrix}

262

\bm L & \bm 0\\

263

\bm R_{\bm Z\bm X}\trans & \bm R_{\bm X}\trans

264

\end{bmatrix}

265

\begin{bmatrix}

266

\bm L\trans & \bm R_{\bm Z\bm X}\\

267

\bm 0 & \bm R_{\bm X}

268

\end{bmatrix} .

269

\end{equation}

270

271

In the \code{lme4} package the \code{"mer"} class is the

272

representation of a mixed-effects model. Several slots in this class

273

are matrices corresponding directly to the matrices in the preceding

274

equations. The \code{A} slot contains the sparse matrix $\bm

275

A(\bm\theta)$ and the \code{L} slot contains the sparse Cholesky

276

factor, $\bm L(\bm\theta)$. The \code{RZX} and \code{RX} slots contain

277

$\bm R_{\bm Z\bm X}(\bm\theta)$ and $\bm R_{\bm X}(\bm\theta)$,

278

respectively, stored as dense matrices.

279

280

It is not necessary to solve for $\tilde{\bm u}(\bm\theta)$ and

281

$\widehat{\bm\beta}(\bm\theta)$ to evaluate the \emph{profiled}

282

log-likelihood, which is the log-likelihood evaluated $\bm\theta$ and

283

the conditional estimates of the other parameters,

284

$\widehat{\bm\beta}(\bm\theta)$ and $\widehat{\sigma^2}(\bm\theta)$.

285

All that is needed for evaluation of the profiled log-likelihood is

286

the (penalized) residual sum of squares, $r^2$, from the penalized

287

least squares problem (\ref{eq:PLS}) and the determinant $|\bm A\bm

288

A\trans+\bm I|=|\bm L|^2$. Because $\bm L$ is triangular, its

289

determinant is easily evaluated as the product

290

of its diagonal elements. Furthermore, $|\bm L|^2 > 0$ because it is

291

equal to $|\bm A\bm A\trans + \bm I|$, which is the determinant of a

292

positive definite matrix. Thus $\log(|\bm L|^2)$ is both well-defined

293

and easily calculated from $\bm L$.

294

295

The profiled deviance (negative twice the profiled log-likelihood), as

296

a function of $\bm\theta$ only ($\bm\beta$ and $\sigma^2$ at their

297

conditional estimates), is

298

\begin{equation}

299

\label{eq:profiledDev}

300

d(\bm\theta|\bm y)=\log(|\bm L|^2)+n\left(1+\log(r^2)+\frac{2\pi}{n}\right)

301

\end{equation}

302

The maximum likelihood estimates, $\widehat{\bm\theta}$, satisfy

303

\begin{equation}

304

\label{eq:thetamle}

305

\widehat{\bm\theta}=\arg\min_{\bm\theta}d(\bm\theta|\bm y)

306

\end{equation}

307

Once the value of $\widehat{\bm\theta}$ has been determined, the mle

308

of $\bm\beta$ is evaluated from (\ref{eq:PLSsol}) and the mle of

309

$\sigma^2$ as $\widehat{\sigma^2}(\bm\theta)=r^2/n$.

310

311

Note that nothing has been said about the form of the sparse model

312

matrix, $\bm Z$, other than the fact that it is sparse. In contrast

313

to other methods for linear mixed models, these results apply to

314

models where $\bm Z$ is derived from crossed or partially crossed

315

grouping factors, in addition to models with multiple, nested grouping

316

factors.

317

318

The system (\ref{eq:PLSsol}) is similar to Henderson's ``mixed-model

319

equations'' (reference?). One important difference between

320

(\ref{eq:PLSsol}) and Henderson's formulation is that Henderson

321

represented his system of equations in terms of $\bm\Sigma^{-1}$ and,

322

in important practical examples, $\bm\Sigma^{-1}$ does not exist at

323

the parameter estimates. Also, Henderson assumed that equations like

324

(\ref{eq:PLSsol}) would need to be solved explicitly and, as we have

325

seen, only the decomposition of the system matrix is needed for

326

evaluation of the profiled log-likelihood. The same is true of the

327

profiled the logarithm of the REML criterion, which we define later.

328

329

\section{The generalized least squares approach to linear mixed models}

330

\label{sec:GLS}

331

332

Another common approach to linear mixed models is to derive the

333

marginal variance-covariance matrix of $\bm{\mathcal{Y}}$ as a

334

function of $\bm\theta$ and use that to determine the conditional

335

estimates, $\widehat{\bm\beta}(\bm\theta)$, as the solution of a

336

generalized least squares (GLS) problem. In the notation of

337

\S\ref{sec:Definition} the marginal mean of $\bm{\mathcal{Y}}$ is

338

$\mathrm{E}[\bm{\mathcal{Y}}]=\bm X\bm\beta$ and the marginal

339

variance-covariance matrix is

340

\begin{equation}

341

\label{eq:marginalvarcovY}

342

\mathrm{Var}(\bm{\mathcal{Y}})=\sigma^2\left(\bm I_n+\bm Z\bm T\bm

343

S\bm S\bm T\trans\bm Z\trans\right)=\sigma^2\left(\bm I_n+\bm

344

A\trans\bm A\right) =\sigma^2\bm V(\bm\theta) ,

345

\end{equation}

346

where $\bm V(\bm\theta)=\bm I_n+\bm A\trans\bm A$.

347

348

The conditional estimates of $\bm\beta$ are often written as

349

\begin{equation}

350

\label{eq:condbeta}

351

\widehat{\bm\beta}(\bm\theta)=\left(\bm X\trans\bm V^{-1}\bm

352

X\right)^{-1}\bm X\trans\bm V^{-1}\bm y

353

\end{equation}

354

but, of course, this formula is not suitable for computation. The

355

matrix $\bm V(\bm\theta)$ is a symmetric $n\times n$ positive definite

356

matrix and hence has a Cholesky factor. However, this factor is

357

$n\times n$, not $q\times q$, and $n$ is always larger than $q$ ---

358

sometimes orders of magnitude larger. Blithely writing a formula in

359

terms of $\bm V^{-1}$ when $\bm V$ is $n\times n$, and $n$ can be in

360

the millions does not a computational formula make.

361

362

\subsection{Relating the GLS approach to the Cholesky factor}

363

\label{sec:GLStoL}

364

365

We can use the fact that

366

\begin{equation}

367

\label{eq:Vinv}

368

\bm V^{-1}(\bm\theta)=\left(\bm I_n+\bm A\trans\bm A\right)^{-1}=

369

\bm I_n-\bm A\trans\left(\bm I_q+\bm A\bm A\trans\right)^{-1}\bm A

370

\end{equation}

371

to relate the GLS problem to the PLS problem. One way to establish

372

(\ref{eq:Vinv}) is simply to show that the product

373

\begin{multline*}

374

(\bm I+\bm A\trans\bm A)\left(\bm I-\bm A\trans\left(\bm I+\bm A\bm

375

A\trans\right)^{-1}\bm A\right)\\

376

\begin{aligned}

377

=&\bm I+\bm A\trans\bm A-\bm A\trans\left(\bm I+\bm A\bm

378

A\trans\right)

379

\left(\bm I+\bm A\bm A\trans\right)^{-1}\bm A\\

380

=&\bm I+\bm A\trans\bm A-\bm A\trans\bm A\\

381

=&\bm I .

382

\end{aligned}

383

\end{multline*}

384

Incorporating the permutation matrix $\bm P$ we have

385

\begin{equation}

386

\label{eq:PLA}

387

\begin{aligned}

388

\bm V^{-1}(\bm\theta)=&\bm I_n-\bm A\trans\bm P\trans\bm P\left(\bm

389

I_q+\bm A\bm A\trans\right)^{-1}\bm P\trans\bm P\bm A\\

390

=&\bm I_n-\bm A\trans\bm P\trans(\bm L\bm L\trans)^{-1}\bm P\bm A\\

391

=&\bm I_n-\left(\bm L^{-1}\bm P\bm A\right)\trans\bm L^{-1}\bm P\bm A .

392

\end{aligned}

393

\end{equation}

394

Even in this form we would not want to routinely evaluate $\bm

395

V^{-1}$. However, (\ref{eq:PLA}) does allow us to simplify many

396

common expressions.

397

398

For example, the variance-covariance of the estimator $\widehat{\bm

399

\beta}$, conditional on $\bm\theta$ and $\sigma$, can be expressed as

400

\begin{equation}

401

\label{eq:varcovbeta}

402

\begin{aligned}

403

\sigma^2\left(\bm X\trans\bm V^{-1}(\bm\theta)\bm X\right)^{-1}

404

=&\sigma^2\left(\bm X\trans\bm X-\left(\bm L^{-1}\bm P\bm

405

A\bm X\right)\trans\left(\bm L^{-1}\bm P\bm A\bm

406

X\right)\right)^{-1}\\

407

=&\sigma^2\left(\bm X\trans\bm X-\bm R_{\bm Z\bm X}\trans\bm

408

R_{\bm Z\bm X}\right)^{-1}\\

409

=&\sigma^2\left(\bm R_{\bm X}\trans\bm R_{\bm X}\right)^{-1} .

410

\end{aligned}

411

\end{equation}

412

413

\section{Trace of the ``hat'' matrix}

414

\label{sec:hatTrace}

415

416

Another calculation that is of interest to some is the

417

the trace of the ``hat'' matrix, which can be written as

418

\begin{multline}

419

\label{eq:hatTrace}

420

\tr\left(\begin{bmatrix}\bm A\trans&\bm X\end{bmatrix}

421

\left(\begin{bmatrix}\bm A\trans&\bm X\\\bm I&\bm0\end{bmatrix}\trans

422

\begin{bmatrix}\bm A\trans&\bm X\\\bm I&\bm0\end{bmatrix}\right)^{-1}

423

\begin{bmatrix}\bm A\\\bm X\trans\end{bmatrix}\right)\\

424

= \tr\left(\begin{bmatrix}\bm A\trans&\bm X\end{bmatrix}

425

\left(\begin{bmatrix}\bm L&\bm0\\

426

\bm R_{\bm{ZX}}\trans&\bm R_{\bm X}\trans\end{bmatrix}

427

\begin{bmatrix}\bm L\trans&\bm R_{\bm{ZX}}\\

428

\bm0&\bm R_{\bm X}\end{bmatrix}\right)^{-1}

429

\begin{bmatrix}\bm A\\\bm X\trans\end{bmatrix}\right)

430

\end{multline}

431

432

\end{document}

Older »