## Demonstrate how the usual estimate of sigma^2 operates when the ## design matrix varies from being under-specified to over-specified. ## ## 40 potential covariates are given. Only covariates 1, 3, 5, ..., 19 ## are informative about EY. ## ## The expected value of sigma_hat^2 is calculated analytically, and ## simulation is used to get a picture of its distribution. ## The sample size. n = 100 ## The number of columns of the design matrix. p = 40 ## The observed covariates. X = runif(n*(p+1)) X = array(X, c(n,p+1)) X[,1] = 1 ## The expected value of Y (beta = 1,0,1,0,..., up to 20). EY = X[,seq(1,20,2)] %*% array(1,10) ## Get the squared L2 norm of the projection of EY onto the complement ## of the first k columns of the design matrix. NV = NULL for (k in 1:(p+1)) { qrd = qr(X[,1:k]) Q = qr.Q(qrd) U = Q %*% (t(Q) %*% EY) NV[k] = sum(EY^2) - sum(U^2) } ## The expected value of sigma_hat^2. VB = 1 + NV/(n - seq(length(NV))) ## Simulate sigma_hat^2 values for a sequence of design matrices. nrep = 100 S = array(0, c(n*(p+1),2)) ii = 1 for (k in 1:(p+1)) { for (r in 1:100) { ## Generate the responses. Y = EY + rnorm(n) ## Estimate the variance. m = lm(Y ~ X[,1:k]+0) S[ii,] = c(k, summary(m)$sigma^2) ii = ii+1 } } boxplot(S[,2] ~ S[,1], ylim=c(0,3), xlab='Number of regressors', ylab=expression(hat(sigma)^2), col='bisque') abline(1, 0, col='grey', lwd=2, lty=3) lines(seq(length(VB)), VB, t='l', col='red')