## ## Some calculations with multiple least squares. ## ## Sample size. n = 100 ## Number of covariates. p = 4 ## Simulate some data. X = rnorm(n*(p+1)) ## iid covariates X = array(X, c(n,p+1)) X[,1] = 1 coeff = c(1, (-1)**(1:p %% 2)) E = 0.5*rnorm(n) Y = X %*% coeff + E ## Make a simple scatterplot. L = c("Y") for (k in 1:p) { L = c(L, sprintf("X%d", k)) } pairs(cbind(Y, X[,2:(p+1)]), labels=L) ## Calculate the least square coefficients using linear algebra. qrd = qr(X) Q = qr.Q(qrd) R = qr.R(qrd) coeff_est_1 = solve(R, t(Q) %*% Y) ## Calculate the least squares coefficients using the built-in lm ## function. m = lm(Y ~ 0+X) coeff_est_2 = m$coeff ## Check that the fitted regression function passes through the center ## of the data. ybar = mean(Y) xbar = apply(X, 2, mean) dc = ybar - sum(coeff_est_1 * xbar) ## ## Some analysis of the residuals and fitted values. ## ## Three ways to get the fitted values. F1 = X %*% coeff_est_1 F2 = m$fitted.values F3 = Q %*% (t(Q) %*% Y) ## Two ways to get the residuals. R1 = Y - F1 R2 = m$residuals ## Check some properties of the residuals. print(sprintf("The residuals sum to: %f", sum(R1))) print(sprintf(" = %f", sum(F1*R2))) ## ## What happens if there is no intercept? ## X = rnorm(n*p) ## iid covariates X = array(X, c(n,p)) coeff = (-1)**(1:p %% 2) E = 0.5*rnorm(n) Y = X %*% coeff + E qrd = qr(X) Q = qr.Q(qrd) F = Q %*% (t(Q) %*% Y) R = Y - F print("No intercept:") print(sprintf("The residuals sum to: %f", sum(R))) print(sprintf(" = %f", sum(F*R))) ## ## Look into the effects of correlations among the predictors. ## print("Systematically vary the correlation between predictors:") for (r in seq(-0.9, 0.9, 0.1)) { ## Generate two covariate vectors with population correlation r ## between them. X1 = rnorm(n) U = rnorm(n) X2 = r*X1 + sqrt(1-r^2)*U ## Simulate data. X = cbind(array(1,n), X1, X2) coeff = c(0, 1, 2) E = 0.5*rnorm(n) Y = X %*% coeff + E ## Do the multiple least squares fit. qrd = qr(X) Q = qr.Q(qrd) R = qr.R(qrd) coeff_est = solve(R, t(Q) %*% Y) ## Do the simple least squares fits for each covariate. for (k in c(2,3)) { m = lm(Y ~ X[,k]) coeff_est = c(coeff_est, m$coeff[2]) } print(sprintf(rep("%5.2f", 5), c(r, coeff_est[2:5]))) }