## Apply the SIMEX procedure to get approximately unbiased slopes for ## the measurement error model. ## ## Arguments: ## Y the n-dimensional response vector ## X the nxp dimensional design matrix (no intercept) ## s the vector of known standard deviations of the measurement errors. ## ## Returns: ## beta_corrected The adjusted slopes ## B A matrix whose columns are fitted slopes for ## linearly increasing amounts of added measurement error. ## F The grid of values for the rows of B. simex = function(Y, X, s) { n = dim(X)[1] p = dim(X)[2] ## Add these factors of additional measurement error. F = seq(0, 2, by=0.2) B = array(0, c(11,p+1)) ## Take 11 multiples of the error. for (k in 1:11) { ## Average over 10 replicates to stabilize. for (j in 1:10) { ## Generate the additional measurement error. E = array(rnorm(n*p), c(n,p)) E = F[k]*E*outer(array(1,n), s) ## The artificially noisy design matrix. XM = X + E ## Fit the model. mm = lm(Y ~ XM) B[k,] = B[k,] + mm$coeff } } B = B/10 R = list() R$B = B R$F = F ## Use linear regression to extrapolate to no measurement error. beta_corrected = array(0, p+1) for (k in 1:(p+1)) { bhat = cov(B[,k], F) / var(F) ahat = mean(B[,k]) - bhat*mean(F) beta_corrected[k] = ahat - bhat } R$beta_corrected = beta_corrected return(R) } ## Sample size. n = 1000 ## Number of covariates. p = 2 ## Generate the population coefficients (the population intercept is zero). beta = rnorm(p) ## The measurement error standard deviations per covariate. MSD = c(0.5,0.5) ## The exact covariates. XT = array(rnorm(n*p), c(n,p)) ## The response. Y = XT %*% beta + rnorm(n) ## The observed covariates. E = outer(array(1,n),MSD)*array(rnorm(n*p), c(n,p)) X = XT + E ## Run SIMEX. R = simex(Y, X, c(0.5,0.5)) ## For comparison, this is what we would get if we observed the ## covariates exactly. m_true = lm(Y ~ XT) ymin = min(R$B, m_true$coeff, beta, R$beta_corrected) ymax = max(R$B, m_true$coeff, beta, R$beta_corrected) ymax = ymax + 0.2*(ymax-ymin) ## Plot the SIMEX trajectories in black. plot(NULL, NULL, xlab='SD of extra measurement error', ylab='Coefficient', xlim=c(-1.5,max(R$F)), ylim=c(ymin,ymax)) for (k in 1:dim(R$B)[2]) { points(R$F, R$B[,k], t='p') } ## Plot the OLS estimates using the observed covariate data in green. points(rep(0,dim(R$B)[2]), R$B[1,], col='green', t='p') ## Plot the SIMEX estimates in cyan. points(rep(-1, length(R$beta_corrected)), R$beta_corrected, col='cyan') ## Plot the values we would get if we had access to the exact ## covariate data in red. points(rep(-1.2, length(m_true$coeff)), m_true$coeff, col='red') ## Plot the population coefficients in blue. points(rep(-1.4, 1+length(beta)), c(0,beta), col='blue') legend('topright', c("Population", "OLS on observed X", "OLS on exact X", "SIMEX"), col=c('blue', 'green', 'red', 'cyan'), pch=c(1,1,1,1))