############################################################################################## #Created by: Stefan Metzger (stefan.met@gmail.com) #Last change by: Stefan Metzger (2013-12-29) ## #http://www.mail-archive.com/r-help@r-project.org/msg85070.html: #This is also called "Deming regression" and perhaps many other things. #It is a lively topic in the validation of competing assay methods in the #laboratory. I have a function 'deming.R' that does a generalized form #of this, based on the (very nice) article below, the code is attached. #(The attachments will be stripped by R-help, but the original requestor #will get them.) If someone thinks this to be of general enough interest #to package up on CRAN I'm happy donate the code to them -- I won't have #the time for some while. # Terry T. #BD Ripley and M Thompson, Regression techniques for the detection #of analytical bias, Analyst 112:377-383, 1987. # Generalized Deming regression, based on Ripley, Analyst, 1987:377-383. # eiv <- function( x, y, xstd, ystd, jackknife=T, dfbeta=F, scale=T, intercept=F ){ Call <- match.call() n <- length(x) p2 <- ifelse(intercept == F, 1, 2) #degrees of freedum in model if (length(y) !=n) stop("x and y must be the same length") if (length(xstd) != length(ystd)) stop("xstd and ystd must be the same length") # Do missing value processing nafun <- get(options()$na.action) if (length(xstd)==n) { tdata <- nafun(data.frame(x=x, y=y, xstd=xstd, ystd=ystd)) x <- tdata$x y <- tdata$y xstd <- tdata$xstd ystd <- tdata$ystd } else { tdata <- nafun(data.frame(x=x, y=y)) x <- tdata$x y <- tdata$y if (length(xstd) !=2) stop("Wrong length for std specification") xstd <- xstd[1] + xstd[2]*x ystd <- ystd[1] + ystd[2] * y } if (any(xstd <=0) || any(ystd <=0)) stop("Std must be positive") #optimality criteria for fit through zero minfun0 <- function(beta, x, y, xv, yv) { w <- 1/(yv + beta^2*xv) alphahat <- 0 #constrain to zero sum(w*(y-(alphahat + beta*x))^2) } #optimality criteria for free regression minfun <- function(beta, x, y, xv, yv) { w <- 1/(yv + beta^2*xv) alphahat <- sum(w * (y - beta*x))/ sum(w) sum(w*(y-(alphahat + beta*x))^2) } #function for slope calculation afun <-function(beta, x, y, xv, yv) { w <- 1/(yv + beta^2*xv) sum(w * (y - beta*x))/ sum(w) } #actual fit fit <- optimize(minfun, c(.1, 10), x=x, y=y, xv=xstd^2, yv=ystd^2) coef <- c(intercept=afun(fit$minimum, x, y, xstd^2, ystd^2), slope=fit$minimum) #switch between free and constrained regression if(intercept == F) { fit0 <- optimize(minfun0, coef[2]*c(.5, 1.5), x=x, y=y, xv=xstd^2, yv=ystd^2) coef <- c(intercept=0, slope=fit0$minimum) } #calculate scale #weights (in units Y^2) w <- 1 / (ystd^2 + (coef[2] * xstd)^2) #imputed "true" value of x u <- w * (ystd^2 * x + xstd^2 * coef[2] * (y-coef[1])) #fitted values (imputed "true" value of y) fit <- coef[1] + coef[2] * u #non-scaled residuals # print(sqrt(mean((x-u)^2 + (y-fit)^2))) if (is.logical(scale) & scale) { #scaled residuals in x, y and combined (only makes sense if X and Y carry the same units!) err1 <- (x - u) / xstd err2 <- (y - fit) / ystd #sigma <- sum(err1^2 + err2^2) / (n-p2) #weighted sum of squares / R-square #http://tolstoy.newcastle.edu.au/R/help/06/08/33441.html #in lm: values in weights being inversely proportional to the variances #in weighted regression, u and fit beeing the best guess of the 'true' X and Y value #in residuals SSEx <- sum(err1^2) SSEy <- sum(err2^2) SSE <- SSEx + SSEy ##------------------------------ ##INSERTION #percentage error #unweighted whr <- 1:length(x) # which(abs(x) > .20 & abs(y) > .20) err1n <- ((x - u) / u)[whr] err2n <- ((y - fit) / fit)[whr] SSExn <- sum(err1n^2) SSEyn <- sum(err2n^2) SSEn <- SSExn + SSEyn # SSEn <- sum(rowMeans(cbind(err1n^2, err2n^2))) RSEn <- sqrt(SSEn / (2*n-p2)) * 100 # RSEn <- sqrt(median(c(err1n^2, err2n^2))) * 100 mema <- MEDmad(test=c(x[whr], y[whr]), refe=c(u[whr], fit[whr]), perc=T) MADn <- sqrt(mema[1]^2 + mema[2]^2) MAD_abs <- MADn*MEDmad(u[whr]) rm(err1n, err2n, mema, SSExn, SSEyn, SSEn) #weighted err1n <- (x - u) / u / xstd err2n <- (y - fit) / fit / ystd SSExn <- sum(err1n^2) SSEyn <- sum(err2n^2) SSEn <- SSExn + SSEyn RSEnw <- sqrt(SSEn / (sum(w) * (n-p2)/n)) * 100 rm(err1n, err2n, SSExn, SSEyn, SSEn) ##------------------------------ #in variables if(intercept == T) { m <- sum(w * u / sum(w)) #weighted average (must use same weights as in regression fct.) SSFx <- sum(((u - m) / xstd)^2) m <- sum(w * fit / sum(w)) #weighted average SSFy <- sum(((fit - m) / ystd)^2) } else { SSFx <- sum((u / xstd)^2) SSFy <- sum((fit / ystd)^2) } SSF <- SSFx + SSFy #R-square RSQ <- SSF / (SSF + SSE) #F-statistic #degrees of freedom between models DFnum <- 1 #degrees of freedom of sample size DFden <- n - p2 #numerator num <- SSF / DFnum #denominator den <- SSE / DFden #value of F-statistic fsta <- num / den #false-rejection probability pval <- pf(fsta, DFnum, DFden, lower.tail=FALSE) #prepare output FSTA <- c(fsta, DFnum, DFden, pval) names(FSTA) <- c("value", "numdf", "dendf", "p-value") #scaled residuals after Ripley (the same): r <- (y - (coef[1] + coef[2]*x)) * sqrt(w) SSE <- sum(r^2) sigma <- SSE / (n-p2) #residual standard error RSE <- sqrt(SSE / (sum(w) * (n-p2)/n)) } else sigma <- scale^2 ##Z-test for significance: 5 % significance rejects values outside [-1.96, 1.96] ##what does that do, how does it work, how to interpret? #weighted mean of x x_tilde <- sum(w*x) / sum(w) #test for a=0 test2 <- coef[1] * sqrt(sum(w*x^2) / sum(w*(x-u)^2) / sigma) #test2 <- coef[1] / sqrt(sum(w*x^2) / sum(w) * sum(w * (x - x_tilde))) #following Ripley #test for beta=1 test1 <- (coef[2] - 1) * sqrt(sum(w * (x-u)^2) / sigma) #test1 <- (coef[2] - 1) * sqrt(sum(w * (x - x_tilde))) #following Ripley rlist <- list(coefficients=coef, test1=test1, test0=test2, scale=sigma, err1=err1, err2=err2, u=u, r=r, MADn=MADn, SSE=SSE, RSE=RSE, RSEn=RSEn, RSEnw=RSEnw, RSQ=RSQ, FSTA=FSTA, weights=w, fitted.values=fit) if (jackknife) { delta <- matrix(0., nrow=n, ncol=2) if(intercept == F) { for (i in 1:n) { fit <- optimize(minfun0, c(.5, 1.5)*coef[2], x=x[-i], y=y[-i], xv=xstd[-i]^2, yv=ystd[-i]^2) delta[i,] <- coef - c(0, fit$minimum) } } else { for (i in 1:n) { fit <- optimize(minfun, c(.5, 1.5)*coef[2], x=x[-i], y=y[-i], xv=xstd[-i]^2, yv=ystd[-i]^2) ahat <- afun(fit$minimum, x[-i], y[-i], xstd[-i]^2, ystd[-i]^2) delta[i,] <- coef - c(ahat, fit$minimum) } } rlist$variance <- t(delta) %*% delta if (dfbeta) rlist$dfbeta <- delta } rlist$call <- Call class(rlist) <- 'eiv' rlist } print.eiv <- function(x=rlist, ...) { cat("\nCall:\n", deparse(x$call), "\n\n", sep = "") if (is.null(x$variance)) { table <- matrix(0., nrow=2, ncol=3) table[,1] <- x$coefficients table[,2] <- c(x$test0, x$test1) table[,3] <- pnorm(-2*abs(table[,2])) dimnames(table) <- list(c("Intercept", "Slope"), c("Coef", "z", "p")) } else { table <- matrix(0., nrow=2, ncol=4) table[,1] <- x$coefficients table[,2] <- sqrt(diag(x$variance)) table[,3] <- c(x$test0, x$test1) table[,4] <- pnorm(-2*abs(table[,3])) dimnames(table) <- list(c("Intercept", "Slope"), c("Coef", "se(coef)", "z", "p")) } print(table, ...) cat("\n Normalized median residual standard error [%]: ", format(x$MADn, ...), "\n") cat("\n Residual standard error [unit of input]: ", format(x$RSE, ...), "\n") cat("\n Weighted adjusted R-squared: ", format(x$RSQ, ...), "\n") cat("\n p-value: ", format(x$FSTA[4], ...), "\n") invisible(x) }