Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-06-30 18:15:53 -04:00 · 2011-06-30 18:15:53 -04:00 · 91f8ab115d
parent 419bf259ee 510efa3313
commit 91f8ab115d
33 changed files with 1121 additions and 860 deletions
--- a/public/R/plot_Annotations_BinnedTruthMetrics.R
+++ b/public/R/plot_Annotations_BinnedTruthMetrics.R
@ -0,0 +1,190 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+annotationName = args[2]
+minBinCutoff = as.numeric(args[3])
+medianNumVariants = args[4]
+
+c <- read.table(input, header=T)
+
+all = c[c$numVariants>minBinCutoff & c$category=="all",]
+novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
+dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
+truth = c[c$numVariants>minBinCutoff & c$category=="truth",]
+
+#
+# Calculate min, max, medians
+#
+
+d = c[c$numVariants>minBinCutoff,]
+ymin = min(d$titv)
+ymax = max(d$titv)
+xmin = min(d$value)
+xmax = max(d$value)
+m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
+ma = all[all$value > m,]
+mb = all[all$value < m,]
+m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
+m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
+if(medianNumVariants == "true") {
+vc = cumsum( all$numVariants/sum(all$numVariants) )
+m10 = all$value[ max(which(vc<=0.10)) ]
+m25 = all$value[ max(which(vc<=0.25)) ]
+m = all$value[ max(which(vc<=0.5)) ]
+m75 = all$value[ min(which(vc>=0.75)) ]
+m90 = all$value[ min(which(vc>=0.90)) ]
+}
+
+#
+# Plot TiTv ratio as a function of the annotation
+#
+
+outfile = paste(input, ".TiTv.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2,col="red")
+abline(v=m75,lty=3)
+abline(v=m25,lty=3)
+text(m, ymin, "50", col="red", cex=0.6);
+text(m75, ymin, "75", col="black", cex=0.6);
+text(m25, ymin, "25", col="black", cex=0.6);
+if(medianNumVariants == "true") {
+abline(v=m90,lty=3)
+abline(v=m10,lty=3)
+text(m10, ymin, "10", col="black", cex=0.6);
+text(m90, ymin, "90", col="black", cex=0.6);
+}
+points(novel$value,novel$titv,col="green",pch=20)
+points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(truth$value,truth$titv,col="magenta",pch=20)
+legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
+} else {
+legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
+}
+dev.off()
+
+#
+# Plot TiTv ratio as a function of the annotation, log scale on the x-axis
+#
+
+outfile = paste(input, ".TiTv_log.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2,col="red")
+abline(v=m75,lty=3)
+abline(v=m25,lty=3)
+text(m, ymin, "50", col="red", cex=0.6);
+text(m75, ymin, "75", col="black", cex=0.6);
+text(m25, ymin, "25", col="black", cex=0.6);
+if(medianNumVariants == "true") {
+abline(v=m90,lty=3)
+abline(v=m10,lty=3)
+text(m10, ymin, "10", col="black", cex=0.6);
+text(m90, ymin, "90", col="black", cex=0.6);
+}
+points(novel$value,novel$titv,col="green",pch=20)
+points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(truth$value,truth$titv,col="magenta",pch=20)
+legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
+} else {
+legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
+}
+dev.off()
+
+#
+# Plot dbsnp and true positive rate as a function of the annotation
+#
+
+ymin = min(all$dbsnp)
+ymax = max(all$dbsnp)
+outfile = paste(input, ".truthRate.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+yLabel = "DBsnp Rate"
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+t = all[all$truePositive>0,]
+yLabel = "DBsnp/True Positive Rate"
+ymin = min(min(all$dbsnp),min(t$truePositive))
+ymax = max(max(all$dbsnp),max(t$truePositive))
+}
+plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2,col="red")
+abline(v=m75,lty=3)
+abline(v=m25,lty=3)
+text(m, ymin, "50", col="red", cex=0.6);
+text(m75, ymin, "75", col="black", cex=0.6);
+text(m25, ymin, "25", col="black", cex=0.6);
+if(medianNumVariants == "true") {
+abline(v=m90,lty=3)
+abline(v=m10,lty=3)
+text(m10, ymin, "10", col="black", cex=0.6);
+text(m90, ymin, "90", col="black", cex=0.6);
+}
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(t$value,t$truePositive,col="magenta",pch=20);
+legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
+}
+dev.off()
+
+#
+# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
+#
+
+outfile = paste(input, ".truthRate_log.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+yLabel = "DBsnp Rate"
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+yLabel = "DBsnp/Truth Rate"
+}
+plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2,col="red")
+abline(v=m75,lty=3)
+abline(v=m25,lty=3)
+text(m, ymin, "50", col="red", cex=0.6);
+text(m75, ymin, "75", col="black", cex=0.6);
+text(m25, ymin, "25", col="black", cex=0.6);
+if(medianNumVariants == "true") {
+abline(v=m90,lty=3)
+abline(v=m10,lty=3)
+text(m10, ymin, "10", col="black", cex=0.6);
+text(m90, ymin, "90", col="black", cex=0.6);
+}
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(t$value,t$truePositive,col="magenta",pch=20);
+legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
+}
+dev.off()
+
+#
+# Plot histogram of the annotation's value
+#
+
+outfile = paste(input, ".Histogram.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+dev.off()
+
+#
+# Plot histogram of the annotation's value, log scale on x-axis
+#
+
+outfile = paste(input, ".Histogram_log.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+dev.off()
--- a/public/R/plot_Tranches.R
+++ b/public/R/plot_Tranches.R
@ -0,0 +1,87 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+tranchesFile = args[1]
+targetTITV = as.numeric(args[2])
+targetSensitivity = as.numeric(args[3])
+suppressLegend = ! is.na(args[4])
+
+# -----------------------------------------------------------------------------------------------
+# Useful general routines
+# -----------------------------------------------------------------------------------------------
+
+MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate 
+
+titvFPEst <- function(titvExpected, titvObserved) { 
+    max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE) 
+}
+
+titvFPEstV <- function(titvExpected, titvs) {
+    sapply(titvs, function(x) titvFPEst(titvExpected, x))
+}
+
+nTPFP <- function(nVariants, FDR) {
+    return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
+}
+
+leftShift <- function(x, leftValue = 0) {
+    r = rep(leftValue, length(x))
+    for ( i in 1:(length(x)-1) ) {
+        #print(list(i=i))
+        r[i] = x[i+1]
+    }
+    r
+}
+
+# -----------------------------------------------------------------------------------------------
+# Tranches plot
+# -----------------------------------------------------------------------------------------------
+data2 = read.table(tranchesFile,sep=",",head=T)
+data2 = data2[order(data2$novelTiTv, decreasing=F),]
+#data2 = data2[order(data2$FDRtranche, decreasing=T),]
+cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
+density=c(20, -1, -1, 20)
+outfile = paste(tranchesFile, ".pdf", sep="")
+pdf(outfile, height=5, width=8)
+par(mar = c(5, 5, 4, 2) + 0.1)
+novelTiTv = c(data2$novelTITV,data2$novelTiTv)
+alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
+#print(alpha)
+
+numGood = round(alpha * data2$numNovel);
+
+#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
+numBad = data2$numNovel - numGood;
+
+numPrevGood = leftShift(numGood, 0)
+numNewGood = numGood - numPrevGood
+numPrevBad = leftShift(numBad, 0)
+numNewBad = numBad - numPrevBad
+
+d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
+#print(d)
+barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
+#abline(v= d[2,dim(d)[2]], lty=2)
+#abline(v= d[1,3], lty=2)
+if ( ! suppressLegend ) 
+    legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
+
+mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
+mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
+axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
+axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
+
+# plot sensitivity vs. specificity
+sensitivity = data2$truthSensitivity
+if ( ! is.null(sensitivity) ) {
+    #specificity = titvFPEstV(targetTITV, novelTiTv)
+    specificity = novelTiTv
+    plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
+    abline(h=targetTITV, lty=2)
+    abline(v=targetSensitivity, lty=2)
+    #text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
+}
+
+dev.off()
--- a/public/R/plot_residualError_OtherCovariate.R
+++ b/public/R/plot_residualError_OtherCovariate.R
@ -0,0 +1,108 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+covariateName = args[2]
+
+outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+c <- read.table(input, header=T)
+c <- c[sort.list(c[,1]),]
+
+#
+# Plot residual error as a function of the covariate
+#
+
+d.good <- c[c$nBases >= 1000,]
+d.1000 <- c[c$nBases < 1000,]
+rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
+rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
+theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
+if( length(d.good$nBases) == length(c$nBases) ) {
+	theTitle = paste("RMSE =", round(rmseAll,digits=3))
+}
+# Don't let residual error go off the edge of the plot
+d.good$residualError = d.good$Qempirical-d.good$Qreported
+d.good$residualError[which(d.good$residualError > 10)] = 10
+d.good$residualError[which(d.good$residualError < -10)] = -10
+d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
+d.1000$residualError[which(d.1000$residualError > 10)] = 10
+d.1000$residualError[which(d.1000$residualError < -10)] = -10
+c$residualError = c$Qempirical-c$Qreported
+c$residualError[which(c$residualError > 10)] = 10
+c$residualError[which(c$residualError < -10)] = -10
+pointType = "p"
+if( length(c$Covariate) <= 20 ) {
+    pointType = "o"
+}
+if( is.numeric(c$Covariate) ) {
+	plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
+	points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
+	points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
+}
+dev.off()
+
+
+#
+# Plot mean quality versus the covariate
+#
+
+outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+pointType = "p"
+if( length(c$Covariate) <= 20 ) {
+    pointType = "o"
+}
+theTitle = paste("Quality By", covariateName);
+if( is.numeric(c$Covariate) ) {
+	plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
+	points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
+	points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
+}
+dev.off()
+
+#
+# Plot histogram of the covariate
+#
+
+e = d.good
+f = d.1000
+outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
+
+lwdSize=2
+if( length(c$Covariate) <= 20 ) {
+    lwdSize=7
+} else if( length(c$Covariate) <= 70 ) {
+    lwdSize=4
+}
+
+if( is.numeric(c$Covariate) ) {
+    if( length(hst$e.Covariate) == 0 ) {
+        plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
+    } else {
+	    plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
+	    points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
+	}
+	axis(2,axTicks(2), format(axTicks(2), scientific=F))
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
+	plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
+	if( length(hst$c.Covariate) > 9 ) {
+	    axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
+	} else {
+	    axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
+	}
+	axis(2,axTicks(2), format(axTicks(2), scientific=F))
+}
+dev.off()
--- a/public/R/plot_residualError_QualityScoreCovariate.R
+++ b/public/R/plot_residualError_QualityScoreCovariate.R
@ -0,0 +1,70 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+
+input = args[1]
+Qcutoff = as.numeric(args[2])
+maxQ = as.numeric(args[3])
+maxHist = as.numeric(args[4])
+
+t=read.table(input, header=T)
+
+#
+# Plot of reported quality versus empirical quality
+#
+
+outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
+pdf(outfile, height=7, width=7)
+d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
+d.1000 <- t[t$nBases < 1000  & t$Qreported >= Qcutoff,]
+d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000  & t$Qreported >= Qcutoff,]
+f <- t[t$Qreported < Qcutoff,]
+e <- rbind(d.good, d.1000, d.10000)
+rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
+rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
+theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
+if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
+	theTitle = paste("RMSE =", round(rmseAll,digits=3));
+}
+plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
+points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
+points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
+points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
+abline(0,1, lty=2)
+dev.off()
+
+#
+# Plot Q empirical histogram
+#
+
+outfile = paste(input, ".quality_emp_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
+percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
+entropy = -sum(log2(percentBases)*percentBases)
+yMax = max(hst$e.nBases)
+if(maxHist != 0) {
+yMax = maxHist
+}
+plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
+points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
+
+#
+# Plot Q reported histogram
+#
+
+outfile = paste(input, ".quality_rep_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
+yMax = max(hst$e.nBases)
+if(maxHist != 0) {
+yMax = maxHist
+}
+plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
+points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
--- a/public/R/titvFPEst.R
+++ b/public/R/titvFPEst.R
@ -0,0 +1,138 @@
+titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
+
+titvFPEstV <- function(titvExpected, titvs) {
+    sapply(titvs, function(x) titvFPEst(titvExpected, x))
+}
+
+calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
+  TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
+  2 * TP / 3 / callable
+}
+
+marginalTiTv <- function( nx, titvx, ny, titvy ) {
+    tvx = nx / (titvx + 1)
+    tix = nx - tvx
+    tvy = ny / (titvy + 1)
+    tiy = ny - tvy
+    tiz = tix - tiy
+    tvz = tvx - tvy
+    return(tiz / tvz)
+}
+marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
+    knownx = nx * dbx / 100
+    novelx = nx - knownx
+    knowny = ny * dby / 100
+    novely = ny - knowny
+    knownz = knownx - knowny
+    novelz = novelx - novely
+    return(knownz / ( knownz + novelz ) * 100)
+}
+
+numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
+    nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
+    return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
+}
+
+normalize <- function(x) {
+    x / sum(x)
+}
+
+normcumsum <- function(x) {
+    cumsum(normalize(x))
+}
+
+cumhist <- function(d, ...) {
+    plot(d[order(d)], type="b", col="orange", lwd=2, ...)
+}
+
+revcumsum <- function(x) {
+   return(rev(cumsum(rev(x))))
+}
+
+phred <- function(x) {
+    log10(max(x,10^(-9.9)))*-10
+}
+
+pOfB <- function(b, B, Q) {
+    #print(paste(b, B, Q))
+    p = 1 - 10^(-Q/10)
+    if ( b == B )
+        return(p) 
+    else
+        return(1 - p)
+}
+
+pOfG <- function(bs, qs, G) {
+    a1 = G[1]
+    a2 = G[2]
+    
+    log10p = 0
+    for ( i in 1:length(bs) ) {
+        b = bs[i]
+        q = qs[i]
+        p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
+        log10p = log10p + log10(p1)
+    }
+    
+    return(log10p)
+}
+
+pOfGs <- function(nAs, nBs, Q) {
+    bs = c(rep("a", nAs), rep("t", nBs))
+    qs = rep(Q, nAs + nBs)
+    G1 = c("a", "a")
+    G2 = c("a", "t")
+    G3 = c("t", "t")
+
+    log10p1 = pOfG(bs, qs, G1)
+    log10p2 = pOfG(bs, qs, G2)
+    log10p3 = pOfG(bs, qs, G3)
+    Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
+    
+    return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) 
+}
+
+QsampleExpected <- function(depth, Q) {
+    weightedAvg = 0
+    for ( d in 1:(depth*3) ) {
+        Qsample = 0
+        pOfD = dpois(d, depth)
+        for ( nBs in 0:d ) {
+            pOfnB = dbinom(nBs, d, 0.5)
+            nAs = d - nBs
+            Qsample = pOfGs(nAs, nBs, Q)$Qsample
+            #Qsample = 1
+            weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
+            print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
+        }
+    }
+    
+    return(weightedAvg)
+}
+
+plotQsamples <- function(depths, Qs, Qmax) {
+    cols = rainbow(length(Qs))
+    plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
+    
+    for ( i in 1:length(Qs) ) {
+        Q = Qs[i]
+        y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
+        points(depths, y, col=cols[i], type="b")
+    }
+
+    legend("topleft", paste("Q", Qs), fill=cols)
+}
+
+pCallHetGivenDepth <- function(depth, nallelesToCall) {
+  depths = 0:(2*depth)   
+  pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
+  dpois(depths,depth)*(1-pNoAllelesToCall)
+}
+
+pCallHets <- function(depth, nallelesToCall) {
+  sum(pCallHetGivenDepth(depth,nallelesToCall))
+}
+
+pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
+  1-(1-pCallHets(depth,nallelesToCall))^nsamples
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@ -798,11 +798,11 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {

    private void generateAlternateConsensesFromKnownIndels(final Set<Consensus> altConsensesToPopulate, final int leftmostIndex, final byte[] reference) {
        for ( VariantContext knownIndel : knownIndelsToTry ) {
-            if ( knownIndel == null || !knownIndel.isIndel() )
+            if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() )
                continue;
            byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length());
            int start = knownIndel.getStart() - leftmostIndex + 1;
-            Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel.isDeletion());
+            Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel);
            if ( c != null )
                altConsensesToPopulate.add(c);
        }
@ -988,7 +988,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
    }

    // create a Consensus from just the indel string that falls on the reference
-    private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final boolean isDeletion) {
+    private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final VariantContext indel) {
        if ( indexOnRef < 0 || indexOnRef >= reference.length )
            return null;

@ -1002,14 +1002,16 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
        if ( indexOnRef > 0 )
            cigar.add(new CigarElement(indexOnRef, CigarOperator.M));

-        if ( isDeletion ) {
+        if ( indel.isDeletion() ) {
            refIdx += indelStr.length;
            cigar.add(new CigarElement(indelStr.length, CigarOperator.D));
        }
-        else {
+        else if ( indel.isInsertion() ) {
            for ( byte b : indelStr )
                sb.append((char)b);
            cigar.add(new CigarElement(indelStr.length, CigarOperator.I));
+        } else {
+            throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows");
        }

        if ( reference.length - refIdx > 0 )
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java
@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2010.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.indels;
+
+import net.sf.samtools.*;
+import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.*;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.filters.BadMateFilter;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.sting.commandline.Argument;
+
+import java.io.File;
+import java.util.*;
+
+@By(DataSource.READS)
+// walker to count realigned reads
+public class RealignedReadCounter extends ReadWalker<Integer, Integer> {
+
+    public static final String ORIGINAL_CIGAR_TAG = "OC";
+    public static final String ORIGINAL_POSITION_TAG = "OP";
+
+    @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
+    protected String intervalsFile = null;
+
+    // the intervals input by the user
+    private Iterator<GenomeLoc> intervals = null;
+
+    // the current interval in the list
+    private GenomeLoc currentInterval = null;
+
+    private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0;
+    private boolean intervalWasUpdated = false;
+
+    public void initialize() {
+        // prepare to read intervals one-by-one, as needed (assuming they are sorted).
+        intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
+        currentInterval = intervals.hasNext() ? intervals.next() : null;
+    }
+
+    public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
+        if ( currentInterval == null ) {
+            return 0;
+        }
+
+        GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read);
+        // hack to get around unmapped reads having screwy locations
+        if ( readLoc.getStop() == 0 )
+            readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart());
+
+        if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) )
+            return 0;
+
+        if ( readLoc.overlapsP(currentInterval) ) {
+            if ( doNotTryToClean(read) )
+                return 0;
+
+            if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) {
+                String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG);
+                // deal with an old bug
+                if ( read.getCigar().toString().equals(newCigar) ) {
+                    //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar);
+                    return 0;
+                }
+
+                if ( !intervalWasUpdated ) {
+                    intervalWasUpdated = true;
+                    updatedIntervals++;
+                    affectedBases += 20 + getIndelSize(read);
+                }
+                updatedReads++;
+
+            }
+        } else {
+            do {
+                intervalWasUpdated = false;
+                currentInterval = intervals.hasNext() ? intervals.next() : null;
+            } while ( currentInterval != null && currentInterval.isBefore(readLoc) );
+        }
+
+        return 0;
+    }
+
+    private int getIndelSize(SAMRecord read) {
+        for ( CigarElement ce : read.getCigar().getCigarElements() ) {
+            if ( ce.getOperator() == CigarOperator.I )
+                return 0;
+            if ( ce.getOperator() == CigarOperator.D )
+                return ce.getLength();
+        }
+        logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar());
+        return 0;
+    }
+
+    private boolean doNotTryToClean(SAMRecord read) {
+        return read.getReadUnmappedFlag() ||
+                read.getNotPrimaryAlignmentFlag() ||
+                read.getReadFailsVendorQualityCheckFlag() ||
+                read.getMappingQuality() == 0 ||
+                read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ||
+                (BadMateFilter.hasBadMate(read));
+    }
+
+    public Integer reduceInit() {
+        return 0;
+    }
+
+    public Integer reduce(Integer value, Integer sum) {
+        return sum + value;
+    }
+
+    public void onTraversalDone(Integer result) {
+        System.out.println(updatedIntervals + " intervals were updated");
+        System.out.println(updatedReads + " reads were updated");
+        System.out.println(affectedBases + " bases were affected");
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java
@ -161,7 +161,7 @@ public class AnnotateMNPsWalker extends RodWalker<Integer, Integer> {
            boolean atStartOfVc = curLocus.getStart() == vcLoc.getStart();
            boolean atEndOfVc = curLocus.getStart() == vcLoc.getStop();

-            if (vc.getType() == VariantContext.Type.MNP) {
+            if (vc.isMNP()) {
                logger.debug("Observed MNP at " + vcLoc);

                if (isChrM(vc)) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
@ -0,0 +1,62 @@
+package org.broadinstitute.sting.gatk.walkers.qc;
+
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.walkers.RefWalker;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.collections.Pair;
+
+import java.util.List;
+import java.io.PrintStream;
+
+/**
+ * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but
+ * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to.
+ * This was its very first use.
+ */
+public class CountIntervals extends RefWalker<Long, Long> {
+    @Output
+    PrintStream out;
+
+    @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false)
+    int numOverlaps = 2;
+
+    public Long reduceInit() {
+        return 0l;
+    }
+
+    public boolean isReduceByInterval() { return true; }
+
+    public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if ( tracker == null ) {
+            return null;
+        }
+
+        List<GATKFeature> checkIntervals = tracker.getGATKFeatureMetaData("check",false);
+        return (long) checkIntervals.size();
+    }
+
+    public Long reduce(Long loc, Long prev) {
+        if ( loc == null ) {
+            return 0l;
+        } else {
+            return Math.max(prev,loc);
+        }
+    }
+
+    public void onTraversalDone(List<Pair<GenomeLoc,Long>> finalReduce) {
+        long count = 0;
+        for ( Pair<GenomeLoc,Long> g : finalReduce ) {
+            if ( g.second >= numOverlaps) {
+                count ++;
+            }
+        }
+        out.printf("Number of contiguous intervals: %d",count);
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java
@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2010, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.qc;
-
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import org.broad.tribble.util.ParsingUtils;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
-import org.broad.tribble.readers.AsciiLineReader;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.Requires;
-import org.broadinstitute.sting.gatk.walkers.RodWalker;
-import org.broadinstitute.sting.utils.SimpleTimer;
-
-import java.io.PrintStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.util.*;
-
-/**
- * Emits specific fields as dictated by the user from one or more VCF files.
- */
-@Requires(value={})
-public class ProfileRodSystem extends RodWalker<Integer, Integer> {
-    @Output(doc="File to which results should be written",required=true)
-    protected PrintStream out;
-
-    @Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false)
-    int nIterations = 1;
-
-    @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false)
-    int MAX_RECORDS = -1;
-
-    SimpleTimer timer = new SimpleTimer("myTimer");
-
-    public void initialize() {
-        File rodFile = getRodFile();
-
-        out.printf("# walltime is in seconds%n");
-        out.printf("# file is %s%n", rodFile);
-        out.printf("# file size is %d bytes%n", rodFile.length());
-        out.printf("operation\titeration\twalltime%n");
-        for ( int i = 0; i < nIterations; i++ ) {
-            out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE));
-            out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE));
-            out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS));
-            out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC));
-            out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE));
-        }
-
-        timer.start(); // start up timer for map itself
-    }
-
-    private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE };
-
-    private final double readFile(File f, ReadMode mode) {
-        timer.start();
-
-        try {
-            byte[] data = new byte[100000];
-            FileInputStream s = new FileInputStream(f);
-
-            if ( mode == ReadMode.BY_BYTE ) {
-                while (true) {
-                    if ( s.read(data) == -1 )
-                        break;
-                }
-            } else {
-                int counter = 0;
-                VCFCodec codec = new VCFCodec();
-                String[] parts = new String[100000];
-                AsciiLineReader lineReader = new AsciiLineReader(s);
-
-                if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE )
-                    codec.readHeader(lineReader);
-
-                while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) {
-                    String line = lineReader.readLine();
-                    if ( line == null )
-                        break;
-                    else if ( mode == ReadMode.BY_PARTS ) {
-                        ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR);
-                    }
-                    else if ( mode == ReadMode.DECODE_LOC ) {
-                        codec.decodeLoc(line);
-                    }
-                    else if ( mode == ReadMode.DECODE ) {
-                        processOneVC((VariantContext)codec.decode(line));
-                    }
-                }
-            }
-        } catch ( Exception e ) {
-            throw new RuntimeException(e);
-        }
-
-        return timer.getElapsedTime();
-    }
-
-    private File getRodFile() {
-        List<ReferenceOrderedDataSource> rods = this.getToolkit().getRodDataSources();
-        ReferenceOrderedDataSource rod = rods.get(0);
-        return rod.getFile();
-    }
-
-    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
-        if ( tracker == null ) // RodWalkers can make funky map calls
-            return 0;
-
-        VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation());
-        processOneVC(vc);
-
-        return 0;
-    }
-
-    private static final void processOneVC(VariantContext vc) {
-        vc.getNSamples(); // force us to parse the samples
-    }
-
-    public Integer reduceInit() {
-        return 0;
-    }
-
-    public Integer reduce(Integer counter, Integer sum) {
-        return counter + sum;
-    }
-
-    public void onTraversalDone(Integer sum) {
-        out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime());
-    }
-}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java
@ -0,0 +1,153 @@
+package org.broadinstitute.sting.gatk.walkers.qc;
+
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.*;
+import java.math.BigInteger;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * a walker for validating (in the style of validating pile-up) the ROD system.
+ */
+@Reference(window=@Window(start=-40,stop=40))
+public class RodSystemValidationWalker extends RodWalker<Integer,Integer> {
+
+    // the divider to use in some of the text output
+    private static final String DIVIDER = ",";
+
+    @Output
+    public PrintStream out;
+
+    @Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts")
+    public boolean allRecordsVariantContextEquivalent = false;
+    
+    // used to calculate the MD5 of a file
+    MessageDigest digest = null;
+
+    // we sometimes need to know what rods the engine's seen
+    List<ReferenceOrderedDataSource> rodList;
+
+    /**
+     * emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change
+     * underneath us).
+     */
+    public void initialize() {
+        // setup the MD5-er
+        try {
+            digest = MessageDigest.getInstance("MD5");
+        } catch (NoSuchAlgorithmException e) {
+            throw new ReviewedStingException("Unable to find MD5 checksumer");
+        }
+        out.println("Header:");
+        // enumerate the list of ROD's we've loaded
+        rodList = this.getToolkit().getRodDataSources();
+        for (ReferenceOrderedDataSource rod : rodList) {
+            out.println(rod.getName() + DIVIDER + rod.getType());
+            out.println(rod.getName() + DIVIDER + rod.getFile());
+            out.println(rod.getName() + DIVIDER + md5sum(rod.getFile()));
+        }
+        out.println("Data:");
+    }
+
+    /**
+     *
+     * @param tracker the ref meta data tracker to get RODs
+     * @param ref reference context
+     * @param context the reads
+     * @return an 1 for each site with a rod(s), 0 otherwise
+     */
+    @Override
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        int ret = 0;
+        if (tracker != null && tracker.getAllRods().size() > 0) {
+            out.print(context.getLocation() + DIVIDER);
+            Collection<GATKFeature> features = tracker.getAllRods();
+            for (GATKFeature feat : features)
+                out.print(feat.getName() + DIVIDER);
+            out.println(";");
+            ret++;
+        }
+
+        // if the argument was set, check for equivalence
+        if (allRecordsVariantContextEquivalent && tracker != null) {
+            Collection<VariantContext> col = tracker.getAllVariantContexts(ref);
+            VariantContext con = null;
+            for (VariantContext contextInList : col)
+                if (con == null) con = contextInList;
+                else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con);
+        }
+        return ret;
+    }
+
+    /**
+     * Provide an initial value for reduce computations.
+     *
+     * @return Initial value of reduce.
+     */
+    @Override
+    public Integer reduceInit() {
+        return 0;
+    }
+
+    /**
+     * Reduces a single map with the accumulator provided as the ReduceType.
+     *
+     * @param value result of the map.
+     * @param sum   accumulator for the reduce.
+     * @return accumulator with result of the map taken into account.
+     */
+    @Override
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+
+    @Override
+    public void onTraversalDone(Integer result) {
+        // Double check traversal result to make count is the same.
+        // TODO: Is this check necessary?
+        out.println("[REDUCE RESULT] Traversal result is: " + result);
+    }        
+
+    // shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html
+    private String md5sum(File f) {
+        InputStream is;
+        try {
+            is = new FileInputStream(f);
+        } catch (FileNotFoundException e) {
+            return "Not a file";
+        }
+        byte[] buffer = new byte[8192];
+        int read = 0;
+        try {
+            while ((read = is.read(buffer)) > 0) {
+                digest.update(buffer, 0, read);
+            }
+            byte[] md5sum = digest.digest();
+            BigInteger bigInt = new BigInteger(1, md5sum);
+            return bigInt.toString(16);
+        }
+        catch (IOException e) {
+            throw new RuntimeException("Unable to process file for MD5", e);
+        }
+        finally {
+            try {
+                is.close();
+            }
+            catch (IOException e) {
+                throw new RuntimeException("Unable to close input stream for MD5 calculation", e);
+            }
+        }
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java
@ -1,298 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.qc;
-
-import net.sf.samtools.SAMRecord;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
-import net.sf.picard.reference.IndexedFastaSequenceFile;
-import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.*;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.utils.exceptions.StingException;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.SimpleTimer;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.commandline.Argument;
-
-import java.io.PrintStream;
-
-/**
- * Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
- * Can also count the number of reads matching a given criterion using read filters (see the
- * --read-filter command line argument).  Simplest example of a read-backed analysis.
- */
-@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER)
-@Reference(window=@Window(start=-5,stop=5))
-@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
-public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
-    @Output(doc="File to which results should be written",required=true)
-    protected PrintStream out;
-
-    @Argument(doc="maximum read length to apply the BAQ calculation too",required=false)
-    protected int maxReadLen = 1000;
-
-    @Argument(doc="",required=false)
-    protected int bw = 7;
-
-    @Argument(doc="",required=false)
-    protected boolean samtoolsMode = false;
-
-    @Argument(doc="only operates on reads with this name",required=false)
-    protected String readName = null;
-
-    @Argument(doc="If true, all differences are errors", required=false)
-    protected boolean strict = false;
-
-    @Argument(doc="prints info for each read", required=false)
-    protected boolean printEachRead = false;
-
-    @Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false)
-    protected boolean alsoPrintWarnings = false;
-
-    @Argument(doc="Include reads without BAQ tag", required=false)
-    protected boolean includeReadsWithoutBAQTag = false;
-
-    @Argument(doc="x each read is processed", required=false)
-    protected int magnification = 1;
-
-    @Argument(doc="Profile performance", required=false)
-    protected boolean profile = false;
-
-    int counter = 0;
-
-    BAQ baqHMM = null;         // matches current samtools parameters
-
-    public void initialize() {
-        if ( samtoolsMode )
-            baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true);
-        else
-            baqHMM = new BAQ();
-    }
-
-    long goodReads = 0, badReads = 0;
-
-    public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
-
-        if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) {
-            if ( baqHMM.excludeReadFromBAQ(read) )
-                return 0;
-
-            if ( profile ) {
-                profileBAQ(ref, read);
-            } else {
-                validateBAQ(ref, read);
-            }
-
-            return 1;
-        }
-
-        return 0;
-    }
-
-    SimpleTimer tagTimer = new SimpleTimer("from.tag");
-    SimpleTimer baqReadTimer = new SimpleTimer("baq.read");
-    SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal");
-
-    private void profileBAQ(ReferenceContext ref, SAMRecord read) {
-        IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
-        BAQ.BAQCalculationResult baq = null;
-
-        tagTimer.restart();
-        for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); }
-        tagTimer.stop();
-
-        baqReadTimer.restart();
-        for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); }
-        baqReadTimer.stop();
-
-        glocalTimer.restart();
-        for ( int i = 0; i < magnification; i++ )
-            baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY);
-        glocalTimer.stop();
-    }
-
-
-    private void validateBAQ(ReferenceContext ref, SAMRecord read) {
-        IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
-        byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag);
-        if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
-
-        BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader);
-
-        boolean fail = false;
-        boolean print = false;
-        int badi = 0;
-
-        if ( BAQ.hasBAQTag(read) ) {
-            for ( badi = 0; badi < baqFromTag.length; badi++ ) {
-                if ( baqFromTag[badi] != baq.bq[badi] ) {
-                    if ( cigarLength(read) != read.getReadLength() ) {
-                        print = true;
-                        fail = false;
-                        out.printf("  different, but cigar length != read length%n");
-                        break;
-                    }
-                    if (MathUtils.arrayMin(read.getBaseQualities()) == 0) {
-                        print = true;
-                        fail = strict;
-                        out.printf("  different, but Q0 base detected%n");
-                        break;
-                    }
-                    else if (readHasSoftClip(read) && ! samtoolsMode) {
-                        print = true;
-                        fail = strict;
-                        out.printf("  different, but soft clip detected%n");
-                        break;
-                    } else if (readHasDeletion(read) ) { // && ! samtoolsMode) {
-                        print = true;
-                        fail = strict;
-                        out.printf("  different, but deletion detected%n");
-                        break;
-                    } else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) {
-                        print = fail = true;
-                        out.printf("  Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual());
-                        break;
-                    } else {
-                        print = fail = true;
-                        break;
-                    }
-                }
-            }
-            if ( fail || print )
-                badReads++;
-            else
-                goodReads++;
-        }
-
-        if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) {
-            byte[] pos = new byte[baq.bq.length];
-            for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i;
-
-            out.printf("  read length   : %d%n", read.getReadLength());
-            out.printf("  read start    : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart());
-            out.printf("  cigar         : %s%n", read.getCigarString());
-            out.printf("  ref bases     : %s%n", new String(baq.refBases));
-            out.printf("  read bases    : %s%n", new String(read.getReadBases()));
-            out.printf("  ref length    : %d%n", baq.refBases.length);
-            out.printf("  BQ tag        : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
-            if ( BAQ.hasBAQTag(read) ) printQuals("  BQ deltas     : ", getBAQDeltas(read), true);
-            printQuals("  original quals: ", read.getBaseQualities(), true);
-            printQuals("  baq      quals: ", baq.bq, true);
-            printQuals("  positions     : ", pos, true);
-            printQuals("  original quals: ", read.getBaseQualities());
-            if ( BAQ.hasBAQTag(read) ) printQuals("  tag      quals: ", baqFromTag);
-            printQuals("  hmm      quals: ", baq.bq);
-            out.printf("  read bases    : %s%n", new String(read.getReadBases()));
-            out.println(Utils.dupString('-', 80));
-        }
-
-
-        if ( fail )
-            throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d",
-                    read.getReadName(), badi, baqFromTag[badi], baq.bq[badi]));
-    }
-
-    private final static boolean readHasSoftClip(SAMRecord read) {
-        for (CigarElement e : read.getCigar().getCigarElements()) {
-            if ( e.getOperator() == CigarOperator.SOFT_CLIP )
-                return true;
-        }
-
-        return false;
-    }
-
-    private final static boolean readHasDeletion(SAMRecord read) {
-        for (CigarElement e : read.getCigar().getCigarElements()) {
-            if ( e.getOperator() == CigarOperator.DELETION )
-                return true;
-        }
-
-        return false;
-    }
-
-    public final void printQuals( String prefix, byte[] quals ) {
-        printQuals(prefix, quals, false);
-    }
-
-    public final void printQuals( String prefix, byte[] quals, boolean asInt ) {
-        printQuals(out, prefix, quals, asInt);
-    }
-
-    public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
-        out.print(prefix);
-        for ( int i = 0; i < quals.length; i++) {
-            if ( asInt ) {
-                out.printf("%2d", (int)quals[i]);
-                if ( i+1 != quals.length ) out.print(",");
-            } else
-                out.print((char)(quals[i]+33));
-        }
-        out.println();
-    }
-
-    /**
-     * Get the BAQ delta bytes from the tag in read.  Returns null if no BAQ tag is present.
-     * @param read
-     * @return
-     */
-    public static byte[] getBAQDeltas(SAMRecord read) {
-        byte[] baq = BAQ.getBAQTag(read);
-        if ( baq != null ) {
-            byte[] deltas = new byte[baq.length];
-            for ( int i = 0; i < deltas.length; i++)
-                deltas[i] = (byte)(-1 * (baq[i] - 64));
-            return deltas;
-        } else
-            return null;
-    }
-
-    private int cigarLength(SAMRecord read) {
-        int readI = 0;
-        for ( CigarElement elt : read.getCigar().getCigarElements() ) {
-            int l = elt.getLength();
-            switch (elt.getOperator()) {
-                case N: // cannot handle these
-                    return 0;
-                case H : case P : // ignore pads and hard clips
-                    break;
-                case S :
-                case I :
-                    readI += l;
-                    break;
-                case D : break;
-                case M :
-                    readI += l;
-                    break;
-                default:
-                    throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
-            }
-        }
-        return readI;
-    }
-
-    public Integer reduceInit() { return 0; }
-
-    public Integer reduce(Integer value, Integer sum) {
-        return value + sum;
-    }
-
-    public void onTraversalDone(Integer nreads) {
-        if ( profile ) {
-            out.printf("n.reads baq.per.read calculation time.in.secs%n");
-            printTimer(nreads, tagTimer);
-            printTimer(nreads, glocalTimer);
-            printTimer(nreads, baqReadTimer);
-        } else {
-            out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads,
-                    goodReads, (100.0 * goodReads) / nreads,
-                    badReads, (100.0 * badReads) / nreads);
-        }
-    }
-
-    private final void printTimer(int nreads, SimpleTimer timer) {
-        out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime());
-    }
-}
-
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java
@ -209,7 +209,7 @@ public class PickSequenomProbes extends RodWalker<String, String> {
            String assay_sequence;
            if ( vc.isSNP() )
                assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
-            else if ( vc.getType() == VariantContext.Type.MNP )
+            else if ( vc.isMNP() )
                assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1);
            else if ( vc.isInsertion() )
                assay_sequence = leading_bases + (char)ref.getBase() + "[-/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
@ -113,7 +113,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
            observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING);
        }
        // deletions
-        else if ( vc.isDeletion() || vc.isMixed() || vc.getType() == VariantContext.Type.MNP ) {
+        else if ( vc.isDeletion() || vc.isMixed() || vc.isMNP() ) {
            // we can't validate arbitrarily long deletions
            if ( reportedRefAllele.length() > 100 ) {
                logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart()));
@ -123,7 +123,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
            // deletions are associated with the (position of) the last (preceding) non-deleted base;
            // hence to get actually deleted bases we need offset = 1
            int offset = 1 ;
-            if ( vc.getType() == VariantContext.Type.MNP ) offset = 0; // if it's an MNP, the reported position IS the first modified base
+            if ( vc.isMNP() ) offset = 0; // if it's an MNP, the reported position IS the first modified base
            byte[] refBytes = ref.getBases();
            byte[] trueRef = new byte[reportedRefAllele.length()];
            for (int i = 0; i < reportedRefAllele.length(); i++)
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@ -414,7 +414,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
     * @return vc subcontext
     */
    public VariantContext subContextFromGenotypes(Collection<Genotype> genotypes, Set<Allele> alleles) {
-        return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), getFilters(), getAttributes());
+        return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes());
    }


@ -566,10 +566,21 @@ public class VariantContext implements Feature { // to enable tribble intergrati
        return getType() == Type.INDEL && getAlternateAllele(0).isNull();
    }

+    /**
+     * @return true if the alleles indicate neither a simple deletion nor a simple insertion
+     */
+    public boolean isComplexIndel() {
+        return isIndel() && !isDeletion() && !isInsertion();
+    }
+
    public boolean isSymbolic() {
        return getType() == Type.SYMBOLIC;
    }

+    public boolean isMNP() {
+        return getType() == Type.MNP;
+    }
+
    /**
     * convenience method for indels
     *
--- a/public/java/test/org/broadinstitute/sting/BaseTest.java
+++ b/public/java/test/org/broadinstitute/sting/BaseTest.java
@ -83,7 +83,7 @@ public abstract class BaseTest {
     */
    public static final String MD5_FILE_DB_SUBDIR = "integrationtests";

-    public static final String testDir = "testdata/";
+    public static final String testDir = "public/testdata/";

    /** before the class starts up */
    static {
--- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java
@ -60,8 +60,8 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
        GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine();

        Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
-        samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
-        samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
+        samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));
+        samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));

        testEngine.setSAMFileIDs(samFiles);
        testEngine.checkForDuplicateSamFiles();
@ -72,10 +72,10 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
        GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine();

        Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
-        samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"),  new Tags()));
-        samFiles.add(new SAMReaderID(new File("testdata/exampleNORG.bam"), new Tags()));
-        samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"),  new Tags()));
-        samFiles.add(new SAMReaderID(new File("testdata/exampleNORG.bam"), new Tags()));
+        samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"),  new Tags()));
+        samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags()));
+        samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"),  new Tags()));
+        samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags()));

        testEngine.setSAMFileIDs(samFiles);
        testEngine.checkForDuplicateSamFiles();
@ -97,7 +97,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
        GATKArgumentCollection argCollection = new GATKArgumentCollection();
        testEngine.setArguments(argCollection);

-        File fastaFile = new File("testdata/exampleFASTA.fasta");
+        File fastaFile = new File("public/testdata/exampleFASTA.fasta");
        GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));
        testEngine.setGenomeLocParser(genomeLocParser);

--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java
@ -19,7 +19,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest {
        WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
                "-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
                 1,
-                 Arrays.asList("fd2d9dbf718f7a6d82ae1787ac1fe61e"));
+                 Arrays.asList("f23ba17ee0f9573dd307708175d90cd2"));
        executeTest("test dbsnp", spec2);

        WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java
@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
                    " --NO_HEADER" +
                    " -o %s",
            1,
-            Arrays.asList("2cae3d16f9ed00b07d87e9c49272d877")
+            Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d")
        );

        executeTest("testSimpleVCFStreaming", spec);
@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
            " -EV CompOverlap -noEV -noST" +
            " -o %s",
            1,
-            Arrays.asList("f60729c900bc8368717653b3fad80d1e")
+            Arrays.asList("f60729c900bc8368717653b3fad80d1e")           //"f60729c900bc8368717653b3fad80d1e"
        );
        executeTest("testVCFStreamingChain", selectTestSpec);

--- a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java
@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider;
 import org.testng.annotations.BeforeMethod;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker;
-import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
-import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
 import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
 import org.broadinstitute.sting.utils.Utils;

 import java.io.File;
 import java.io.FileNotFoundException;
-import java.util.Arrays;
+import java.io.PrintStream;
 import java.util.List;
 import java.util.ArrayList;

 import net.sf.picard.reference.IndexedFastaSequenceFile;
-import net.sf.picard.reference.ReferenceSequence;
 import net.sf.samtools.*;

 /**
@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest {

        System.out.println(Utils.dupString('-', 40));
        System.out.println("reads   : " + new String(test.readBases));
-        ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false);
-        ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false);
+        printQuals(System.out, "in-quals:", test.quals, false);
+        printQuals(System.out, "bq-quals:", result.bq, false);
        for (int i = 0; i < test.quals.length; i++) {
            //result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
            Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest {
        }

    }
+
+    public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
+        out.print(prefix);
+        for ( int i = 0; i < quals.length; i++) {
+            if ( asInt ) {
+                out.printf("%2d", (int)quals[i]);
+                if ( i+1 != quals.length ) out.print(",");
+            } else
+                out.print((char)(quals[i]+33));
+        }
+        out.println();
+    }
 }
--- a/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java
@ -21,7 +21,7 @@ public class BedParserUnitTest extends BaseTest {

    private static IndexedFastaSequenceFile seq;
    private GenomeLocParser genomeLocParser;
-    private File bedFile = new File("testdata/sampleBedFile.bed");
+    private File bedFile = new File("public/testdata/sampleBedFile.bed");

    @BeforeClass
    public void beforeTests() {
--- a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java
@ -39,7 +39,7 @@ import java.io.IOException;
 */
 public class HapMapUnitTest {
    // our sample hapmap file
-    private final static File hapMapFile = new File("testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap");
+    private final static File hapMapFile = new File("public/testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap");
    private final static String knownLine = "rs2185539 C/T chr1 556738 + ncbi_b36 bbs urn:lsid:bbs.hapmap.org:Protocol:Phase3.r3:1 urn:lsid:bbs.hapmap.org:Assay:Phase3.r3_r" +
            "s2185539:1 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:4 QC+ CC TC TT CT CC CC CC CC CC CC CC CC CC";
    /**
--- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java
@ -17,8 +17,8 @@ import java.util.*;
 */
 public class IndexFactoryUnitTest {

-    File inputFile = new File("testdata/HiSeq.10000.vcf");
-    File outputFile = new File("testdata/onTheFlyOutputTest.vcf");
+    File inputFile = new File("public/testdata/HiSeq.10000.vcf");
+    File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf");
    File outputFileIndex = Tribble.indexFile(outputFile);

    /**
--- a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java
@ -49,12 +49,12 @@ public class ListFileUtilsUnitTest extends BaseTest {
    public void testIgnoreBlankLinesInBAMListFiles() throws Exception {
        File tempListFile = createTempListFile("testIgnoreBlankLines",
                                               "",
-                                               "testdata/exampleBAM.bam",
+                                               "public/testdata/exampleBAM.bam",
                                               "         "
                                              );

        List<SAMReaderID> expectedBAMFileListAfterUnpacking = new ArrayList<SAMReaderID>();
-        expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
+        expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));

        performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking);
    }
@ -63,13 +63,13 @@ public class ListFileUtilsUnitTest extends BaseTest {
    public void testCommentSupportInBAMListFiles() throws Exception {
        File tempListFile = createTempListFile("testCommentSupport",
                                               "#",
-                                               "testdata/exampleBAM.bam",
-                                               "#testdata/foo.bam",
-                                               "      # testdata/bar.bam"
+                                               "public/testdata/exampleBAM.bam",
+                                               "#public/testdata/foo.bam",
+                                               "      # public/testdata/bar.bam"
                                              );

        List<SAMReaderID> expectedBAMFileListAfterUnpacking = new ArrayList<SAMReaderID>();
-        expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
+        expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));

        performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking);
    }
--- a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java
@ -29,7 +29,7 @@ public class GenomeLocProcessingTrackerUnitTest extends BaseTest {
    IndexedFastaSequenceFile fasta = null;
    GenomeLocParser genomeLocParser = null;
    String chr1 = null;
-    private final static String FILE_ROOT = "testdata/GLPTFile";
+    private final static String FILE_ROOT = "public/testdata/GLPTFile";

    @BeforeTest
    public void before() {
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
@ -1,15 +1,15 @@
-package core
+package org.broadinstitute.sting.queue.qscripts

 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.function.ListWriterFunction

-import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord}
-
 import scala.io.Source._
 import collection.JavaConversions._
 import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
 import org.broadinstitute.sting.queue.extensions.picard._
+import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord}
+import net.sf.samtools.SAMFileHeader.SortOrder


 class DataProcessingPipeline extends QScript {
@ -69,6 +69,8 @@ class DataProcessingPipeline extends QScript {
  @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
  var useBWApe: Boolean = false

+  @Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
+  var bwaThreads: Int = 1


  /****************************************************************************
@ -180,6 +182,7 @@ class DataProcessingPipeline extends QScript {
    var realignedBams: List[File] = List()
    var index = 1
    for (bam <- bams) {
+      val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" )
      val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai")
      val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai")
      val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam")
@ -190,11 +193,12 @@ class DataProcessingPipeline extends QScript {
            bwa_sam_se(bam, saiFile1, realignedSamFile))
      }
      else {
-        add(bwa_aln_pe(bam, saiFile1, 1),
-            bwa_aln_pe(bam, saiFile2, 2),
-            bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile))
+        add(sortSam(bam, readSortedBam, SortOrder.queryname),
+            bwa_aln_pe(readSortedBam, saiFile1, 1),
+            bwa_aln_pe(readSortedBam, saiFile2, 2),
+            bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile))
      }
-      add(sortSam(realignedSamFile, realignedBamFile))
+      add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate))
      addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam))
      realignedBams :+= rgRealignedBamFile
      index = index + 1
@ -385,9 +389,10 @@ class DataProcessingPipeline extends QScript {
    this.jobName = queueLogDir + outBam + ".joinBams"
  }

-  case class sortSam (inSam: File, outBam: File) extends SortSam {
+  case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam {
    this.input = List(inSam)
    this.output = outBam
+    this.sortOrder = sortOrderP
    this.memoryLimit = 4
    this.isIntermediate = true
    this.analysisName = queueLogDir + outBam + ".sortSam"
@ -430,7 +435,7 @@ class DataProcessingPipeline extends QScript {
  case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs {
    @Input(doc="bam file to be aligned") var bam = inBam
    @Output(doc="output sai file") var sai = outSai
-    def commandLine = bwaPath + " aln -q 5 " + reference + " -b " + bam + " > " + sai
+    def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai
    this.analysisName = queueLogDir + outSai + ".bwa_aln_se"
    this.jobName = queueLogDir + outSai + ".bwa_aln_se"
  }
@ -438,7 +443,7 @@ class DataProcessingPipeline extends QScript {
  case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs {
    @Input(doc="bam file to be aligned") var bam = inBam
    @Output(doc="output sai file for 1st mating pair") var sai = outSai1
-    def commandLine = bwaPath + " aln -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
+    def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
    this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1"
    this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1"
  }
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala
@ -1,4 +1,4 @@
-package core
+package org.broadinstitute.sting.queue.qscripts

 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.extensions.gatk._
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
@ -1,3 +1,5 @@
+package org.broadinstitute.sting.queue.qscripts
+
 import org.broadinstitute.sting.commandline.Hidden
 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.QScript
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
@ -0,0 +1,91 @@
+package org.broadinstitute.sting.queue.qscripts
+
+import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.queue.extensions.gatk._
+import net.sf.samtools.SAMFileReader
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 4/20/11
+ * Time: 16:29 PM
+ */
+
+
+class RecalibrateBaseQualities extends QScript {
+
+  @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
+  var GATKjar: File = _
+
+  @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
+  var input: File = _
+
+  @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false)
+  var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R")
+
+  @Input(doc="Reference fasta file", shortName="R", required=false)
+  var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
+
+  @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false)
+  var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
+
+  val queueLogDir: String = ".qlog/"
+  var nContigs: Int = 0
+
+  def getNumberOfContigs(bamFile: File): Int = {
+    val samReader = new SAMFileReader(new File(bamFile))
+    return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
+  }
+
+  def script = {
+
+    nContigs = getNumberOfContigs(input)
+
+    val recalFile1: File = new File("recal1.csv")
+    val recalFile2: File = new File("recal2.csv")
+    val recalBam: File        = swapExt(input, ".bam", "recal.bam")
+    val path1: String    = "before"
+    val path2: String    = "after"
+    
+    add(cov(input, recalFile1),
+        recal(input, recalFile1, recalBam),
+        cov(recalBam, recalFile2),
+        analyzeCovariates(recalFile1, path1),
+        analyzeCovariates(recalFile2, path2))
+  }
+
+  trait CommandLineGATKArgs extends CommandLineGATK {
+    this.jarFile = GATKjar
+    this.reference_sequence = reference
+    this.memoryLimit = 4
+    this.isIntermediate = true
+  }
+
+  case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
+    this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
+    this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
+    this.input_file :+= inBam
+    this.recal_file = outRecalFile
+    this.analysisName = queueLogDir + outRecalFile + ".covariates"
+    this.jobName = queueLogDir + outRecalFile + ".covariates"
+    this.scatterCount = nContigs
+  }
+
+  case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
+    this.input_file :+= inBam
+    this.recal_file = inRecalFile
+    this.out = outBam
+    this.isIntermediate = false
+    this.analysisName = queueLogDir + outBam + ".recalibration"
+    this.jobName = queueLogDir + outBam + ".recalibration"
+    this.scatterCount = nContigs
+  }
+
+  case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates {
+    this.resources = R
+    this.recal_file = inRecalFile
+    this.output_dir = outPath.toString
+    this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates"
+    this.jobName = queueLogDir + inRecalFile + ".analyze_covariates"
+  }
+}
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala
@ -1,4 +1,4 @@
-package core
+package org.broadinstitute.sting.queue.qscripts

 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.extensions.gatk.RodBind
--- a/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala
+++ b/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala
@ -1,211 +0,0 @@
-package org.broadinstitute.sting.scala
-/**
-import gatk.walkers.genotyper.{UnifiedGenotyper, GenotypeCall}
-import java.io.File
-import net.sf.samtools.SAMRecord
-import org.broadinstitute.sting.gatk.walkers.LocusWalker
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext
-import org.broadinstitute.sting.utils.Pair
-import org.broadinstitute.sting.utils.genotype.GenotypeMetaData
-import utils._
-import cmdLine.Argument
-
-class TransitionTable() {
-  var table: Array[Array[Double]] = new Array[Array[Double]](4,4)
-
-  def makeKey(b: Char): Int = {
-    return BaseUtils.simpleBaseToBaseIndex(b)
-  }
-
-  def fromKey(i: Int): Char = {
-    return BaseUtils.baseIndexToSimpleBase(i)
-  }
-
-  def add(ref: Char, subst: Char) = {
-    table(makeKey(ref))(makeKey(subst)) += 1
-  }
-
-  def get(i: Int, j: Int): Double = {
-    return table(i)(j)
-  }
-
-  def set(i: Int, j: Int, d: Double) = {
-    //printf("Setting %d / %d => %f%n", i, j, d)
-    table(i)(j) = d
-  }
-
-  def mapEntries[A](f: ((Int, Int, Double) => A)): List[A] = {
-    return for { i <- List.range(0, 4); j <- List.range(0,4) } yield f(i, j, get(i,j))
-  }
-
-  def header(): String = {
-    //return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray)
-    return Utils.join("\t", mapEntries((i,j,x) => "[%c|%c]" format (fromKey(j), fromKey(i))).toArray)  
-    }
-
-  override def toString(): String = {
-    return Utils.join("\t", mapEntries((i,j,x) => "%.2f" format x).toArray)
-  }
-
-  def sum(): Double = {
-     return sumList(table.toList map (x => sumList(x.toList)))
-  }
-
-  def sumList(xs: List[Double]): Double = {
-      return (0.0 :: xs) reduceLeft {(x, y) => x + y}
-  }
-
-  def getRateMatrix(): TransitionTable = {
-    var sums = (table.toList map (x => sumList(x.toList))).toArray
-    var t = new TransitionTable()
-
-    //println(sums.toList)
-    this mapEntries ((i,j,x) => t.set(i, j, x / sums(i)))
-
-    return t
-  }
-}
-
-**/class BaseTransitionTableCalculator /**{ // extends LocusWalker[Unit,Int] {
-  private var MIN_MAPPING_QUALITY = 30
-  private var MIN_BASE_QUALITY = 20
-  private var MIN_LOD = 5
-  private var PRINT_FREQ = 1000000
-  private var CARE_ABOUT_STRAND = true
-
-  private val SSG = new UnifiedGenotyper()
-  private val table1 = new TransitionTable()
-  private val tableFWD = new TransitionTable()
-  private val tableREV = new TransitionTable()
-  private val table2 = new TransitionTable()
-
-  @Argument() { val shortName = "v", val doc = "Print out verbose output", val required = false }
-  var VERBOSE: Boolean = false
-
-  override def initialize() {
-    SSG.initialize();
-  }
-
-  override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Unit = {
-    val callPair = SSG.map(tracker, ref, context)
-    //val call = callPair.getFirst().get(0)
-    val pileup = new ReadBackedPileup(ref.getBase(), context)
-
-    def hasNoNs(): Boolean = {
-      return ! (pileup.getBases() exists ('N' ==))
-    }
-
-    if ( isDefinitelyHomRef(callPair) && hasNoNs() ) {
-      var (refBases, nonRefBases) = splitBases(ref.getBase, pileup.getBases)
-
-      //printf("nonRefBases %s%n", nonRefBases)
-      if ( nonRefBases.length > 0 ) {
-        var (nonRefReads, offsets) = getNonRefReads(ref.getBase, pileup)
-        var nonRefRead: SAMRecord = nonRefReads.head
-
-        nonRefBases.length match {
-          case 1 =>
-	    //if ( goodRead(nonRefRead,offsets.head) ) {
-	       //printf("Including site:%n nonRefBases=%s%n  refBases=%s%n  nonRefRead=%s%n  offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head)
-            //}
-	    addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1)
-            addRead(nonRefRead, offsets.head, nonRefBases.head, ref, if ( nonRefRead.getReadNegativeStrandFlag ) tableREV else tableFWD )
-          case 2 =>
-            addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table2)
-            addRead(nonRefReads.tail.head, offsets.tail.head, nonRefBases.tail.head, ref, table2)
-          case x => 
-        }
-
-        if ( VERBOSE && pileup.size > 30 && nonRefReads.length < 4 )
-          printNonRefBases(ref, nonRefReads, offsets, context.getReads.size())
-      }
-    }
-  }
-
-  implicit def listToJavaList[T](l: List[T]) = java.util.Arrays.asList(l.toArray: _*)
-
-  def printNonRefBases(ref: ReferenceContext, nonRefReads: List[SAMRecord], offsets: List[Integer], depth: Integer): Unit = {
-    out.println(new ReadBackedPileup(ref.getLocus(), ref.getBase(), listToJavaList(nonRefReads), offsets) + " " + depth)
-  }
-
-  def addRead(nonRefRead: SAMRecord, offset: Integer, nonRefBase: Char, ref: ReferenceContext, table: TransitionTable): Unit = {
-    if ( goodRead(nonRefRead, offset) ) {
-      //printf("Including site:%n  call=%s%n  nonRefBases=%s%n  refBases=%s%n  nonRefRead=%s%n  offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head)
-      //println(call.getReadDepth, call.getReferencebase, nonRefBases, refBases, nonRefRead.getMappingQuality)
-      if ( CARE_ABOUT_STRAND && nonRefRead.getReadNegativeStrandFlag() ) {
-        // it's on the reverse strand
-        val refComp: Char = BaseUtils.simpleComplement(ref.getBase)
-        val nonRefComp: Char = BaseUtils.simpleComplement(nonRefBase)
-
-        //printf("Negative: %c/%c is actually %c/%c%n", ref.getBase, nonRefBases.head, refComp, nonRefComp)
-        table.add(refComp, nonRefComp)
-      } else {
-        table.add(ref.getBase, nonRefBase)
-      }
-    }
-  }
-
-  def getNonRefReads(ref: Char, pileup: ReadBackedPileup): (List[SAMRecord], List[Integer]) = {
-    def getBase(i: Int): Char = {
-      var offset = pileup.getOffsets().get(i)
-      return pileup.getReads().get(i).getReadString().charAt(offset.asInstanceOf[Int])
-    }
-
-    var subset = for { i <- List.range(0, pileup.size) if getBase(i) != ref } yield i
-    var reads = subset map (i => pileup.getReads().get(i))
-    var offsets = subset map (i => pileup.getOffsets().get(i))
-
-    //println(reads, offsets, subset)
-    return (reads, offsets)
-  }
-
-
-  def hasFewNonRefBases(refBase: Char, pileup: String): List[Char] = {
-    splitBases(refBase, pileup) match {
-      case (refs, nonRefs) =>
-        //printf("nrf=%s, rb=%s%n", nonRefs.mkString, refs.mkString)
-        return nonRefs
-    }
-  }
-
-  def goodRead( read: SAMRecord, offset: Integer ): Boolean = {
-    return read.getMappingQuality() > MIN_MAPPING_QUALITY && read.getBaseQualities()(offset.intValue) > MIN_BASE_QUALITY
-  }
-
-  def splitBases(refBase: Char, pileup: String): (List[Char], List[Char]) = {
-    val refBases = pileup.toList filter (refBase ==)
-    val nonRefBases = pileup.toList filter (refBase !=)
-    return (refBases, nonRefBases)
-  }
-  
-  def isDefinitelyHomRef(call: Pair[java.util.List[GenotypeCall],GenotypeMetaData]): Boolean = {
-    if ( call == null )
-       return false;
-
-    return (! call.getFirst().get(0).isVariant()) && call.getFirst().get(0).getNegLog10PError() > MIN_LOD
-  }
-
-  def reduceInit(): Int = {
-    return 0;
-  }
-
-  def reduce(m: Unit, r: Int): Int = {
-    return r;
-  }
-
-  override def onTraversalDone(r: Int) = {
-    out.println("-------------------- FINAL RESULT --------------------")
-    out.println("type\t" + table1.header)
-    def print1(t: String, x: TransitionTable) = {
-      out.println(t + "\t" + x)
-    }
-
-    print1("1-mismatch", table1)
-    print1("2-mismatch", table2)
-    print1("1-mismatch-fwd", tableFWD)
-    print1("1-mismatch-rev", tableREV)
-  }
-}
-*/
--- a/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala
+++ b/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala
@ -1,120 +0,0 @@
-package org.broadinstitute.sting.scala
-
-import java.io.PrintStream
-import org.broadinstitute.sting.gatk.contexts.{ReferenceContext, AlignmentContext}
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
-import collection.JavaConversions._
-import collection.immutable.List
-import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature
-import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature
-import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature
-import org.broadinstitute.sting.gatk.walkers.{TreeReducible, RefWalker}
-import org.broadinstitute.sting.commandline.{Output, Argument}
-import org.broadinstitute.sting.utils.{BaseUtils, GenomeLoc}
-import collection.mutable.{ListBuffer, HashSet}
-import java.lang.Math
-
-class IntervalAnnotationWalker extends RefWalker[AnnotationMetaData,List[IntervalInfoBuilder]] {
-  @Argument(doc="Min proportion of bases overlapping between an interval of interest and an annotation interval for annotation to occur",shortName="mpb")
-  var minPropOverlap : Double = 0.50 // default to 50%
-  @Output var out : PrintStream = _
-
-  override def reduceInit : List[IntervalInfoBuilder] = {
-    Nil
-  }
-
-  override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext) : AnnotationMetaData = {
-    val ivalsOfInterest : List[GATKFeature] = tracker.getGATKFeatureMetaData("interval_list",false).toList
-    val refSeqData : List[Object] = tracker.getReferenceMetaData("refseq").toList
-    val tableMetaData : List[Object] = tracker.getReferenceMetaData("table",false).toList
-    var amd : AnnotationMetaData = new AnnotationMetaData(ref.getLocus,ref.getBase)
-    if ( refSeqData != null ) {
-      amd.refSeqFeatures = refSeqData.map( (o: Object) => o.asInstanceOf[RefSeqFeature])
-    }
-    if ( tableMetaData != null ) {
-      amd.tableFeatures = tableMetaData.map( (o: Object) => o.asInstanceOf[TableFeature]).map( (u: TableFeature) =>
-      new Tuple2(u.getLocation,u.getAllValues.toList.reduceLeft(_ + "," + _)))
-    }
-    amd.newIvals = ivalsOfInterest.map(_.getLocation)
-    return amd
-  }
-
-  override def reduce(mapMetaData : AnnotationMetaData, rList : List[IntervalInfoBuilder]) : List[IntervalInfoBuilder] = {
-    rList.filter( (u : IntervalInfoBuilder) => u.location.isBefore(mapMetaData.locus)).foreach(u => out.print("%s%n".format(u.done)))
-    val newList : List[IntervalInfoBuilder] = rList.filter( ! _.finalized ) :::
-      mapMetaData.newIvals.filter( (g: GenomeLoc) => g.getStart == mapMetaData.locus.getStart ).map( u => new IntervalInfoBuilder(u,minPropOverlap) )
-    newList.foreach(_.update(mapMetaData))
-    return newList
-  }
-
-  override def onTraversalDone(finalList : List[IntervalInfoBuilder]) = {
-    finalList.foreach(u => out.print("%s%n".format(u.done)))
-  }
-
-}
-
-class AnnotationMetaData(loc: GenomeLoc, ref: Byte) {
-  val locus : GenomeLoc = loc
-  var newIvals : List[GenomeLoc] = Nil
-  var refSeqFeatures : List[RefSeqFeature] = Nil
-  var tableFeatures : List[(GenomeLoc,String)] = Nil
-  val refBase : Byte = ref
-}
-
-class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) {
-
-  val OVERLAP_PROP : Double = minProp
-
-  var metaData : HashSet[String] = new HashSet[String]
-  var seenTranscripts : HashSet[String] = new HashSet[String]
-  var baseContent : ListBuffer[Byte] = new ListBuffer[Byte]
-  var location : GenomeLoc = loc
-  var gcContent : Double = _
-  var entropy : Double = _
-  var finalized : Boolean = false
-
-
-  def update(mmd : AnnotationMetaData) = {
-    baseContent += mmd.refBase
-    mmd.refSeqFeatures.filter( p => ! seenTranscripts.contains(p.getTranscriptUniqueGeneName)).view.foreach(u => addRefSeq(u))
-    mmd.tableFeatures.filter( (t : (GenomeLoc,String)) =>
-      ! metaData.contains(t._2) && (t._1.intersect(location).size.asInstanceOf[Double]/location.size() >= OVERLAP_PROP) ).foreach( t => metaData.add(t._2))
-  }
-
-  def addRefSeq( rs: RefSeqFeature ) : Unit = {
-    seenTranscripts.add(rs.getTranscriptUniqueGeneName)
-    val exons : List[(GenomeLoc,Int)] = rs.getExons.toList.zipWithIndex.filter( p => ((p._1.intersect(location).size+0.0)/location.size) >= minProp )
-    if ( exons.size > 0 ) {
-      metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,exons.map( p => "exon%d".format(p._2) ).reduceLeft(_ + "," + _)))
-    } else {
-      if ( location.isBefore(rs.getExons.get(0)) || location.isPast(rs.getExons.get(rs.getNumExons-1)) ) {
-        metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"UTR"))
-      } else {
-        metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"Intron"))
-      }
-    }
-
-  }
-
-  def done : String = {
-    finalized = true
-    def isGC(b : Byte) : Int = if ( BaseUtils.gIndex == b || BaseUtils.cIndex == b ) { 1 } else { 0 }
-    gcContent = baseContent.foldLeft[Int](0)( (a,b) => a + isGC(b)).asInstanceOf[Double]/location.size()
-    entropy = calcEntropy(baseContent.map(b => ListBuffer(b))) + calcEntropy(baseContent.reverse.map(b => ListBuffer(b)))
-    val meta : String = metaData.reduceLeft(_ + "\t" + _)
-    return "%s\t%d\t%d\t%.2f\t%.2f\t%s".format(location.getContig,location.getStart,location.getStop,gcContent,entropy,meta)
-  }
-
-  def calcEntropy(byteList : ListBuffer[ListBuffer[Byte]]) : Double = {
-    if(byteList.size == 1) return 0
-    Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) +
-      calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => {
-        if ( b.equals(byteList(1)) ) {
-          a.dropRight(1) :+ (a.last ++ b)
-        } else {
-          a :+ b
-        }
-      }))
-  }
-
-}
--- a/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala
+++ b/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala
@ -1,24 +0,0 @@
-package org.broadinstitute.sting.scala
-
-import org.broadinstitute.sting.gatk.walkers.LocusWalker
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext
-
-class ScalaCountLoci extends LocusWalker[Int,Int] {
-    override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Int = {
-        return 1
-    }
-
-    def reduceInit(): Int = {
-        return 0;
-    }
-
-    def reduce(m: Int, r: Int): Int = {
-        return m + r;
-    }
-    
-    def main(args: Array[String]) {
-       println("Hello, world!")
-    }    
-}