diff --git a/build.xml b/build.xml
index cf8623bed..fe1723587 100644
--- a/build.xml
+++ b/build.xml
@@ -64,7 +64,7 @@
-
+
@@ -896,7 +896,7 @@
-
+
@@ -934,7 +934,7 @@
-
+
diff --git a/public/R/plot_Tranches.R b/public/R/plot_Tranches.R
new file mode 100755
index 000000000..a79ddd3ab
--- /dev/null
+++ b/public/R/plot_Tranches.R
@@ -0,0 +1,87 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+tranchesFile = args[1]
+targetTITV = as.numeric(args[2])
+targetSensitivity = as.numeric(args[3])
+suppressLegend = ! is.na(args[4])
+
+# -----------------------------------------------------------------------------------------------
+# Useful general routines
+# -----------------------------------------------------------------------------------------------
+
+MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate
+
+titvFPEst <- function(titvExpected, titvObserved) {
+ max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE)
+}
+
+titvFPEstV <- function(titvExpected, titvs) {
+ sapply(titvs, function(x) titvFPEst(titvExpected, x))
+}
+
+nTPFP <- function(nVariants, FDR) {
+ return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
+}
+
+leftShift <- function(x, leftValue = 0) {
+ r = rep(leftValue, length(x))
+ for ( i in 1:(length(x)-1) ) {
+ #print(list(i=i))
+ r[i] = x[i+1]
+ }
+ r
+}
+
+# -----------------------------------------------------------------------------------------------
+# Tranches plot
+# -----------------------------------------------------------------------------------------------
+data2 = read.table(tranchesFile,sep=",",head=T)
+data2 = data2[order(data2$novelTiTv, decreasing=F),]
+#data2 = data2[order(data2$FDRtranche, decreasing=T),]
+cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
+density=c(20, -1, -1, 20)
+outfile = paste(tranchesFile, ".pdf", sep="")
+pdf(outfile, height=5, width=8)
+par(mar = c(5, 5, 4, 2) + 0.1)
+novelTiTv = c(data2$novelTITV,data2$novelTiTv)
+alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
+#print(alpha)
+
+numGood = round(alpha * data2$numNovel);
+
+#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
+numBad = data2$numNovel - numGood;
+
+numPrevGood = leftShift(numGood, 0)
+numNewGood = numGood - numPrevGood
+numPrevBad = leftShift(numBad, 0)
+numNewBad = numBad - numPrevBad
+
+d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
+#print(d)
+barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
+#abline(v= d[2,dim(d)[2]], lty=2)
+#abline(v= d[1,3], lty=2)
+if ( ! suppressLegend )
+ legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
+
+mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
+mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
+axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
+axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
+
+# plot sensitivity vs. specificity
+sensitivity = data2$truthSensitivity
+if ( ! is.null(sensitivity) ) {
+ #specificity = titvFPEstV(targetTITV, novelTiTv)
+ specificity = novelTiTv
+ plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
+ abline(h=targetTITV, lty=2)
+ abline(v=targetSensitivity, lty=2)
+ #text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
+}
+
+dev.off()
diff --git a/public/R/plot_residualError_OtherCovariate.R b/public/R/plot_residualError_OtherCovariate.R
new file mode 100644
index 000000000..a1385ff3f
--- /dev/null
+++ b/public/R/plot_residualError_OtherCovariate.R
@@ -0,0 +1,108 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+covariateName = args[2]
+
+outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+c <- read.table(input, header=T)
+c <- c[sort.list(c[,1]),]
+
+#
+# Plot residual error as a function of the covariate
+#
+
+d.good <- c[c$nBases >= 1000,]
+d.1000 <- c[c$nBases < 1000,]
+rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
+rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
+theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
+if( length(d.good$nBases) == length(c$nBases) ) {
+ theTitle = paste("RMSE =", round(rmseAll,digits=3))
+}
+# Don't let residual error go off the edge of the plot
+d.good$residualError = d.good$Qempirical-d.good$Qreported
+d.good$residualError[which(d.good$residualError > 10)] = 10
+d.good$residualError[which(d.good$residualError < -10)] = -10
+d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
+d.1000$residualError[which(d.1000$residualError > 10)] = 10
+d.1000$residualError[which(d.1000$residualError < -10)] = -10
+c$residualError = c$Qempirical-c$Qreported
+c$residualError[which(c$residualError > 10)] = 10
+c$residualError[which(c$residualError < -10)] = -10
+pointType = "p"
+if( length(c$Covariate) <= 20 ) {
+ pointType = "o"
+}
+if( is.numeric(c$Covariate) ) {
+ plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
+ points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+ plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
+ points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
+}
+dev.off()
+
+
+#
+# Plot mean quality versus the covariate
+#
+
+outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+pointType = "p"
+if( length(c$Covariate) <= 20 ) {
+ pointType = "o"
+}
+theTitle = paste("Quality By", covariateName);
+if( is.numeric(c$Covariate) ) {
+ plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
+ points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+ plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
+ points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
+}
+dev.off()
+
+#
+# Plot histogram of the covariate
+#
+
+e = d.good
+f = d.1000
+outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
+
+lwdSize=2
+if( length(c$Covariate) <= 20 ) {
+ lwdSize=7
+} else if( length(c$Covariate) <= 70 ) {
+ lwdSize=4
+}
+
+if( is.numeric(c$Covariate) ) {
+ if( length(hst$e.Covariate) == 0 ) {
+ plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
+ } else {
+ plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
+ points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
+ }
+ axis(2,axTicks(2), format(axTicks(2), scientific=F))
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+ hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
+ plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
+ if( length(hst$c.Covariate) > 9 ) {
+ axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
+ } else {
+ axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
+ }
+ axis(2,axTicks(2), format(axTicks(2), scientific=F))
+}
+dev.off()
diff --git a/public/R/plot_residualError_QualityScoreCovariate.R b/public/R/plot_residualError_QualityScoreCovariate.R
new file mode 100644
index 000000000..81bc9460d
--- /dev/null
+++ b/public/R/plot_residualError_QualityScoreCovariate.R
@@ -0,0 +1,70 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+
+input = args[1]
+Qcutoff = as.numeric(args[2])
+maxQ = as.numeric(args[3])
+maxHist = as.numeric(args[4])
+
+t=read.table(input, header=T)
+
+#
+# Plot of reported quality versus empirical quality
+#
+
+outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
+pdf(outfile, height=7, width=7)
+d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
+d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
+d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
+f <- t[t$Qreported < Qcutoff,]
+e <- rbind(d.good, d.1000, d.10000)
+rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
+rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
+theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
+if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
+ theTitle = paste("RMSE =", round(rmseAll,digits=3));
+}
+plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
+points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
+points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
+points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
+abline(0,1, lty=2)
+dev.off()
+
+#
+# Plot Q empirical histogram
+#
+
+outfile = paste(input, ".quality_emp_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
+percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
+entropy = -sum(log2(percentBases)*percentBases)
+yMax = max(hst$e.nBases)
+if(maxHist != 0) {
+yMax = maxHist
+}
+plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
+points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
+
+#
+# Plot Q reported histogram
+#
+
+outfile = paste(input, ".quality_rep_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
+yMax = max(hst$e.nBases)
+if(maxHist != 0) {
+yMax = maxHist
+}
+plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
+points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
diff --git a/public/R/titvFPEst.R b/public/R/titvFPEst.R
new file mode 100755
index 000000000..7af5e8bbb
--- /dev/null
+++ b/public/R/titvFPEst.R
@@ -0,0 +1,138 @@
+titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
+
+titvFPEstV <- function(titvExpected, titvs) {
+ sapply(titvs, function(x) titvFPEst(titvExpected, x))
+}
+
+calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
+ TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
+ 2 * TP / 3 / callable
+}
+
+marginalTiTv <- function( nx, titvx, ny, titvy ) {
+ tvx = nx / (titvx + 1)
+ tix = nx - tvx
+ tvy = ny / (titvy + 1)
+ tiy = ny - tvy
+ tiz = tix - tiy
+ tvz = tvx - tvy
+ return(tiz / tvz)
+}
+marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
+ knownx = nx * dbx / 100
+ novelx = nx - knownx
+ knowny = ny * dby / 100
+ novely = ny - knowny
+ knownz = knownx - knowny
+ novelz = novelx - novely
+ return(knownz / ( knownz + novelz ) * 100)
+}
+
+numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
+ nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
+ return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
+}
+
+normalize <- function(x) {
+ x / sum(x)
+}
+
+normcumsum <- function(x) {
+ cumsum(normalize(x))
+}
+
+cumhist <- function(d, ...) {
+ plot(d[order(d)], type="b", col="orange", lwd=2, ...)
+}
+
+revcumsum <- function(x) {
+ return(rev(cumsum(rev(x))))
+}
+
+phred <- function(x) {
+ log10(max(x,10^(-9.9)))*-10
+}
+
+pOfB <- function(b, B, Q) {
+ #print(paste(b, B, Q))
+ p = 1 - 10^(-Q/10)
+ if ( b == B )
+ return(p)
+ else
+ return(1 - p)
+}
+
+pOfG <- function(bs, qs, G) {
+ a1 = G[1]
+ a2 = G[2]
+
+ log10p = 0
+ for ( i in 1:length(bs) ) {
+ b = bs[i]
+ q = qs[i]
+ p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
+ log10p = log10p + log10(p1)
+ }
+
+ return(log10p)
+}
+
+pOfGs <- function(nAs, nBs, Q) {
+ bs = c(rep("a", nAs), rep("t", nBs))
+ qs = rep(Q, nAs + nBs)
+ G1 = c("a", "a")
+ G2 = c("a", "t")
+ G3 = c("t", "t")
+
+ log10p1 = pOfG(bs, qs, G1)
+ log10p2 = pOfG(bs, qs, G2)
+ log10p3 = pOfG(bs, qs, G3)
+ Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
+
+ return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
+}
+
+QsampleExpected <- function(depth, Q) {
+ weightedAvg = 0
+ for ( d in 1:(depth*3) ) {
+ Qsample = 0
+ pOfD = dpois(d, depth)
+ for ( nBs in 0:d ) {
+ pOfnB = dbinom(nBs, d, 0.5)
+ nAs = d - nBs
+ Qsample = pOfGs(nAs, nBs, Q)$Qsample
+ #Qsample = 1
+ weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
+ print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
+ }
+ }
+
+ return(weightedAvg)
+}
+
+plotQsamples <- function(depths, Qs, Qmax) {
+ cols = rainbow(length(Qs))
+ plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
+
+ for ( i in 1:length(Qs) ) {
+ Q = Qs[i]
+ y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
+ points(depths, y, col=cols[i], type="b")
+ }
+
+ legend("topleft", paste("Q", Qs), fill=cols)
+}
+
+pCallHetGivenDepth <- function(depth, nallelesToCall) {
+ depths = 0:(2*depth)
+ pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
+ dpois(depths,depth)*(1-pNoAllelesToCall)
+}
+
+pCallHets <- function(depth, nallelesToCall) {
+ sum(pCallHetGivenDepth(depth,nallelesToCall))
+}
+
+pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
+ 1-(1-pCallHets(depth,nallelesToCall))^nsamples
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java
deleted file mode 100755
index 54fd0a20c..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnalyzeAnnotationsWalker.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
-
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.RodWalker;
-import org.broadinstitute.sting.commandline.Argument;
-
-import java.util.HashMap;
-import java.util.Collection;
-
-/**
- * Takes variant calls as .vcf files and creates plots of truth metrics as a function of the various annotations found in the INFO field.
- *
- * @author rpoplin
- * @since Jan 15, 2010
- * @help.summary Takes variant calls as .vcf files and creates plots of truth metrics as a function of the various annotations found in the INFO field.
- */
-
-public class AnalyzeAnnotationsWalker extends RodWalker {
-
- /////////////////////////////
- // Command Line Arguments
- /////////////////////////////
- @Argument(fullName = "output_prefix", shortName = "output", doc = "The output path and name to prepend to all plots and intermediate data files", required = false)
- private String OUTPUT_PREFIX = "analyzeAnnotations/";
- @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
- private String PATH_TO_RSCRIPT = "Rscript";
- @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
- private String PATH_TO_RESOURCES = "R/";
- @Argument(fullName = "min_variants_per_bin", shortName = "minBinSize", doc = "The minimum number of variants in a bin in order to calculate truth metrics.", required = false)
- private int MIN_VARIANTS_PER_BIN = 1000;
- @Argument(fullName = "max_variants_per_bin", shortName = "maxBinSize", doc = "The maximum number of variants in a bin.", required = false)
- private int MAX_VARIANTS_PER_BIN = 20000;
- @Argument(fullName = "sampleName", shortName = "sampleName", doc = "If supplied, only process variants found in this sample.", required = false)
- private String SAMPLE_NAME = null;
- @Argument(fullName = "name", shortName = "name", doc = "Labels for the annotations to make plots look nicer. Each name is a separate -name argument. For example, -name DP,Depth -name AB,AlleleBalance", required = false)
- private String[] ANNOTATION_NAMES = null;
- @Argument(fullName = "indicate_mean_num_vars", shortName = "meanNumVars", doc = "If supplied, plots will indicate the distribution of number of variants instead of distribution of value of annotation", required = false)
- private boolean INDICATE_MEAN_NUM_VARS = false;
-
- /////////////////////////////
- // Private Member Variables
- /////////////////////////////
- private AnnotationDataManager dataManager;
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // initialize
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public void initialize() {
-
- // Create a HashMap associating the names of the annotations to full Strings that can be used as labels on plots
- HashMap nameMap = null;
- if( ANNOTATION_NAMES != null ) {
- nameMap = new HashMap();
- for( String nameLine : ANNOTATION_NAMES ) {
- String[] vals = nameLine.split(",");
- nameMap.put(vals[0],vals[1]);
- }
- }
- dataManager = new AnnotationDataManager( nameMap, INDICATE_MEAN_NUM_VARS );
-
- if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // map
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
-
- if( tracker != null ) {
- Collection VCs = tracker.getAllVariantContexts(ref);
-
- // First find out if this variant is in the truth sets
- boolean isInTruthSet = false;
- boolean isTrueVariant = false;
- for ( VariantContext vc : VCs ) {
- if( vc != null && vc.isSNP() && !vc.isFiltered() ) {
- if( vc.getSource().toUpperCase().startsWith("TRUTH") ) {
- isInTruthSet = true;
- if( vc.isBiallelic() && vc.isVariant() ) {
- isTrueVariant = true;
- }
- }
- }
- }
-
- // Add each annotation in this VCF Record to the dataManager
- for ( VariantContext vc : VCs ) {
- if( vc != null && vc.isSNP() && vc.isBiallelic() && !vc.isFiltered() ) {
- if( !vc.getSource().toUpperCase().startsWith("TRUTH") ) {
- if( vc.isVariant() ) {
- dataManager.addAnnotations( vc, SAMPLE_NAME, isInTruthSet, isTrueVariant );
- }
- }
- }
- }
- }
-
- return 1; // This value isn't actually used for anything
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // reduce
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public Integer reduceInit() {
- return 0; // Nothing to do here
- }
-
- public Integer reduce( Integer value, Integer sum ) {
- return 0; // Nothing to do here
- }
-
- public void onTraversalDone( Integer sum ) {
-
- // For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables
- dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_PREFIX, MIN_VARIANTS_PER_BIN, MAX_VARIANTS_PER_BIN);
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java
deleted file mode 100755
index 2c06b30d6..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDataManager.java
+++ /dev/null
@@ -1,169 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
-
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-
-import java.io.File;
-import java.util.*;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.io.FileNotFoundException;
-
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Jan 18, 2010
- */
-
-public class AnnotationDataManager {
-
- private final HashMap> data;
- private final HashMap nameMap;
- private final boolean INDICATE_MEAN_NUM_VARS;
-
- public AnnotationDataManager( HashMap _nameMap, boolean _INDICATE_MEAN_NUM_VARS ) {
- data = new HashMap>();
- nameMap = _nameMap;
- INDICATE_MEAN_NUM_VARS = _INDICATE_MEAN_NUM_VARS;
- }
-
- public void addAnnotations( final VariantContext vc, final String sampleName, final boolean isInTruthSet, final boolean isTrueVariant ) {
-
- if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
- if( vc.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
- return;
- }
- } // else, process all samples
-
- // Loop over each annotation in the vcf record
- final Map infoField = new HashMap(vc.getAttributes());
- infoField.put("QUAL", ((Double)vc.getPhredScaledQual()).toString() ); // add QUAL field to annotations
- for( Map.Entry annotation : infoField.entrySet() ) {
-
- float value;
- try {
- value = Float.parseFloat( annotation.getValue().toString() );
- } catch( NumberFormatException e ) {
- continue; // Skip over annotations that aren't floats, like "DB"
- }
-
- TreeSet treeSet = data.get( annotation.getKey() );
- if( treeSet == null ) { // This annotation hasn't been seen before
- treeSet = new TreeSet( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
- data.put( annotation.getKey(), treeSet );
- }
- AnnotationDatum datum = new AnnotationDatum( value );
- if( treeSet.contains(datum) ) { // contains() uses AnnotationDatum's equals function, so it only checks if the value field is already present
- datum = treeSet.tailSet(datum).first();
- } else {
- treeSet.add(datum);
- }
-
- final boolean isNovelVariant = !vc.hasID() || !vc.getID().contains("rs");
-
- // Decide if the variant is a transition or transversion
- if( VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0 ) {
- datum.incrementTi( isNovelVariant, isInTruthSet, isTrueVariant );
- } else {
- datum.incrementTv( isNovelVariant, isInTruthSet, isTrueVariant );
- }
- }
- }
-
- public void plotCumulativeTables( final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_PREFIX,
- final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) {
-
- final AnnotationDatum thisAnnotationBin = new AnnotationDatum();
- System.out.println( "\nFinished reading variants into memory. Executing RScript commands:" );
-
- // For each annotation we've seen
- for( final String annotationKey : data.keySet() ) {
-
- String filename = OUTPUT_PREFIX + annotationKey + ".dat";
- PrintStream output;
- try {
- output = new PrintStream(filename); // Create the intermediate data file for this annotation
- } catch ( FileNotFoundException e ) {
- throw new UserException.CouldNotCreateOutputFile(new File(filename), "Can't create intermediate output annotation data file. Does the output directory exist?", e);
- }
-
- // Output a header line
- output.println("value\ttitv\tdbsnp\ttruePositive\tnumVariants\tcategory");
-
- // Bin SNPs and calculate truth metrics for each bin
- thisAnnotationBin.clearBin();
- for( final AnnotationDatum datum : data.get( annotationKey ) ) {
- thisAnnotationBin.combine( datum );
- if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
- output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
- "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
- output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
- output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
- output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
- thisAnnotationBin.clearBin();
- }
- // else, continue accumulating variants because this bin isn't full yet
- }
-
- // One final bin that may not have been dumped out
- if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) {
- output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
- "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
- output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
- output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
- output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
- thisAnnotationBin.clearBin();
- }
-
- // Close the PrintStream
- output.close();
-
- String annotationName = null;
- if( nameMap != null ) {
- annotationName = nameMap.get(annotationKey);
- }
- if( annotationName == null ) { // This annotation is not in the name map so use the key instead
- annotationName = annotationKey;
- }
-
- // Print out the command line to make it clear to the user what is being executed and how one might modify it
- final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " +
- OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationName + " " + MIN_VARIANTS_PER_BIN + " " + INDICATE_MEAN_NUM_VARS;
- System.out.println( rScriptCommandLine );
-
- // Execute the RScript command to plot the table of truth values
- try {
- Runtime.getRuntime().exec( rScriptCommandLine );
- } catch ( IOException e ) {
- throw new UserException.CannotExecuteRScript( rScriptCommandLine, e );
- }
- }
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java
deleted file mode 100755
index 888eb7e6e..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/analyzeannotations/AnnotationDatum.java
+++ /dev/null
@@ -1,170 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
-
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-
-import java.util.Comparator;
-
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Jan 18, 2010
- */
-
-public class AnnotationDatum implements Comparator {
-
- public float value;
- private final int[] ti;
- private final int[] tv;
-
- public static final int FULL_SET = 0;
- public static final int NOVEL_SET = 1;
- public static final int DBSNP_SET = 2;
- public static final int TRUTH_SET = 3;
- public static final int TRUE_POSITIVE = 4;
- private static final int NUM_SETS = 5;
-
- public AnnotationDatum() {
-
- value = 0.0f;
- ti = new int[NUM_SETS];
- tv = new int[NUM_SETS];
- for( int iii = 0; iii < NUM_SETS; iii++ ) {
- ti[iii] = 0;
- tv[iii] = 0;
- }
- }
-
- public AnnotationDatum( final float _value ) {
-
- value = _value;
- ti = new int[NUM_SETS];
- tv = new int[NUM_SETS];
- for( int iii = 0; iii < NUM_SETS; iii++ ) {
- ti[iii] = 0;
- tv[iii] = 0;
- }
- }
-
- final public void incrementTi( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
-
- ti[FULL_SET]++;
- if( isNovelVariant ) {
- ti[NOVEL_SET]++;
- } else { // Is known, in DBsnp
- ti[DBSNP_SET]++;
- }
- if( isInTruthSet ) {
- ti[TRUTH_SET]++;
- if( isTrueVariant ) {
- ti[TRUE_POSITIVE]++;
- }
- }
- }
-
- final public void incrementTv( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
-
- tv[FULL_SET]++;
- if( isNovelVariant ) {
- tv[NOVEL_SET]++;
- } else { // Is known, in DBsnp
- tv[DBSNP_SET]++;
- }
- if( isInTruthSet ) {
- tv[TRUTH_SET]++;
- if( isTrueVariant ) {
- tv[TRUE_POSITIVE]++;
- }
- }
- }
-
- final public void combine( final AnnotationDatum that ) {
-
- for( int iii = 0; iii < NUM_SETS; iii++ ) {
- this.ti[iii] += that.ti[iii];
- this.tv[iii] += that.tv[iii];
- }
- this.value = that.value; // Overwrite this bin's value
- }
-
- final public float calcTiTv( final int INDEX ) {
-
- if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
- throw new ReviewedStingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
- }
-
- if( tv[INDEX] == 0 ) { // Don't divide by zero
- return 0.0f;
- }
-
- return ((float) ti[INDEX]) / ((float) tv[INDEX]);
- }
-
- final public float calcDBsnpRate() {
-
- if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
- return 0.0f;
- }
-
- return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) /
- ((float) ti[FULL_SET] + tv[FULL_SET]);
- }
-
- final public float calcTPrate() {
-
- if( ti[TRUTH_SET] + tv[TRUTH_SET] == 0 ) { // Don't divide by zero
- return 0.0f;
- }
-
- return 100.0f * ((float) ti[TRUE_POSITIVE] + tv[TRUE_POSITIVE]) /
- ((float) ti[TRUTH_SET] + tv[TRUTH_SET]);
- }
-
- final public int numVariants( final int INDEX ) {
- return ti[INDEX] + tv[INDEX];
- }
-
- final public void clearBin() {
- value = 0.0f;
- for( int iii = 0; iii < NUM_SETS; iii++ ) {
- ti[iii] = 0;
- tv[iii] = 0;
- }
- }
-
- public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { // Function needed for this to be a Comparator
- if( a1.value < a2.value ) { return -1; }
- if( a1.value > a2.value ) { return 1; }
- return 0;
- }
-
- public int equals( AnnotationDatum that ) { // Function needed for this to be sorted correctly in a TreeSet
- if( this.value < that.value ) { return -1; }
- if( this.value > that.value ) { return 1; }
- return 0;
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java
index 33069f1f5..f23433bb5 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java
@@ -42,5 +42,5 @@ public class LowMQ implements InfoFieldAnnotation {
public List getKeyNames() { return Arrays.asList("LowMQ"); }
- public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 3, VCFHeaderLineType.Integer, "3-tuple: ,,")); }
+ public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 3, VCFHeaderLineType.Float, "3-tuple: ,,")); }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
index 3cfa58c02..a53665d64 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@@ -1011,7 +1011,7 @@ public class IndelRealigner extends ReadWalker {
sb.append((char)b);
cigar.add(new CigarElement(indelStr.length, CigarOperator.I));
} else {
- throw new ReviewedStingException("Creating an alternate consensus from a complex indel is not allowed");
+ throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows");
}
if ( reference.length - refIdx > 0 )
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java
new file mode 100755
index 000000000..fc196e712
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2010.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.indels;
+
+import net.sf.samtools.*;
+import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.*;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.filters.BadMateFilter;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.sting.commandline.Argument;
+
+import java.io.File;
+import java.util.*;
+
+@By(DataSource.READS)
+// walker to count realigned reads
+public class RealignedReadCounter extends ReadWalker {
+
+ public static final String ORIGINAL_CIGAR_TAG = "OC";
+ public static final String ORIGINAL_POSITION_TAG = "OP";
+
+ @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
+ protected String intervalsFile = null;
+
+ // the intervals input by the user
+ private Iterator intervals = null;
+
+ // the current interval in the list
+ private GenomeLoc currentInterval = null;
+
+ private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0;
+ private boolean intervalWasUpdated = false;
+
+ public void initialize() {
+ // prepare to read intervals one-by-one, as needed (assuming they are sorted).
+ intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
+ currentInterval = intervals.hasNext() ? intervals.next() : null;
+ }
+
+ public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
+ if ( currentInterval == null ) {
+ return 0;
+ }
+
+ GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read);
+ // hack to get around unmapped reads having screwy locations
+ if ( readLoc.getStop() == 0 )
+ readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart());
+
+ if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) )
+ return 0;
+
+ if ( readLoc.overlapsP(currentInterval) ) {
+ if ( doNotTryToClean(read) )
+ return 0;
+
+ if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) {
+ String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG);
+ // deal with an old bug
+ if ( read.getCigar().toString().equals(newCigar) ) {
+ //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar);
+ return 0;
+ }
+
+ if ( !intervalWasUpdated ) {
+ intervalWasUpdated = true;
+ updatedIntervals++;
+ affectedBases += 20 + getIndelSize(read);
+ }
+ updatedReads++;
+
+ }
+ } else {
+ do {
+ intervalWasUpdated = false;
+ currentInterval = intervals.hasNext() ? intervals.next() : null;
+ } while ( currentInterval != null && currentInterval.isBefore(readLoc) );
+ }
+
+ return 0;
+ }
+
+ private int getIndelSize(SAMRecord read) {
+ for ( CigarElement ce : read.getCigar().getCigarElements() ) {
+ if ( ce.getOperator() == CigarOperator.I )
+ return 0;
+ if ( ce.getOperator() == CigarOperator.D )
+ return ce.getLength();
+ }
+ logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar());
+ return 0;
+ }
+
+ private boolean doNotTryToClean(SAMRecord read) {
+ return read.getReadUnmappedFlag() ||
+ read.getNotPrimaryAlignmentFlag() ||
+ read.getReadFailsVendorQualityCheckFlag() ||
+ read.getMappingQuality() == 0 ||
+ read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ||
+ (BadMateFilter.hasBadMate(read));
+ }
+
+ public Integer reduceInit() {
+ return 0;
+ }
+
+ public Integer reduce(Integer value, Integer sum) {
+ return sum + value;
+ }
+
+ public void onTraversalDone(Integer result) {
+ System.out.println(updatedIntervals + " intervals were updated");
+ System.out.println(updatedReads + " reads were updated");
+ System.out.println(affectedBases + " bases were affected");
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
new file mode 100755
index 000000000..feb5f62af
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
@@ -0,0 +1,62 @@
+package org.broadinstitute.sting.gatk.walkers.qc;
+
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.walkers.RefWalker;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.collections.Pair;
+
+import java.util.List;
+import java.io.PrintStream;
+
+/**
+ * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but
+ * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to.
+ * This was its very first use.
+ */
+public class CountIntervals extends RefWalker {
+ @Output
+ PrintStream out;
+
+ @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false)
+ int numOverlaps = 2;
+
+ public Long reduceInit() {
+ return 0l;
+ }
+
+ public boolean isReduceByInterval() { return true; }
+
+ public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ if ( tracker == null ) {
+ return null;
+ }
+
+ List checkIntervals = tracker.getGATKFeatureMetaData("check",false);
+ return (long) checkIntervals.size();
+ }
+
+ public Long reduce(Long loc, Long prev) {
+ if ( loc == null ) {
+ return 0l;
+ } else {
+ return Math.max(prev,loc);
+ }
+ }
+
+ public void onTraversalDone(List> finalReduce) {
+ long count = 0;
+ for ( Pair g : finalReduce ) {
+ if ( g.second >= numOverlaps) {
+ count ++;
+ }
+ }
+ out.printf("Number of contiguous intervals: %d",count);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java
deleted file mode 100755
index 67879a442..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2010, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.qc;
-
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import org.broad.tribble.util.ParsingUtils;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
-import org.broad.tribble.readers.AsciiLineReader;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.Requires;
-import org.broadinstitute.sting.gatk.walkers.RodWalker;
-import org.broadinstitute.sting.utils.SimpleTimer;
-
-import java.io.PrintStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.util.*;
-
-/**
- * Emits specific fields as dictated by the user from one or more VCF files.
- */
-@Requires(value={})
-public class ProfileRodSystem extends RodWalker {
- @Output(doc="File to which results should be written",required=true)
- protected PrintStream out;
-
- @Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false)
- int nIterations = 1;
-
- @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false)
- int MAX_RECORDS = -1;
-
- SimpleTimer timer = new SimpleTimer("myTimer");
-
- public void initialize() {
- File rodFile = getRodFile();
-
- out.printf("# walltime is in seconds%n");
- out.printf("# file is %s%n", rodFile);
- out.printf("# file size is %d bytes%n", rodFile.length());
- out.printf("operation\titeration\twalltime%n");
- for ( int i = 0; i < nIterations; i++ ) {
- out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE));
- out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE));
- out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS));
- out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC));
- out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE));
- }
-
- timer.start(); // start up timer for map itself
- }
-
- private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE };
-
- private final double readFile(File f, ReadMode mode) {
- timer.start();
-
- try {
- byte[] data = new byte[100000];
- FileInputStream s = new FileInputStream(f);
-
- if ( mode == ReadMode.BY_BYTE ) {
- while (true) {
- if ( s.read(data) == -1 )
- break;
- }
- } else {
- int counter = 0;
- VCFCodec codec = new VCFCodec();
- String[] parts = new String[100000];
- AsciiLineReader lineReader = new AsciiLineReader(s);
-
- if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE )
- codec.readHeader(lineReader);
-
- while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) {
- String line = lineReader.readLine();
- if ( line == null )
- break;
- else if ( mode == ReadMode.BY_PARTS ) {
- ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR);
- }
- else if ( mode == ReadMode.DECODE_LOC ) {
- codec.decodeLoc(line);
- }
- else if ( mode == ReadMode.DECODE ) {
- processOneVC((VariantContext)codec.decode(line));
- }
- }
- }
- } catch ( Exception e ) {
- throw new RuntimeException(e);
- }
-
- return timer.getElapsedTime();
- }
-
- private File getRodFile() {
- List rods = this.getToolkit().getRodDataSources();
- ReferenceOrderedDataSource rod = rods.get(0);
- return rod.getFile();
- }
-
- public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
- if ( tracker == null ) // RodWalkers can make funky map calls
- return 0;
-
- VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation());
- processOneVC(vc);
-
- return 0;
- }
-
- private static final void processOneVC(VariantContext vc) {
- vc.getNSamples(); // force us to parse the samples
- }
-
- public Integer reduceInit() {
- return 0;
- }
-
- public Integer reduce(Integer counter, Integer sum) {
- return counter + sum;
- }
-
- public void onTraversalDone(Integer sum) {
- out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime());
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java
new file mode 100644
index 000000000..9cb715507
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java
@@ -0,0 +1,153 @@
+package org.broadinstitute.sting.gatk.walkers.qc;
+
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.*;
+import java.math.BigInteger;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * a walker for validating (in the style of validating pile-up) the ROD system.
+ */
+@Reference(window=@Window(start=-40,stop=40))
+public class RodSystemValidationWalker extends RodWalker {
+
+ // the divider to use in some of the text output
+ private static final String DIVIDER = ",";
+
+ @Output
+ public PrintStream out;
+
+ @Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts")
+ public boolean allRecordsVariantContextEquivalent = false;
+
+ // used to calculate the MD5 of a file
+ MessageDigest digest = null;
+
+ // we sometimes need to know what rods the engine's seen
+ List rodList;
+
+ /**
+ * emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change
+ * underneath us).
+ */
+ public void initialize() {
+ // setup the MD5-er
+ try {
+ digest = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException e) {
+ throw new ReviewedStingException("Unable to find MD5 checksumer");
+ }
+ out.println("Header:");
+ // enumerate the list of ROD's we've loaded
+ rodList = this.getToolkit().getRodDataSources();
+ for (ReferenceOrderedDataSource rod : rodList) {
+ out.println(rod.getName() + DIVIDER + rod.getType());
+ out.println(rod.getName() + DIVIDER + rod.getFile());
+ out.println(rod.getName() + DIVIDER + md5sum(rod.getFile()));
+ }
+ out.println("Data:");
+ }
+
+ /**
+ *
+ * @param tracker the ref meta data tracker to get RODs
+ * @param ref reference context
+ * @param context the reads
+ * @return an 1 for each site with a rod(s), 0 otherwise
+ */
+ @Override
+ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ int ret = 0;
+ if (tracker != null && tracker.getAllRods().size() > 0) {
+ out.print(context.getLocation() + DIVIDER);
+ Collection features = tracker.getAllRods();
+ for (GATKFeature feat : features)
+ out.print(feat.getName() + DIVIDER);
+ out.println(";");
+ ret++;
+ }
+
+ // if the argument was set, check for equivalence
+ if (allRecordsVariantContextEquivalent && tracker != null) {
+ Collection col = tracker.getAllVariantContexts(ref);
+ VariantContext con = null;
+ for (VariantContext contextInList : col)
+ if (con == null) con = contextInList;
+ else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con);
+ }
+ return ret;
+ }
+
+ /**
+ * Provide an initial value for reduce computations.
+ *
+ * @return Initial value of reduce.
+ */
+ @Override
+ public Integer reduceInit() {
+ return 0;
+ }
+
+ /**
+ * Reduces a single map with the accumulator provided as the ReduceType.
+ *
+ * @param value result of the map.
+ * @param sum accumulator for the reduce.
+ * @return accumulator with result of the map taken into account.
+ */
+ @Override
+ public Integer reduce(Integer value, Integer sum) {
+ return value + sum;
+ }
+
+ @Override
+ public void onTraversalDone(Integer result) {
+ // Double check traversal result to make count is the same.
+ // TODO: Is this check necessary?
+ out.println("[REDUCE RESULT] Traversal result is: " + result);
+ }
+
+ // shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html
+ private String md5sum(File f) {
+ InputStream is;
+ try {
+ is = new FileInputStream(f);
+ } catch (FileNotFoundException e) {
+ return "Not a file";
+ }
+ byte[] buffer = new byte[8192];
+ int read = 0;
+ try {
+ while ((read = is.read(buffer)) > 0) {
+ digest.update(buffer, 0, read);
+ }
+ byte[] md5sum = digest.digest();
+ BigInteger bigInt = new BigInteger(1, md5sum);
+ return bigInt.toString(16);
+ }
+ catch (IOException e) {
+ throw new RuntimeException("Unable to process file for MD5", e);
+ }
+ finally {
+ try {
+ is.close();
+ }
+ catch (IOException e) {
+ throw new RuntimeException("Unable to close input stream for MD5 calculation", e);
+ }
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java
deleted file mode 100755
index eda01bdb4..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java
+++ /dev/null
@@ -1,298 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.qc;
-
-import net.sf.samtools.SAMRecord;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
-import net.sf.picard.reference.IndexedFastaSequenceFile;
-import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.*;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.utils.exceptions.StingException;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.SimpleTimer;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.commandline.Argument;
-
-import java.io.PrintStream;
-
-/**
- * Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
- * Can also count the number of reads matching a given criterion using read filters (see the
- * --read-filter command line argument). Simplest example of a read-backed analysis.
- */
-@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER)
-@Reference(window=@Window(start=-5,stop=5))
-@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
-public class ValidateBAQWalker extends ReadWalker {
- @Output(doc="File to which results should be written",required=true)
- protected PrintStream out;
-
- @Argument(doc="maximum read length to apply the BAQ calculation too",required=false)
- protected int maxReadLen = 1000;
-
- @Argument(doc="",required=false)
- protected int bw = 7;
-
- @Argument(doc="",required=false)
- protected boolean samtoolsMode = false;
-
- @Argument(doc="only operates on reads with this name",required=false)
- protected String readName = null;
-
- @Argument(doc="If true, all differences are errors", required=false)
- protected boolean strict = false;
-
- @Argument(doc="prints info for each read", required=false)
- protected boolean printEachRead = false;
-
- @Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false)
- protected boolean alsoPrintWarnings = false;
-
- @Argument(doc="Include reads without BAQ tag", required=false)
- protected boolean includeReadsWithoutBAQTag = false;
-
- @Argument(doc="x each read is processed", required=false)
- protected int magnification = 1;
-
- @Argument(doc="Profile performance", required=false)
- protected boolean profile = false;
-
- int counter = 0;
-
- BAQ baqHMM = null; // matches current samtools parameters
-
- public void initialize() {
- if ( samtoolsMode )
- baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true);
- else
- baqHMM = new BAQ();
- }
-
- long goodReads = 0, badReads = 0;
-
- public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
-
- if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) {
- if ( baqHMM.excludeReadFromBAQ(read) )
- return 0;
-
- if ( profile ) {
- profileBAQ(ref, read);
- } else {
- validateBAQ(ref, read);
- }
-
- return 1;
- }
-
- return 0;
- }
-
- SimpleTimer tagTimer = new SimpleTimer("from.tag");
- SimpleTimer baqReadTimer = new SimpleTimer("baq.read");
- SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal");
-
- private void profileBAQ(ReferenceContext ref, SAMRecord read) {
- IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
- BAQ.BAQCalculationResult baq = null;
-
- tagTimer.restart();
- for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); }
- tagTimer.stop();
-
- baqReadTimer.restart();
- for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); }
- baqReadTimer.stop();
-
- glocalTimer.restart();
- for ( int i = 0; i < magnification; i++ )
- baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY);
- glocalTimer.stop();
- }
-
-
- private void validateBAQ(ReferenceContext ref, SAMRecord read) {
- IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
- byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag);
- if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
-
- BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader);
-
- boolean fail = false;
- boolean print = false;
- int badi = 0;
-
- if ( BAQ.hasBAQTag(read) ) {
- for ( badi = 0; badi < baqFromTag.length; badi++ ) {
- if ( baqFromTag[badi] != baq.bq[badi] ) {
- if ( cigarLength(read) != read.getReadLength() ) {
- print = true;
- fail = false;
- out.printf(" different, but cigar length != read length%n");
- break;
- }
- if (MathUtils.arrayMin(read.getBaseQualities()) == 0) {
- print = true;
- fail = strict;
- out.printf(" different, but Q0 base detected%n");
- break;
- }
- else if (readHasSoftClip(read) && ! samtoolsMode) {
- print = true;
- fail = strict;
- out.printf(" different, but soft clip detected%n");
- break;
- } else if (readHasDeletion(read) ) { // && ! samtoolsMode) {
- print = true;
- fail = strict;
- out.printf(" different, but deletion detected%n");
- break;
- } else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) {
- print = fail = true;
- out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual());
- break;
- } else {
- print = fail = true;
- break;
- }
- }
- }
- if ( fail || print )
- badReads++;
- else
- goodReads++;
- }
-
- if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) {
- byte[] pos = new byte[baq.bq.length];
- for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i;
-
- out.printf(" read length : %d%n", read.getReadLength());
- out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart());
- out.printf(" cigar : %s%n", read.getCigarString());
- out.printf(" ref bases : %s%n", new String(baq.refBases));
- out.printf(" read bases : %s%n", new String(read.getReadBases()));
- out.printf(" ref length : %d%n", baq.refBases.length);
- out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
- if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true);
- printQuals(" original quals: ", read.getBaseQualities(), true);
- printQuals(" baq quals: ", baq.bq, true);
- printQuals(" positions : ", pos, true);
- printQuals(" original quals: ", read.getBaseQualities());
- if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag);
- printQuals(" hmm quals: ", baq.bq);
- out.printf(" read bases : %s%n", new String(read.getReadBases()));
- out.println(Utils.dupString('-', 80));
- }
-
-
- if ( fail )
- throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d",
- read.getReadName(), badi, baqFromTag[badi], baq.bq[badi]));
- }
-
- private final static boolean readHasSoftClip(SAMRecord read) {
- for (CigarElement e : read.getCigar().getCigarElements()) {
- if ( e.getOperator() == CigarOperator.SOFT_CLIP )
- return true;
- }
-
- return false;
- }
-
- private final static boolean readHasDeletion(SAMRecord read) {
- for (CigarElement e : read.getCigar().getCigarElements()) {
- if ( e.getOperator() == CigarOperator.DELETION )
- return true;
- }
-
- return false;
- }
-
- public final void printQuals( String prefix, byte[] quals ) {
- printQuals(prefix, quals, false);
- }
-
- public final void printQuals( String prefix, byte[] quals, boolean asInt ) {
- printQuals(out, prefix, quals, asInt);
- }
-
- public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
- out.print(prefix);
- for ( int i = 0; i < quals.length; i++) {
- if ( asInt ) {
- out.printf("%2d", (int)quals[i]);
- if ( i+1 != quals.length ) out.print(",");
- } else
- out.print((char)(quals[i]+33));
- }
- out.println();
- }
-
- /**
- * Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
- * @param read
- * @return
- */
- public static byte[] getBAQDeltas(SAMRecord read) {
- byte[] baq = BAQ.getBAQTag(read);
- if ( baq != null ) {
- byte[] deltas = new byte[baq.length];
- for ( int i = 0; i < deltas.length; i++)
- deltas[i] = (byte)(-1 * (baq[i] - 64));
- return deltas;
- } else
- return null;
- }
-
- private int cigarLength(SAMRecord read) {
- int readI = 0;
- for ( CigarElement elt : read.getCigar().getCigarElements() ) {
- int l = elt.getLength();
- switch (elt.getOperator()) {
- case N: // cannot handle these
- return 0;
- case H : case P : // ignore pads and hard clips
- break;
- case S :
- case I :
- readI += l;
- break;
- case D : break;
- case M :
- readI += l;
- break;
- default:
- throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
- }
- }
- return readI;
- }
-
- public Integer reduceInit() { return 0; }
-
- public Integer reduce(Integer value, Integer sum) {
- return value + sum;
- }
-
- public void onTraversalDone(Integer nreads) {
- if ( profile ) {
- out.printf("n.reads baq.per.read calculation time.in.secs%n");
- printTimer(nreads, tagTimer);
- printTimer(nreads, glocalTimer);
- printTimer(nreads, baqReadTimer);
- } else {
- out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads,
- goodReads, (100.0 * goodReads) / nreads,
- badReads, (100.0 * badReads) / nreads);
- }
- }
-
- private final void printTimer(int nreads, SimpleTimer timer) {
- out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime());
- }
-}
-
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
index 9877781d1..02d850211 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
@@ -55,7 +55,6 @@ import java.util.*;
public class ApplyRecalibration extends RodWalker {
-
/////////////////////////////
// Inputs
/////////////////////////////
@@ -86,6 +85,7 @@ public class ApplyRecalibration extends RodWalker {
final private List tranches = new ArrayList();
final private Set inputNames = new HashSet();
final private NestedHashMap lodMap = new NestedHashMap();
+ final private NestedHashMap annotationMap = new NestedHashMap();
final private Set ignoreInputFilterSet = new TreeSet();
//---------------------------------------------------------------------------------------------------------------
@@ -124,6 +124,7 @@ public class ApplyRecalibration extends RodWalker {
final Set hInfo = new HashSet();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
+ hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
final TreeSet samples = new TreeSet();
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
@@ -149,6 +150,7 @@ public class ApplyRecalibration extends RodWalker {
for ( final String line : new XReadLines( RECAL_FILE ) ) {
final String[] vals = line.split(",");
lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
+ annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, e);
@@ -174,11 +176,15 @@ public class ApplyRecalibration extends RodWalker {
String filterString = null;
final Map attrs = new HashMap(vc.getAttributes());
final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
+ final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
if( lod == null ) {
throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc );
}
+ // Annotate the new record with its VQSLOD and the worst performing annotation
attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
+ attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation);
+
for( int i = tranches.size() - 1; i >= 0; i-- ) {
final Tranche tranche = tranches.get(i);
if( lod >= tranche.minVQSLod ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
index 9ffe7be7a..a09a30145 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
@@ -190,6 +190,19 @@ public class GaussianMixtureModel {
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
}
+ public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
+ if(datum.isNull[iii]) { return null; }
+
+ final Normal normal = new Normal(0.0, 1.0, null);
+ final double[] pVarInGaussianLog10 = new double[gaussians.size()];
+ int gaussianIndex = 0;
+ for( final MultivariateGaussian gaussian : gaussians ) {
+ normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) );
+ pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) );
+ }
+ return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
+ }
+
public double evaluateDatumMarginalized( final VariantDatum datum ) {
int numVals = 0;
double sumPVarInGaussian = 0.0;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
index 2fd1326fe..efff5b780 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
@@ -254,7 +254,9 @@ public class VariantDataManager {
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
for( final VariantDatum datum : data ) {
- RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.contig, datum.start, datum.stop, datum.lod));
+ RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
+ datum.contig, datum.start, datum.stop, datum.lod,
+ (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java
index ac875b645..38e2affb6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java
@@ -24,6 +24,7 @@ public class VariantDatum implements Comparable {
public String contig;
public int start;
public int stop;
+ public int worstAnnotation;
public MultivariateGaussian assignment; // used in K-means implementation
public int compareTo( final VariantDatum other ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
index e651b62e0..b5fda4443 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@@ -60,6 +60,7 @@ import java.util.*;
public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> {
public static final String VQS_LOD_KEY = "VQSLOD";
+ public static final String CULPRIT_KEY = "culprit";
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
@@ -232,6 +233,7 @@ public class VariantRecalibrator extends RodWalker randomData = dataManager.getRandomDataForPlotting( 6000 );
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java
index a0fbc572d..81e4d190c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java
@@ -57,6 +57,23 @@ public class VariantRecalibratorEngine {
}
}
+ public void calculateWorstPerformingAnnotation( final List data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) {
+ for( final VariantDatum datum : data ) {
+ int worstAnnotation = -1;
+ double minProb = Double.MAX_VALUE;
+ for( int iii = 0; iii < datum.annotations.length; iii++ ) {
+ final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii);
+ final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii);
+ if( goodProbLog10 != null && badProbLog10 != null ) {
+ final double prob = goodProbLog10 - badProbLog10;
+ if(prob < minProb) { minProb = prob; worstAnnotation = iii; }
+ }
+ }
+ datum.worstAnnotation = worstAnnotation;
+ }
+ }
+
+
/////////////////////////////
// Private Methods used for generating a GaussianMixtureModel
/////////////////////////////
diff --git a/public/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java b/public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java
similarity index 97%
rename from public/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java
rename to public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java
index f8f8b2d29..e0e75c353 100644
--- a/public/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java
+++ b/public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java
@@ -22,7 +22,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-package org.broadinstitute.sting.datasources.pipeline;
+package org.broadinstitute.sting.pipeline;
import java.util.ArrayList;
import java.util.List;
diff --git a/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java b/public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java
similarity index 98%
rename from public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java
rename to public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java
index 7967ba5df..8d33047bf 100644
--- a/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java
+++ b/public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java
@@ -22,7 +22,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-package org.broadinstitute.sting.datasources.pipeline;
+package org.broadinstitute.sting.pipeline;
import java.io.File;
import java.util.Map;
diff --git a/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java b/public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java
similarity index 97%
rename from public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java
rename to public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java
index 701841302..7cd25fed5 100644
--- a/public/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java
+++ b/public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java
@@ -22,7 +22,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-package org.broadinstitute.sting.datasources.pipeline;
+package org.broadinstitute.sting.pipeline;
import java.io.File;
import java.util.Map;
diff --git a/public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java
deleted file mode 100755
index 8f3ffd741..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/broad/PicardAggregationUtils.java
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.broad;
-
-import org.apache.commons.io.filefilter.RegexFileFilter;
-
-import java.io.File;
-import java.io.FileFilter;
-import java.io.FileNotFoundException;
-import java.util.Arrays;
-
-public class PicardAggregationUtils {
- public static final String PICARD_AGGREGATION_DIR = "/seq/picard_aggregation/";
-
- /**
- * Returns the path to the sample BAM.
- * @param project Project
- * @param sample Sample
- * @param version Version
- * @return The path to the sample BAM.
- */
- public static String getSampleBam(String project, String sample, int version) {
- return getSampleDir(project, sample, version) + sample + ".bam";
- }
-
- /**
- * Returns the path to the latest BAM.
- * @param project Project
- * @param sample Sample
- * @return The path to the latest BAM.
- * @throws FileNotFoundException If a finished directory cannot be found for a sample.
- */
- public static String getSampleBam(String project, String sample) throws FileNotFoundException {
- return getSampleDir(project, sample) + sample + ".bam";
- }
-
- /**
- * Returns the sample directory.
- * @param project Project
- * @param sample Sample
- * @param version Version
- * @return the sample directory.
- */
- public static String getSampleDir(String project, String sample, int version) {
- return PICARD_AGGREGATION_DIR + String.format("%s/%s/v%d/", project, sample, version);
- }
-
- /**
- * Returns the latest finished directory for this project sample.
- * @param project Project
- * @param sample Sample
- * @return The path to the latest finished directory.
- * @throws FileNotFoundException If a finished directory cannot be found for a sample.
- */
- public static String getSampleDir(String project, String sample) throws FileNotFoundException {
- int latestVersion = getLatestVersion(project, sample);
- if (latestVersion == 0)
- throw new FileNotFoundException("Unable to find a finished directory for project sample " + project + "/" + sample);
- return getSampleDir(project, sample, latestVersion);
- }
-
- /**
- * Returns the latest finished version directory.
- * Because isilon metadata operations are relatively slow this method
- * tries not to look for every possible versioned directory.
- * @param project Project
- * @param sample Sample
- * @return The highest finished version directory or 0 if a finished directory was not found.
- */
- public static int getLatestVersion(String project, String sample) {
- return getLatestVersion(project, sample, 0);
- }
-
- /**
- * Returns the latest finished version directory after startVersion.
- * Because isilon metadata operations are relatively slow this method
- * tries not to look for every possible versioned directory.
- * @param project Project
- * @param sample Sample
- * @param startVersion minimum version to return
- * @return The highest finished version directory after startVersion
- */
- public static int getLatestVersion(String project, String sample, int startVersion) {
- int version = Math.max(0, startVersion);
- boolean nextExists = true;
- while (nextExists) {
- nextExists = false;
- for (int next = 3; next > 0; next--)
- if (isFinished(project, sample, version + next)) {
- version += next;
- nextExists = true;
- break;
- }
- }
- // Special case when the version is 0
- // Because isilon storage takes a while to do meta data operations only look through every file if we have to.
- if (version == 0) {
- File sampleDir = new File(PICARD_AGGREGATION_DIR + project + "/" + sample);
- if (sampleDir.exists()) {
- FileFilter filter = new RegexFileFilter("v\\d+");
- File[] files = sampleDir.listFiles(filter);
- int[] versions = new int[files.length];
- for (int i = 0; i < files.length; i++)
- versions[i] = Integer.parseInt(files[i].getName().substring(1));
- Arrays.sort(versions);
- for (int i = versions.length - 1; i >= 0; i--) {
- if (isFinished(project, sample, versions[i])) {
- version = versions[i];
- break;
- }
- }
- }
- }
- return version == 0 ? startVersion : version;
- }
-
- /**
- * Returns true if the project sample directory contains a finished.txt
- * @param project Project
- * @param sample Sample
- * @param version Version
- * @return true if the project sample directory contains a finished.txt
- */
- public static boolean isFinished(String project, String sample, int version) {
- return new File(getSampleDir(project, sample, version), "finished.txt").exists();
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java
deleted file mode 100755
index 02381916e..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.broad;
-
-import org.apache.commons.lang.ArrayUtils;
-import org.broadinstitute.sting.utils.exceptions.StingException;
-import org.broadinstitute.sting.utils.text.XReadLines;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-public class PicardAnalysisFiles {
- private static final String REFERENCE_SEQUENCE_HEADER = "REFERENCE_SEQUENCE";
- private static final String TARGET_INTERVALS_HEADER = "TARGET_INTERVALS";
- private static final String BAIT_INTERVALS_HEADER = "BAIT_INTERVALS";
- private static final String[] ANALYSIS_HEADERS = {REFERENCE_SEQUENCE_HEADER, TARGET_INTERVALS_HEADER, BAIT_INTERVALS_HEADER};
- private static final String ANALYSIS_FILES = "analysis_files.txt";
-
- private String path;
- private Map> headerValues = new HashMap>();
-
- public PicardAnalysisFiles(String project, String sample) throws FileNotFoundException {
- this(PicardAggregationUtils.getSampleDir(project, sample) + ANALYSIS_FILES);
- }
-
- public PicardAnalysisFiles(String project, String sample, int version) throws FileNotFoundException {
- this(PicardAggregationUtils.getSampleDir(project, sample, version) + ANALYSIS_FILES);
- }
-
- public PicardAnalysisFiles(String path) throws FileNotFoundException {
- this.path = path;
- HashMap headerIndexes = null;
- for (String line: new XReadLines(new File(path))) {
- if (line.startsWith("#"))
- continue;
- String[] values = line.split("\t");
- if (headerIndexes == null) {
- headerIndexes = new HashMap();
- for (String header: ANALYSIS_HEADERS) {
- headerIndexes.put(header, ArrayUtils.indexOf(values, header));
- headerValues.put(header, new HashSet());
- }
- } else {
- for (String header: ANALYSIS_HEADERS) {
- int index = headerIndexes.get(header);
- if (values.length <= index)
- throw new StingException(String.format("Unable to parse line in %s: %n%s", path, line));
- String value = values[index];
- headerValues.get(header).add(value);
- }
- }
- }
- }
-
- public String getPath() {
- return path;
- }
-
- public String getReferenceSequence() {
- return getSingle(REFERENCE_SEQUENCE_HEADER);
- }
-
- public String getTargetIntervals() {
- return getSingle(TARGET_INTERVALS_HEADER);
- }
-
- public String getBaitIntervals() {
- return getSingle(BAIT_INTERVALS_HEADER);
- }
-
- private String getSingle(String header) {
- Set values = headerValues.get(header);
- if (values.size() > 1) {
- throw new UnsupportedOperationException(path + " contains more than one value for " + header + ": " + values);
- } else if (values.size() == 0) {
- return null;
- } else {
- String value = values.iterator().next();
- return "null".equals(value) ? null : value;
- }
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java b/public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java
deleted file mode 100755
index 96bbb2455..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/broad/PicardPipeline.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.broad;
-
-import org.apache.commons.io.FilenameUtils;
-import org.apache.commons.lang.NullArgumentException;
-import org.broadinstitute.sting.datasources.pipeline.Pipeline;
-import org.broadinstitute.sting.datasources.pipeline.PipelineProject;
-import org.broadinstitute.sting.datasources.pipeline.PipelineSample;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.text.XReadLines;
-import org.broadinstitute.sting.utils.yaml.YamlUtils;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-
-/**
- * Automatically gets the latest version using PicardAggregationUtils.
- */
-public class PicardPipeline {
-
- protected static final String PROJECT_TAG = "SQUIDProject";
- protected static final String SAMPLE_TAG = "CollaboratorID";
- protected static final String PICARD_BAM_TYPE = "cleaned";
-
- private PicardPipeline() {}
-
- /**
- * Creates a new PicardPipeline
- * @param path Path to a tsv with project [tab] sample on each line or a pipeline yaml.
- * @return a new Picard
- * @throws FileNotFoundException when unable to find the file or any supporting files.
- */
- public static Pipeline parse(File path) throws FileNotFoundException {
- if (path == null)
- throw new NullArgumentException("path");
-
- Pipeline pipeline;
- if (path.getName().endsWith(".tsv")) {
- pipeline = new Pipeline();
- pipeline.getProject().setName(FilenameUtils.getBaseName(path.getPath()));
- for (String line: new XReadLines(path)) {
- String[] projectSample = line.split("\t");
- addSample(pipeline, projectSample[0], projectSample[1]);
- }
- } else if (path.getName().endsWith(".yaml")) {
- pipeline = YamlUtils.load(Pipeline.class, path);
- } else {
- throw new UserException.BadInput("Path does not end with .tsv or .yaml: " + path.getPath());
- }
-
- update(pipeline);
- return pipeline;
- }
-
- private static void update(Pipeline pipeline) throws FileNotFoundException {
- for (PipelineSample sample: pipeline.getSamples())
- updateSample(pipeline.getProject(), sample);
- }
-
- private static void addSample(Pipeline pipeline, String project, String sample) {
- PipelineSample pipelineSample = new PipelineSample();
- pipelineSample.getTags().put(PROJECT_TAG, project);
- pipelineSample.getTags().put(SAMPLE_TAG, sample);
- pipeline.getSamples().add(pipelineSample);
- }
-
- private static void updateSample(PipelineProject pipelineProject, PipelineSample pipelineSample) throws FileNotFoundException {
- if (!pipelineSample.getTags().containsKey(PROJECT_TAG) && !pipelineSample.getTags().containsKey(SAMPLE_TAG))
- return;
- String project = pipelineSample.getTags().get(PROJECT_TAG);
- String sample = pipelineSample.getTags().get(SAMPLE_TAG);
- int version = PicardAggregationUtils.getLatestVersion(project, sample);
- if (version <= 0)
- throw new UserException.BadInput("Project sample not found: " + project + "/" + sample);
- String bam = PicardAggregationUtils.getSampleBam(project, sample, version);
- if (pipelineSample.getId() == null)
- pipelineSample.setId(project + "_" + sample);
- pipelineSample.getBamFiles().put(PICARD_BAM_TYPE, new File(bam));
-
- PicardAnalysisFiles analysis = new PicardAnalysisFiles(project, sample, version);
- if (pipelineProject.getReferenceFile() == null) {
- String referenceSequence = analysis.getReferenceSequence();
- ReferenceData referenceData = ReferenceData.getByReference(referenceSequence);
- pipelineProject.setReferenceFile(new File(referenceData.getReference()));
- pipelineProject.setRefseqTable(new File(referenceData.getRefseq()));
- if (analysis.getTargetIntervals() != null)
- pipelineProject.setIntervalList(new File(analysis.getTargetIntervals()));
- pipelineProject.setEvalDbsnp(new File(referenceData.getDbsnp(129)));
- if (referenceData.getDbsnpVersions().contains(132)) {
- pipelineProject.setGenotypeDbsnp(new File(referenceData.getDbsnp(132)));
- } else {
- pipelineProject.setGenotypeDbsnp(new File(referenceData.getDbsnp(129)));
- }
- } else {
- String referenceSequence = analysis.getReferenceSequence();
- if (!pipelineProject.getReferenceFile().getPath().equals(referenceSequence))
- throw new UserException.BadInput("Samples sequenced with different references");
- }
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java b/public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java
deleted file mode 100755
index 36a59ec70..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/broad/ReferenceData.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.broad;
-
-import java.util.Collections;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-
-/**
- * Tracks data related to reference files at the Broad.
- */
-public enum ReferenceData {
- /**
- * HG18 reference data
- */
- HG18("hg18"),
-
- /**
- * HG19 reference data
- */
- HG19("hg19");
-
- private static final String REFSEQ_DIR = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
- private static final String DBSNP_DIR = "/humgen/gsa-hpprojects/GATK/data/";
-
- private final String name;
- private final String reference;
- private final String refseq;
- private final Map dbsnps;
-
- ReferenceData(String name) {
- this.name = name;
- Map dbsnps = new TreeMap();
- if ("hg18".equals(name)) {
- this.reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
- this.refseq = REFSEQ_DIR + "refGene-big-table-hg18.txt";
- dbsnps.put(129, DBSNP_DIR + "dbsnp_129_hg18.rod");
- } else if ("hg19".equals(name)) {
- this.reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
- this.refseq = REFSEQ_DIR + "refGene-big-table-hg19.txt";
- dbsnps.put(129, DBSNP_DIR + "dbsnp_129_b37.vcf");
- dbsnps.put(132, DBSNP_DIR + "dbsnp_132_b37.vcf");
- } else
- throw new UnsupportedOperationException("Unknown reference: " + name);
- this.dbsnps = Collections.unmodifiableMap(dbsnps);
- }
-
- /**
- * Returns the name of the reference.
- * @return the name of the reference.
- */
- public String getName() {
- return name;
- }
-
- /**
- * Returns the path to the fasta.
- * @return the path to the fasta.
- */
- public String getReference() {
- return reference;
- }
-
- /**
- * Returns the path to the refseq table.
- * @return the path to the refseq table.
- */
- public String getRefseq() {
- return refseq;
- }
-
- /**
- * Returns the dbsnp versions available.
- * @return the dbsnp versions available.
- */
- public Set getDbsnpVersions() {
- return dbsnps.keySet();
- }
-
- /**
- * Returns the dbsnp path for the version.
- * @param version version from getDbsnpVersions()
- * @return the dbsnp path for the version.
- */
- public String getDbsnp(int version) {
- return dbsnps.get(version);
- }
-
- /**
- * Returns the dbsnp type for the version, "VCF" or "DBSNP".
- * @param version version from getDbsnpVersions()
- * @return the dbsnp type for the version, "VCF" or "DBSNP".
- */
- public String getDbsnpType(int version) {
- String dbsnp = getDbsnp(version);
- if (dbsnp == null)
- return null;
- return dbsnp.toLowerCase().endsWith(".vcf") ? "VCF" : "DBSNP";
- }
-
- /**
- * Returns the reference data based on the path or null.
- * @param reference path to the reference
- * @return the reference data based on the path or null.
- */
- public static ReferenceData getByReference(String reference) {
- for (ReferenceData data: ReferenceData.values())
- if (data.reference.equals(reference))
- return data;
- return null;
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
index bda52dd4e..0c5085cc7 100644
--- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
+++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
@@ -68,13 +68,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
ReferenceSequence seq = null;
}
- private static ThreadLocal cache;
+ private ThreadLocal cache;
- static {
+ {
resetThreadLocalCache();
- };
+ }
- protected static void resetThreadLocalCache() {
+ protected void resetThreadLocalCache() {
cache = new ThreadLocal () {
@Override protected Cache initialValue() {
return new Cache();
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 92262f1bf..3d375aba2 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -414,7 +414,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
* @return vc subcontext
*/
public VariantContext subContextFromGenotypes(Collection genotypes, Set alleles) {
- return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), getFilters(), getAttributes());
+ return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes());
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java
index acfcbab7b..4b225aaea 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java
@@ -19,7 +19,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
"-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
1,
- Arrays.asList("fd2d9dbf718f7a6d82ae1787ac1fe61e"));
+ Arrays.asList("f23ba17ee0f9573dd307708175d90cd2"));
executeTest("test dbsnp", spec2);
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
index eb6a1a4c6..9600046da 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
@@ -26,8 +26,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
"d33212a84368e821cbedecd4f59756d6", // tranches
- "a35cd067f378442eee8cd5edeea92be0", // recal file
- "126d52843f4a57199ee97750ffc16a07"); // cut VCF
+ "4652dca41222bebdf9d9fda343b2a835", // recal file
+ "5350b1a4c1250cf3b77ca45327c04711"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java
index 15733e104..cf0673ee6 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java
@@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
" --NO_HEADER" +
" -o %s",
1,
- Arrays.asList("2cae3d16f9ed00b07d87e9c49272d877")
+ Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d")
);
executeTest("testSimpleVCFStreaming", spec);
@@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
" -EV CompOverlap -noEV -noST" +
" -o %s",
1,
- Arrays.asList("f60729c900bc8368717653b3fad80d1e")
+ Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e"
);
executeTest("testVCFStreamingChain", selectTestSpec);
diff --git a/public/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java b/public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java
similarity index 96%
rename from public/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java
rename to public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java
index 8e18fac6f..891356670 100644
--- a/public/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java
@@ -22,8 +22,10 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-package org.broadinstitute.sting.datasources.pipeline;
+package org.broadinstitute.sting.pipeline;
+import org.broadinstitute.sting.pipeline.Pipeline;
+import org.broadinstitute.sting.pipeline.PipelineSample;
import org.testng.Assert;
import org.broadinstitute.sting.utils.yaml.YamlUtils;
diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java
index 08be4f742..f4c3163cf 100644
--- a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java
@@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.BeforeMethod;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker;
-import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
-import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.Utils;
import java.io.File;
import java.io.FileNotFoundException;
-import java.util.Arrays;
+import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
import net.sf.picard.reference.IndexedFastaSequenceFile;
-import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.*;
/**
@@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest {
System.out.println(Utils.dupString('-', 40));
System.out.println("reads : " + new String(test.readBases));
- ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false);
- ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false);
+ printQuals(System.out, "in-quals:", test.quals, false);
+ printQuals(System.out, "bq-quals:", result.bq, false);
for (int i = 0; i < test.quals.length; i++) {
//result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
@@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest {
}
}
+
+ public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
+ out.print(prefix);
+ for ( int i = 0; i < quals.length; i++) {
+ if ( asInt ) {
+ out.printf("%2d", (int)quals[i]);
+ if ( i+1 != quals.length ) out.print(",");
+ } else
+ out.print((char)(quals[i]+33));
+ }
+ out.println();
+ }
}
\ No newline at end of file
diff --git a/public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java
deleted file mode 100755
index 77a8a1ac8..000000000
--- a/public/java/test/org/broadinstitute/sting/utils/broad/PicardAggregationUtilsUnitTest.java
+++ /dev/null
@@ -1,61 +0,0 @@
-package org.broadinstitute.sting.utils.broad;
-
-import org.testng.Assert;
-import org.testng.annotations.Test;
-
-import java.io.FileNotFoundException;
-
-public class PicardAggregationUtilsUnitTest {
- public static final String PROJECT = "C474";
- public static final String SAMPLE = "NA19651";
- public static final String MISSING_PROJECT = "C0";
- public static final String MISSING_SAMPLE = "0";
- private int latestVersion = -1;
-
- @Test
- public void testGetLatestVersion() {
- latestVersion = PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE);
- System.out.println(String.format("Latest version for %s %s is %d", PROJECT, SAMPLE, latestVersion));
- Assert.assertTrue(latestVersion > 0);
- Assert.assertEquals(PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE, latestVersion), latestVersion);
- }
-
- @Test(dependsOnMethods = "testGetLatestVersion")
- public void testGetSampleBam() throws Exception {
- String test = PicardAggregationUtils.getSampleBam(PROJECT, SAMPLE);
- String latest = PicardAggregationUtils.getSampleBam(PROJECT, SAMPLE, latestVersion);
- Assert.assertEquals(test, latest);
- }
-
- @Test(dependsOnMethods = "testGetLatestVersion")
- public void testGetSampleDir() throws Exception {
- String test = PicardAggregationUtils.getSampleDir(PROJECT, SAMPLE);
- String latest = PicardAggregationUtils.getSampleDir(PROJECT, SAMPLE, latestVersion);
- Assert.assertEquals(test, latest);
- }
-
- @Test(dependsOnMethods = "testGetLatestVersion")
- public void testIsFinished() {
- Assert.assertTrue(PicardAggregationUtils.isFinished(PROJECT, SAMPLE, latestVersion));
- Assert.assertFalse(PicardAggregationUtils.isFinished(PROJECT, SAMPLE, latestVersion + 1));
- }
-
- @Test(expectedExceptions = FileNotFoundException.class)
- public void testMissingSampleBam() throws Exception {
- PicardAggregationUtils.getSampleBam(MISSING_PROJECT, MISSING_SAMPLE);
- }
-
- @Test(expectedExceptions = FileNotFoundException.class)
- public void testMissingSampleDir() throws Exception {
- PicardAggregationUtils.getSampleDir(MISSING_PROJECT, MISSING_SAMPLE);
- }
-
- @Test
- public void testLatestVersionMissing() {
- Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE), 0);
- Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, -1), -1);
- Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 0), 0);
- Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 1), 1);
- Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 2), 2);
- }
-}
diff --git a/public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java
deleted file mode 100755
index de58bf983..000000000
--- a/public/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java
+++ /dev/null
@@ -1,56 +0,0 @@
-package org.broadinstitute.sting.utils.broad;
-
-import org.broadinstitute.sting.BaseTest;
-import org.testng.Assert;
-import org.testng.annotations.Test;
-
-import java.io.FileNotFoundException;
-
-import static org.broadinstitute.sting.utils.broad.PicardAggregationUtilsUnitTest.*;
-
-public class PicardAnalysisFilesUnitTest extends BaseTest {
- @Test
- public void testParseLatest() throws Exception {
- PicardAnalysisFiles files = new PicardAnalysisFiles(PROJECT, SAMPLE);
- Assert.assertNotNull(files.getPath());
- files = new PicardAnalysisFiles(PROJECT, SAMPLE, PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE));
- Assert.assertNotNull(files.getPath());
- }
-
- @Test
- public void testParseValid() throws Exception {
- PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file.txt");
- Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta");
- Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list");
- Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list");
- }
-
- @Test
- public void testParseValidWithComments() throws Exception {
- PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_comments.txt");
- Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta");
- Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list");
- Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list");
- }
-
- @Test(expectedExceptions = FileNotFoundException.class)
- public void testParseBadPath() throws Exception {
- new PicardAnalysisFiles(BaseTest.validationDataLocation + "non_existent_picard_analysis_file.txt");
- }
-
- @Test(expectedExceptions = FileNotFoundException.class)
- public void testParseMissingLatest() throws Exception {
- new PicardAnalysisFiles(MISSING_PROJECT, MISSING_SAMPLE);
- }
-
- @Test(expectedExceptions = FileNotFoundException.class)
- public void testParseMissingVersion() throws Exception {
- new PicardAnalysisFiles(PROJECT, SAMPLE, PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE) + 2);
- }
-
- @Test(expectedExceptions = UnsupportedOperationException.class)
- public void testParseMultipleReferences() throws Exception {
- PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_different_references.txt");
- file.getReferenceSequence();
- }
-}
diff --git a/public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java
deleted file mode 100755
index 121ca4619..000000000
--- a/public/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java
+++ /dev/null
@@ -1,71 +0,0 @@
-package org.broadinstitute.sting.utils.broad;
-
-import org.testng.Assert;
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.io.FilenameUtils;
-import org.broadinstitute.sting.BaseTest;
-import org.broadinstitute.sting.datasources.pipeline.Pipeline;
-import org.broadinstitute.sting.datasources.pipeline.PipelineSample;
-import org.broadinstitute.sting.utils.yaml.YamlUtils;
-import org.testng.annotations.Test;
-import static org.broadinstitute.sting.utils.broad.PicardAggregationUtilsUnitTest.*;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collections;
-
-public class PicardPipelineUnitTest {
- @Test
- public void testParseTsv() throws IOException {
- File tsv = writeTsv(PROJECT, SAMPLE);
- Pipeline pipeline = PicardPipeline.parse(tsv);
- validatePipeline(pipeline, FilenameUtils.getBaseName(tsv.getPath()));
- }
-
- @Test
- public void testParseTsvWithPicardComments() throws Exception {
- File tsv = writeTsv("C460", "HG01359");
- PicardPipeline.parse(tsv);
- }
-
- @Test
- public void testParseYaml() throws IOException {
- File yaml = writeYaml("project_name", PROJECT, SAMPLE);
- Pipeline pipeline = PicardPipeline.parse(yaml);
- validatePipeline(pipeline, "project_name");
- }
-
- private void validatePipeline(Pipeline pipeline, String name) {
- Assert.assertEquals(pipeline.getProject().getName(), name);
- Assert.assertTrue(pipeline.getProject().getReferenceFile().exists(), "reference not found");
- Assert.assertTrue(pipeline.getProject().getIntervalList().exists(), "intervals not found");
- Assert.assertTrue(pipeline.getProject().getRefseqTable().exists(), "refseq not found");
- Assert.assertTrue(pipeline.getProject().getGenotypeDbsnp().exists(), "genotype dbsnp not found");
- Assert.assertTrue(pipeline.getProject().getEvalDbsnp().exists(), "eval dbsnp not found");
- Assert.assertEquals(pipeline.getSamples().size(), 1);
- for (PipelineSample sample: pipeline.getSamples()) {
- Assert.assertEquals(sample.getId(), PROJECT + "_" + SAMPLE);
- Assert.assertTrue(sample.getBamFiles().get(PicardPipeline.PICARD_BAM_TYPE).exists(), "bam not found");
- Assert.assertEquals(sample.getTags().get(PicardPipeline.PROJECT_TAG), PROJECT);
- Assert.assertEquals(sample.getTags().get(PicardPipeline.SAMPLE_TAG), SAMPLE);
- }
- }
-
- private File writeTsv(String project, String sample) throws IOException {
- File tsv = BaseTest.createTempFile("pipeline", ".tsv");
- FileUtils.writeLines(tsv, Collections.singletonList(project + "\t" + sample));
- return tsv;
- }
-
- private File writeYaml(String projectName, String project, String sample) throws IOException {
- File yaml = BaseTest.createTempFile("pipeline", ".yaml");
- PipelineSample pipelineSample = new PipelineSample();
- pipelineSample.getTags().put(PicardPipeline.PROJECT_TAG, project);
- pipelineSample.getTags().put(PicardPipeline.SAMPLE_TAG, sample);
- Pipeline pipeline = new Pipeline();
- pipeline.getProject().setName(projectName);
- pipeline.getSamples().add(pipelineSample);
- YamlUtils.dump(pipeline, yaml);
- return yaml;
- }
-}
diff --git a/public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java
deleted file mode 100755
index e58ea3858..000000000
--- a/public/java/test/org/broadinstitute/sting/utils/broad/ReferenceDataUnitTest.java
+++ /dev/null
@@ -1,49 +0,0 @@
-package org.broadinstitute.sting.utils.broad;
-
-import org.broadinstitute.sting.BaseTest;
-import org.testng.Assert;
-import org.testng.annotations.Test;
-
-import java.io.File;
-
-public class ReferenceDataUnitTest {
- @Test
- public void testNames() {
- Assert.assertEquals(ReferenceData.HG18.getName(), "hg18");
- Assert.assertEquals(ReferenceData.HG19.getName(), "hg19");
- }
-
- @Test
- public void testFilesExist() {
- for (ReferenceData data: ReferenceData.values()) {
- Assert.assertTrue(new File(data.getReference()).exists());
- Assert.assertTrue(new File(data.getRefseq()).exists());
- for (int version: data.getDbsnpVersions()) {
- Assert.assertTrue(new File(data.getDbsnp(version)).exists());
- }
- }
- }
-
- @Test
- public void testDbsnps() {
- Assert.assertTrue(new File(ReferenceData.HG18.getDbsnp(129)).exists());
- Assert.assertTrue(new File(ReferenceData.HG19.getDbsnp(129)).exists());
- Assert.assertTrue(new File(ReferenceData.HG19.getDbsnp(132)).exists());
- Assert.assertNull(ReferenceData.HG19.getDbsnp(130));
- }
-
- @Test
- public void testDbsnpTypes() {
- Assert.assertEquals(ReferenceData.HG18.getDbsnpType(129), "DBSNP");
- Assert.assertEquals(ReferenceData.HG19.getDbsnpType(129), "VCF");
- Assert.assertEquals(ReferenceData.HG19.getDbsnpType(132), "VCF");
- Assert.assertNull(ReferenceData.HG19.getDbsnpType(130));
- }
-
- @Test
- public void testGetByReference() {
- Assert.assertEquals(ReferenceData.getByReference(BaseTest.hg18Reference), ReferenceData.HG18);
- Assert.assertEquals(ReferenceData.getByReference(BaseTest.hg19Reference), ReferenceData.HG19);
- Assert.assertEquals(ReferenceData.getByReference("none"), null);
- }
-}
diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java
index a60f1b0fe..74cd8d8fe 100644
--- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java
@@ -34,11 +34,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
private static final List QUERY_SIZES = Arrays.asList(1, 10, 100, 1000);
private static final List CACHE_SIZES = Arrays.asList(-1, 10, 1000);
- @BeforeMethod
- public void clearCache() {
- CachingIndexedFastaSequenceFile.resetThreadLocalCache();
- }
-
@DataProvider(name = "fastas")
public Object[][] createData1() {
List