diff --git a/R/plot_Tranches.R b/R/plot_Tranches.R index 343d48e28..858d2ddf3 100755 --- a/R/plot_Tranches.R +++ b/R/plot_Tranches.R @@ -3,7 +3,7 @@ args <- commandArgs(TRUE) verbose = TRUE -input = args[1] +tranchesFile = args[1] targetTITV = as.numeric(args[2]) suppressLegend = ! is.na(args[3]) @@ -37,12 +37,15 @@ leftShift <- function(x, leftValue = 0) { # ----------------------------------------------------------------------------------------------- # Tranches plot # ----------------------------------------------------------------------------------------------- -data2 = read.table(paste(input,".tranches",sep=""),sep=",",head=T) +data2 = read.table(tranchesFile,sep=",",head=T) +data2 = data2[order(data2$FDRtranche, decreasing=T),] cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange") density=c(20, -1, -1, 20) -outfile = paste(input, ".FDRtranches.pdf", sep="") +outfile = paste(tranchesFile, ".pdf", sep="") pdf(outfile, height=5, width=8) -alpha = 1 - titvFPEstV(targetTITV, data2$novelTITV) +par(mar = c(5, 5, 4, 2) + 0.1) +novelTiTv = c(data2$novelTITV,data2$novelTiTv) +alpha = 1 - titvFPEstV(targetTITV, novelTiTv) #print(alpha) numGood = round(alpha * data2$numNovel); @@ -61,26 +64,10 @@ barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (100 #abline(v= d[2,dim(d)[2]], lty=2) #abline(v= d[1,3], lty=2) if ( ! suppressLegend ) - legend(5, 2.25, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25) + legend("topright", c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25) mtext("Ti/Tv",2,line=2.25,at=length(data2$FDRtranche)*1.2,las=1, cex=1) mtext("FDR",2,line=0,at=length(data2$FDRtranche)*1.2,las=1, cex=1) -axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche, las=1, cex.axis=1.25) -axis(2,line=1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$novelTITV, las=1, cex.axis=1.25) +axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche, las=1, cex.axis=1.0) +axis(2,line=1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0) dev.off() - - -# -#data2 = read.table(paste(input,".tranches",sep=""),sep=",",head=T) -#cols = c("steelblue","orange") -#outfile = paste(input, ".FDRtranches.pdf", sep="") -#pdf(outfile, height=7, width=8) -#alpha = (data2$novelTITV - 0.5) / (targetTITV - 0.5); -#numGood = round(alpha * data2$numNovel); -#numBad = data2$numNovel - numGood; -#d=matrix(c(numGood,numBad),2,byrow=TRUE) -#barplot(d,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants",ylab="Novel Ti/Tv --> FDR (%)") -#legend('topright',c('implied TP','implied FP'),col=cols,lty=1,lwd=16) -#axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche) -#axis(2,line=0.4,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$novelTITV) -#dev.off() diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java index eab6b76b9..d4e0153d6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.io.File; @@ -86,9 +87,9 @@ public class ApplyVariantCuts extends RodWalker { tranches.add(t); //statusMsg = "Keeping, above FDR threshold"; } - logger.info(String.format("Tranche %s with %.2f FDR, TsTv %.2f and pCut %.2f, threshold %.2f", - t.name, t.fdr, t.novelTiTv, t.pCut, FDR_FILTER_LEVEL)); + logger.info(String.format("Read tranche " + t)); } + Collections.reverse(tranches); // this algorithm wants the tranches ordered from worst to best // setup the header fields final Set hInfo = new HashSet(); @@ -104,8 +105,12 @@ public class ApplyVariantCuts extends RodWalker { } if( tranches.size() >= 1 ) { hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("FDR tranche level at qual < " + tranches.get(0).pCut))); + } else { + throw new UserException("No tranches were found in the file or were above the FDR Filter level " + FDR_FILTER_LEVEL); } + logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1)); + final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java index 63a7faafb..b4ffbfeff 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.vcf.VCFUtils; import org.broadinstitute.sting.utils.text.XReadLines; @@ -45,39 +46,69 @@ import java.util.*; /** */ -public class Tranche { - public double fdr, pCut, knownTiTv, novelTiTv; +public class Tranche implements Comparable { + private static final int CURRENT_VERSION = 2; + + public double fdr, pCut, targetTiTv, knownTiTv, novelTiTv; public int numKnown,numNovel; public String name; - public Tranche(double fdr, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv) { - this(fdr, pCut, numKnown, knownTiTv, numNovel, novelTiTv, "anonymous"); + public Tranche(double fdr, double targetTiTv, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv) { + this(fdr, targetTiTv, pCut, numKnown, knownTiTv, numNovel, novelTiTv, "anonymous"); } - public Tranche(double fdr, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv, String name) { + public Tranche(double fdr, double targetTiTv, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv, String name) { this.fdr = fdr; + this.targetTiTv = targetTiTv; this.pCut = pCut; this.novelTiTv = novelTiTv; this.numNovel = numNovel; this.knownTiTv = knownTiTv; this.numKnown = numKnown; this.name = name; + +// if ( targetTiTv < 0.5 || targetTiTv > 10 ) +// throw new UserException("Target Ti/Tv ratio is unreasonable " + targetTiTv); +// +// if ( numKnown < 0 || numNovel < 0) +// throw new ReviewedStingException("Invalid tranch - no. variants is < 0 : known " + numKnown + " novel " + numNovel); + + if ( name == null ) + throw new ReviewedStingException("BUG -- name cannot be null"); + + } + + public int compareTo(Tranche other) { + return Double.compare(this.fdr, other.fdr); } public String toString() { - return String.format("[Tranche cut = %.3f with %d novels @ %.2f]", pCut, numNovel, novelTiTv); + return String.format("Tranche fdr=%.2f minQual=%.2f known=(%d @ %.2f) novel=(%d @ %.2f) name=%s]", + fdr, pCut, numKnown, knownTiTv, numNovel, novelTiTv, name); } - public static String tranchesString(List tranches) { + /** + * Returns an appropriately formated string representing the raw tranches file on disk. + * + * @param rawTranches + * @return + */ + public static String tranchesString(List rawTranches) { ByteArrayOutputStream bytes = new ByteArrayOutputStream(); PrintStream stream = new PrintStream(bytes); - stream.println("FDRtranche,numKnown,numNovel,knownTiTv,novelTiTv,pCut,filterName"); + List tranches = new ArrayList(); + tranches.addAll(rawTranches); + Collections.sort(tranches); + + stream.println("# Variant quality score tranches file"); + stream.println("# Version number " + CURRENT_VERSION); + stream.println("FDRtranche,targetTiTv,numKnown,numNovel,knownTiTv,novelTiTv,pCut,filterName"); Tranche prev = null; for ( Tranche t : tranches ) { - stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f%n", - t.fdr,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.pCut, + stream.printf("%.2f,%.2f,%d,%d,%.4f,%.4f,%.2f,FDRtranche%.2fto%.2f%n", + t.fdr,t.targetTiTv,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.pCut, (prev == null ? 0.0 : prev.fdr), t.fdr); prev = t; } @@ -85,39 +116,65 @@ public class Tranche { return bytes.toString(); } - private static double getDouble(Map bindings, String key) { - return bindings.containsKey(key) ? Double.valueOf(bindings.get(key)) : -1.0; + private static double getDouble(Map bindings, String key, boolean required) { + if ( bindings.containsKey(key) ) + return Double.valueOf(bindings.get(key)); + else if ( required ) { + throw new UserException("Malformed tranches file. Missing required key " + key); + } + else + return -1; } - private static int getInteger(Map bindings, String key) { - return bindings.containsKey(key) ? Integer.valueOf(bindings.get(key)) : -1; + private static int getInteger(Map bindings, String key, boolean required) { + if ( bindings.containsKey(key) ) + return Integer.valueOf(bindings.get(key)); + else if ( required ) { + throw new UserException("Malformed tranches file. Missing required key " + key); + } + else + return -1; } + /** + * Returns a list of tranches, sorted from most to least specific, read in from file f + * + * @param f + * @return + */ public static List readTraches(File f) { String[] header = null; List tranches = new ArrayList(); try { for( final String line : new XReadLines(f) ) { + if ( line.startsWith("#") ) + continue; + final String[] vals = line.split(","); if( header == null ) { header = vals; } else { + if ( header.length != vals.length ) + throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + " line " + line); + Map bindings = new HashMap(); for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]); - tranches.add(new Tranche(getDouble(bindings,"FDRtranche"), - getDouble(bindings,"pCut"), - getInteger(bindings,"numKnown"), - getDouble(bindings,"knownTiTv"), - getInteger(bindings,"numNovel"), - Math.max(getDouble(bindings,"novelTiTv"), getDouble(bindings,"novelTITV")), + tranches.add(new Tranche(getDouble(bindings,"FDRtranche", true), + getDouble(bindings,"targetTiTv", false), + getDouble(bindings,"pCut", true), + getInteger(bindings,"numKnown", false), + getDouble(bindings,"knownTiTv", false), + getInteger(bindings,"numNovel", true), + Math.max(getDouble(bindings,"novelTiTv", false), getDouble(bindings,"novelTITV", false)), bindings.get("filterName"))); } } + Collections.sort(tranches); // sort this in the standard order return tranches; } catch( FileNotFoundException e ) { - throw new UserException.CouldNotCreateOutputFile(f, e); + throw new UserException.CouldNotReadInputFile(f, e); } } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index d8f283093..dc2ba54da 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -32,6 +32,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; */ public class VariantDatum implements Comparable { + public int pos; public double[] annotations; public boolean isTransition; public boolean isKnown; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java index e80aae9d3..67376edca 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java @@ -474,6 +474,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel List tranches = new ArrayList(); for ( double fdr : FDRtranches ) { Tranche t = findTranche(tranchesData, runningTiTv, fdr, targetTiTv); + // todo -- should abort early when t's qual is 0 -- that's the lowest we'll get to if ( t == null ) { if ( tranches.size() == 0 ) @@ -514,7 +515,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel for ( int i = 0; i < runningTiTv.length; i++ ) { if ( runningTiTv[i] >= titvThreshold ) { // we've found the largest group of variants with Ti/Tv >= our target titv - return trancheOfVariants(data, i, desiredFDR); + return trancheOfVariants(data, i, desiredFDR, targetTiTv); } } @@ -522,173 +523,36 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel return null; } - public final static Tranche trancheOfVariants( final List data, int minI, double fdr ) { + public final static Tranche trancheOfVariants( final List data, int minI, double fdr, double targetTiTv ) { int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0; - for ( int i = minI; i < data.size(); i++ ) { - VariantDatum datum = data.get(i); - if ( datum.isKnown ) { - numKnown++; - if ( datum.isTransition ) { knownTi++; } else { knownTv++; } - } else { - numNovel++; - if ( datum.isTransition ) { novelTi++; } else { novelTv++; } + double qualThreshold = data.get(minI).qual; + VariantDatum last = null; + for ( VariantDatum datum : data ) { + if ( datum.qual >= qualThreshold ) { + //if( ! datum.isKnown ) System.out.println(datum.pos); + if ( datum.isKnown ) { + numKnown++; + if ( datum.isTransition ) { knownTi++; } else { knownTv++; } + } else { + numNovel++; + if ( datum.isTransition ) { novelTi++; } else { novelTv++; } + } } + last = datum; } double knownTiTv = knownTi / Math.max(1.0 * knownTv, 1.0); double novelTiTv = novelTi / Math.max(1.0 * novelTv, 1.0); - return new Tranche(fdr, data.get(minI).qual, numKnown, knownTiTv, numNovel, novelTiTv); + return new Tranche(fdr, targetTiTv, qualThreshold, numKnown, knownTiTv, numNovel, novelTiTv); } public final static double fdrToTiTv(double desiredFDR, double targetTiTv) { return (1.0 - desiredFDR / 100.0) * (targetTiTv - 0.5) + 0.5; } - public final void outputOptimizationCurve( final VariantDatum[] data, final PrintStream outputReportDatFile, final PrintStream tranchesOutputFile, - final int desiredNumVariants, final Double[] FDRtranches, final double MAX_QUAL ) { - final int numVariants = data.length; - final boolean[] markedVariant = new boolean[numVariants]; - - int NUM_BINS = 400 * 2; - double QUAL_STEP1 = (MAX_QUAL * 9.0 / 10.0 ) / ((double) NUM_BINS / 2.0); - if( QUAL_STEP1 < 0.01 ) { QUAL_STEP1 = 0.01; } // QUAL field in VCF file is rounded to two decimal places - double QUAL_STEP2 = (MAX_QUAL * 1.0 / 10.0) / ((double) NUM_BINS / 2.0); - if( QUAL_STEP2 < 0.01 ) { QUAL_STEP2 = 0.01; } // QUAL field in VCF file is rounded to two decimal places - - final int numKnownAtCut[] = new int[NUM_BINS+2]; - final int numNovelAtCut[] = new int[NUM_BINS+2]; - final double knownTiTvAtCut[] = new double[NUM_BINS+2]; - final double novelTiTvAtCut[] = new double[NUM_BINS+2]; - final double theCut[] = new double[NUM_BINS+2]; - - final double fdrCutAsTiTv[] = new double[FDRtranches.length]; - for( int iii = 0; iii < FDRtranches.length; iii++ ) { - fdrCutAsTiTv[iii] = (1.0 - FDRtranches[iii] / 100.0) * (targetTITV - 0.5) + 0.5; - } - - for( int iii = 0; iii < numVariants; iii++ ) { - markedVariant[iii] = false; - } - - tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName"); - - int numKnown = 0; - int numNovel = 0; - int numKnownTi = 0; - int numKnownTv = 0; - int numNovelTi = 0; - int numNovelTv = 0; - boolean foundDesiredNumVariants = false; - int jjj = 0; - outputReportDatFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV"); - double qCut = MAX_QUAL; - - while( qCut >= 0.0 - 1E-2 + 1E-8 ) { - if( qCut < 1E-2 ) { qCut = 0.0; } - - for( int iii = 0; iii < numVariants; iii++ ) { - if( !markedVariant[iii] ) { - if( data[iii].qual >= qCut ) { - markedVariant[iii] = true; - if( data[iii].isKnown ) { // known - numKnown++; - if( data[iii].isTransition ) { // transition - numKnownTi++; - } else { // transversion - numKnownTv++; - } - } else { // novel - numNovel++; - if( data[iii].isTransition ) { // transition - numNovelTi++; - } else { // transversion - numNovelTv++; - } - } - } - } - } - if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) { - logger.info( "Keeping variants with QUAL >= " + String.format("%.2f",qCut) + " results in a filtered set with: " ); - logger.info("\t" + numKnown + " known variants"); - logger.info("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)"); - logger.info("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv))); - logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv))); - foundDesiredNumVariants = true; - } - outputReportDatFile.println( String.format("%.2f,%d,%d,%.4f,%.4f", qCut, numKnown, numNovel, - ( numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) ), - ( numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) ))); - - numKnownAtCut[jjj] = numKnown; - numNovelAtCut[jjj] = numNovel; - knownTiTvAtCut[jjj] = ( numKnownTi == 0 || numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) ); - novelTiTvAtCut[jjj] = ( numNovelTi == 0 || numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) ); - theCut[jjj] = qCut; - jjj++; - - if( qCut >= (MAX_QUAL / 10.0) ) { - qCut -= QUAL_STEP1; - } else { - qCut -= QUAL_STEP2; - } - } - - // loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio - int checkQuantile = 0; - int tranche = FDRtranches.length - 1; - for( ; jjj >= 0; jjj-- ) { - - if( tranche >= 0 && novelTiTvAtCut[jjj] >= fdrCutAsTiTv[tranche] ) { - tranchesOutputFile.println(String.format("%.2f,%.4f,%.4f,%d,FDRtranche%.2fto%.2f", - FDRtranches[tranche],novelTiTvAtCut[jjj],theCut[jjj],numNovelAtCut[jjj], - (tranche == 0 ? 0.0 : FDRtranches[tranche-1]) ,FDRtranches[tranche])); - tranche--; - } - - boolean foundCut = false; - if( checkQuantile == 0 ) { - if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) { - foundCut = true; - checkQuantile++; - } - } else if( checkQuantile == 1 ) { - if( novelTiTvAtCut[jjj] >= 0.95 * targetTITV ) { - foundCut = true; - checkQuantile++; - } - } else if( checkQuantile == 2 ) { - if( novelTiTvAtCut[jjj] >= 0.98 * targetTITV ) { - foundCut = true; - checkQuantile++; - } - } else if( checkQuantile == 3 ) { - if( novelTiTvAtCut[jjj] >= targetTITV ) { - foundCut = true; - checkQuantile++; - } - } else if( checkQuantile == 4 ) { - break; // break out - } - - if( foundCut ) { - logger.info( "Keeping variants with QUAL >= " + String.format("%.2f",theCut[jjj]) + " results in a filtered set with: " ); - logger.info("\t" + numKnownAtCut[jjj] + " known variants"); - logger.info("\t" + numNovelAtCut[jjj] + " novel variants, (dbSNP rate = " + - String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)"); - logger.info("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj])); - logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj])); - logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5)))))); - } - } - if( tranche >= 0 ) { // Didn't find all the tranches - throw new UserException.BadInput("Couldn't find appropriate cuts for all the requested tranches. Please ask for fewer tranches with higher false discovery rates using the --FDRtranche argument"); - } - } - private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) { final int numAnnotations = data[0].annotations.length; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index b20a1eacc..defe1ceb4 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -68,9 +68,7 @@ public class VariantRecalibrator extends RodWalker maxQualObserved ) { maxQualObserved = variantDatum.qual; } @@ -329,25 +333,17 @@ public class VariantRecalibrator extends RodWalker tranches = VariantGaussianMixtureModel.findTranches( dataManager.data, FDR_TRANCHES, TARGET_TITV ); + tranchesStream.print(Tranche.tranchesString(tranches)); // Execute Rscript command to plot the optimization curve // Print out the command line to make it clear to the user what is being executed and how one might modify it - final String rScriptOptimizationCurveCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV; - final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV; - logger.info( rScriptOptimizationCurveCommandLine ); + final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV; logger.info( rScriptTranchesCommandLine ); // Execute the RScript command to plot the table of truth values try { Process p; - p = Runtime.getRuntime().exec( rScriptOptimizationCurveCommandLine ); - p.waitFor(); p = Runtime.getRuntime().exec( rScriptTranchesCommandLine ); p.waitFor(); } catch ( Exception e ) { diff --git a/java/test/org/broadinstitute/sting/WalkerTest.java b/java/test/org/broadinstitute/sting/WalkerTest.java index 22a5180d5..9349fe0b4 100755 --- a/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/java/test/org/broadinstitute/sting/WalkerTest.java @@ -71,7 +71,7 @@ public class WalkerTest extends BaseTest { } } - private static File getFileForMD5(final String md5) { + protected static File getFileForMD5(final String md5) { final String basename = String.format("%s.integrationtest", md5); return new File(MD5_FILE_DB_SUBDIR + "/" + basename); } @@ -208,15 +208,23 @@ public class WalkerTest extends BaseTest { protected Map auxillaryFiles = new HashMap(); + public WalkerTestSpec(String args, List md5s) { + this(args, -1, md5s); + } + public WalkerTestSpec(String args, int nOutputFiles, List md5s) { this.args = args; - this.nOutputFiles = nOutputFiles; + this.nOutputFiles = md5s.size(); this.md5s = md5s; } + public WalkerTestSpec(String args, List exts, List md5s) { + this(args, -1, exts, md5s); + } + public WalkerTestSpec(String args, int nOutputFiles, List exts, List md5s) { this.args = args; - this.nOutputFiles = nOutputFiles; + this.nOutputFiles = md5s.size(); this.md5s = md5s; this.exts = exts; } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3b39767b1..9f84a3e65 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -7,8 +7,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Map; -public class - VariantEvalIntegrationTest extends WalkerTest { +public class VariantEvalIntegrationTest extends WalkerTest { private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java index 075af3006..b0933735c 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2010, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,15 +12,14 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; @@ -34,6 +33,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.BaseTest; import org.testng.annotations.Test; @@ -58,35 +58,93 @@ public final class VariantGaussianMixtureModelUnitTest extends BaseTest { private static int N_VARIANTS = 100; VariantDatum[] variantData1 = new VariantDatum[N_VARIANTS]; - @BeforeTest(enabled=false) - public void beforeTest() { - for ( int i = 0; i < N_VARIANTS; i++ ) { - variantData1[i].isKnown = i % 2 == 0; // every other is know - variantData1[i].qual = (N_VARIANTS - i) * 1.0; + private final File QUAL_DATA = new File(testDir + "tranches.raw.dat"); + private final double[] FDRS = new double[]{0.1, 1, 10, 100}; + private final double TARGET_TITV = 2.8; + private final File EXPECTED_TRANCHES_NEW = new File(testDir + "tranches.6.txt"); + private final File EXPECTED_TRANCHES_OLD = new File(testDir + "tranches.4.txt"); + + private List readData() { + List vd = new ArrayList(); + try { + for ( String line : new XReadLines(QUAL_DATA, true) ) { + String[] parts = line.split(" "); + if ( ! parts[0].equals("QUAL") ) { + VariantDatum datum = new VariantDatum(); + datum.qual = Double.valueOf(parts[0]); + datum.isTransition = parts[1].equals("1"); + datum.isKnown = parts[2].equals("1"); + vd.add(datum); + } + } + } catch (FileNotFoundException e) { + throw new StingException("foo", e); } - // first 25 are tv, 25-75 are 50/50, and 75+ are all transitions - int i = 0; - for ( ; i < (N_VARIANTS * 0.25); i++ ) { variantData1[i].isTransition = true; } - for ( ; i < (N_VARIANTS * 0.75); i++ ) { variantData1[i].isTransition = i % 2 == 0; } - for ( ; i < N_VARIANTS; i++ ) { variantData1[i].isTransition = false; } + return vd; } - @Test(enabled=false) + @Test(expectedExceptions = {UserException.MalformedFile.class}) + public final void readBadFormat() { + Tranche.readTraches(QUAL_DATA); + } + + @Test + public final void readNewFormat() { + read(EXPECTED_TRANCHES_NEW); + } + + @Test + public final void readOldFormat() { + read(EXPECTED_TRANCHES_OLD); + } + + public final List read(File f) { + return Tranche.readTraches(f); + } + + @Test + public final void testNewAndOldAretheSame() { + List newFormat = read(EXPECTED_TRANCHES_NEW); + List oldFormat = read(EXPECTED_TRANCHES_OLD); + assertTranchesAreTheSame(newFormat, oldFormat, false, true); + } + + private static void assertTranchesAreTheSame(List newFormat, List oldFormat, boolean completeP, boolean includeName) { + Assert.assertEquals(oldFormat.size(), newFormat.size()); + for ( int i = 0; i < newFormat.size(); i++ ) { + Tranche n = newFormat.get(i); + Tranche o = oldFormat.get(i); + Assert.assertEquals(n.fdr, o.fdr, 1e-3); + Assert.assertEquals(n.numNovel, o.numNovel); + Assert.assertEquals(n.novelTiTv, o.novelTiTv, 1e-3); + if ( includeName ) + Assert.assertEquals(n.name, o.name); + if ( completeP ) { + Assert.assertEquals(n.numKnown, o.numKnown); + Assert.assertEquals(n.knownTiTv, o.knownTiTv, 1e-3); + Assert.assertEquals(n.targetTiTv, o.targetTiTv, 1e-3); + } + } + } + + @Test public final void testFindTranches1() { - List tranches = VariantGaussianMixtureModel.findTranches(variantData1, new double[]{0.1, 20}, 2.0); - Assert.assertEquals( tranches.size(), 2 ); - - Tranche t1 = tranches.get(0); - Assert.assertEquals( t1.fdr, 0.1 ); - Assert.assertEquals( t1.pCut, 26 ); - Assert.assertEquals( t1.numKnown, 37 ); - Assert.assertEquals( t1.numNovel, 37 ); - - Tranche t2 = tranches.get(1); - Assert.assertEquals( t2.fdr, 20 ); - Assert.assertEquals( t2.pCut, 21 ); - Assert.assertEquals( t2.numKnown, 37 ); - Assert.assertEquals( t2.numNovel, 37 ); + List vd = readData(); + List tranches = VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), FDRS, TARGET_TITV); + System.out.printf(Tranche.tranchesString(tranches)); + assertTranchesAreTheSame(read(EXPECTED_TRANCHES_NEW), tranches, true, false); } + +// @Test(expectedExceptions = {UserException.class}) +// public final void testBadFDR() { +// List vd = readData(); +// VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), new double[]{-1}, TARGET_TITV); +// } +// +// @Test(expectedExceptions = {UserException.class}) +// public final void testBadTargetTiTv() { +// List vd = readData(); +// VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), FDRS, 0.1); +// } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 8e2768aa3..52f6958ee 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; import java.util.*; import java.io.File; @@ -12,130 +13,112 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { static HashMap tranchesFiles = new HashMap(); static HashMap inputVCFFiles = new HashMap(); - @Test(enabled = true) - public void testGenerateVariantClusters() { - HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "ab2629d67e378fd3aceb8318f0fbfe04" ); - e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "725489156426e4ddd8d623ab3d4b1023" ); - - for ( Map.Entry entry : e.entrySet() ) { - String vcf = entry.getKey(); - String md5 = entry.getValue(); - - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b36KGReference + - " -NO_HEADER" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" + - " -weightDBSNP 0.2 -weightHapMap 1.0" + - " -T GenerateVariantClusters" + - " -B:input,VCF " + vcf + - " -L 1:50,000,000-200,000,000" + - " -qual 50.0" + - " --ignore_filter GATK_STANDARD" + - " -an QD -an HRun -an SB" + - " -clusterFile %s", - 1, // just one output file - Arrays.asList(md5)); - List result = executeTest("testGenerateVariantClusters", spec).getFirst(); - clusterFiles.put(vcf, result.get(0).getAbsolutePath()); + private static class VRTest { + String inVCF; + String clusterMD5; + String tranchesMD5; + String recalVCFMD5; + String cutVCFMD5; + public VRTest(String inVCF, String clusterMD5, String tranchesMD5, String recalVCFMD5, String cutVCFMD5) { + this.inVCF = validationDataLocation + inVCF; + this.clusterMD5 = clusterMD5; + this.tranchesMD5 = tranchesMD5; + this.recalVCFMD5 = recalVCFMD5; + this.cutVCFMD5 = cutVCFMD5; } } - @Test(enabled = true) - public void testVariantRecalibrator() { - HashMap> e = new HashMap>(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", - Arrays.asList("4e893672230fca625f70b0491f3b36cb", "40b3fb6632304cebc56a9ed5853cc72e","0cfbeb9f2db7edc1d69b5b43ea17670c")); // Each test checks the md5 of three output files - e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", - Arrays.asList("d52e4f511c9c00f8c21dffea81c47103", "780920fcfa3009c66ebcf18265edaa75","010023ddb95fac071e0f208e6bb40c61")); // Each test checks the md5 of three output files + VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", + "ab2629d67e378fd3aceb8318f0fbfe04", // in vcf + "4dd95e9d8e5d21a6ab73de67d9663492", // tranches + "4e893672230fca625f70b0491f3b36cb", // recalVCF + "371b0e2796982485ae050e46892f6660"); // cut VCF - for ( Map.Entry> entry : e.entrySet() ) { - String vcf = entry.getKey(); - List md5s = entry.getValue(); - String clusterFile = clusterFiles.get(vcf); - System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); - if ( clusterFile != null ) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b36KGReference + - " -NO_HEADER" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" + - " -T VariantRecalibrator" + - " -B:input,VCF " + vcf + - " -L 1:20,000,000-100,000,000" + - " --ignore_filter GATK_STANDARD" + - " --ignore_filter HARD_TO_VALIDATE" + - " -clusterFile " + clusterFile + - " -titv 2.07" + - " -o %s" + - " -tranchesFile %s" + - " -reportDatFile %s", - 3, // three output files - md5s); - List result = executeTest("testVariantRecalibrator", spec).getFirst(); - inputVCFFiles.put(vcf, result.get(0).getAbsolutePath()); - tranchesFiles.put(vcf, result.get(1).getAbsolutePath()); - } - } + VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf", + "725489156426e4ddd8d623ab3d4b1023", // in vcf + "dfc07132f592a811d0d6c25a4cb67a09", // tranches + "d52e4f511c9c00f8c21dffea81c47103", // recalVCF + "39716b2f03b50e88855d7975dd1b9b3e"); // cut VCF + + @DataProvider(name = "VRTest") + public Object[][] createData1() { + return new Object[][]{ {yriTrio}, {lowPass} }; } - @Test - public void testApplyVariantCuts() { - HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "646805c4619a15d3f9f4976b96b89ecd" ); - e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "df9e37af16610ccd44d375eae2c4479c" ); + @Test(dataProvider = "VRTest", enabled = true) + public void testGenerateVariantClusters(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " -NO_HEADER" + + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" + + " -weightDBSNP 0.2 -weightHapMap 1.0" + + " -T GenerateVariantClusters" + + " -B:input,VCF " + params.inVCF + + " -L 1:50,000,000-200,000,000" + + " -qual 50.0" + + " --ignore_filter GATK_STANDARD" + + " -an QD -an HRun -an SB" + + " -clusterFile %s", + Arrays.asList(params.clusterMD5)); + executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst(); + } - for ( Map.Entry entry : e.entrySet() ) { - String vcf = entry.getKey(); - String md5 = entry.getValue(); - String inputVCFFile = inputVCFFiles.get(vcf); - String tranchesFile = tranchesFiles.get(vcf); - System.out.printf("PARAMS FOR %s is %s%n", vcf, inputVCFFile); - System.out.printf("PARAMS FOR %s is %s%n", vcf, tranchesFile); - if ( inputVCFFile != null && tranchesFile != null ) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b36KGReference + - " -NO_HEADER" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -T ApplyVariantCuts" + - " -L 1:20,000,000-100,000,000" + - " -B:input,VCF " + inputVCFFile + - " -o %s" + - " -tranchesFile " + tranchesFile, - 1, // just one output file - Arrays.asList(md5)); - executeTest("testApplyVariantCuts", spec); - } - } + @Test(dataProvider = "VRTest", enabled = true) + public void testVariantRecalibrator(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " -NO_HEADER" + + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" + + " -T VariantRecalibrator" + + " -B:input,VCF " + params.inVCF + + " -L 1:20,000,000-100,000,000" + + " --ignore_filter GATK_STANDARD" + + " --ignore_filter HARD_TO_VALIDATE" + + " -clusterFile " + getFileForMD5(params.clusterMD5) + + " -titv 2.07" + + " -o %s" + + " -tranchesFile %s", + Arrays.asList(params.recalVCFMD5, params.tranchesMD5)); + executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRTest", enabled = true) + public void testApplyVariantCuts(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " -NO_HEADER" + + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -T ApplyVariantCuts" + + " -L 1:20,000,000-100,000,000" + + " -B:input,VCF " + getFileForMD5(params.recalVCFMD5) + + " -o %s" + + " -tranchesFile " + getFileForMD5(params.tranchesMD5), + Arrays.asList(params.cutVCFMD5)); + executeTest("testApplyVariantCuts-"+params.inVCF, spec); } - @Test + @Test() public void testFailWithBadAnnotation() { - HashMap e = new HashMap(); - e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "" ); - - for ( Map.Entry entry : e.entrySet() ) { - String vcf = entry.getKey(); - - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b36KGReference + - " -NO_HEADER" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" + - " -weightDBSNP 0.2 -weightHapMap 1.0" + - " -T GenerateVariantClusters" + - " -B:input,VCF " + vcf + - " -L 1:50,000,000-200,000,000" + - " -qual 50.0" + - " --ignore_filter GATK_STANDARD" + - " -an QD -an HRun -an ThisAnnotationIsBAD" + // There is a bad annotation here - " -clusterFile %s", - 1, // just one output file - UserException.MalformedFile.class); - executeTest("testFailWithBadAnnotation", spec); - } + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " -NO_HEADER" + + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" + + " -weightDBSNP 0.2 -weightHapMap 1.0" + + " -T GenerateVariantClusters" + + " -B:input,VCF " + lowPass.inVCF + + " -L 1:50,000,000-200,000,000" + + " -qual 50.0" + + " --ignore_filter GATK_STANDARD" + + " -an QD -an HRun -an ThisAnnotationIsBAD" + // There is a bad annotation here + " -clusterFile %s", + 1, // just one output file + UserException.MalformedFile.class); + executeTest("testFailWithBadAnnotation", spec); } }