New version of cutting routines for VQSR. Old code removed. Working unit tests. Best practice with testng integration test (everyone look at it). Walker test now allows you to not specify no. input files, if it can infer input counts from MD5s
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4664 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
62a106ca5a
commit
44d0cb6cde
|
|
@ -3,7 +3,7 @@
|
|||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
tranchesFile = args[1]
|
||||
targetTITV = as.numeric(args[2])
|
||||
suppressLegend = ! is.na(args[3])
|
||||
|
||||
|
|
@ -37,12 +37,15 @@ leftShift <- function(x, leftValue = 0) {
|
|||
# -----------------------------------------------------------------------------------------------
|
||||
# Tranches plot
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
data2 = read.table(paste(input,".tranches",sep=""),sep=",",head=T)
|
||||
data2 = read.table(tranchesFile,sep=",",head=T)
|
||||
data2 = data2[order(data2$FDRtranche, decreasing=T),]
|
||||
cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
|
||||
density=c(20, -1, -1, 20)
|
||||
outfile = paste(input, ".FDRtranches.pdf", sep="")
|
||||
outfile = paste(tranchesFile, ".pdf", sep="")
|
||||
pdf(outfile, height=5, width=8)
|
||||
alpha = 1 - titvFPEstV(targetTITV, data2$novelTITV)
|
||||
par(mar = c(5, 5, 4, 2) + 0.1)
|
||||
novelTiTv = c(data2$novelTITV,data2$novelTiTv)
|
||||
alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
|
||||
#print(alpha)
|
||||
|
||||
numGood = round(alpha * data2$numNovel);
|
||||
|
|
@ -61,26 +64,10 @@ barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (100
|
|||
#abline(v= d[2,dim(d)[2]], lty=2)
|
||||
#abline(v= d[1,3], lty=2)
|
||||
if ( ! suppressLegend )
|
||||
legend(5, 2.25, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
|
||||
legend("topright", c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
|
||||
|
||||
mtext("Ti/Tv",2,line=2.25,at=length(data2$FDRtranche)*1.2,las=1, cex=1)
|
||||
mtext("FDR",2,line=0,at=length(data2$FDRtranche)*1.2,las=1, cex=1)
|
||||
axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche, las=1, cex.axis=1.25)
|
||||
axis(2,line=1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$novelTITV, las=1, cex.axis=1.25)
|
||||
axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche, las=1, cex.axis=1.0)
|
||||
axis(2,line=1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
|
||||
dev.off()
|
||||
|
||||
|
||||
#
|
||||
#data2 = read.table(paste(input,".tranches",sep=""),sep=",",head=T)
|
||||
#cols = c("steelblue","orange")
|
||||
#outfile = paste(input, ".FDRtranches.pdf", sep="")
|
||||
#pdf(outfile, height=7, width=8)
|
||||
#alpha = (data2$novelTITV - 0.5) / (targetTITV - 0.5);
|
||||
#numGood = round(alpha * data2$numNovel);
|
||||
#numBad = data2$numNovel - numGood;
|
||||
#d=matrix(c(numGood,numBad),2,byrow=TRUE)
|
||||
#barplot(d,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants",ylab="Novel Ti/Tv --> FDR (%)")
|
||||
#legend('topright',c('implied TP','implied FP'),col=cols,lty=1,lwd=16)
|
||||
#axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche)
|
||||
#axis(2,line=0.4,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$novelTITV)
|
||||
#dev.off()
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -86,9 +87,9 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
tranches.add(t);
|
||||
//statusMsg = "Keeping, above FDR threshold";
|
||||
}
|
||||
logger.info(String.format("Tranche %s with %.2f FDR, TsTv %.2f and pCut %.2f, threshold %.2f",
|
||||
t.name, t.fdr, t.novelTiTv, t.pCut, FDR_FILTER_LEVEL));
|
||||
logger.info(String.format("Read tranche " + t));
|
||||
}
|
||||
Collections.reverse(tranches); // this algorithm wants the tranches ordered from worst to best
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
|
|
@ -104,8 +105,12 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
if( tranches.size() >= 1 ) {
|
||||
hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("FDR tranche level at qual < " + tranches.get(0).pCut)));
|
||||
} else {
|
||||
throw new UserException("No tranches were found in the file or were above the FDR Filter level " + FDR_FILTER_LEVEL);
|
||||
}
|
||||
|
||||
logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
|
|
@ -45,39 +46,69 @@ import java.util.*;
|
|||
/**
|
||||
*/
|
||||
|
||||
public class Tranche {
|
||||
public double fdr, pCut, knownTiTv, novelTiTv;
|
||||
public class Tranche implements Comparable<Tranche> {
|
||||
private static final int CURRENT_VERSION = 2;
|
||||
|
||||
public double fdr, pCut, targetTiTv, knownTiTv, novelTiTv;
|
||||
public int numKnown,numNovel;
|
||||
public String name;
|
||||
|
||||
public Tranche(double fdr, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv) {
|
||||
this(fdr, pCut, numKnown, knownTiTv, numNovel, novelTiTv, "anonymous");
|
||||
public Tranche(double fdr, double targetTiTv, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv) {
|
||||
this(fdr, targetTiTv, pCut, numKnown, knownTiTv, numNovel, novelTiTv, "anonymous");
|
||||
}
|
||||
|
||||
public Tranche(double fdr, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv, String name) {
|
||||
public Tranche(double fdr, double targetTiTv, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv, String name) {
|
||||
this.fdr = fdr;
|
||||
this.targetTiTv = targetTiTv;
|
||||
this.pCut = pCut;
|
||||
this.novelTiTv = novelTiTv;
|
||||
this.numNovel = numNovel;
|
||||
this.knownTiTv = knownTiTv;
|
||||
this.numKnown = numKnown;
|
||||
this.name = name;
|
||||
|
||||
// if ( targetTiTv < 0.5 || targetTiTv > 10 )
|
||||
// throw new UserException("Target Ti/Tv ratio is unreasonable " + targetTiTv);
|
||||
//
|
||||
// if ( numKnown < 0 || numNovel < 0)
|
||||
// throw new ReviewedStingException("Invalid tranch - no. variants is < 0 : known " + numKnown + " novel " + numNovel);
|
||||
|
||||
if ( name == null )
|
||||
throw new ReviewedStingException("BUG -- name cannot be null");
|
||||
|
||||
}
|
||||
|
||||
public int compareTo(Tranche other) {
|
||||
return Double.compare(this.fdr, other.fdr);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("[Tranche cut = %.3f with %d novels @ %.2f]", pCut, numNovel, novelTiTv);
|
||||
return String.format("Tranche fdr=%.2f minQual=%.2f known=(%d @ %.2f) novel=(%d @ %.2f) name=%s]",
|
||||
fdr, pCut, numKnown, knownTiTv, numNovel, novelTiTv, name);
|
||||
}
|
||||
|
||||
public static String tranchesString(List<Tranche> tranches) {
|
||||
/**
|
||||
* Returns an appropriately formated string representing the raw tranches file on disk.
|
||||
*
|
||||
* @param rawTranches
|
||||
* @return
|
||||
*/
|
||||
public static String tranchesString(List<Tranche> rawTranches) {
|
||||
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
|
||||
PrintStream stream = new PrintStream(bytes);
|
||||
|
||||
stream.println("FDRtranche,numKnown,numNovel,knownTiTv,novelTiTv,pCut,filterName");
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
tranches.addAll(rawTranches);
|
||||
Collections.sort(tranches);
|
||||
|
||||
stream.println("# Variant quality score tranches file");
|
||||
stream.println("# Version number " + CURRENT_VERSION);
|
||||
stream.println("FDRtranche,targetTiTv,numKnown,numNovel,knownTiTv,novelTiTv,pCut,filterName");
|
||||
|
||||
Tranche prev = null;
|
||||
for ( Tranche t : tranches ) {
|
||||
stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f%n",
|
||||
t.fdr,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.pCut,
|
||||
stream.printf("%.2f,%.2f,%d,%d,%.4f,%.4f,%.2f,FDRtranche%.2fto%.2f%n",
|
||||
t.fdr,t.targetTiTv,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.pCut,
|
||||
(prev == null ? 0.0 : prev.fdr), t.fdr);
|
||||
prev = t;
|
||||
}
|
||||
|
|
@ -85,39 +116,65 @@ public class Tranche {
|
|||
return bytes.toString();
|
||||
}
|
||||
|
||||
private static double getDouble(Map<String,String> bindings, String key) {
|
||||
return bindings.containsKey(key) ? Double.valueOf(bindings.get(key)) : -1.0;
|
||||
private static double getDouble(Map<String,String> bindings, String key, boolean required) {
|
||||
if ( bindings.containsKey(key) )
|
||||
return Double.valueOf(bindings.get(key));
|
||||
else if ( required ) {
|
||||
throw new UserException("Malformed tranches file. Missing required key " + key);
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
private static int getInteger(Map<String,String> bindings, String key) {
|
||||
return bindings.containsKey(key) ? Integer.valueOf(bindings.get(key)) : -1;
|
||||
private static int getInteger(Map<String,String> bindings, String key, boolean required) {
|
||||
if ( bindings.containsKey(key) )
|
||||
return Integer.valueOf(bindings.get(key));
|
||||
else if ( required ) {
|
||||
throw new UserException("Malformed tranches file. Missing required key " + key);
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of tranches, sorted from most to least specific, read in from file f
|
||||
*
|
||||
* @param f
|
||||
* @return
|
||||
*/
|
||||
public static List<Tranche> readTraches(File f) {
|
||||
String[] header = null;
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
|
||||
try {
|
||||
for( final String line : new XReadLines(f) ) {
|
||||
if ( line.startsWith("#") )
|
||||
continue;
|
||||
|
||||
final String[] vals = line.split(",");
|
||||
if( header == null ) {
|
||||
header = vals;
|
||||
} else {
|
||||
if ( header.length != vals.length )
|
||||
throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + " line " + line);
|
||||
|
||||
Map<String,String> bindings = new HashMap<String, String>();
|
||||
for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]);
|
||||
tranches.add(new Tranche(getDouble(bindings,"FDRtranche"),
|
||||
getDouble(bindings,"pCut"),
|
||||
getInteger(bindings,"numKnown"),
|
||||
getDouble(bindings,"knownTiTv"),
|
||||
getInteger(bindings,"numNovel"),
|
||||
Math.max(getDouble(bindings,"novelTiTv"), getDouble(bindings,"novelTITV")),
|
||||
tranches.add(new Tranche(getDouble(bindings,"FDRtranche", true),
|
||||
getDouble(bindings,"targetTiTv", false),
|
||||
getDouble(bindings,"pCut", true),
|
||||
getInteger(bindings,"numKnown", false),
|
||||
getDouble(bindings,"knownTiTv", false),
|
||||
getInteger(bindings,"numNovel", true),
|
||||
Math.max(getDouble(bindings,"novelTiTv", false), getDouble(bindings,"novelTITV", false)),
|
||||
bindings.get("filterName")));
|
||||
}
|
||||
}
|
||||
|
||||
Collections.sort(tranches); // sort this in the standard order
|
||||
return tranches;
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(f, e);
|
||||
throw new UserException.CouldNotReadInputFile(f, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -32,6 +32,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
|||
*/
|
||||
|
||||
public class VariantDatum implements Comparable<VariantDatum> {
|
||||
public int pos;
|
||||
public double[] annotations;
|
||||
public boolean isTransition;
|
||||
public boolean isKnown;
|
||||
|
|
|
|||
|
|
@ -474,6 +474,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
for ( double fdr : FDRtranches ) {
|
||||
Tranche t = findTranche(tranchesData, runningTiTv, fdr, targetTiTv);
|
||||
// todo -- should abort early when t's qual is 0 -- that's the lowest we'll get to
|
||||
|
||||
if ( t == null ) {
|
||||
if ( tranches.size() == 0 )
|
||||
|
|
@ -514,7 +515,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
for ( int i = 0; i < runningTiTv.length; i++ ) {
|
||||
if ( runningTiTv[i] >= titvThreshold ) {
|
||||
// we've found the largest group of variants with Ti/Tv >= our target titv
|
||||
return trancheOfVariants(data, i, desiredFDR);
|
||||
return trancheOfVariants(data, i, desiredFDR, targetTiTv);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -522,173 +523,36 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
return null;
|
||||
}
|
||||
|
||||
public final static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr ) {
|
||||
public final static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr, double targetTiTv ) {
|
||||
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
|
||||
|
||||
for ( int i = minI; i < data.size(); i++ ) {
|
||||
VariantDatum datum = data.get(i);
|
||||
if ( datum.isKnown ) {
|
||||
numKnown++;
|
||||
if ( datum.isTransition ) { knownTi++; } else { knownTv++; }
|
||||
} else {
|
||||
numNovel++;
|
||||
if ( datum.isTransition ) { novelTi++; } else { novelTv++; }
|
||||
double qualThreshold = data.get(minI).qual;
|
||||
VariantDatum last = null;
|
||||
for ( VariantDatum datum : data ) {
|
||||
if ( datum.qual >= qualThreshold ) {
|
||||
//if( ! datum.isKnown ) System.out.println(datum.pos);
|
||||
if ( datum.isKnown ) {
|
||||
numKnown++;
|
||||
if ( datum.isTransition ) { knownTi++; } else { knownTv++; }
|
||||
} else {
|
||||
numNovel++;
|
||||
if ( datum.isTransition ) { novelTi++; } else { novelTv++; }
|
||||
|
||||
}
|
||||
}
|
||||
last = datum;
|
||||
}
|
||||
|
||||
double knownTiTv = knownTi / Math.max(1.0 * knownTv, 1.0);
|
||||
double novelTiTv = novelTi / Math.max(1.0 * novelTv, 1.0);
|
||||
|
||||
return new Tranche(fdr, data.get(minI).qual, numKnown, knownTiTv, numNovel, novelTiTv);
|
||||
return new Tranche(fdr, targetTiTv, qualThreshold, numKnown, knownTiTv, numNovel, novelTiTv);
|
||||
}
|
||||
|
||||
public final static double fdrToTiTv(double desiredFDR, double targetTiTv) {
|
||||
return (1.0 - desiredFDR / 100.0) * (targetTiTv - 0.5) + 0.5;
|
||||
}
|
||||
|
||||
public final void outputOptimizationCurve( final VariantDatum[] data, final PrintStream outputReportDatFile, final PrintStream tranchesOutputFile,
|
||||
final int desiredNumVariants, final Double[] FDRtranches, final double MAX_QUAL ) {
|
||||
final int numVariants = data.length;
|
||||
final boolean[] markedVariant = new boolean[numVariants];
|
||||
|
||||
int NUM_BINS = 400 * 2;
|
||||
double QUAL_STEP1 = (MAX_QUAL * 9.0 / 10.0 ) / ((double) NUM_BINS / 2.0);
|
||||
if( QUAL_STEP1 < 0.01 ) { QUAL_STEP1 = 0.01; } // QUAL field in VCF file is rounded to two decimal places
|
||||
double QUAL_STEP2 = (MAX_QUAL * 1.0 / 10.0) / ((double) NUM_BINS / 2.0);
|
||||
if( QUAL_STEP2 < 0.01 ) { QUAL_STEP2 = 0.01; } // QUAL field in VCF file is rounded to two decimal places
|
||||
|
||||
final int numKnownAtCut[] = new int[NUM_BINS+2];
|
||||
final int numNovelAtCut[] = new int[NUM_BINS+2];
|
||||
final double knownTiTvAtCut[] = new double[NUM_BINS+2];
|
||||
final double novelTiTvAtCut[] = new double[NUM_BINS+2];
|
||||
final double theCut[] = new double[NUM_BINS+2];
|
||||
|
||||
final double fdrCutAsTiTv[] = new double[FDRtranches.length];
|
||||
for( int iii = 0; iii < FDRtranches.length; iii++ ) {
|
||||
fdrCutAsTiTv[iii] = (1.0 - FDRtranches[iii] / 100.0) * (targetTITV - 0.5) + 0.5;
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
markedVariant[iii] = false;
|
||||
}
|
||||
|
||||
tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName");
|
||||
|
||||
int numKnown = 0;
|
||||
int numNovel = 0;
|
||||
int numKnownTi = 0;
|
||||
int numKnownTv = 0;
|
||||
int numNovelTi = 0;
|
||||
int numNovelTv = 0;
|
||||
boolean foundDesiredNumVariants = false;
|
||||
int jjj = 0;
|
||||
outputReportDatFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
||||
double qCut = MAX_QUAL;
|
||||
|
||||
while( qCut >= 0.0 - 1E-2 + 1E-8 ) {
|
||||
if( qCut < 1E-2 ) { qCut = 0.0; }
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
if( !markedVariant[iii] ) {
|
||||
if( data[iii].qual >= qCut ) {
|
||||
markedVariant[iii] = true;
|
||||
if( data[iii].isKnown ) { // known
|
||||
numKnown++;
|
||||
if( data[iii].isTransition ) { // transition
|
||||
numKnownTi++;
|
||||
} else { // transversion
|
||||
numKnownTv++;
|
||||
}
|
||||
} else { // novel
|
||||
numNovel++;
|
||||
if( data[iii].isTransition ) { // transition
|
||||
numNovelTi++;
|
||||
} else { // transversion
|
||||
numNovelTv++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
|
||||
logger.info( "Keeping variants with QUAL >= " + String.format("%.2f",qCut) + " results in a filtered set with: " );
|
||||
logger.info("\t" + numKnown + " known variants");
|
||||
logger.info("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
|
||||
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
|
||||
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv)));
|
||||
foundDesiredNumVariants = true;
|
||||
}
|
||||
outputReportDatFile.println( String.format("%.2f,%d,%d,%.4f,%.4f", qCut, numKnown, numNovel,
|
||||
( numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) ),
|
||||
( numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) )));
|
||||
|
||||
numKnownAtCut[jjj] = numKnown;
|
||||
numNovelAtCut[jjj] = numNovel;
|
||||
knownTiTvAtCut[jjj] = ( numKnownTi == 0 || numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) );
|
||||
novelTiTvAtCut[jjj] = ( numNovelTi == 0 || numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) );
|
||||
theCut[jjj] = qCut;
|
||||
jjj++;
|
||||
|
||||
if( qCut >= (MAX_QUAL / 10.0) ) {
|
||||
qCut -= QUAL_STEP1;
|
||||
} else {
|
||||
qCut -= QUAL_STEP2;
|
||||
}
|
||||
}
|
||||
|
||||
// loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
|
||||
int checkQuantile = 0;
|
||||
int tranche = FDRtranches.length - 1;
|
||||
for( ; jjj >= 0; jjj-- ) {
|
||||
|
||||
if( tranche >= 0 && novelTiTvAtCut[jjj] >= fdrCutAsTiTv[tranche] ) {
|
||||
tranchesOutputFile.println(String.format("%.2f,%.4f,%.4f,%d,FDRtranche%.2fto%.2f",
|
||||
FDRtranches[tranche],novelTiTvAtCut[jjj],theCut[jjj],numNovelAtCut[jjj],
|
||||
(tranche == 0 ? 0.0 : FDRtranches[tranche-1]) ,FDRtranches[tranche]));
|
||||
tranche--;
|
||||
}
|
||||
|
||||
boolean foundCut = false;
|
||||
if( checkQuantile == 0 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 1 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.95 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 2 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.98 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 3 ) {
|
||||
if( novelTiTvAtCut[jjj] >= targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 4 ) {
|
||||
break; // break out
|
||||
}
|
||||
|
||||
if( foundCut ) {
|
||||
logger.info( "Keeping variants with QUAL >= " + String.format("%.2f",theCut[jjj]) + " results in a filtered set with: " );
|
||||
logger.info("\t" + numKnownAtCut[jjj] + " known variants");
|
||||
logger.info("\t" + numNovelAtCut[jjj] + " novel variants, (dbSNP rate = " +
|
||||
String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
|
||||
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj]));
|
||||
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj]));
|
||||
logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5))))));
|
||||
}
|
||||
}
|
||||
if( tranche >= 0 ) { // Didn't find all the tranches
|
||||
throw new UserException.BadInput("Couldn't find appropriate cuts for all the requested tranches. Please ask for fewer tranches with higher false discovery rates using the --FDRtranche argument");
|
||||
}
|
||||
}
|
||||
|
||||
private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
||||
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
|
|
|||
|
|
@ -68,9 +68,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyVariantCuts", required=true)
|
||||
private PrintStream TRANCHES_FILE;
|
||||
@Output(fullName="report_dat_file", shortName="reportDatFile", doc="The output report .dat file used with Rscript to create the optimization curve PDF file", required=true)
|
||||
private File REPORT_DAT_FILE;
|
||||
private File TRANCHES_FILE;
|
||||
@Output(doc="File to which recalibrated variants should be written", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
|
|
@ -96,7 +94,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
@Argument(fullName="prior1KG", shortName="prior1KG", doc="A prior on the quality of 1000 Genomes Project variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_1KG = 12.0;
|
||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private Double[] FDR_TRANCHES = null;
|
||||
private double[] FDR_TRANCHES = new double[]{0.1, 1.0, 5.0, 10.0};
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
|
||||
|
|
@ -132,6 +130,14 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private NestedHashMap priorCache = new NestedHashMap();
|
||||
private boolean trustACField = false;
|
||||
private double maxQualObserved = 0.0;
|
||||
private PrintStream tranchesStream = null;
|
||||
|
||||
private static double round2(double num) {
|
||||
double result = num * 100.0;
|
||||
result = Math.round(result);
|
||||
result = result / 100.0;
|
||||
return result;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -195,13 +201,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
// Set up default values for the FDR tranches if necessary
|
||||
if( FDR_TRANCHES == null ) {
|
||||
FDR_TRANCHES = new Double[4];
|
||||
FDR_TRANCHES[0] = 0.1;
|
||||
FDR_TRANCHES[1] = 1.0;
|
||||
FDR_TRANCHES[2] = 5.0;
|
||||
FDR_TRANCHES[3] = 10.0;
|
||||
try {
|
||||
tranchesStream = new PrintStream(TRANCHES_FILE);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -285,7 +288,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
final double pVar = theModel.evaluateVariant( ref.getGenomeLocParser(),vc );
|
||||
final double lod = priorLodFactor + Math.log10(pVar);
|
||||
variantDatum.qual = Math.abs( QUALITY_SCALE_FACTOR * QualityUtils.lodToPhredScaleErrorRate(lod) );
|
||||
variantDatum.pos = vc.getStart();
|
||||
variantDatum.qual = round2(Math.abs( QUALITY_SCALE_FACTOR * QualityUtils.lodToPhredScaleErrorRate(lod)));
|
||||
if( variantDatum.qual > maxQualObserved ) {
|
||||
maxQualObserved = variantDatum.qual;
|
||||
}
|
||||
|
|
@ -329,25 +333,17 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
try {
|
||||
PrintStream reportDatFilePrintStream = new PrintStream(REPORT_DAT_FILE);
|
||||
theModel.outputOptimizationCurve( dataManager.data, reportDatFilePrintStream, TRANCHES_FILE, DESIRED_NUM_VARIANTS, FDR_TRANCHES, maxQualObserved );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(REPORT_DAT_FILE, e);
|
||||
}
|
||||
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches( dataManager.data, FDR_TRANCHES, TARGET_TITV );
|
||||
tranchesStream.print(Tranche.tranchesString(tranches));
|
||||
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptOptimizationCurveCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV;
|
||||
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV;
|
||||
logger.info( rScriptOptimizationCurveCommandLine );
|
||||
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV;
|
||||
logger.info( rScriptTranchesCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Process p;
|
||||
p = Runtime.getRuntime().exec( rScriptOptimizationCurveCommandLine );
|
||||
p.waitFor();
|
||||
p = Runtime.getRuntime().exec( rScriptTranchesCommandLine );
|
||||
p.waitFor();
|
||||
} catch ( Exception e ) {
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ public class WalkerTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
private static File getFileForMD5(final String md5) {
|
||||
protected static File getFileForMD5(final String md5) {
|
||||
final String basename = String.format("%s.integrationtest", md5);
|
||||
return new File(MD5_FILE_DB_SUBDIR + "/" + basename);
|
||||
}
|
||||
|
|
@ -208,15 +208,23 @@ public class WalkerTest extends BaseTest {
|
|||
|
||||
protected Map<String, File> auxillaryFiles = new HashMap<String, File>();
|
||||
|
||||
public WalkerTestSpec(String args, List<String> md5s) {
|
||||
this(args, -1, md5s);
|
||||
}
|
||||
|
||||
public WalkerTestSpec(String args, int nOutputFiles, List<String> md5s) {
|
||||
this.args = args;
|
||||
this.nOutputFiles = nOutputFiles;
|
||||
this.nOutputFiles = md5s.size();
|
||||
this.md5s = md5s;
|
||||
}
|
||||
|
||||
public WalkerTestSpec(String args, List<String> exts, List<String> md5s) {
|
||||
this(args, -1, exts, md5s);
|
||||
}
|
||||
|
||||
public WalkerTestSpec(String args, int nOutputFiles, List<String> exts, List<String> md5s) {
|
||||
this.args = args;
|
||||
this.nOutputFiles = nOutputFiles;
|
||||
this.nOutputFiles = md5s.size();
|
||||
this.md5s = md5s;
|
||||
this.exts = exts;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,8 +7,7 @@ import java.util.Arrays;
|
|||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class
|
||||
VariantEvalIntegrationTest extends WalkerTest {
|
||||
public class VariantEvalIntegrationTest extends WalkerTest {
|
||||
private static String cmdRoot = "-T VariantEval" +
|
||||
" -R " + b36KGReference;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -12,15 +12,14 @@
|
|||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
|
@ -34,6 +33,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -58,35 +58,93 @@ public final class VariantGaussianMixtureModelUnitTest extends BaseTest {
|
|||
private static int N_VARIANTS = 100;
|
||||
VariantDatum[] variantData1 = new VariantDatum[N_VARIANTS];
|
||||
|
||||
@BeforeTest(enabled=false)
|
||||
public void beforeTest() {
|
||||
for ( int i = 0; i < N_VARIANTS; i++ ) {
|
||||
variantData1[i].isKnown = i % 2 == 0; // every other is know
|
||||
variantData1[i].qual = (N_VARIANTS - i) * 1.0;
|
||||
private final File QUAL_DATA = new File(testDir + "tranches.raw.dat");
|
||||
private final double[] FDRS = new double[]{0.1, 1, 10, 100};
|
||||
private final double TARGET_TITV = 2.8;
|
||||
private final File EXPECTED_TRANCHES_NEW = new File(testDir + "tranches.6.txt");
|
||||
private final File EXPECTED_TRANCHES_OLD = new File(testDir + "tranches.4.txt");
|
||||
|
||||
private List<VariantDatum> readData() {
|
||||
List<VariantDatum> vd = new ArrayList<VariantDatum>();
|
||||
try {
|
||||
for ( String line : new XReadLines(QUAL_DATA, true) ) {
|
||||
String[] parts = line.split(" ");
|
||||
if ( ! parts[0].equals("QUAL") ) {
|
||||
VariantDatum datum = new VariantDatum();
|
||||
datum.qual = Double.valueOf(parts[0]);
|
||||
datum.isTransition = parts[1].equals("1");
|
||||
datum.isKnown = parts[2].equals("1");
|
||||
vd.add(datum);
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("foo", e);
|
||||
}
|
||||
|
||||
// first 25 are tv, 25-75 are 50/50, and 75+ are all transitions
|
||||
int i = 0;
|
||||
for ( ; i < (N_VARIANTS * 0.25); i++ ) { variantData1[i].isTransition = true; }
|
||||
for ( ; i < (N_VARIANTS * 0.75); i++ ) { variantData1[i].isTransition = i % 2 == 0; }
|
||||
for ( ; i < N_VARIANTS; i++ ) { variantData1[i].isTransition = false; }
|
||||
return vd;
|
||||
}
|
||||
|
||||
@Test(enabled=false)
|
||||
@Test(expectedExceptions = {UserException.MalformedFile.class})
|
||||
public final void readBadFormat() {
|
||||
Tranche.readTraches(QUAL_DATA);
|
||||
}
|
||||
|
||||
@Test
|
||||
public final void readNewFormat() {
|
||||
read(EXPECTED_TRANCHES_NEW);
|
||||
}
|
||||
|
||||
@Test
|
||||
public final void readOldFormat() {
|
||||
read(EXPECTED_TRANCHES_OLD);
|
||||
}
|
||||
|
||||
public final List<Tranche> read(File f) {
|
||||
return Tranche.readTraches(f);
|
||||
}
|
||||
|
||||
@Test
|
||||
public final void testNewAndOldAretheSame() {
|
||||
List<Tranche> newFormat = read(EXPECTED_TRANCHES_NEW);
|
||||
List<Tranche> oldFormat = read(EXPECTED_TRANCHES_OLD);
|
||||
assertTranchesAreTheSame(newFormat, oldFormat, false, true);
|
||||
}
|
||||
|
||||
private static void assertTranchesAreTheSame(List<Tranche> newFormat, List<Tranche> oldFormat, boolean completeP, boolean includeName) {
|
||||
Assert.assertEquals(oldFormat.size(), newFormat.size());
|
||||
for ( int i = 0; i < newFormat.size(); i++ ) {
|
||||
Tranche n = newFormat.get(i);
|
||||
Tranche o = oldFormat.get(i);
|
||||
Assert.assertEquals(n.fdr, o.fdr, 1e-3);
|
||||
Assert.assertEquals(n.numNovel, o.numNovel);
|
||||
Assert.assertEquals(n.novelTiTv, o.novelTiTv, 1e-3);
|
||||
if ( includeName )
|
||||
Assert.assertEquals(n.name, o.name);
|
||||
if ( completeP ) {
|
||||
Assert.assertEquals(n.numKnown, o.numKnown);
|
||||
Assert.assertEquals(n.knownTiTv, o.knownTiTv, 1e-3);
|
||||
Assert.assertEquals(n.targetTiTv, o.targetTiTv, 1e-3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public final void testFindTranches1() {
|
||||
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches(variantData1, new double[]{0.1, 20}, 2.0);
|
||||
Assert.assertEquals( tranches.size(), 2 );
|
||||
|
||||
Tranche t1 = tranches.get(0);
|
||||
Assert.assertEquals( t1.fdr, 0.1 );
|
||||
Assert.assertEquals( t1.pCut, 26 );
|
||||
Assert.assertEquals( t1.numKnown, 37 );
|
||||
Assert.assertEquals( t1.numNovel, 37 );
|
||||
|
||||
Tranche t2 = tranches.get(1);
|
||||
Assert.assertEquals( t2.fdr, 20 );
|
||||
Assert.assertEquals( t2.pCut, 21 );
|
||||
Assert.assertEquals( t2.numKnown, 37 );
|
||||
Assert.assertEquals( t2.numNovel, 37 );
|
||||
List<VariantDatum> vd = readData();
|
||||
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), FDRS, TARGET_TITV);
|
||||
System.out.printf(Tranche.tranchesString(tranches));
|
||||
assertTranchesAreTheSame(read(EXPECTED_TRANCHES_NEW), tranches, true, false);
|
||||
}
|
||||
|
||||
// @Test(expectedExceptions = {UserException.class})
|
||||
// public final void testBadFDR() {
|
||||
// List<VariantDatum> vd = readData();
|
||||
// VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), new double[]{-1}, TARGET_TITV);
|
||||
// }
|
||||
//
|
||||
// @Test(expectedExceptions = {UserException.class})
|
||||
// public final void testBadTargetTiTv() {
|
||||
// List<VariantDatum> vd = readData();
|
||||
// VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), FDRS, 0.1);
|
||||
// }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
|||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.DataProvider;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
|
|
@ -12,130 +13,112 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
static HashMap<String, String> tranchesFiles = new HashMap<String, String>();
|
||||
static HashMap<String, String> inputVCFFiles = new HashMap<String, String>();
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testGenerateVariantClusters() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "ab2629d67e378fd3aceb8318f0fbfe04" );
|
||||
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "725489156426e4ddd8d623ab3d4b1023" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 0.2 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClusters" +
|
||||
" -B:input,VCF " + vcf +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an HRun -an SB" +
|
||||
" -clusterFile %s",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
List<File> result = executeTest("testGenerateVariantClusters", spec).getFirst();
|
||||
clusterFiles.put(vcf, result.get(0).getAbsolutePath());
|
||||
private static class VRTest {
|
||||
String inVCF;
|
||||
String clusterMD5;
|
||||
String tranchesMD5;
|
||||
String recalVCFMD5;
|
||||
String cutVCFMD5;
|
||||
public VRTest(String inVCF, String clusterMD5, String tranchesMD5, String recalVCFMD5, String cutVCFMD5) {
|
||||
this.inVCF = validationDataLocation + inVCF;
|
||||
this.clusterMD5 = clusterMD5;
|
||||
this.tranchesMD5 = tranchesMD5;
|
||||
this.recalVCFMD5 = recalVCFMD5;
|
||||
this.cutVCFMD5 = cutVCFMD5;
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testVariantRecalibrator() {
|
||||
HashMap<String, List<String>> e = new HashMap<String, List<String>>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
||||
Arrays.asList("4e893672230fca625f70b0491f3b36cb", "40b3fb6632304cebc56a9ed5853cc72e","0cfbeb9f2db7edc1d69b5b43ea17670c")); // Each test checks the md5 of three output files
|
||||
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf",
|
||||
Arrays.asList("d52e4f511c9c00f8c21dffea81c47103", "780920fcfa3009c66ebcf18265edaa75","010023ddb95fac071e0f208e6bb40c61")); // Each test checks the md5 of three output files
|
||||
VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
||||
"ab2629d67e378fd3aceb8318f0fbfe04", // in vcf
|
||||
"4dd95e9d8e5d21a6ab73de67d9663492", // tranches
|
||||
"4e893672230fca625f70b0491f3b36cb", // recalVCF
|
||||
"371b0e2796982485ae050e46892f6660"); // cut VCF
|
||||
|
||||
for ( Map.Entry<String, List<String>> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
List<String> md5s = entry.getValue();
|
||||
String clusterFile = clusterFiles.get(vcf);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
|
||||
if ( clusterFile != null ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
|
||||
" -T VariantRecalibrator" +
|
||||
" -B:input,VCF " + vcf +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" --ignore_filter HARD_TO_VALIDATE" +
|
||||
" -clusterFile " + clusterFile +
|
||||
" -titv 2.07" +
|
||||
" -o %s" +
|
||||
" -tranchesFile %s" +
|
||||
" -reportDatFile %s",
|
||||
3, // three output files
|
||||
md5s);
|
||||
List<File> result = executeTest("testVariantRecalibrator", spec).getFirst();
|
||||
inputVCFFiles.put(vcf, result.get(0).getAbsolutePath());
|
||||
tranchesFiles.put(vcf, result.get(1).getAbsolutePath());
|
||||
}
|
||||
}
|
||||
VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
|
||||
"725489156426e4ddd8d623ab3d4b1023", // in vcf
|
||||
"dfc07132f592a811d0d6c25a4cb67a09", // tranches
|
||||
"d52e4f511c9c00f8c21dffea81c47103", // recalVCF
|
||||
"39716b2f03b50e88855d7975dd1b9b3e"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
return new Object[][]{ {yriTrio}, {lowPass} };
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyVariantCuts() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "646805c4619a15d3f9f4976b96b89ecd" );
|
||||
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "df9e37af16610ccd44d375eae2c4479c" );
|
||||
@Test(dataProvider = "VRTest", enabled = true)
|
||||
public void testGenerateVariantClusters(VRTest params) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 0.2 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClusters" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an HRun -an SB" +
|
||||
" -clusterFile %s",
|
||||
Arrays.asList(params.clusterMD5));
|
||||
executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst();
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
String inputVCFFile = inputVCFFiles.get(vcf);
|
||||
String tranchesFile = tranchesFiles.get(vcf);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", vcf, inputVCFFile);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", vcf, tranchesFile);
|
||||
if ( inputVCFFile != null && tranchesFile != null ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -T ApplyVariantCuts" +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" -B:input,VCF " + inputVCFFile +
|
||||
" -o %s" +
|
||||
" -tranchesFile " + tranchesFile,
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
executeTest("testApplyVariantCuts", spec);
|
||||
}
|
||||
}
|
||||
@Test(dataProvider = "VRTest", enabled = true)
|
||||
public void testVariantRecalibrator(VRTest params) {
|
||||
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -T VariantRecalibrator" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" --ignore_filter HARD_TO_VALIDATE" +
|
||||
" -clusterFile " + getFileForMD5(params.clusterMD5) +
|
||||
" -titv 2.07" +
|
||||
" -o %s" +
|
||||
" -tranchesFile %s",
|
||||
Arrays.asList(params.recalVCFMD5, params.tranchesMD5));
|
||||
executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst();
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VRTest", enabled = true)
|
||||
public void testApplyVariantCuts(VRTest params) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -T ApplyVariantCuts" +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" -B:input,VCF " + getFileForMD5(params.recalVCFMD5) +
|
||||
" -o %s" +
|
||||
" -tranchesFile " + getFileForMD5(params.tranchesMD5),
|
||||
Arrays.asList(params.cutVCFMD5));
|
||||
executeTest("testApplyVariantCuts-"+params.inVCF, spec);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Test()
|
||||
public void testFailWithBadAnnotation() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 0.2 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClusters" +
|
||||
" -B:input,VCF " + vcf +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an HRun -an ThisAnnotationIsBAD" + // There is a bad annotation here
|
||||
" -clusterFile %s",
|
||||
1, // just one output file
|
||||
UserException.MalformedFile.class);
|
||||
executeTest("testFailWithBadAnnotation", spec);
|
||||
}
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 0.2 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClusters" +
|
||||
" -B:input,VCF " + lowPass.inVCF +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an HRun -an ThisAnnotationIsBAD" + // There is a bad annotation here
|
||||
" -clusterFile %s",
|
||||
1, // just one output file
|
||||
UserException.MalformedFile.class);
|
||||
executeTest("testFailWithBadAnnotation", spec);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue