Removed the qScale arguments in VariantRecalibrator. It is smarter about how it tries to find a cut so the arbitrary scale factor hopefully is no longer necessary. Now the recalibrated variant quality score more accurately reflects our believed lod of the call.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4451 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ee00dcb79d
commit
0de658534d
|
|
@ -87,8 +87,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beagle on each site"));
|
||||
hInfo.add(new VCFInfoHeaderLine("NumGenotypesChanged", 1, VCFHeaderLineType.Integer, "r2 Value reported by Beagle on each site"));
|
||||
|
||||
hInfo.add(new VCFHeaderLine("source", "BeagleImputation"));
|
||||
|
||||
// Open output file specified by output VCF ROD
|
||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||
|
||||
|
|
|
|||
|
|
@ -462,19 +462,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
|
||||
public final void outputOptimizationCurve( final VariantDatum[] data, final PrintStream outputReportDatFile, final PrintStream tranchesOutputFile,
|
||||
final int desiredNumVariants, final Double[] FDRtranches, final double QUAL_STEP ) {
|
||||
|
||||
final int desiredNumVariants, final Double[] FDRtranches, final double MAX_QUAL ) {
|
||||
final int numVariants = data.length;
|
||||
final boolean[] markedVariant = new boolean[numVariants];
|
||||
|
||||
final double MAX_QUAL = 100.0;
|
||||
final int NUM_BINS = (int) ((MAX_QUAL / QUAL_STEP) + 1);
|
||||
int NUM_BINS = 400 * 2;
|
||||
double QUAL_STEP1 = (MAX_QUAL * 9.0 / 10.0 ) / ((double) NUM_BINS / 2.0);
|
||||
if( QUAL_STEP1 < 0.01 ) { QUAL_STEP1 = 0.01; } // QUAL field in VCF file is rounded to two decimal places
|
||||
double QUAL_STEP2 = (MAX_QUAL * 1.0 / 10.0) / ((double) NUM_BINS / 2.0);
|
||||
if( QUAL_STEP2 < 0.01 ) { QUAL_STEP2 = 0.01; } // QUAL field in VCF file is rounded to two decimal places
|
||||
|
||||
final int numKnownAtCut[] = new int[NUM_BINS];
|
||||
final int numNovelAtCut[] = new int[NUM_BINS];
|
||||
final double knownTiTvAtCut[] = new double[NUM_BINS];
|
||||
final double novelTiTvAtCut[] = new double[NUM_BINS];
|
||||
final double theCut[] = new double[NUM_BINS];
|
||||
final int numKnownAtCut[] = new int[NUM_BINS+1];
|
||||
final int numNovelAtCut[] = new int[NUM_BINS+1];
|
||||
final double knownTiTvAtCut[] = new double[NUM_BINS+1];
|
||||
final double novelTiTvAtCut[] = new double[NUM_BINS+1];
|
||||
final double theCut[] = new double[NUM_BINS+1];
|
||||
|
||||
final double fdrCutAsTiTv[] = new double[FDRtranches.length];
|
||||
for( int iii = 0; iii < FDRtranches.length; iii++ ) {
|
||||
|
|
@ -496,7 +498,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
boolean foundDesiredNumVariants = false;
|
||||
int jjj = 0;
|
||||
outputReportDatFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
||||
for( double qCut = MAX_QUAL; qCut >= -0.001; qCut -= QUAL_STEP ) {
|
||||
double qCut = MAX_QUAL;
|
||||
|
||||
while( qCut >= 0.0 - 1E-2 + 1E-8 ) {
|
||||
if( qCut < 1E-2 ) { qCut = 0.0; }
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
if( !markedVariant[iii] ) {
|
||||
if( data[iii].qual >= qCut ) {
|
||||
|
|
@ -520,14 +526,14 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
}
|
||||
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
|
||||
logger.info( "Keeping variants with QUAL >= " + String.format("%.1f",qCut) + " results in a filtered set with: " );
|
||||
logger.info( "Keeping variants with QUAL >= " + String.format("%.2f",qCut) + " results in a filtered set with: " );
|
||||
logger.info("\t" + numKnown + " known variants");
|
||||
logger.info("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
|
||||
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
|
||||
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv)));
|
||||
foundDesiredNumVariants = true;
|
||||
}
|
||||
outputReportDatFile.println( String.format("%.6f,%d,%d,%.4f,%.4f", qCut, numKnown, numNovel,
|
||||
outputReportDatFile.println( String.format("%.2f,%d,%d,%.4f,%.4f", qCut, numKnown, numNovel,
|
||||
( numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) ),
|
||||
( numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) )));
|
||||
|
||||
|
|
@ -536,13 +542,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
knownTiTvAtCut[jjj] = ( numKnownTi == 0 || numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) );
|
||||
novelTiTvAtCut[jjj] = ( numNovelTi == 0 || numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) );
|
||||
theCut[jjj] = qCut;
|
||||
jjj++;
|
||||
jjj++;
|
||||
|
||||
if( qCut >= (MAX_QUAL / 10.0) ) {
|
||||
qCut -= QUAL_STEP1;
|
||||
} else {
|
||||
qCut -= QUAL_STEP2;
|
||||
}
|
||||
}
|
||||
|
||||
// loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
|
||||
int checkQuantile = 0;
|
||||
int tranche = FDRtranches.length - 1;
|
||||
for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) {
|
||||
for( ; jjj >= 0; jjj-- ) {
|
||||
|
||||
if( tranche >= 0 && novelTiTvAtCut[jjj] >= fdrCutAsTiTv[tranche] ) {
|
||||
tranchesOutputFile.println(String.format("%.2f,%.4f,%.4f,%d,FDRtranche%.2fto%.2f",
|
||||
|
|
@ -577,7 +589,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
|
||||
if( foundCut ) {
|
||||
logger.info( "Keeping variants with QUAL >= " + String.format("%.1f",theCut[jjj]) + " results in a filtered set with: " );
|
||||
logger.info( "Keeping variants with QUAL >= " + String.format("%.2f",theCut[jjj]) + " results in a filtered set with: " );
|
||||
logger.info("\t" + numKnownAtCut[jjj] + " known variants");
|
||||
logger.info("\t" + numNovelAtCut[jjj] + " novel variants, (dbSNP rate = " +
|
||||
String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
|
||||
|
|
@ -586,6 +598,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5))))));
|
||||
}
|
||||
}
|
||||
if( tranche >= 0 ) { // Didn't find all the tranches
|
||||
throw new UserException.BadInput("Couldn't find appropriate cuts for all the requested tranches. Please ask for fewer tranches with higher false discovery rates using the --FDRtranche argument");
|
||||
}
|
||||
}
|
||||
|
||||
private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
||||
|
|
|
|||
|
|
@ -101,14 +101,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
@Argument(fullName="quality_step", shortName="qStep", doc="Resolution in QUAL units for optimization and tranche calculations", required=false)
|
||||
private double QUAL_STEP = 0.1;
|
||||
@Argument(fullName="singleton_fp_rate", shortName="fp_rate", doc="Prior expectation that a singleton call would be a FP", required=false)
|
||||
private double SINGLETON_FP_RATE = 0.5;
|
||||
@Argument(fullName="max_ac_prior", shortName="maxACPrior", doc="Maximum value for the prior expectation based on allele count. Needed because (1 - 0.5^x) approaches 1.0 very quickly.", required=false)
|
||||
private double MAX_AC_PRIOR = 0.99;
|
||||
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
|
||||
private double QUALITY_SCALE_FACTOR = 100.0;
|
||||
@Argument(fullName="dontTrustACField", shortName="dontTrustACField", doc="If specified the VR will not use the AC field and will instead always parse the genotypes to figure out how many variant chromosomes there are at a given site.", required=false)
|
||||
private boolean NEVER_TRUST_AC_FIELD = false;
|
||||
|
||||
|
|
@ -119,8 +115,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
@Argument(fullName = "NoByHapMapValidationStatus", shortName = "NoByHapMapValidationStatus", doc = "Don't consider sites in dbsnp rod tagged as by-hapmap validation status as real HapMap sites. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private Boolean NO_BY_HAPMAP_VALIDATION_STATUS = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites with original quality scores below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private double QUAL_THRESHOLD = 0.0;
|
||||
@Hidden
|
||||
@Argument(fullName="quality_scale_factor", shortName="qScaleFactor", doc="Multiply all final quality scores by this value. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private double QUALITY_SCALE_FACTOR = 1.0;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -131,6 +131,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private Set<String> inputNames = new HashSet<String>();
|
||||
private NestedHashMap priorCache = new NestedHashMap();
|
||||
private boolean trustACField = false;
|
||||
private double maxQualObserved = 0.0;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -194,15 +195,13 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
|
||||
// Set up default values for the FDR tranches if necessary
|
||||
if( FDR_TRANCHES == null ) {
|
||||
FDR_TRANCHES = new Double[5];
|
||||
FDR_TRANCHES = new Double[4];
|
||||
FDR_TRANCHES[0] = 0.1;
|
||||
FDR_TRANCHES[1] = 1.0;
|
||||
FDR_TRANCHES[2] = 5.0;
|
||||
FDR_TRANCHES[3] = 10.0;
|
||||
FDR_TRANCHES[4] = 15.0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -274,7 +273,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
|
||||
|
||||
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
|
||||
throw new UserException.CommandLineException("Something is wrong with the prior that was entered by the user: Prior = " + totalPrior); // TODO - fix this up later
|
||||
throw new UserException.CommandLineException("Something is wrong with the priors that were entered by the user: Prior = " + totalPrior); // TODO - fix this up later
|
||||
}
|
||||
|
||||
priorLodFactor = Math.log10(totalPrior) - Math.log10(1.0 - totalPrior) - Math.log10(1.0);
|
||||
|
|
@ -285,6 +284,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
final double pVar = theModel.evaluateVariant( vc );
|
||||
final double lod = priorLodFactor + Math.log10(pVar);
|
||||
variantDatum.qual = Math.abs( QUALITY_SCALE_FACTOR * QualityUtils.lodToPhredScaleErrorRate(lod) );
|
||||
if( variantDatum.qual > maxQualObserved ) {
|
||||
maxQualObserved = variantDatum.qual;
|
||||
}
|
||||
|
||||
mapList.add( variantDatum );
|
||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
|
|
@ -327,7 +329,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
try {
|
||||
PrintStream reportDatFilePrintStream = new PrintStream(REPORT_DAT_FILE);
|
||||
theModel.outputOptimizationCurve( dataManager.data, reportDatFilePrintStream, TRANCHES_FILE, DESIRED_NUM_VARIANTS, FDR_TRANCHES, QUAL_STEP );
|
||||
theModel.outputOptimizationCurve( dataManager.data, reportDatFilePrintStream, TRANCHES_FILE, DESIRED_NUM_VARIANTS, FDR_TRANCHES, maxQualObserved );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(REPORT_DAT_FILE, e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,9 +46,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
public void testVariantRecalibrator() {
|
||||
HashMap<String, List<String>> e = new HashMap<String, List<String>>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
||||
Arrays.asList("ca80c95e47d22a79eb0700c039457aca", "ab6b91a3f97b682817d2cc068c34c317","a4802dbf883138ae1152109be990fb9d")); // Each test checks the md5 of three output files
|
||||
Arrays.asList("4e893672230fca625f70b0491f3b36cb", "40b3fb6632304cebc56a9ed5853cc72e","0cfbeb9f2db7edc1d69b5b43ea17670c")); // Each test checks the md5 of three output files
|
||||
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf",
|
||||
Arrays.asList("a40d3b1dd7bfbb66a52145600f87d744", "e2c89c74debc1c202c56060b77575dff","8e8b55b521aaba1c44169a19e3ff2355")); // Each test checks the md5 of three output files
|
||||
Arrays.asList("d52e4f511c9c00f8c21dffea81c47103", "780920fcfa3009c66ebcf18265edaa75","010023ddb95fac071e0f208e6bb40c61")); // Each test checks the md5 of three output files
|
||||
|
||||
for ( Map.Entry<String, List<String>> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
|
@ -68,7 +68,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" --ignore_filter HARD_TO_VALIDATE" +
|
||||
" -clusterFile " + clusterFile +
|
||||
" -titv 2.07" +
|
||||
" -qScale 20.0" +
|
||||
" -o %s" +
|
||||
" -tranchesFile %s" +
|
||||
" -reportDatFile %s",
|
||||
|
|
@ -84,8 +83,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testApplyVariantCuts() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "ece0c15c34926fc585e12503f6ce6271" );
|
||||
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "994b329a35d01e9564f5581cf3d9feac" );
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "8e6528fba350e466f5b6c2858bc20556" );
|
||||
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "df9e37af16610ccd44d375eae2c4479c" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue