Updating VQSR for new RodBinding syntax. Cleaning up indel specific parts of VQSR.

This commit is contained in:
Ryan Poplin 2011-08-10 10:20:37 -04:00
parent 9e53fd6880
commit c60cf52f73
4 changed files with 129 additions and 103 deletions

View File

@ -28,9 +28,9 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
@ -56,6 +56,11 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
// Inputs
/////////////////////////////
/**
* The raw input variants to be recalibrated.
*/
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input;
@Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true)
private File RECAL_FILE;
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
@ -101,17 +106,8 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
}
Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity)
for( final ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().startsWith("input") ) {
inputNames.add(d.getName());
logger.info("Found input variant track with name " + d.getName());
} else {
logger.info("Not evaluating ROD binding " + d.getName());
}
}
if( inputNames.size() == 0 ) {
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
for( final RodBinding rod : input ) {
inputNames.add( rod.getName() );
}
if( IGNORE_INPUT_FILTERS != null ) {
@ -168,7 +164,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
return 1;
}
for( VariantContext vc : tracker.getValues(VariantContext.class, inputNames, context.getLocation()) ) {
for( VariantContext vc : tracker.getValues(input, context.getLocation()) ) {
if( vc != null ) {
if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
String filterString = null;

View File

@ -26,10 +26,10 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
/**
@ -105,31 +106,6 @@ public class VariantDataManager {
}
}
public void addTrainingSet( final TrainingSet trainingSet ) {
trainingSets.add( trainingSet );
}
public boolean checkHasTrainingSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isTraining ) { return true; }
}
return false;
}
public boolean checkHasTruthSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isTruth ) { return true; }
}
return false;
}
public boolean checkHasKnownSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isKnown ) { return true; }
}
return false;
}
public ExpandingArrayList<VariantDatum> getTrainingData() {
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
for( final VariantDatum datum : data ) {
@ -240,13 +216,14 @@ public class VariantDataManager {
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
}
if (vc.isIndel() && annotationKey.equalsIgnoreCase("QD")) {
// normalize QD by event length for indel case
int eventLength = Math.abs(vc.getAlternateAllele(0).getBaseString().length() - vc.getReference().getBaseString().length()); // ignore multi-allelic complication here for now
if (eventLength > 0) // sanity check
value /= (double)eventLength;
}
if (vc.isIndel() && annotationKey.equalsIgnoreCase("QD")) {
// normalize QD by event length for indel case
int eventLength = Math.abs(vc.getAlternateAllele(0).getBaseString().length() - vc.getReference().getBaseString().length()); // ignore multi-allelic complication here for now
if (eventLength > 0) { // sanity check
value /= (double)eventLength;
}
}
if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
@ -257,30 +234,44 @@ public class VariantDataManager {
return value;
}
public void parseTrainingSets( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) {
public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap<String, Double> rodToPriorMap,
final List<RodBinding<VariantContext>> training, final List<RodBinding<VariantContext>> truth, final List<RodBinding<VariantContext>> known, final List<RodBinding<VariantContext>> badSites) {
datum.isKnown = false;
datum.atTruthSite = false;
datum.atTrainingSite = false;
datum.atAntiTrainingSite = false;
datum.prior = 2.0;
datum.consensusCount = 0;
for( final TrainingSet trainingSet : trainingSets ) {
for( final VariantContext trainVC : tracker.getValues(VariantContext.class, trainingSet.name, ref.getLocus()) ) {
if( trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()) ) {
datum.isKnown = datum.isKnown || trainingSet.isKnown;
datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth;
datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining;
datum.prior = Math.max( datum.prior, trainingSet.prior );
datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 );
for( final RodBinding<VariantContext> rod : training ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.atTrainingSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : truth ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.atTruthSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : known ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.isKnown = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : badSites ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( trainVC != null ) {
datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining;
datum.atAntiTrainingSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
}
@ -292,4 +283,10 @@ public class VariantDataManager {
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
}
}
private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
}
}

View File

@ -25,13 +25,9 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
@ -57,11 +53,51 @@ import java.util.*;
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
public static final String VQS_LOD_KEY = "VQSLOD";
public static final String CULPRIT_KEY = "culprit";
public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model
public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
/////////////////////////////
// Inputs
/////////////////////////////
/**
* The raw input variants to be recalibrated.
*/
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input;
/**
* A list of training variants used to train the Gaussian mixture model.
*
* Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
*/
@Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
public List<RodBinding<VariantContext>> training;
/**
* A list of true variants to be used when deciding the truth sensitivity cut of the final callset.
*
* When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
* Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
*/
@Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
public List<RodBinding<VariantContext>> truth;
/**
* A list of known variants to be used for metric comparison purposes.
*
* The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
* The output metrics are stratified by known status in order to aid in comparisons with other call sets.
*/
@Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
public List<RodBinding<VariantContext>> known = Collections.emptyList();
/**
* A list of known bad variants used to supplement training the negative model.
*
* In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
* with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
*/
@Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
public List<RodBinding<VariantContext>> badSites = Collections.emptyList();
/////////////////////////////
// Outputs
/////////////////////////////
@ -96,9 +132,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Hidden
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
protected Boolean TRUST_ALL_POLYMORPHIC = false;
@Hidden
@Argument(fullName = "projectConsensus", shortName = "projectConsensus", doc = "Perform 1000G project consensus. This implies an extra prior factor based on the individual participant callsets passed in with consensus=true rod binding tags.", required = false)
protected Boolean PERFORM_PROJECT_CONSENSUS = false;
//@Hidden
//@Argument(fullName = "projectConsensus", shortName = "projectConsensus", doc = "Perform 1000G project consensus. This implies an extra prior factor based on the individual participant callsets passed in with consensus=true rod binding tags.", required = false)
//protected Boolean PERFORM_PROJECT_CONSENSUS = false;
/////////////////////////////
// Private Member Variables
@ -106,8 +142,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private VariantDataManager dataManager;
private PrintStream tranchesStream;
private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
private final Set<String> inputNames = new HashSet<String>();
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
private final HashMap<String, Double> rodToPriorMap = new HashMap<String, Double>();
//---------------------------------------------------------------------------------------------------------------
//
@ -123,31 +159,24 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) );
}
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().toLowerCase().startsWith("input") ) {
inputNames.add(d.getName());
logger.info( "Found input variant track with name " + d.getName() );
} else {
dataManager.addTrainingSet( new TrainingSet(d.getName(), d.getTags()) );
}
}
if( !dataManager.checkHasTrainingSet() ) {
throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
if( !dataManager.checkHasTruthSet() ) {
throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
if( inputNames.size() == 0 ) {
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
}
try {
tranchesStream = new PrintStream(TRANCHES_FILE);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
}
final ArrayList<RodBinding<VariantContext>> allInputBindings = new ArrayList<RodBinding<VariantContext>>();
allInputBindings.addAll(truth);
allInputBindings.addAll(training);
allInputBindings.addAll(known);
allInputBindings.addAll(badSites);
for( final RodBinding<VariantContext> rod : allInputBindings ) {
try {
rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
} catch( NumberFormatException e ) {
throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
}
}
}
//---------------------------------------------------------------------------------------------------------------
@ -163,10 +192,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
return mapList;
}
for( final VariantContext vc : tracker.getValues(VariantContext.class, inputNames, context.getLocation()) ) {
for( final VariantContext vc : tracker.getValues(input, context.getLocation()) ) {
if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) {
if( checkRecalibrationMode( vc, VRAC.MODE ) ) {
final VariantDatum datum = new VariantDatum();
// Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need.
dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine
datum.contig = vc.getChr();
datum.start = vc.getStart();
@ -176,12 +207,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
datum.isTransition = datum.isSNP && VariantContextUtils.isTransition(vc);
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
dataManager.parseTrainingSets( tracker, ref, context, vc, datum, TRUST_ALL_POLYMORPHIC );
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC, rodToPriorMap, training, truth, known, badSites );
double priorFactor = QualityUtils.qualToProb( datum.prior );
if( PERFORM_PROJECT_CONSENSUS ) {
final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );
priorFactor = 1.0 - ((1.0 - priorFactor) * (1.0 - consensusPrior));
}
//if( PERFORM_PROJECT_CONSENSUS ) {
// final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );
// priorFactor = 1.0 - ((1.0 - priorFactor) * (1.0 - consensusPrior));
//}
datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor );
mapList.add( datum );

View File

@ -41,11 +41,13 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -B:dbsnp,VCF,known=true,training=false,truth=false,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -B:hapmap,VCF,known=false,training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -B:omni,VCF,known=false,training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -known:prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -training:prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -truth:prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -training:prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -truth:prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -T VariantRecalibrator" +
" -B:input,VCF " + params.inVCF +
" -input " + params.inVCF +
" -L 20:1,000,000-40,000,000" +
" -an QD -an HaplotypeScore -an HRun" +
" -percentBad 0.07" +
@ -64,7 +66,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -T ApplyRecalibration" +
" -L 20:12,000,000-30,000,000" +
" -NO_HEADER" +
" -B:input,VCF " + params.inVCF +
" -input " + params.inVCF +
" -o %s" +
" -tranchesFile " + MD5DB.getMD5FilePath(params.tranchesMD5, null) +
" -recalFile " + MD5DB.getMD5FilePath(params.recalMD5, null),