Changing the VQSR command line syntax back to the parsed tags approach. This cleans up the code and makes sure we won't be parsing the same rod file multiple times. I've tried to update the appropriate qscripts.

2011-09-12 12:17:43 -04:00 · 2011-09-12 12:17:43 -04:00 · 981b78ea50
parent 60ebe68aff
commit 981b78ea50
5 changed files with 147 additions and 106 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java
@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2011 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.commandline.Tags;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 3/12/11
+ */
+
+public class TrainingSet {
+
+    public RodBinding<VariantContext> rodBinding;
+    public boolean isKnown = false;
+    public boolean isTraining = false;
+    public boolean isAntiTraining = false;
+    public boolean isTruth = false;
+    public boolean isConsensus = false;
+    public double prior = 0.0;
+
+    protected final static Logger logger = Logger.getLogger(TrainingSet.class);
+
+    public TrainingSet( final RodBinding<VariantContext> rodBinding) {
+        this.rodBinding = rodBinding;
+
+        final Tags tags = rodBinding.getTags();
+        final String name = rodBinding.getName();
+
+        // Parse the tags to decide which tracks have which properties
+        if( tags != null ) {
+            isKnown = tags.containsKey("known") && tags.getValue("known").equals("true");
+            isTraining = tags.containsKey("training") && tags.getValue("training").equals("true");
+            isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true");
+            isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true");
+            isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true");
+            prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior );
+        }
+
+        // Report back to the user which tracks were found and the properties that were detected
+        if( !isConsensus && !isAntiTraining ) {
+            logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) );
+        } else if( isConsensus ) {
+            logger.info( String.format( "Found consensus track: %s", name) );
+        } else {
+            logger.info( String.format( "Found bad sites training track: %s", name) );
+        }
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
@ -51,10 +51,10 @@ public class VariantDataManager {
    private ExpandingArrayList<VariantDatum> data;
    private final double[] meanVector;
    private final double[] varianceVector; // this is really the standard deviation
-    public final ArrayList<String> annotationKeys;
+    public final List<String> annotationKeys;
    private final VariantRecalibratorArgumentCollection VRAC;
    protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
-
+    protected final List<TrainingSet> trainingSets;

    public VariantDataManager( final List<String> annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) {
        this.data = null;
@ -62,6 +62,7 @@ public class VariantDataManager {
        this.VRAC = VRAC;
        meanVector = new double[this.annotationKeys.size()];
        varianceVector = new double[this.annotationKeys.size()];
+        trainingSets = new ArrayList<TrainingSet>();
    }

    public void setData( final ExpandingArrayList<VariantDatum> data ) {
@ -104,6 +105,31 @@ public class VariantDataManager {
        }
    }

+     public void addTrainingSet( final TrainingSet trainingSet ) {
+         trainingSets.add( trainingSet );
+     }
+
+     public boolean checkHasTrainingSet() {
+         for( final TrainingSet trainingSet : trainingSets ) {
+             if( trainingSet.isTraining ) { return true; }
+         }
+         return false;
+     }
+
+     public boolean checkHasTruthSet() {
+         for( final TrainingSet trainingSet : trainingSets ) {
+             if( trainingSet.isTruth ) { return true; }
+         }
+         return false;
+     }
+
+     public boolean checkHasKnownSet() {
+         for( final TrainingSet trainingSet : trainingSets ) {
+             if( trainingSet.isKnown ) { return true; }
+         }
+         return false;
+     }
+
    public ExpandingArrayList<VariantDatum> getTrainingData() {
        final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
        for( final VariantDatum datum : data ) {
@ -232,57 +258,35 @@ public class VariantDataManager {
        return value;
    }

-    public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap<String, Double> rodToPriorMap,
-                                   final List<RodBinding<VariantContext>> training, final List<RodBinding<VariantContext>> truth, final List<RodBinding<VariantContext>> known, final List<RodBinding<VariantContext>> badSites, final List<RodBinding<VariantContext>> resource) {
+    public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) {
        datum.isKnown = false;
        datum.atTruthSite = false;
        datum.atTrainingSite = false;
        datum.atAntiTrainingSite = false;
        datum.prior = 2.0;

-        //BUGBUG: need to clean this up
-
-        for( final RodBinding<VariantContext> rod : training ) {
-            for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
+        for( final TrainingSet trainingSet : trainingSets ) {
+            for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) {
                if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
-                    datum.atTrainingSite = true;
-                    datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
+                    datum.isKnown = datum.isKnown || trainingSet.isKnown;
+                    datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth;
+                    datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining;
+                    datum.prior = Math.max( datum.prior, trainingSet.prior );
+                    datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 );
                }
-            }
-        }
-        for( final RodBinding<VariantContext> rod : truth ) {
-            for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
-                if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
-                    datum.atTruthSite = true;
-                    datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
-                }
-            }
-        }
-        for( final RodBinding<VariantContext> rod : known ) {
-            for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
-                if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
-                    datum.isKnown = true;
-                    datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
-                }
-            }
-        }
-        for( final RodBinding<VariantContext> rod : resource ) {
-            for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
-                if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
-                    datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
-                }
-            }
-        }
-        for( final RodBinding<VariantContext> rod : badSites ) {
-            for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
                if( trainVC != null ) {
-                    datum.atAntiTrainingSite = true;
-                    datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
+                    datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining;
                }
            }
        }
    }

+    private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
+        return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
+                        ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
+                        (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
+    }
+
    public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
        for( final VariantDatum datum : data ) {
            RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
@ -290,10 +294,4 @@ public class VariantDataManager {
                    (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
        }
    }
-
-    private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
-        return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
-                        ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
-                        (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
-    }
 }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@ -77,16 +77,15 @@ import java.util.*;
 * <p>
 * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.  
 *
- * <h2>Examples</h2>
+ * <h2>Example</h2>
 * <pre>
 * java -Xmx4g -jar GenomeAnalysisTK.jar \
 *   -T VariantRecalibrator \
 *   -R reference/human_g1k_v37.fasta \
 *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
- *   -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
- *   -known:prior=8.0 dbsnp_132.b37.vcf \
+ *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
 *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
 *   -recalFile path/to/output.recal \
 *   -tranchesFile path/to/output.tranches \
@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    public List<RodBinding<VariantContext>> input;

    /**
-     * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
-     */
-    @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
-    public List<RodBinding<VariantContext>> training;
-
-    /**
-     * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
-     * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
-     */
-    @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
-    public List<RodBinding<VariantContext>> truth;
-
-    /**
-     * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
-     * The output metrics are stratified by known status in order to aid in comparisons with other call sets.
-     */
-    @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
-    public List<RodBinding<VariantContext>> known = Collections.emptyList();
-
-    /**
-     * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
-     * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
-     */
-    @Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
-    public List<RodBinding<VariantContext>> badSites = Collections.emptyList();
-
-    /**
-     * Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
+     * Any set of VCF files to use as lists of training, truth, or known sites.
+     * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
+     * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
+     * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
+     * Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
     */
    @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
    public List<RodBinding<VariantContext>> resource = Collections.emptyList();
@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    private PrintStream tranchesStream;
    private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
    private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
-    private final HashMap<String, Double> rodToPriorMap = new HashMap<String, Double>();

    //---------------------------------------------------------------------------------------------------------------
    //
@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
            throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
        }

-        final ArrayList<RodBinding<VariantContext>> allInputBindings = new ArrayList<RodBinding<VariantContext>>();
-        allInputBindings.addAll(truth);
-        allInputBindings.addAll(training);
-        allInputBindings.addAll(known);
-        allInputBindings.addAll(badSites);
-        allInputBindings.addAll(resource);
-        for( final RodBinding<VariantContext> rod : allInputBindings ) {
-            try {
-                rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
-            } catch( NumberFormatException e ) {
-                throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
-            }
+        for( RodBinding<VariantContext> rod : resource ) {
+            dataManager.addTrainingSet( new TrainingSet( rod ) );
+        }
+
+        if( !dataManager.checkHasTrainingSet() ) {
+            throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
+        }
+        if( !dataManager.checkHasTruthSet() ) {
+            throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
        }
    }

@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
                    datum.isTransition = datum.isSNP && VariantContextUtils.isTransition(vc);

                    // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
-                    dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC, rodToPriorMap, training, truth, known, badSites, resource ); // BUGBUG: need to clean this up to be a class, not a list of all the rod bindings
+                    dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
                    double priorFactor = QualityUtils.qualToProb( datum.prior );
                    //if( PERFORM_PROJECT_CONSENSUS ) { // BUGBUG: need to resurrect this functionality?
                    //    final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
@ -41,11 +41,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
        //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-R " + b37KGReference +
-                        " -known:prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
-                        " -training:prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
-                        " -truth:prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
-                        " -training:prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
-                        " -truth:prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
+                        " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
+                        " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
+                        " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
                        " -T VariantRecalibrator" +
                        " -input " + params.inVCF +
                        " -L 20:1,000,000-40,000,000" +
@ -89,9 +87,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
        //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-R " + b37KGReference +
-                        " -known:prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
-                        " -training:prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" +
-                        " -truth:prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" +
+                        " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
+                        " -resource:training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" +
                        " -T VariantRecalibrator" +
                        " -input " + params.inVCF +
                        " -L 20:1,000,000-40,000,000" +
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
@ -248,12 +248,10 @@ class MethodsDevelopmentCallingPipeline extends QScript {
    this.reference_sequence = t.reference
    this.intervalsString ++= List(t.intervals)
    this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } )
-    this.training :+= new TaggedFile( t.hapmapFile, "prior=15.0")
-    this.truth :+= new TaggedFile( t.hapmapFile, "prior=15.0")
-    this.training :+= new TaggedFile( omni_b37, "prior=12.0")
-    this.truth :+= new TaggedFile( omni_b37, "prior=12.0")
-    this.training :+= new TaggedFile( training_1000G, "prior=10.0" )
-    this.known :+= new TaggedFile( t.dbsnpFile, "prior=2.0" )
+    this.resource :+= new TaggedFile( t.hapmapFile, "training=true,truth=true,prior=15.0" )
+    this.resource :+= new TaggedFile( omni_b37, "training=true,truth=true,prior=12.0" )
+    this.resource :+= new TaggedFile( training_1000G, "training=true,prior=10.0" )
+    this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" )
    this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" )
    this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS")
    if(t.nSamples >= 10) {