diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java new file mode 100755 index 000000000..5f688d001 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/12/11 + */ + +public class TrainingSet { + + public RodBinding rodBinding; + public boolean isKnown = false; + public boolean isTraining = false; + public boolean isAntiTraining = false; + public boolean isTruth = false; + public boolean isConsensus = false; + public double prior = 0.0; + + protected final static Logger logger = Logger.getLogger(TrainingSet.class); + + public TrainingSet( final RodBinding rodBinding) { + this.rodBinding = rodBinding; + + final Tags tags = rodBinding.getTags(); + final String name = rodBinding.getName(); + + // Parse the tags to decide which tracks have which properties + if( tags != null ) { + isKnown = tags.containsKey("known") && tags.getValue("known").equals("true"); + isTraining = tags.containsKey("training") && tags.getValue("training").equals("true"); + isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true"); + isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true"); + isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true"); + prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior ); + } + + // Report back to the user which tracks were found and the properties that were detected + if( !isConsensus && !isAntiTraining ) { + logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) ); + } else if( isConsensus ) { + logger.info( String.format( "Found consensus track: %s", name) ); + } else { + logger.info( String.format( "Found bad sites training track: %s", name) ); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 429becfc7..e04bfab76 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -51,10 +51,10 @@ public class VariantDataManager { private ExpandingArrayList data; private final double[] meanVector; private final double[] varianceVector; // this is really the standard deviation - public final ArrayList annotationKeys; + public final List annotationKeys; private final VariantRecalibratorArgumentCollection VRAC; protected final static Logger logger = Logger.getLogger(VariantDataManager.class); - + protected final List trainingSets; public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { this.data = null; @@ -62,6 +62,7 @@ public class VariantDataManager { this.VRAC = VRAC; meanVector = new double[this.annotationKeys.size()]; varianceVector = new double[this.annotationKeys.size()]; + trainingSets = new ArrayList(); } public void setData( final ExpandingArrayList data ) { @@ -104,6 +105,31 @@ public class VariantDataManager { } } + public void addTrainingSet( final TrainingSet trainingSet ) { + trainingSets.add( trainingSet ); + } + + public boolean checkHasTrainingSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTraining ) { return true; } + } + return false; + } + + public boolean checkHasTruthSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTruth ) { return true; } + } + return false; + } + + public boolean checkHasKnownSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isKnown ) { return true; } + } + return false; + } + public ExpandingArrayList getTrainingData() { final ExpandingArrayList trainingData = new ExpandingArrayList(); for( final VariantDatum datum : data ) { @@ -232,57 +258,35 @@ public class VariantDataManager { return value; } - public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap rodToPriorMap, - final List> training, final List> truth, final List> known, final List> badSites, final List> resource) { + public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) { datum.isKnown = false; datum.atTruthSite = false; datum.atTrainingSite = false; datum.atAntiTrainingSite = false; datum.prior = 2.0; - //BUGBUG: need to clean this up - - for( final RodBinding rod : training ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { + for( final TrainingSet trainingSet : trainingSets ) { + for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) { if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.atTrainingSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + datum.isKnown = datum.isKnown || trainingSet.isKnown; + datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth; + datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining; + datum.prior = Math.max( datum.prior, trainingSet.prior ); + datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 ); } - } - } - for( final RodBinding rod : truth ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.atTruthSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : known ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.isKnown = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : resource ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : badSites ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { if( trainVC != null ) { - datum.atAntiTrainingSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining; } } } } + private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { + return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && + ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); + } + public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { for( final VariantDatum datum : data ) { RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", @@ -290,10 +294,4 @@ public class VariantDataManager { (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); } } - - private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { - return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && - ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index df4faebd1..529d17285 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -77,16 +77,15 @@ import java.util.*; *

* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. * - *

Examples

+ *

Example

*
  * java -Xmx4g -jar GenomeAnalysisTK.jar \
  *   -T VariantRecalibrator \
  *   -R reference/human_g1k_v37.fasta \
  *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
- *   -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
- *   -known:prior=8.0 dbsnp_132.b37.vcf \
+ *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
  *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
  *   -recalFile path/to/output.recal \
  *   -tranchesFile path/to/output.tranches \
@@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker> input;
 
     /**
-     * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
-     */
-    @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
-    public List> training;
-
-    /**
-     * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
-     * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
-     */
-    @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
-    public List> truth;
-
-    /**
-     * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
-     * The output metrics are stratified by known status in order to aid in comparisons with other call sets.
-     */
-    @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
-    public List> known = Collections.emptyList();
-
-    /**
-     * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
-     * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
-     */
-    @Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
-    public List> badSites = Collections.emptyList();
-
-    /**
-     * Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
+     * Any set of VCF files to use as lists of training, truth, or known sites.
+     * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
+     * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
+     * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
+     * Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
      */
     @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
     public List> resource = Collections.emptyList();
@@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker ignoreInputFilterSet = new TreeSet();
     private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
-    private final HashMap rodToPriorMap = new HashMap();
 
     //---------------------------------------------------------------------------------------------------------------
     //
@@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker> allInputBindings = new ArrayList>();
-        allInputBindings.addAll(truth);
-        allInputBindings.addAll(training);
-        allInputBindings.addAll(known);
-        allInputBindings.addAll(badSites);
-        allInputBindings.addAll(resource);
-        for( final RodBinding rod : allInputBindings ) {
-            try {
-                rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
-            } catch( NumberFormatException e ) {
-                throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
-            }
+        for( RodBinding rod : resource ) {
+            dataManager.addTrainingSet( new TrainingSet( rod ) );
+        }
+
+        if( !dataManager.checkHasTrainingSet() ) {
+            throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
+        }
+        if( !dataManager.checkHasTruthSet() ) {
+            throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
         }
     }
 
@@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker= 10) {