From d170187896bc58200a2e827373046b03c4c260cd Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 17 Aug 2011 16:16:05 -0400 Subject: [PATCH 1/9] Disable optimization that increases marginal speed of the GATK slightly but can produce data loss in a narrow corner case where the BGZF block(s) locations and offsets in the last index bucket of contig n overlap exactly with the BGZF block locations and offset in the last index bucket of contig n+1. A proper fix that keeps the optimization has already been introduced into unstable, but disabling the optimization is a low risk way to make sure that users of stable experience no data loss. --- .../gatk/datasources/reads/LowMemoryIntervalSharder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index ba6321121..198f7d7d3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -59,8 +59,8 @@ public class LowMemoryIntervalSharder implements Iterator { */ public FilePointer next() { FilePointer current = wrappedIterator.next(); - while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) - current = current.combine(parser,wrappedIterator.next()); + //while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) + // current = current.combine(parser,wrappedIterator.next()); return current; } From c193f52e5de69e8dd837f42aae299a31ed5b016e Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 17 Aug 2011 16:29:45 -0400 Subject: [PATCH 2/9] Fixed up examples: pasting from wiki still had old rod syntax --- .../gatk/walkers/beagle/BeagleOutputToVCFWalker.java | 10 +++++----- .../gatk/walkers/beagle/ProduceBeagleInputWalker.java | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index aca176bc2..51fe543df 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -64,11 +64,11 @@ import static java.lang.Math.log10; *
  *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
  *      -R reffile.fasta -T BeagleOutputToVCF \
- *      -B:variant,VCF input_vcf.vcf \
- *      -B:beagleR2,BEAGLE /myrun.beagle_output.r2 \
- *      -B:beaglePhased,BEAGLE /myrun.beagle_output.phased \
- *      -B:beagleProbs,BEAGLE /myrun.beagle_output.gprobs \
- *      --out output_vcf.vcf
+ *      -V input_vcf.vcf \
+ *      -beagleR2:BEAGLE /myrun.beagle_output.r2 \
+ *      -beaglePhased:BEAGLE /myrun.beagle_output.phased \
+ *      -beagleProbs:BEAGLE /myrun.beagle_output.gprobs \
+ *      -o output_vcf.vcf
  *      

Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 6ac817555..07793fd7b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -68,7 +68,7 @@ import java.util.*; *
  *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
  *      -R reffile.fasta -T ProduceBeagleInput \
- *      -B:variant,VCF path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ *      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
  * 
* */ From 81a792afeb6f6bcdf81437a58417217924aeb61b Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 17 Aug 2011 16:58:24 -0400 Subject: [PATCH 3/9] Reverting optimization disable in unstable. --- .../gatk/datasources/reads/LowMemoryIntervalSharder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index 198f7d7d3..ba6321121 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -59,8 +59,8 @@ public class LowMemoryIntervalSharder implements Iterator { */ public FilePointer next() { FilePointer current = wrappedIterator.next(); - //while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) - // current = current.combine(parser,wrappedIterator.next()); + while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) + current = current.combine(parser,wrappedIterator.next()); return current; } From 8e83b6646ba9934c8c9475538e984d684b2d37eb Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 17 Aug 2011 21:49:14 -0400 Subject: [PATCH 4/9] Bug fix for Chris: don't validate ref base for complex events. --- .../sting/utils/variantcontext/VariantContext.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index ca3399c78..dacc2d94a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1080,8 +1080,8 @@ public class VariantContext implements Feature { // to enable tribble intergrati } public void validateReferenceBases(Allele reference, Byte paddedRefBase) { - // don't validate if we're an insertion - if ( !reference.isNull() && !reference.basesMatch(getReference()) ) { + // don't validate if we're an insertion or complex event + if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) { throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); } From a9df3653642da21106d8c9d86cc88300a8323d1f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 17 Aug 2011 17:48:04 -0400 Subject: [PATCH 5/9] GenotypeAndValidate walker updated Updated the walker to comply with the new RodBinding system and the new GATKDocs. Will move it to public after writing integration tests. --- settings/helpTemplates/style.css | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/settings/helpTemplates/style.css b/settings/helpTemplates/style.css index 1d7bcc576..2abcf0397 100644 --- a/settings/helpTemplates/style.css +++ b/settings/helpTemplates/style.css @@ -95,6 +95,12 @@ dd { padding: 0 0 0.5em 0; } +pre { + border: thin solid lightgray; + margin-left: 1em; + margin-right: 4em; + background-color: #e0fdff; +} /* * clean table layouts */ @@ -128,6 +134,48 @@ dd { } th#row-divider +{ + font-weight: bolder; + font-size: larger; +} + + +/* + * Table design for input/ouptut description + */ + +#description-table +{ + font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; + font-size: 12px; + background: #fff; + margin: 5px; + border-collapse: collapse; + text-align: left; +} +#description-table th +{ + font-size: 16px; + font-weight: bold; + background-color: lightgray; + color: #039; + text-align: center; + padding: 10px 8px; + border-bottom: 2px solid #6678b1; +} +#description-table td +{ + border-bottom: 1px solid #ccc; + color: #669; + padding: 6px 8px; + text-align: right; +} +#description-table tbody tr:hover td +{ + color: #009; +} + +th#row-divider { font-weight: bolder; font-size: larger; From cc3df8f11a454cdabdd139b4e52f70447ce5756d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 17 Aug 2011 21:54:40 -0400 Subject: [PATCH 6/9] Moving GAV walker to public Walker is updated to the new RodBinding system and has the new GATKDocs layout. --- .../validation/GenotypeAndValidateWalker.java | 438 ++++++++++++++++++ 1 file changed, 438 insertions(+) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java new file mode 100755 index 000000000..fc23200af --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.validation; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.Map; +import java.util.Set; + +import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; + +/** + * Genotypes a dataset and validates the calls of another dataset using the Unified Genotyper. + * + *

+ * Genotype and Validate is a tool to evaluate the quality of a dataset for calling SNPs + * and Indels given a secondary (validation) data source. The data sources are BAM or VCF + * files. You can use them interchangeably (i.e. a BAM to validate calls in a VCF or a VCF + * to validate calls on a BAM). + *

+ * + *

+ * The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you + * want to know how well a particular technology performs calling these snps. With a + * dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you + * can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's + * dataset. + *

+ * + *

+ * Another option is to validate the calls on a VCF file, using a deep coverage BAM file + * that you trust the calls on. The GenotypeAndValidate walker will make calls using the + * reads in the BAM file and take them as truth, then compare to the calls in the VCF file + * and produce a truth table. + *

+ * + * + *

Input

+ *

+ * A BAM file to make calls on and a VCF file to use as truth validation dataset. + * + * You also have the option to invert the roles of the files using the command line options listed below. + *

+ * + *

Output

+ *

+ * GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a + * 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true + * positive or a false positive). The table should look like this: + *

+ *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
ALTREFPredictive Value
called altTrue Positive (TP)False Positive (FP)Positive PV
called refFalse Negative (FN)True Negative (TN)Negative PV
+ *
+ * + *

+ * The positive predictive value (PPV) is the proportion of subjects with positive test results + * who are correctly diagnosed. + *

+ *

+ * The negative predictive value (NPV) is the proportion of subjects with a negative test result + * who are correctly diagnosed. + *

+ *

+ * The VCF file will contain only the variants that were called or not called, excluding the ones that + * were uncovered or didn't pass the filters. This file is useful if you are trying to compare + * the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to + * apples). + *

+ * + *

+ * Here is an example of an annotated VCF file (info field clipped for clarity) + * + *

+ * #CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+ * 1   20568807    .   C   T   0    HapMapHet        AC=1;AF=0.50;AN=2;DP=0;GV=T  GT  0/1
+ * 1   22359922    .   T   C   282  WG-CG-HiSeq      AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ  1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99    ./.
+ * 13  102391461   .   G   A   341  Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ  ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99   ./.
+ * 1   175516757   .   C   G   655  SnpCluster,WG    AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ  ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99   ./.
+ * 
+ * + *

+ * + *

Additional Details

+ *
    + *
  • + * You should always use -BTI on your VCF track, so that the GATK only looks at the sites on the VCF file. + * This speeds up the process a lot. + *
  • + *
  • + * The total number of visited bases may be greater than the number of variants in the original + * VCF file because of extended indels, as they trigger one call per new insertion or deletion. + * (i.e. ACTG/- will count as 4 genotyper calls, but it's only one line in the VCF). + *
  • + *
+ * + *

Examples

+ *
    + *
  1. + * Genotypes BAM file from new technology using the VCF as a truth dataset: + *
  2. + * + *
    + *  java
    + *      -jar /GenomeAnalysisTK.jar
    + *      -T  GenotypeAndValidate
    + *      -R human_g1k_v37.fasta
    + *      -I myNewTechReads.bam
    + *      -alleles handAnnotatedVCF.vcf
    + *      -BTI alleles
    + * 
    + * + *
  3. + * Using a BAM file as the truth dataset: + *
  4. + * + *
    + *  java
    + *      -jar /GenomeAnalysisTK.jar
    + *      -T  GenotypeAndValidate
    + *      -R human_g1k_v37.fasta
    + *      -I myTruthDataset.bam
    + *      -alleles callsToValidate.vcf
    + *      -BTI alleles
    + *      -bt
    + *      -o gav.vcf
    + * 
    + * + * + * @author Mauricio Carneiro + * @since ${DATE} + */ + +@Requires(value={DataSource.READS, DataSource.REFERENCE}) +@Allows(value={DataSource.READS, DataSource.REFERENCE}) + +@By(DataSource.REFERENCE) +@Reference(window=@Window(start=-200,stop=200)) + + +public class GenotypeAndValidateWalker extends RodWalker implements TreeReducible { + + @Output(doc="Generate a VCF file with the variants considered by the walker, with a new annotation \"callStatus\" which will carry the value called in the validation VCF or BAM file", required=false) + protected VCFWriter vcfWriter = null; + + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype", required=true) + public RodBinding alleles; + + @Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false) + private boolean bamIsTruth = false; + + @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false) + private int mbq = -1; + + @Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false) + private double deletions = -1; + + @Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false) + private double callConf = -1; + + @Argument(fullName="standard_min_confidence_threshold_for_emitting", shortName="stand_emit_conf", doc="the minimum phred-scaled Qscore threshold to emit low confidence calls", required=false) + private double emitConf = -1; + + @Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false) + private int minDepth = -1; + + @Argument(fullName ="sample", shortName ="sn", doc="Name of the sample to validate (in case your VCF/BAM has more than one sample)", required=false) + private String sample = ""; + + private UnifiedGenotyperEngine snpEngine; + private UnifiedGenotyperEngine indelEngine; + + public static class CountedData { + private long nAltCalledAlt = 0L; + private long nAltCalledRef = 0L; + private long nRefCalledAlt = 0L; + private long nRefCalledRef = 0L; + private long nNotConfidentCalls = 0L; + private long nUncovered = 0L; + + /** + * Adds the values of other to this, returning this + * @param other the other object + */ + public void add(CountedData other) { + nAltCalledAlt += other.nAltCalledAlt; + nAltCalledRef += other.nAltCalledRef; + nRefCalledAlt += other.nRefCalledAlt; + nRefCalledRef += other.nRefCalledRef; + nUncovered += other.nUncovered; + nNotConfidentCalls += other.nNotConfidentCalls; + } + } + + + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + + // Initialize VCF header + if (vcfWriter != null) { + Map header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName()); + Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); + headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); + vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + } + + // Filling in SNP calling arguments for UG + UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); + uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; + uac.alleles = alleles; + if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; + if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions; + if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; + if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; + + uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; + snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); + + // Adding the INDEL calling arguments for UG + uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL; + indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac); + + // make sure we have callConf set to the threshold set by the UAC so we can use it later. + callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + + final CountedData counter = new CountedData(); + + // For some reason RodWalkers get map calls with null trackers + if( tracker == null ) + return counter; + + VariantContext vcComp = tracker.getFirstValue(alleles); + if( vcComp == null ) + return counter; + + //todo - not sure I want this, may be misleading to filter extended indel events. + if (isInsideExtendedIndel(vcComp, ref)) + return counter; + + // Do not operate on variants that are not covered to the optional minimum depth + if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) { + counter.nUncovered = 1L; + return counter; + } + + VariantCallContext call; + if ( vcComp.isSNP() ) + call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); + else if ( vcComp.isIndel() ) { + call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); + } + else { + logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles()); + return counter; + } + + + boolean writeVariant = true; + + if (bamIsTruth) { + if (call.confidentlyCalled) { + // If truth is a confident REF call + if (call.isVariant()) { + if (vcComp.isVariant()) + counter.nAltCalledAlt = 1L; // todo -- may wanna check if the alts called are the same? + else + counter.nAltCalledRef = 1L; + } + // If truth is a confident ALT call + else { + if (vcComp.isVariant()) + counter.nRefCalledAlt = 1L; + else + counter.nRefCalledRef = 1L; + } + } + else { + counter.nNotConfidentCalls = 1L; + writeVariant = false; + } + } + else { + if (!vcComp.hasAttribute("GV")) + throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart()); + + + + if (call.isCalledAlt(callConf)) { + if (vcComp.getAttribute("GV").equals("T")) + counter.nAltCalledAlt = 1L; + else + counter.nRefCalledAlt = 1L; + } + else if (call.isCalledRef(callConf)) { + if (vcComp.getAttribute("GV").equals("T")) + counter.nAltCalledRef = 1L; + else + counter.nRefCalledRef = 1L; + } + else { + counter.nNotConfidentCalls = 1L; + writeVariant = false; + } + } + + if (vcfWriter != null && writeVariant) { + if (!vcComp.hasAttribute("callStatus")) { + MutableVariantContext mvc = new MutableVariantContext(vcComp); + mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" ); + vcfWriter.add(mvc); + } + else + vcfWriter.add(vcComp); + } + return counter; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + public CountedData reduceInit() { + return new CountedData(); + } + + public CountedData treeReduce( final CountedData sum1, final CountedData sum2) { + sum2.add(sum1); + return sum2; + } + + public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) { + reduceSum.add(mapValue); + return reduceSum; + } + + public void onTraversalDone( CountedData reduceSum ) { + double ppv = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nRefCalledAlt)); + double npv = 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nAltCalledRef)); + double sensitivity = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nAltCalledRef)); + double specificity = (reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt > 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100; + logger.info(String.format("Resulting Truth Table Output\n\n" + + "---------------------------------------------------\n" + + "\t\t|\tALT\t|\tREF\t\n" + + "---------------------------------------------------\n" + + "called alt\t|\t%d\t|\t%d\n" + + "called ref\t|\t%d\t|\t%d\n" + + "---------------------------------------------------\n" + + "positive predictive value: %f%%\n" + + "negative predictive value: %f%%\n" + + "---------------------------------------------------\n" + + "sensitivity: %f%%\n" + + "specificity: %f%%\n" + + "---------------------------------------------------\n" + + "not confident: %d\n" + + "not covered: %d\n" + + "---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered)); + } +} From a7b70e6bb46c10c3a00bcedfcf9cf916321e9d3b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 17 Aug 2011 22:28:22 -0400 Subject: [PATCH 7/9] Adding feature for Khalid: ability to exclude particular samples. --- .../walkers/variantutils/SelectVariants.java | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index b52863c8f..f6b6a8d65 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -191,16 +191,28 @@ public class SelectVariants extends RodWalker { protected VCFWriter vcfWriter = null; @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) - public Set sampleNames; + public Set sampleNames = new HashSet(0); @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) - public Set sampleExpressions; + public Set sampleExpressions ; @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) public Set sampleFiles; /** - * Note that thse expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated. + * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. + */ + @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false) + public Set XLsampleNames = new HashSet(0); + + /** + * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. + */ + @Argument(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false) + public Set XLsampleFiles; + + /** + * Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated. */ @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false) public ArrayList SELECT_EXPRESSIONS = new ArrayList(); @@ -304,8 +316,7 @@ public class SelectVariants extends RodWalker { private ArrayList afBoosts = null; double bkDelta = 0.0; - - private PrintStream outMVFileStream = null; + private PrintStream outMVFileStream = null; /** @@ -321,19 +332,27 @@ public class SelectVariants extends RodWalker { Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); + // first, add any requested samples samples.addAll(samplesFromFile); samples.addAll(samplesFromExpressions); - if (sampleNames != null) - samples.addAll(sampleNames); + samples.addAll(sampleNames); - if(samples.isEmpty()) { + // if none were requested, we want all of them + if ( samples.isEmpty() ) { samples.addAll(vcfSamples); NO_SAMPLES_SPECIFIED = true; } - for (String sample : samples) { + // now, exclude any requested samples + Collection XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); + samples.removeAll(XLsamplesFromFile); + samples.removeAll(XLsampleNames); + + if ( samples.size() == 0 ) + throw new UserException("All samples requested to be included were also requested to be excluded."); + + for ( String sample : samples ) logger.info("Including sample '" + sample + "'"); - } // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); From b75a1807e3242fd3dda158aba33362f826fb9041 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 17 Aug 2011 22:40:09 -0400 Subject: [PATCH 8/9] Adding integration test to cover sample exclusion --- .../SelectVariantsIntegrationTest.java | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index bec0d5dd4..cec377f5f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -7,7 +7,7 @@ import java.util.Arrays; public class SelectVariantsIntegrationTest extends WalkerTest { public static String baseTestString(String args) { - return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s" + args; + return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s -NO_HEADER" + args; } @Test @@ -16,7 +16,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile), 1, Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") ); @@ -24,12 +24,26 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testComplexSelection--" + testfile, spec); } + @Test + public void testSampleExclusion() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant:VCF3 " + testfile, + 1, + Arrays.asList("730f021fd6ecf1d195dabbee2e233bfd") + ); + + executeTest("testSampleExclusion--" + testfile, spec); + } + @Test public void testRepeatedLineSelection() { String testfile = validationDataLocation + "test.dup.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile), 1, Arrays.asList("b74038779fe6485dbb8734ae48178356") );