From db645a94ca0e5f533402284367b5ce042de61ad7 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Wed, 25 Jan 2012 16:10:59 -0500 Subject: [PATCH 3/3] Added options to make the batch-merger more all-inclusive: keep all indels, SNPs (even filtered ones) but maintain their annotations. Also, VariantContextUtils.simpleMerge can now merge variants of all types using the Hidden non-default enum MultipleAllelesMergeType=MIX_TYPES --- .../genotyper/ExactAFCalculationModel.java | 5 +- .../walkers/genotyper/UGCalcLikelihoods.java | 114 ------------- .../walkers/genotyper/UGCallVariants.java | 152 ------------------ .../walkers/variantutils/CombineVariants.java | 29 +++- .../variantcontext/VariantContextUtils.java | 13 ++ 5 files changed, 38 insertions(+), 275 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 1594c92cb..a91928bc3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -39,6 +38,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + // TODO: PERMITS WALKER USED TO HAVE A TEMPORARY FIX to prevent NullPointerException caused by bug: + public static boolean PRESERVE_AC_DATA = false; protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); @@ -51,7 +52,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numAlleles = alleles.size(); //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false); + linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, PRESERVE_AC_DATA); } private static final ArrayList getGLs(GenotypesContext GLs) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java deleted file mode 100755 index c7e577393..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.HashSet; -import java.util.Set; - - -/** - * Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs). - * Absolutely not supported or recommended for public use. - * Run this as you would the UnifiedGenotyper, except that you must additionally pass in a VCF bound to - * the name 'allele' so we know which alternate allele to use at each site. - */ -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) -@Reference(window=@Window(start=-200,stop=200)) -@By(DataSource.READS) -@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UGCalcLikelihoods extends LocusWalker implements TreeReducible { - - @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - - // control the output - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter writer = null; - - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - - // enable deletions in the pileup - public boolean includeReadsWithDeletionAtLoci() { return true; } - - // enable extended events for indels - public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; } - - public void initialize() { - // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); - - // initialize the header - Set headerInfo = new HashSet(); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); - - writer.writeHeader(new VCFHeader(headerInfo, samples)) ; - } - - public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext); - return call == null ? null : new VariantCallContext(call, true); - } - - public Integer reduceInit() { return 0; } - - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - public Integer reduce(VariantCallContext value, Integer sum) { - if ( value == null ) - return sum; - - try { - writer.add(value); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); - } - - return sum + 1; - } - - public void onTraversalDone(Integer sum) { - logger.info(String.format("Visited bases: %d", sum)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java deleted file mode 100755 index 97f7b21eb..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.util.*; - -/** - * Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs). - * Absolutely not supported or recommended for public use. - * Run this as you would the UnifiedGenotyper, except that instead of '-I reads' it expects any number - * of GL/PL-annotated VCFs bound to a name starting with 'variant'. - */ -public class UGCallVariants extends RodWalker { - - @ArgumentCollection - private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public List> variants; - - // control the output - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter writer = null; - - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - - // variant track names - private Set trackNames = new HashSet(); - - public void initialize() { - - for ( RodBinding rb : variants ) - trackNames.add(rb.getName()); - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames); - - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); - - Set headerInfo = new HashSet(); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); - if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ) - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - - // initialize the header - writer.writeHeader(new VCFHeader(headerInfo, samples)); - } - - public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return null; - - List VCs = tracker.getValues(variants, context.getLocation()); - - VariantContext mergedVC = mergeVCsWithGLs(VCs); - if ( mergedVC == null ) - return null; - - return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC); - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(VariantCallContext value, Integer sum) { - if ( value == null ) - return sum; - - try { - VariantContextBuilder builder = new VariantContextBuilder(value); - VariantContextUtils.calculateChromosomeCounts(builder, true); - writer.add(builder.make()); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); - } - - return sum + 1; - } - - public void onTraversalDone(Integer result) { - logger.info(String.format("Visited sites: %d", result)); - } - - private static VariantContext mergeVCsWithGLs(List VCs) { - // we can't use the VCUtils classes because our VCs can all be no-calls - if ( VCs.size() == 0 ) - return null; - - VariantContext variantVC = null; - GenotypesContext genotypes = GenotypesContext.create(); - for ( VariantContext vc : VCs ) { - if ( variantVC == null && vc.isVariant() ) - variantVC = vc; - genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes())); - } - - if ( variantVC == null ) { - VariantContext vc = VCs.get(0); - throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart()); - } - - return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make(); - } - - private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) { - GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size()); - for ( final Genotype g : genotypes ) { - if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null ) - genotypesWithGLs.add(g); - } - return genotypesWithGLs; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index af05c0dc4..684b9102a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -120,6 +120,10 @@ public class CombineVariants extends RodWalker { @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; + @Hidden + @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false) + public VariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = VariantContextUtils.MultipleAllelesMergeType.BY_TYPE; + /** * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. */ @@ -236,13 +240,24 @@ public class CombineVariants extends RodWalker { return 0; List mergedVCs = new ArrayList(); - Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); - // iterate over the types so that it's deterministic - for ( VariantContext.Type type : VariantContext.Type.values() ) { - if ( VCsByType.containsKey(type) ) - mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + + if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { + Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); + // iterate over the types so that it's deterministic + for (VariantContext.Type type : VariantContext.Type.values()) { + if (VCsByType.containsKey(type)) + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + } + else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs, + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + else { + logger.warn("Ignoring all records at site " + ref.getLocus()); } for ( VariantContext mergedVC : mergedVCs ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 39045ea21..179c91660 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -29,6 +29,7 @@ import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -471,6 +472,18 @@ public class VariantContextUtils { KEEP_UNCONDITIONAL } + @Hidden + public enum MultipleAllelesMergeType { + /** + * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. + */ + BY_TYPE, + /** + * Merge all allele types at the same start position into the same VCF record. + */ + MIX_TYPES + } + /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with