From 9121b9816741fd597321a587f3fa9d18c24e0ed4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 19 Aug 2012 10:29:38 -0400 Subject: [PATCH 1/3] CombineVariants outputs the first non-MISSING qual, not the maximum -- When merging multiple VCF records at a site, the combined VCF record has the QUAL of the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the max QUAL, which resulted in sometime strange downstream confusion. --- .../sting/gatk/walkers/variantutils/CombineVariants.java | 7 +++++++ .../sting/utils/variantcontext/VariantContextUtils.java | 6 ++++-- .../variantutils/CombineVariantsIntegrationTest.java | 6 +++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 8dabd49b8..555999bdb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -73,6 +73,13 @@ import java.util.*; * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much. * + * Some fine details about the merging algorithm: + * + * *

Input

*

* One or more variant sets to combine. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index ff6b0be70..d7e4a7135 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -514,7 +514,7 @@ public class VariantContextUtils { int depth = 0; int maxAC = -1; final Map attributesWithMaxAC = new LinkedHashMap(); - double log10PError = 1; + double log10PError = CommonInfo.NO_LOG10_PERROR; VariantContext vcWithMaxAC = null; GenotypesContext genotypes = GenotypesContext.create(); @@ -542,7 +542,9 @@ public class VariantContextUtils { mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); - log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); + // We always take the QUAL of the first VC with a non-MISSING qual for the combined value + if ( log10PError == CommonInfo.NO_LOG10_PERROR ) + log10PError = vc.getLog10PError(); filters.addAll(vc.getFilters()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 9ea751b72..c32d77f82 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -113,13 +113,13 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "4efdf983918db822e4ac13d911509576"); } // official project VCF files in tabix format @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "848d4408ee953053d2307cefebc6bd6d"); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "5d04f22ef88ed9226cbd7b4483c5cb23"); } + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "629656bfef7713c23f3a593523503b2f"); } @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e54d0dcf14f90d5c8e58b45191dd0219"); } @Test public void uniqueSNPs() { // parallelism must be disabled because the input VCF is malformed (DB=0) and parallelism actually fixes this which breaks the md5s - combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "acc70f33be741b564f7be9aa3f819dd4", true); + combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "e5ea6ac3905bd9eeea1a2ef5d2cb5af7", true); } @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "def52bcd3942bbe39cd7ebe845c4f206"); } @@ -137,7 +137,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("3039cfff7abee6aa7fbbafec66a1b019")); + Arrays.asList("e5f0e7a80cd392172ebf5ddb06b91a00")); cvExecuteTest("threeWayWithRefs", spec, true); } From 7fa76f719b7f1c348d6f8d70bad3ae03bda42e09 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 19 Aug 2012 10:32:55 -0400 Subject: [PATCH 2/3] Print "Parsing data stream with BCF version BCFx.y" in BCF2 codec as .debug not .info --- .../org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 60fcb6585..c221b8fba 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -149,7 +149,7 @@ public final class BCF2Codec implements FeatureCodec { if ( bcfVersion.getMinorVersion() < MIN_MINOR_VERSION ) error("BCF2Codec can only process BCF2 files with minor version >= " + MIN_MINOR_VERSION + " but this file has minor version " + bcfVersion.getMinorVersion()); - logger.info("Parsing data stream with BCF version " + bcfVersion); + logger.debug("Parsing data stream with BCF version " + bcfVersion); final int headerSizeInBytes = BCF2Type.INT32.read(inputStream); From 97b191f5787638223d858f6f6cdd6099d0166396 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 20 Aug 2012 01:16:23 -0400 Subject: [PATCH 3/3] Thanks to Guillermo I was able to isolate an instance of where the MLEAC > AN. It turns out that this is valid, e.g. when PLs are all 0s for a sample we no-call it but it's allowed to factor into the MLE (since that's the contract with the exact model). Removing the check in UG and instead protecting for it in the AlleleCount stratification. --- .../gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 10 ++-------- .../varianteval/stratifications/AlleleCount.java | 5 +++-- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 67ade390f..3d9724ffb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -444,16 +444,10 @@ public class UnifiedGenotyperEngine { if ( alleleCountsofMLE.size() > 0 ) { attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); final int AN = builder.make().getCalledChrCount(); - - // let's sanity check that we don't have an invalid MLE value in there - for ( int MLEAC : alleleCountsofMLE ) { - if ( MLEAC > AN ) - throw new ReviewedStingException(String.format("MLEAC value (%d) is larger than AN (%d) at position %s:%d", MLEAC, AN, loc.getContig(), loc.getStart())); - } - final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); + // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) for ( int AC : alleleCountsofMLE ) - MLEfrequencies.add((double)AC / (double)AN); + MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN)); attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 00a593768..2b1bd9c62 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -46,7 +46,8 @@ public class AlleleCount extends VariantStratifier { int AC = 0; // by default, the site is considered monomorphic if ( eval.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && eval.isBiallelic() ) { - AC = eval.getAttributeAsInt(VCFConstants.MLE_ALLELE_COUNT_KEY, 0); + // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) + AC = Math.min(eval.getAttributeAsInt(VCFConstants.MLE_ALLELE_COUNT_KEY, 0), nchrom); } else if ( eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && eval.isBiallelic() ) { AC = eval.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); } else if ( eval.isVariant() ) { @@ -56,7 +57,7 @@ public class AlleleCount extends VariantStratifier { // make sure that the AC isn't invalid if ( AC > nchrom ) - throw new UserException.MalformedVCF(String.format("The AC or MLEAC value (%d) at position %s:%d " + + throw new UserException.MalformedVCF(String.format("The AC value (%d) at position %s:%d " + "is larger than the number of chromosomes over all samples (%d)", AC, eval.getChr(), eval.getStart(), nchrom));