From 687dee22acc9877f6c6222e64e2c399eb52ebbf6 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 10 Mar 2017 10:44:42 -0500 Subject: [PATCH] Fix CombineVariants merge for different types --- .../CombineVariantsIntegrationTest.java | 23 +++++++++++++++---- .../variant/GATKVariantContextUtils.java | 21 +++++++++++++---- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java index eddf113ac..acdee6644 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -144,18 +144,18 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "9bdda937754e1407183406808f560723"); } // official project VCF files in tabix format @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "6344953a82a422115bd647ec1d696b94"); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "51cf4543e46c8434e32c426f59507d1e"); } + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "9a62e340fe17c8659786f56cc992975a"); } @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f9d1d7e6246f0ce9e493357d5b320323"); } @Test public void uniqueSNPs() { // parallelism must be disabled because the input VCF is malformed (DB=0) and parallelism actually fixes this which breaks the md5s //both of these files have the YRI trio and merging of duplicate samples without priority must be specified with UNSORTED merge type - combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", " -genotypeMergeOptions UNSORTED", "5aece78046bfb7d6ee8dc4d551542e3a", true); + combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", " -genotypeMergeOptions UNSORTED", "698f591bdc8dca1c9a3473ebc447c787", true); } - @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "0897efcc0046bd94760315838d4d0fa5"); } - @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "8b12b09a6ec4e3fde2352bbf82637f1e"); } + @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "7750b77316b8f02df5c9a60cc64c156d"); } + @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "3af8a310fca5a4ace3172528d0bc1d38"); } @Test public void threeWayWithRefs() { WalkerTestSpec spec = new WalkerTestSpec( @@ -169,7 +169,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("8b75e835ed19c06c358a2185cd0e14db")); + Arrays.asList("085258a3c507e30860ec87b6b65f5f11")); cvExecuteTest("threeWayWithRefs", spec, true); } @@ -251,4 +251,17 @@ public class CombineVariantsIntegrationTest extends WalkerTest { Arrays.asList("")); executeTest("combineSpanningDels: ", spec); } + + @Test + public void combineDifferentTypes() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants --no_cmdline_in_header -o %s " + + " -R " + b37KGReference + + " -V " + privateTestDir + "same.variant.indel.vcf" + + " -V " + privateTestDir + "same.variant.mixed.vcf" + + " -multipleAllelesMergeType MIX_TYPES", + 1, + Arrays.asList("629bb763e9af8edbec0a77cc91eea617")); + executeTest("combineDifferentTypes: ", spec); + } } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index ce40acebe..8df0185c8 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -99,7 +99,7 @@ public class GATKVariantContextUtils { return Collections.nCopies(ploidy,allele); } - private static boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + private static boolean hasIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { final Iterator it1 = alleleSet1.iterator(); final Iterator it2 = alleleSet2.iterator(); @@ -1135,8 +1135,6 @@ public class GATKVariantContextUtils { * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. * - * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ - * * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records @@ -1190,6 +1188,7 @@ public class GATKVariantContextUtils { final Set inconsistentAttributes = new HashSet<>(); final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + final Map nonBooleanAttributeOccurrences = new HashMap<>(); VariantContext longestVC = first; int depth = 0; @@ -1262,6 +1261,13 @@ public class GATKVariantContextUtils { for (final Map.Entry p : vc.getAttributes().entrySet()) { final String key = p.getKey(); final Object value = p.getValue(); + if ( !(value instanceof Boolean) ) { + if ( nonBooleanAttributeOccurrences.containsKey(key) ) { + nonBooleanAttributeOccurrences.put(key, nonBooleanAttributeOccurrences.get(key) + 1); + } else { + nonBooleanAttributeOccurrences.put(key, 1); + } + } // only output annotations that have the same value in every input VC // if we don't like the key already, don't go anywhere if ( ! inconsistentAttributes.contains(key) ) { @@ -1280,12 +1286,15 @@ public class GATKVariantContextUtils { } } + // if the non-boolean attribute is not in all of the VCs, remove it. + nonBooleanAttributeOccurrences.entrySet().stream().filter(a -> a.getValue() < VCs.size()).map(a -> a.getKey()).forEach(attributes::remove); + // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD for ( final VariantContext vc : VCs ) { if (vc.getAlleles().size() == 1) continue; - if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { + if ( hasIncompatibleAlleles(alleles, vc.getAlleles())) { if ( ! genotypes.isEmpty() ) { logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); @@ -1348,6 +1357,10 @@ public class GATKVariantContextUtils { // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); + + // Recalculate chromosome count annotations or remove them if no genotypes + VariantContextUtils.calculateChromosomeCounts(merged, attributes, true); + if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; }