Merge pull request #1565 from broadinstitute/rhl_fix_cv_merge_1547

Fix CombineVariants merge for different types
This commit is contained in:
Ron Levine 2017-03-14 16:43:33 -04:00 committed by GitHub
commit c449b133d0
2 changed files with 35 additions and 9 deletions

View File

@ -144,18 +144,18 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
@Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "9bdda937754e1407183406808f560723"); } // official project VCF files in tabix format
@Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "6344953a82a422115bd647ec1d696b94"); } // official project VCF files in tabix format
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "51cf4543e46c8434e32c426f59507d1e"); }
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "9a62e340fe17c8659786f56cc992975a"); }
@Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f9d1d7e6246f0ce9e493357d5b320323"); }
@Test public void uniqueSNPs() {
// parallelism must be disabled because the input VCF is malformed (DB=0) and parallelism actually fixes this which breaks the md5s
//both of these files have the YRI trio and merging of duplicate samples without priority must be specified with UNSORTED merge type
combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", " -genotypeMergeOptions UNSORTED", "5aece78046bfb7d6ee8dc4d551542e3a", true);
combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", " -genotypeMergeOptions UNSORTED", "698f591bdc8dca1c9a3473ebc447c787", true);
}
@Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "0897efcc0046bd94760315838d4d0fa5"); }
@Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "8b12b09a6ec4e3fde2352bbf82637f1e"); }
@Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "7750b77316b8f02df5c9a60cc64c156d"); }
@Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "3af8a310fca5a4ace3172528d0bc1d38"); }
@Test public void threeWayWithRefs() {
WalkerTestSpec spec = new WalkerTestSpec(
@ -169,7 +169,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
" -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
" -genotypeMergeOptions UNIQUIFY -L 1"),
1,
Arrays.asList("8b75e835ed19c06c358a2185cd0e14db"));
Arrays.asList("085258a3c507e30860ec87b6b65f5f11"));
cvExecuteTest("threeWayWithRefs", spec, true);
}
@ -251,4 +251,17 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
Arrays.asList(""));
executeTest("combineSpanningDels: ", spec);
}
@Test
public void combineDifferentTypes() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineVariants --no_cmdline_in_header -o %s "
+ " -R " + b37KGReference
+ " -V " + privateTestDir + "same.variant.indel.vcf"
+ " -V " + privateTestDir + "same.variant.mixed.vcf"
+ " -multipleAllelesMergeType MIX_TYPES",
1,
Arrays.asList("629bb763e9af8edbec0a77cc91eea617"));
executeTest("combineDifferentTypes: ", spec);
}
}

View File

@ -99,7 +99,7 @@ public class GATKVariantContextUtils {
return Collections.nCopies(ploidy,allele);
}
private static boolean hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) {
private static boolean hasIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) {
final Iterator<Allele> it1 = alleleSet1.iterator();
final Iterator<Allele> it2 = alleleSet2.iterator();
@ -1135,8 +1135,6 @@ public class GATKVariantContextUtils {
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
* SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
*
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
*
* @param unsortedVCs collection of unsorted VCs
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
@ -1190,6 +1188,7 @@ public class GATKVariantContextUtils {
final Set<String> inconsistentAttributes = new HashSet<>();
final Set<String> variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant
final Set<String> rsIDs = new LinkedHashSet<>(1); // most of the time there's one id
final Map<String, Integer> nonBooleanAttributeOccurrences = new HashMap<>();
VariantContext longestVC = first;
int depth = 0;
@ -1262,6 +1261,13 @@ public class GATKVariantContextUtils {
for (final Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
final String key = p.getKey();
final Object value = p.getValue();
if ( !(value instanceof Boolean) ) {
if ( nonBooleanAttributeOccurrences.containsKey(key) ) {
nonBooleanAttributeOccurrences.put(key, nonBooleanAttributeOccurrences.get(key) + 1);
} else {
nonBooleanAttributeOccurrences.put(key, 1);
}
}
// only output annotations that have the same value in every input VC
// if we don't like the key already, don't go anywhere
if ( ! inconsistentAttributes.contains(key) ) {
@ -1280,12 +1286,15 @@ public class GATKVariantContextUtils {
}
}
// if the non-boolean attribute is not in all of the VCs, remove it.
nonBooleanAttributeOccurrences.entrySet().stream().filter(a -> a.getValue() < VCs.size()).map(a -> a.getKey()).forEach(attributes::remove);
// if we have more alternate alleles in the merged VC than in one or more of the
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
for ( final VariantContext vc : VCs ) {
if (vc.getAlleles().size() == 1)
continue;
if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) {
if ( hasIncompatibleAlleles(alleles, vc.getAlleles())) {
if ( ! genotypes.isEmpty() ) {
logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s",
vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles()));
@ -1348,6 +1357,10 @@ public class GATKVariantContextUtils {
// Trim the padded bases of all alleles if necessary
final VariantContext merged = builder.make();
// Recalculate chromosome count annotations or remove them if no genotypes
VariantContextUtils.calculateChromosomeCounts(merged, attributes, true);
if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
return merged;
}