diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index ae648d547..437693a46 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -29,6 +29,7 @@ import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.util.StringUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; +import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; import org.broadinstitute.sting.utils.BaseUtils; @@ -44,6 +45,7 @@ import java.io.Serializable; import java.util.*; public class VariantContextUtils { + private static Logger logger = Logger.getLogger(VariantContextUtils.class); public final static String MERGE_INTERSECTION = "Intersection"; public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; @@ -511,7 +513,7 @@ public class VariantContextUtils { final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); - final Set alleles = new TreeSet(); + final Set alleles = new LinkedHashSet(); final Set filters = new TreeSet(); final Map attributes = new TreeMap(); final Set inconsistentAttributes = new HashSet(); @@ -604,11 +606,14 @@ public class VariantContextUtils { } } - // if we have more alternate alleles in the merged VC than in one or more of the original VCs, we need to strip out the GL/PLs (because they are no longer accurate) + // if we have more alternate alleles in the merged VC than in one or more of the + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate) for ( VariantContext vc : VCs ) { if (vc.alleles.size() == 1) continue; - if ( vc.alleles.size() != alleles.size()) { + if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { + logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); break; } @@ -661,6 +666,24 @@ public class VariantContextUtils { return merged; } + private static final boolean hasPLIncompatibleAlleles(final Set alleleSet1, final Set alleleSet2) { + final Iterator it1 = alleleSet1.iterator(); + final Iterator it2 = alleleSet2.iterator(); + + while ( it1.hasNext() && it2.hasNext() ) { + final Allele a1 = it1.next(); + final Allele a2 = it2.next(); + if ( ! a1.equals(a2) ) + return true; + } + + // by this point, at least one of the iterators is empty. All of the elements + // we've compared are equal up until this point. But it's possible that the + // sets aren't the same size, which is indicated by the test below. If they + // are of the same size, though, the sets are compatible + return it1.hasNext() || it2.hasNext(); + } + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { // if all alleles of vc1 are a contained in alleles of vc2, return true if (!vc1.getReference().equals(vc2.getReference())) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java index 08db7bcde..c8cbbfd8e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java @@ -66,8 +66,8 @@ public class VariantContextUtilsUnitTest extends BaseTest { return new Genotype(sample, Arrays.asList(a1, a2)); } - private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, double l1, double l2, double l3) { - return new Genotype(sample, Arrays.asList(a1, a2), log10pError, new double[]{l1,l2,l3}); + private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, double... pls) { + return new Genotype(sample, Arrays.asList(a1, a2), log10pError, pls); } @@ -144,15 +144,18 @@ public class VariantContextUtilsUnitTest extends BaseTest { new MergeAllelesTest(Arrays.asList(Aref, T), Arrays.asList(Aref, C), - Arrays.asList(Aref, C, T)); // sorted by allele + Arrays.asList(Aref, T, C)); // in order of appearence new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C), Arrays.asList(Aref, C, T)); + new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T)); + new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C)); + new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, C), - Arrays.asList(Aref, C, T)); // sorted by allele + Arrays.asList(Aref, T, C)); // in order of appearence // The following is actually a pathological case - there's no way on a vcf to represent a null allele that's non-variant. // The code converts this (correctly) to a single-base non-variant vc with whatever base was there as a reference. @@ -167,10 +170,12 @@ public class VariantContextUtilsUnitTest extends BaseTest { Arrays.asList(delRef, ATC, ATCATC), Arrays.asList(delRef, ATC, ATCATC)); + // alleles in the order we see them new MergeAllelesTest(Arrays.asList(delRef, ATCATC), Arrays.asList(delRef, ATC, ATCATC), - Arrays.asList(delRef, ATC, ATCATC)); + Arrays.asList(delRef, ATCATC, ATC)); + // same new MergeAllelesTest(Arrays.asList(delRef, ATC), Arrays.asList(delRef, ATCATC), Arrays.asList(delRef, ATC, ATCATC)); @@ -446,7 +451,31 @@ public class VariantContextUtilsUnitTest extends BaseTest { makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)), makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3))); + // // merging genothpes with PLs + // + + // first, do no harm + new MergeGenotypesTest("OrderedPLs", "1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3)), + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles", "1", + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6))); + new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1,5,0,3)), makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2)), @@ -458,6 +487,12 @@ public class VariantContextUtilsUnitTest extends BaseTest { // no likelihoods on result since type changes to mixed multiallelic makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, 1), makeG("s3", Aref, T, 3))); + new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1, 1, 2, 3, 4, 5, 6)), + makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, 2, 6, 5, 4, 3, 2, 1)), + // no likelihoods on result since type changes to mixed multiallelic + makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1), makeG("s2", Aref, T, 2))); + return MergeGenotypesTest.getTests(MergeGenotypesTest.class); } @@ -493,11 +528,11 @@ public class VariantContextUtilsUnitTest extends BaseTest { Genotype value = entry.getValue(); Genotype expectedValue = expected.get(key); - Assert.assertEquals(value.alleles, expectedValue.alleles); - Assert.assertEquals(value.getNegLog10PError(), expectedValue.getNegLog10PError()); - Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods()); + Assert.assertEquals(value.alleles, expectedValue.alleles, "Alleles in Genotype aren't equal"); + Assert.assertEquals(value.getNegLog10PError(), expectedValue.getNegLog10PError(), "GQ values aren't equal"); + Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not"); if ( value.hasLikelihoods() ) - Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector()); + Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal"); } }