simpleMerge now preserves allele order

-- UnitTests for dangerous PL merging cases in the multi-allelic case.  The new behavior is correct
This commit is contained in:
Mark DePristo 2011-10-08 17:39:53 -07:00
parent e94e6ba101
commit c67f6c076b
2 changed files with 70 additions and 12 deletions

View File

@ -29,6 +29,7 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.util.StringUtil;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker;
import org.broadinstitute.sting.utils.BaseUtils;
@ -44,6 +45,7 @@ import java.io.Serializable;
import java.util.*;
public class VariantContextUtils {
private static Logger logger = Logger.getLogger(VariantContextUtils.class);
public final static String MERGE_INTERSECTION = "Intersection";
public final static String MERGE_FILTER_IN_ALL = "FilteredInAll";
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
@ -511,7 +513,7 @@ public class VariantContextUtils {
final String name = first.getSource();
final Allele refAllele = determineReferenceAllele(VCs);
final Set<Allele> alleles = new TreeSet<Allele>();
final Set<Allele> alleles = new LinkedHashSet<Allele>();
final Set<String> filters = new TreeSet<String>();
final Map<String, Object> attributes = new TreeMap<String, Object>();
final Set<String> inconsistentAttributes = new HashSet<String>();
@ -604,11 +606,14 @@ public class VariantContextUtils {
}
}
// if we have more alternate alleles in the merged VC than in one or more of the original VCs, we need to strip out the GL/PLs (because they are no longer accurate)
// if we have more alternate alleles in the merged VC than in one or more of the
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate)
for ( VariantContext vc : VCs ) {
if (vc.alleles.size() == 1)
continue;
if ( vc.alleles.size() != alleles.size()) {
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
genotypes = stripPLs(genotypes);
break;
}
@ -661,6 +666,24 @@ public class VariantContextUtils {
return merged;
}
private static final boolean hasPLIncompatibleAlleles(final Set<Allele> alleleSet1, final Set<Allele> alleleSet2) {
final Iterator<Allele> it1 = alleleSet1.iterator();
final Iterator<Allele> it2 = alleleSet2.iterator();
while ( it1.hasNext() && it2.hasNext() ) {
final Allele a1 = it1.next();
final Allele a2 = it2.next();
if ( ! a1.equals(a2) )
return true;
}
// by this point, at least one of the iterators is empty. All of the elements
// we've compared are equal up until this point. But it's possible that the
// sets aren't the same size, which is indicated by the test below. If they
// are of the same size, though, the sets are compatible
return it1.hasNext() || it2.hasNext();
}
public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) {
// if all alleles of vc1 are a contained in alleles of vc2, return true
if (!vc1.getReference().equals(vc2.getReference()))

View File

@ -66,8 +66,8 @@ public class VariantContextUtilsUnitTest extends BaseTest {
return new Genotype(sample, Arrays.asList(a1, a2));
}
private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, double l1, double l2, double l3) {
return new Genotype(sample, Arrays.asList(a1, a2), log10pError, new double[]{l1,l2,l3});
private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, double... pls) {
return new Genotype(sample, Arrays.asList(a1, a2), log10pError, pls);
}
@ -144,15 +144,18 @@ public class VariantContextUtilsUnitTest extends BaseTest {
new MergeAllelesTest(Arrays.asList(Aref, T),
Arrays.asList(Aref, C),
Arrays.asList(Aref, C, T)); // sorted by allele
Arrays.asList(Aref, T, C)); // in order of appearence
new MergeAllelesTest(Arrays.asList(Aref, C, T),
Arrays.asList(Aref, C),
Arrays.asList(Aref, C, T));
new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T));
new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C));
new MergeAllelesTest(Arrays.asList(Aref, T, C),
Arrays.asList(Aref, C),
Arrays.asList(Aref, C, T)); // sorted by allele
Arrays.asList(Aref, T, C)); // in order of appearence
// The following is actually a pathological case - there's no way on a vcf to represent a null allele that's non-variant.
// The code converts this (correctly) to a single-base non-variant vc with whatever base was there as a reference.
@ -167,10 +170,12 @@ public class VariantContextUtilsUnitTest extends BaseTest {
Arrays.asList(delRef, ATC, ATCATC),
Arrays.asList(delRef, ATC, ATCATC));
// alleles in the order we see them
new MergeAllelesTest(Arrays.asList(delRef, ATCATC),
Arrays.asList(delRef, ATC, ATCATC),
Arrays.asList(delRef, ATC, ATCATC));
Arrays.asList(delRef, ATCATC, ATC));
// same
new MergeAllelesTest(Arrays.asList(delRef, ATC),
Arrays.asList(delRef, ATCATC),
Arrays.asList(delRef, ATC, ATCATC));
@ -446,7 +451,31 @@ public class VariantContextUtilsUnitTest extends BaseTest {
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)),
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)));
//
// merging genothpes with PLs
//
// first, do no harm
new MergeGenotypesTest("OrderedPLs", "1",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3)),
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3)));
// first, do no harm
new MergeGenotypesTest("OrderedPLs-3Alleles", "1",
makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)),
makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)));
// first, do no harm
new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1",
makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)),
makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)));
// first, do no harm
new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1",
makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)),
makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6)),
makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6)));
new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1,5,0,3)),
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2)),
@ -458,6 +487,12 @@ public class VariantContextUtilsUnitTest extends BaseTest {
// no likelihoods on result since type changes to mixed multiallelic
makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, 1), makeG("s3", Aref, T, 3)));
new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2",
makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1, 1, 2, 3, 4, 5, 6)),
makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, 2, 6, 5, 4, 3, 2, 1)),
// no likelihoods on result since type changes to mixed multiallelic
makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1), makeG("s2", Aref, T, 2)));
return MergeGenotypesTest.getTests(MergeGenotypesTest.class);
}
@ -493,11 +528,11 @@ public class VariantContextUtilsUnitTest extends BaseTest {
Genotype value = entry.getValue();
Genotype expectedValue = expected.get(key);
Assert.assertEquals(value.alleles, expectedValue.alleles);
Assert.assertEquals(value.getNegLog10PError(), expectedValue.getNegLog10PError());
Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods());
Assert.assertEquals(value.alleles, expectedValue.alleles, "Alleles in Genotype aren't equal");
Assert.assertEquals(value.getNegLog10PError(), expectedValue.getNegLog10PError(), "GQ values aren't equal");
Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not");
if ( value.hasLikelihoods() )
Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector());
Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal");
}
}