simpleMerge now preserves allele order
-- UnitTests for dangerous PL merging cases in the multi-allelic case. The new behavior is correct
This commit is contained in:
parent
e94e6ba101
commit
c67f6c076b
|
|
@ -29,6 +29,7 @@ import net.sf.picard.reference.ReferenceSequenceFile;
|
|||
import net.sf.samtools.util.StringUtil;
|
||||
import org.apache.commons.jexl2.Expression;
|
||||
import org.apache.commons.jexl2.JexlEngine;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
|
||||
import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -44,6 +45,7 @@ import java.io.Serializable;
|
|||
import java.util.*;
|
||||
|
||||
public class VariantContextUtils {
|
||||
private static Logger logger = Logger.getLogger(VariantContextUtils.class);
|
||||
public final static String MERGE_INTERSECTION = "Intersection";
|
||||
public final static String MERGE_FILTER_IN_ALL = "FilteredInAll";
|
||||
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
|
||||
|
|
@ -511,7 +513,7 @@ public class VariantContextUtils {
|
|||
final String name = first.getSource();
|
||||
final Allele refAllele = determineReferenceAllele(VCs);
|
||||
|
||||
final Set<Allele> alleles = new TreeSet<Allele>();
|
||||
final Set<Allele> alleles = new LinkedHashSet<Allele>();
|
||||
final Set<String> filters = new TreeSet<String>();
|
||||
final Map<String, Object> attributes = new TreeMap<String, Object>();
|
||||
final Set<String> inconsistentAttributes = new HashSet<String>();
|
||||
|
|
@ -604,11 +606,14 @@ public class VariantContextUtils {
|
|||
}
|
||||
}
|
||||
|
||||
// if we have more alternate alleles in the merged VC than in one or more of the original VCs, we need to strip out the GL/PLs (because they are no longer accurate)
|
||||
// if we have more alternate alleles in the merged VC than in one or more of the
|
||||
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate)
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if (vc.alleles.size() == 1)
|
||||
continue;
|
||||
if ( vc.alleles.size() != alleles.size()) {
|
||||
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
|
||||
logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
|
||||
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
|
||||
genotypes = stripPLs(genotypes);
|
||||
break;
|
||||
}
|
||||
|
|
@ -661,6 +666,24 @@ public class VariantContextUtils {
|
|||
return merged;
|
||||
}
|
||||
|
||||
private static final boolean hasPLIncompatibleAlleles(final Set<Allele> alleleSet1, final Set<Allele> alleleSet2) {
|
||||
final Iterator<Allele> it1 = alleleSet1.iterator();
|
||||
final Iterator<Allele> it2 = alleleSet2.iterator();
|
||||
|
||||
while ( it1.hasNext() && it2.hasNext() ) {
|
||||
final Allele a1 = it1.next();
|
||||
final Allele a2 = it2.next();
|
||||
if ( ! a1.equals(a2) )
|
||||
return true;
|
||||
}
|
||||
|
||||
// by this point, at least one of the iterators is empty. All of the elements
|
||||
// we've compared are equal up until this point. But it's possible that the
|
||||
// sets aren't the same size, which is indicated by the test below. If they
|
||||
// are of the same size, though, the sets are compatible
|
||||
return it1.hasNext() || it2.hasNext();
|
||||
}
|
||||
|
||||
public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) {
|
||||
// if all alleles of vc1 are a contained in alleles of vc2, return true
|
||||
if (!vc1.getReference().equals(vc2.getReference()))
|
||||
|
|
|
|||
|
|
@ -66,8 +66,8 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
return new Genotype(sample, Arrays.asList(a1, a2));
|
||||
}
|
||||
|
||||
private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, double l1, double l2, double l3) {
|
||||
return new Genotype(sample, Arrays.asList(a1, a2), log10pError, new double[]{l1,l2,l3});
|
||||
private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, double... pls) {
|
||||
return new Genotype(sample, Arrays.asList(a1, a2), log10pError, pls);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -144,15 +144,18 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
|
||||
new MergeAllelesTest(Arrays.asList(Aref, T),
|
||||
Arrays.asList(Aref, C),
|
||||
Arrays.asList(Aref, C, T)); // sorted by allele
|
||||
Arrays.asList(Aref, T, C)); // in order of appearence
|
||||
|
||||
new MergeAllelesTest(Arrays.asList(Aref, C, T),
|
||||
Arrays.asList(Aref, C),
|
||||
Arrays.asList(Aref, C, T));
|
||||
|
||||
new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T));
|
||||
new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C));
|
||||
|
||||
new MergeAllelesTest(Arrays.asList(Aref, T, C),
|
||||
Arrays.asList(Aref, C),
|
||||
Arrays.asList(Aref, C, T)); // sorted by allele
|
||||
Arrays.asList(Aref, T, C)); // in order of appearence
|
||||
|
||||
// The following is actually a pathological case - there's no way on a vcf to represent a null allele that's non-variant.
|
||||
// The code converts this (correctly) to a single-base non-variant vc with whatever base was there as a reference.
|
||||
|
|
@ -167,10 +170,12 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
Arrays.asList(delRef, ATC, ATCATC),
|
||||
Arrays.asList(delRef, ATC, ATCATC));
|
||||
|
||||
// alleles in the order we see them
|
||||
new MergeAllelesTest(Arrays.asList(delRef, ATCATC),
|
||||
Arrays.asList(delRef, ATC, ATCATC),
|
||||
Arrays.asList(delRef, ATC, ATCATC));
|
||||
Arrays.asList(delRef, ATCATC, ATC));
|
||||
|
||||
// same
|
||||
new MergeAllelesTest(Arrays.asList(delRef, ATC),
|
||||
Arrays.asList(delRef, ATCATC),
|
||||
Arrays.asList(delRef, ATC, ATCATC));
|
||||
|
|
@ -446,7 +451,31 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)),
|
||||
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)));
|
||||
|
||||
//
|
||||
// merging genothpes with PLs
|
||||
//
|
||||
|
||||
// first, do no harm
|
||||
new MergeGenotypesTest("OrderedPLs", "1",
|
||||
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3)),
|
||||
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3)));
|
||||
|
||||
// first, do no harm
|
||||
new MergeGenotypesTest("OrderedPLs-3Alleles", "1",
|
||||
makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)),
|
||||
makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)));
|
||||
|
||||
// first, do no harm
|
||||
new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1",
|
||||
makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)),
|
||||
makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)));
|
||||
|
||||
// first, do no harm
|
||||
new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1",
|
||||
makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)),
|
||||
makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6)),
|
||||
makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6)));
|
||||
|
||||
new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1",
|
||||
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1,5,0,3)),
|
||||
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2)),
|
||||
|
|
@ -458,6 +487,12 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
// no likelihoods on result since type changes to mixed multiallelic
|
||||
makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, 1), makeG("s3", Aref, T, 3)));
|
||||
|
||||
new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2",
|
||||
makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1, 1, 2, 3, 4, 5, 6)),
|
||||
makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, 2, 6, 5, 4, 3, 2, 1)),
|
||||
// no likelihoods on result since type changes to mixed multiallelic
|
||||
makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1), makeG("s2", Aref, T, 2)));
|
||||
|
||||
return MergeGenotypesTest.getTests(MergeGenotypesTest.class);
|
||||
}
|
||||
|
||||
|
|
@ -493,11 +528,11 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
Genotype value = entry.getValue();
|
||||
Genotype expectedValue = expected.get(key);
|
||||
|
||||
Assert.assertEquals(value.alleles, expectedValue.alleles);
|
||||
Assert.assertEquals(value.getNegLog10PError(), expectedValue.getNegLog10PError());
|
||||
Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods());
|
||||
Assert.assertEquals(value.alleles, expectedValue.alleles, "Alleles in Genotype aren't equal");
|
||||
Assert.assertEquals(value.getNegLog10PError(), expectedValue.getNegLog10PError(), "GQ values aren't equal");
|
||||
Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not");
|
||||
if ( value.hasLikelihoods() )
|
||||
Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector());
|
||||
Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue