Combine Variants was actually outputting invalid VCFs in cases where it was combining Variant Contexts with different alternate alleles: if any of the genotypes had PLs they were no longer valid/correct. Added a check for such cases (the combined VC has more alleles than an original VC) and strip out the PLs when triggered; added integration test to cover it. I also added the check to Select Variants, although it currently doesn't remove unused alleles so it should never trigger. Is there any reason not to strip out unused alleles after a select?

This commit is contained in:
Eric Banks 2011-08-08 16:25:35 -04:00
parent 197169e47b
commit d7813db217
5 changed files with 51 additions and 0 deletions

View File

@ -544,6 +544,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles());
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate)
if ( vc.getAlleles().size() != sub.getAlleles().size() )
sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes()));
HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes());
int depth = 0;

View File

@ -57,6 +57,13 @@ public class Genotype {
return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased());
}
public static Genotype removePLs(Genotype g) {
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased());
}
public static Genotype modifyAlleles(Genotype g, List<Allele> alleles) {
return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased());
}

View File

@ -588,6 +588,14 @@ public class VariantContextUtils {
}
}
// if we have more alternate alleles in the merged VC than in one or more of the original VCs, we need to strip out the GL/PLs (because they are no longer accurate)
for ( VariantContext vc : VCs ) {
if ( vc.alleles.size() != alleles.size() ) {
genotypes = stripPLs(genotypes);
break;
}
}
// take the VC with the maxAC and pull the attributes into a modifiable map
if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) {
attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes());
@ -633,6 +641,16 @@ public class VariantContextUtils {
return merged;
}
public static Map<String, Genotype> stripPLs(Map<String, Genotype> genotypes) {
Map<String, Genotype> newGs = new HashMap<String, Genotype>(genotypes.size());
for ( Map.Entry<String, Genotype> g : genotypes.entrySet() ) {
newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? Genotype.removePLs(g.getValue()) : g.getValue());
}
return newGs;
}
public static Map<VariantContext.Type, List<VariantContext>> separateVariantContextsByType(Collection<VariantContext> VCs) {
HashMap<VariantContext.Type, List<VariantContext>> mappedVCs = new HashMap<VariantContext.Type, List<VariantContext>>();
for ( VariantContext vc : VCs ) {

View File

@ -70,6 +70,14 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
executeTest("combineSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec);
}
public void combinePLs(String file1, String file2, String md5) {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineVariants -NO_HEADER -o %s -R " + b36KGReference + " -priority v1,v2 -B:v1,VCF " + validationDataLocation + file1 + " -B:v2,VCF " + validationDataLocation + file2,
1,
Arrays.asList(md5));
executeTest("combine PLs 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec);
}
@Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "c608b9fc1e36dba6cebb4f259883f9f0", true); }
@Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "20caad94411d6ab48153b214de916df8", " -setKey foo", true); }
@Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "004f3065cb1bc2ce2f9afd695caf0b48", " -setKey null", true); }
@ -78,6 +86,8 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
@Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "7593be578d4274d672fc22fced38012b", false); }
@Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "1cd467863c4e948fadd970681552d57e", false); }
@Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "0f873fed02aa99db5b140bcd6282c10a"); }
@Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f", false); } // official project VCF files in tabix format
@Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9", false); } // official project VCF files in tabix format
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a", false); }

View File

@ -63,4 +63,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testConcordance--" + testFile, spec);
}
@Test(enabled=false)
public void testRemovePLs() {
String testFile = validationDataLocation + "combine.3.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -sn NA12892 -B:variant,VCF " + testFile + " -o %s -NO_HEADER",
1,
Arrays.asList("")
);
executeTest("testWithPLs--" + testFile, spec);
}
}