Adding 80% support for multi-allelic variants

-- Multi-allelic variants are split into their bi-allelic version, trimmed, and we attempt to provide a meaningful genotype for NA12878 here.  It's not perfect and needs some discussion on how to handle het/alt variants
-- Adding splitInBiallelic funtion to VariantContextUtils as well as extensive unit tests that also indirectly test reverseTrimAlleles (which worked perfectly FYI)
This commit is contained in:
Mark DePristo 2012-11-21 17:23:41 -05:00
parent e14bfa9f5c
commit 48f271c5bd
3 changed files with 150 additions and 5 deletions

View File

@ -979,6 +979,40 @@ public class VariantContextUtils {
private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
/**
* Split variant context into its biallelic components if there are more than 2 alleles
*
* For VC has A/B/C alleles, returns A/B and A/C contexts.
* Genotypes are all no-calls now (it's not possible to fix them easily)
* Alleles are right trimmed to satisfy VCF conventions
*
* If vc is biallelic or non-variant it is just returned
*
* Chromosome counts are updated (but they are by definition 0)
*
* @param vc a potentially multi-allelic variant context
* @return a list of bi-allelic (or monomorphic) variant context
*/
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc) {
if ( ! vc.isVariant() || vc.isBiallelic() )
// non variant or biallelics already satisfy the contract
return Collections.singletonList(vc);
else {
final List<VariantContext> biallelics = new LinkedList<VariantContext>();
for ( final Allele alt : vc.getAlternateAlleles() ) {
VariantContextBuilder builder = new VariantContextBuilder(vc);
final List<Allele> alleles = Arrays.asList(vc.getReference(), alt);
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
calculateChromosomeCounts(builder, true);
biallelics.add(reverseTrimAlleles(builder.make()));
}
return biallelics;
}
}
/**
* subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately)
*

View File

@ -782,7 +782,7 @@ public class VariantContextTestProvider {
Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
Assert.assertEquals(actual.getID(), expected.getID(), "id");
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles");
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");

View File

@ -26,7 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
@ -39,7 +39,7 @@ import java.io.FileNotFoundException;
import java.util.*;
public class VariantContextUtilsUnitTest extends BaseTest {
Allele Aref, T, C, Cref, ATC, ATCATC;
Allele Aref, T, C, G, Cref, ATC, ATCATC;
private GenomeLocParser genomeLocParser;
@BeforeSuite
@ -58,6 +58,7 @@ public class VariantContextUtilsUnitTest extends BaseTest {
Cref = Allele.create("C", true);
T = Allele.create("T");
C = Allele.create("C");
G = Allele.create("G");
ATC = Allele.create("ATC");
ATCATC = Allele.create("ATCATC");
}
@ -697,10 +698,120 @@ public class VariantContextUtilsUnitTest extends BaseTest {
return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class);
}
@Test(dataProvider = "ReverseClippingPositionTestProvider")
public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) {
int result = VariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false);
Assert.assertEquals(result, cfg.expectedClip);
}
}
// --------------------------------------------------------------------------------
//
// test splitting into bi-allelics
//
// --------------------------------------------------------------------------------
@DataProvider(name = "SplitBiallelics")
public Object[][] makeSplitBiallelics() throws CloneNotSupportedException {
List<Object[]> tests = new ArrayList<Object[]>();
final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C));
// biallelic -> biallelic
tests.add(new Object[]{root.make(), Arrays.asList(root.make())});
// monos -> monos
root.alleles(Arrays.asList(Aref));
tests.add(new Object[]{root.make(), Arrays.asList(root.make())});
root.alleles(Arrays.asList(Aref, C, T));
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(Aref, C)).make(),
root.alleles(Arrays.asList(Aref, T)).make())});
root.alleles(Arrays.asList(Aref, C, T, G));
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(Aref, C)).make(),
root.alleles(Arrays.asList(Aref, T)).make(),
root.alleles(Arrays.asList(Aref, G)).make())});
final Allele C = Allele.create("C");
final Allele CA = Allele.create("CA");
final Allele CAA = Allele.create("CAA");
final Allele CAAAA = Allele.create("CAAAA");
final Allele CAAAAA = Allele.create("CAAAAA");
final Allele Cref = Allele.create("C", true);
final Allele CAref = Allele.create("CA", true);
final Allele CAAref = Allele.create("CAA", true);
final Allele CAAAref = Allele.create("CAAA", true);
root.alleles(Arrays.asList(Cref, CA, CAA));
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(Cref, CA)).make(),
root.alleles(Arrays.asList(Cref, CAA)).make())});
root.alleles(Arrays.asList(CAAref, C, CA)).stop(12);
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(CAAref, C)).make(),
root.alleles(Arrays.asList(CAref, C)).stop(11).make())});
root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13);
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(CAAAref, C)).make(),
root.alleles(Arrays.asList(CAAref, C)).stop(12).make(),
root.alleles(Arrays.asList(CAref, C)).stop(11).make())});
root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13);
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(),
root.alleles(Arrays.asList(Cref, CA)).stop(10).make(),
root.alleles(Arrays.asList(CAref, C)).stop(11).make(),
root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "SplitBiallelics")
public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
final List<VariantContext> biallelics = VariantContextUtils.splitVariantContextToBiallelics(vc);
Assert.assertEquals(biallelics.size(), expectedBiallelics.size());
for ( int i = 0; i < biallelics.size(); i++ ) {
final VariantContext actual = biallelics.get(i);
final VariantContext expected = expectedBiallelics.get(i);
VariantContextTestProvider.assertEquals(actual, expected);
}
}
@Test(dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes")
public void testSplitBiallelicsGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
final List<Genotype> genotypes = new ArrayList<Genotype>();
int sampleI = 0;
for ( final List<Allele> alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) {
genotypes.add(GenotypeBuilder.create("sample" + sampleI, alleles));
}
genotypes.add(GenotypeBuilder.createMissing("missing", 2));
final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make();
final List<VariantContext> biallelics = VariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes);
for ( int i = 0; i < biallelics.size(); i++ ) {
final VariantContext actual = biallelics.get(i);
Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples
for ( final Genotype inputGenotype : genotypes ) {
final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName());
Assert.assertNotNull(actualGenotype);
if ( ! vc.isVariant() || vc.isBiallelic() )
Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName()));
else
Assert.assertTrue(actualGenotype.isNoCall());
}
}
}
}