From 1592841c93a5c50f0a76f33c5e27e50decc8e7fa Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 19 Jul 2012 13:16:33 -0400 Subject: [PATCH] New function for merging nearby events into MNPs or complex substitutions. Added extensive unit tests. --- .../haplotypecaller/GenotypingEngine.java | 50 +++--- .../GenotypingEngineUnitTest.java | 143 +++++++++++++++++- 2 files changed, 174 insertions(+), 19 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 47a7030a8..bdd04cba1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -33,6 +33,7 @@ import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.codecs.vcf.VCFAlleleClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -381,24 +382,7 @@ public class GenotypingEngine { } if( R2 > MERGE_EVENTS_R2_THRESHOLD ) { - byte[] refBases = thisVC.getReference().getBases(); - byte[] altBases = thisVC.getAlternateAllele(0).getBases(); - final int thisPadding = ( thisVC.getReference().isNull() || thisVC.getAlternateAllele(0).isNull() ? 1 : 0 ); - final int nextPadding = ( nextVC.getReference().isNull() || nextVC.getAlternateAllele(0).isNull() ? 1 : 0 ); - for( int locus = thisStart + refBases.length + thisPadding ; locus < nextStart + nextPadding; locus++ ) { - final byte refByte = ref[ locus - refLoc.getStart() ]; - refBases = ArrayUtils.add(refBases, refByte); - altBases = ArrayUtils.add(altBases, refByte); - } - refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases()); - altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); - - final ArrayList mergedAlleles = new ArrayList(); - mergedAlleles.add( Allele.create( refBases, true ) ); - mergedAlleles.add( Allele.create( altBases, false ) ); - final VariantContext mergedVC = ( refBases.length == altBases.length ? - new VariantContextBuilder("MNP", thisVC.getChr(), thisVC.getStart() + (thisVC.isIndel() && !thisVC.isComplexIndel() ? 1 : 0) , nextVC.getEnd(), mergedAlleles).make() : - new VariantContextBuilder("Complex", thisVC.getChr(), thisVC.getStart(), nextVC.getEnd(), mergedAlleles).referenceBaseForIndel(ref[thisStart-refLoc.getStart()]).make() ); + final VariantContext mergedVC = createMergedVariantContext(thisVC, nextVC, ref, refLoc); // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event for( final Haplotype h : haplotypes ) { @@ -431,6 +415,36 @@ public class GenotypingEngine { } } + // BUGBUG: make this merge function more general + protected static VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) { + final int thisStart = thisVC.getStart(); + final int nextStart = nextVC.getStart(); + byte[] refBases = ( thisVC.hasReferenceBaseForIndel() ? new byte[]{ thisVC.getReferenceBaseForIndel() } : new byte[]{} ); + byte[] altBases = ( thisVC.hasReferenceBaseForIndel() ? new byte[]{ thisVC.getReferenceBaseForIndel() } : new byte[]{} ); + refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases()); + altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases()); + for( int locus = thisStart + refBases.length; locus < nextStart; locus++ ) { + final byte refByte = ref[locus - refLoc.getStart()]; + refBases = ArrayUtils.add(refBases, refByte); + altBases = ArrayUtils.add(altBases, refByte); + } + if( nextVC.hasReferenceBaseForIndel() ) { + refBases = ArrayUtils.add(refBases, nextVC.getReferenceBaseForIndel()); + altBases = ArrayUtils.add(altBases, nextVC.getReferenceBaseForIndel()); + } + refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases()); + altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); + + int iii = 0; + if( refBases.length == altBases.length && VCFAlleleClipper.needsPadding(thisVC) ) { // special case of insertion + deletion of same length creates an MNP --> trim padding bases off the allele + while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; } + } + final ArrayList mergedAlleles = new ArrayList(); + mergedAlleles.add( Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ) ); + mergedAlleles.add( Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ) ); + return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), mergedAlleles).make(); + } + @Requires({"x11 >= 0", "x12 >= 0", "x21 >= 0", "x22 >= 0"}) @Ensures({"result >= 0.0", "result <= 1.0"}) protected static double calculateR2LD( final int x11, final int x12, final int x21, final int x22 ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 4826bfb16..04bb3a753 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -232,7 +233,7 @@ public class GenotypingEngineUnitTest extends BaseTest { } /** - * Tests that we get the right values from the binomial distribution + * Tests that we get the right values from the R^2 calculation */ @Test public void testCalculateR2LD() { @@ -244,6 +245,146 @@ public class GenotypingEngineUnitTest extends BaseTest { Assert.assertEquals(GenotypingEngine.calculateR2LD(100,0,0,100), 1.0, 0.00001); Assert.assertEquals(GenotypingEngine.calculateR2LD(1,2,3,4), (0.1 - 0.12) * (0.1 - 0.12) / (0.3 * 0.7 * 0.4 * 0.6), 0.00001); } + + @Test + public void testCreateMergedVariantContext() { + logger.warn("Executing testCreateMergedVariantContext"); + + final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes(); + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); + + // SNP + SNP = simple MNP + VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make(); + VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make(); + VariantContext mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // SNP + ref + SNP = MNP with ref base gap + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // insertion + SNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","AAAAA").referenceBaseForIndel("T").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // SNP + insertion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("-","AAAAA").referenceBaseForIndel("C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // deletion + SNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("C","-").referenceBaseForIndel("T").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // SNP + deletion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // insertion + deletion = MNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","A").referenceBaseForIndel("T").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make(); + truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // insertion + deletion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","AAAAA").referenceBaseForIndel("T").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // insertion + insertion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","A").referenceBaseForIndel("T").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("-","A").referenceBaseForIndel("C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // deletion + deletion + thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("T","-").referenceBaseForIndel("A").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make(); + truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + + // complex + complex + thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make(); + mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel()); + Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel()); + } /** * Private function to compare HashMap of VCs, it only checks the types and start locations of the VariantContext