New function for merging nearby events into MNPs or complex substitutions. Added extensive unit tests.

This commit is contained in:
Ryan Poplin 2012-07-19 13:16:33 -04:00
parent a4884f82cd
commit 1592841c93
2 changed files with 174 additions and 19 deletions

View File

@ -33,6 +33,7 @@ import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.codecs.vcf.VCFAlleleClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -381,24 +382,7 @@ public class GenotypingEngine {
}
if( R2 > MERGE_EVENTS_R2_THRESHOLD ) {
byte[] refBases = thisVC.getReference().getBases();
byte[] altBases = thisVC.getAlternateAllele(0).getBases();
final int thisPadding = ( thisVC.getReference().isNull() || thisVC.getAlternateAllele(0).isNull() ? 1 : 0 );
final int nextPadding = ( nextVC.getReference().isNull() || nextVC.getAlternateAllele(0).isNull() ? 1 : 0 );
for( int locus = thisStart + refBases.length + thisPadding ; locus < nextStart + nextPadding; locus++ ) {
final byte refByte = ref[ locus - refLoc.getStart() ];
refBases = ArrayUtils.add(refBases, refByte);
altBases = ArrayUtils.add(altBases, refByte);
}
refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases());
altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
final ArrayList<Allele> mergedAlleles = new ArrayList<Allele>();
mergedAlleles.add( Allele.create( refBases, true ) );
mergedAlleles.add( Allele.create( altBases, false ) );
final VariantContext mergedVC = ( refBases.length == altBases.length ?
new VariantContextBuilder("MNP", thisVC.getChr(), thisVC.getStart() + (thisVC.isIndel() && !thisVC.isComplexIndel() ? 1 : 0) , nextVC.getEnd(), mergedAlleles).make() :
new VariantContextBuilder("Complex", thisVC.getChr(), thisVC.getStart(), nextVC.getEnd(), mergedAlleles).referenceBaseForIndel(ref[thisStart-refLoc.getStart()]).make() );
final VariantContext mergedVC = createMergedVariantContext(thisVC, nextVC, ref, refLoc);
// remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
for( final Haplotype h : haplotypes ) {
@ -431,6 +415,36 @@ public class GenotypingEngine {
}
}
// BUGBUG: make this merge function more general
protected static VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) {
final int thisStart = thisVC.getStart();
final int nextStart = nextVC.getStart();
byte[] refBases = ( thisVC.hasReferenceBaseForIndel() ? new byte[]{ thisVC.getReferenceBaseForIndel() } : new byte[]{} );
byte[] altBases = ( thisVC.hasReferenceBaseForIndel() ? new byte[]{ thisVC.getReferenceBaseForIndel() } : new byte[]{} );
refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases());
altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases());
for( int locus = thisStart + refBases.length; locus < nextStart; locus++ ) {
final byte refByte = ref[locus - refLoc.getStart()];
refBases = ArrayUtils.add(refBases, refByte);
altBases = ArrayUtils.add(altBases, refByte);
}
if( nextVC.hasReferenceBaseForIndel() ) {
refBases = ArrayUtils.add(refBases, nextVC.getReferenceBaseForIndel());
altBases = ArrayUtils.add(altBases, nextVC.getReferenceBaseForIndel());
}
refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases());
altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
int iii = 0;
if( refBases.length == altBases.length && VCFAlleleClipper.needsPadding(thisVC) ) { // special case of insertion + deletion of same length creates an MNP --> trim padding bases off the allele
while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; }
}
final ArrayList<Allele> mergedAlleles = new ArrayList<Allele>();
mergedAlleles.add( Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ) );
mergedAlleles.add( Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ) );
return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), mergedAlleles).make();
}
@Requires({"x11 >= 0", "x12 >= 0", "x21 >= 0", "x22 >= 0"})
@Ensures({"result >= 0.0", "result <= 1.0"})
protected static double calculateR2LD( final int x11, final int x12, final int x21, final int x22 ) {

View File

@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
@ -232,7 +233,7 @@ public class GenotypingEngineUnitTest extends BaseTest {
}
/**
* Tests that we get the right values from the binomial distribution
* Tests that we get the right values from the R^2 calculation
*/
@Test
public void testCalculateR2LD() {
@ -244,6 +245,146 @@ public class GenotypingEngineUnitTest extends BaseTest {
Assert.assertEquals(GenotypingEngine.calculateR2LD(100,0,0,100), 1.0, 0.00001);
Assert.assertEquals(GenotypingEngine.calculateR2LD(1,2,3,4), (0.1 - 0.12) * (0.1 - 0.12) / (0.3 * 0.7 * 0.4 * 0.6), 0.00001);
}
@Test
public void testCreateMergedVariantContext() {
logger.warn("Executing testCreateMergedVariantContext");
final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes();
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length);
// SNP + SNP = simple MNP
VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make();
VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make();
VariantContext mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// SNP + ref + SNP = MNP with ref base gap
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// insertion + SNP
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","AAAAA").referenceBaseForIndel("T").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// SNP + insertion
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("-","AAAAA").referenceBaseForIndel("C").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// deletion + SNP
thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("C","-").referenceBaseForIndel("T").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// SNP + deletion
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// insertion + deletion = MNP
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","A").referenceBaseForIndel("T").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// insertion + deletion
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","AAAAA").referenceBaseForIndel("T").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// insertion + insertion
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","A").referenceBaseForIndel("T").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("-","A").referenceBaseForIndel("C").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// deletion + deletion
thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("T","-").referenceBaseForIndel("A").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// complex + complex
thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make();
nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
}
/**
* Private function to compare HashMap of VCs, it only checks the types and start locations of the VariantContext