From 198923b597e3635ec5f71aaa192ac4bcaca36ddd Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 26 Nov 2012 15:01:13 -0500 Subject: [PATCH] Add ActiveRegionReadState handling --- .../haplotypecaller/HaplotypeCaller.java | 11 +- .../traversals/TraverseActiveRegions.java | 16 +- .../gatk/walkers/ActiveRegionWalker.java | 20 +- .../activeregion/ActiveRegionReadState.java | 16 ++ .../traversals/TraverseActiveRegionsTest.java | 207 +++++++++++++++--- 5 files changed, 230 insertions(+), 40 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 24b3309f1..d194e2620 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -295,9 +296,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Override public boolean includeReadsWithDeletionAtLoci() { return true; } - // enable non primary reads in the active region + // enable non primary and extended reads in the active region @Override - public boolean wantsNonPrimaryReads() { return true; } + public EnumSet desiredReadStates() { + return EnumSet.of( + ActiveRegionReadState.PRIMARY, + ActiveRegionReadState.NONPRIMARY, + ActiveRegionReadState.EXTENDED + ); + } @Override @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 3f20db0af..06fc01232 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -258,13 +258,23 @@ public class TraverseActiveRegions extends TraversalEngine extends Walker desiredReadStates() { + return EnumSet.of(ActiveRegionReadState.PRIMARY); + } + + public final boolean wantsNonPrimaryReads() { + return desiredReadStates().contains(ActiveRegionReadState.NONPRIMARY); + } + + public boolean wantsExtendedReads() { + return desiredReadStates().contains(ActiveRegionReadState.EXTENDED); + } + + public boolean wantsUnmappedReads() { + return desiredReadStates().contains(ActiveRegionReadState.UNMAPPED); } // Determine probability of active status over the AlignmentContext diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java new file mode 100644 index 000000000..00e491eb0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java @@ -0,0 +1,16 @@ +package org.broadinstitute.sting.utils.activeregion; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/26/12 + * Time: 2:35 PM + * + * Describes how a read relates to an assigned ActiveRegion + */ +public enum ActiveRegionReadState { + PRIMARY, // This is the read's primary region + NONPRIMARY, // This region overlaps the read, but it is not primary + EXTENDED, // This region would overlap the read if it were extended + UNMAPPED // This read is not mapped +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index 66504da11..b70085eff 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.traversals; import com.google.java.contract.PreconditionError; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -46,6 +47,8 @@ public class TraverseActiveRegionsTest extends BaseTest { private class DummyActiveRegionWalker extends ActiveRegionWalker { private final double prob; + private EnumSet states = super.desiredReadStates(); + protected List isActiveCalls = new ArrayList(); protected Map mappedActiveRegions = new HashMap(); @@ -57,6 +60,16 @@ public class TraverseActiveRegionsTest extends BaseTest { this.prob = constProb; } + public DummyActiveRegionWalker(EnumSet wantStates) { + this.prob = 1.0; + this.states = wantStates; + } + + @Override + public EnumSet desiredReadStates() { + return states; + } + @Override public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { isActiveCalls.add(ref.getLocus()); @@ -202,12 +215,158 @@ public class TraverseActiveRegionsTest extends BaseTest { } @Test - public void testReadMapping() { + public void testPrimaryReadMapping() { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); // Contract: Each read has the Primary state in a single region (or none) // This is the region of maximum overlap for the read (earlier if tied) + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_only: Extended in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + + getRead(region, "simple"); + getRead(region, "overlap_equal"); + getRead(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + getRead(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail getRead(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + getRead(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "simple20"); + + // TODO: more tests and edge cases + } + + @Test + public void testNonPrimaryReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker( + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_only: Extended in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + + getRead(region, "simple"); + getRead(region, "overlap_equal"); + getRead(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail getRead(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + getRead(region, "boundary_equal"); + getRead(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail getRead(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + getRead(region, "boundary_equal"); + getRead(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "simple20"); + + // TODO: more tests and edge cases + } + + @Test + public void testExtendedReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker( + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + // Contract: Each read has the Non-Primary state in all other regions it overlaps // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended @@ -226,13 +385,13 @@ public class TraverseActiveRegionsTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadPrimary(region, "simple"); - verifyReadPrimary(region, "overlap_equal"); - verifyReadPrimary(region, "overlap_unequal"); + getRead(region, "simple"); + getRead(region, "overlap_equal"); + getRead(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_only"); - // TODO: fail verifyReadNonPrimary(region, "extended_and_np"); + // TODO: fail getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -241,10 +400,10 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "simple"); verifyReadNotPlaced(region, "overlap_equal"); verifyReadNotPlaced(region, "overlap_unequal"); - // TODO: fail verifyReadNonPrimary(region, "boundary_equal"); - verifyReadPrimary(region, "boundary_unequal"); + getRead(region, "boundary_equal"); + getRead(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_only"); - // TODO: fail verifyReadPrimary(region, "extended_and_np"); + // TODO: fail getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -253,10 +412,10 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "simple"); verifyReadNotPlaced(region, "overlap_equal"); verifyReadNotPlaced(region, "overlap_unequal"); - verifyReadPrimary(region, "boundary_equal"); - // TODO: fail verifyReadNonPrimary(region, "boundary_unequal"); - // TODO: fail verifyReadExtended(region, "extended_only"); - // TODO: fail verifyReadExtended(region, "extended_and_np"); + getRead(region, "boundary_equal"); + getRead(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -267,33 +426,19 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); - verifyReadNotPlaced(region, "extended_and_np"); + // TODO: fail getRead(region, "extended_only"); + // TODO: fail getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); - verifyReadPrimary(region, "simple20"); + getRead(region, "simple20"); // TODO: more tests and edge cases } - private void verifyReadPrimary(ActiveRegion region, String readName) { - SAMRecord read = getRead(region, readName); - Assert.assertFalse(read.getNotPrimaryAlignmentFlag(), "Read " + read + " not primary in active region " + region); - } - - private void verifyReadNonPrimary(ActiveRegion region, String readName) { - SAMRecord read = getRead(region, readName); - Assert.assertTrue(read.getNotPrimaryAlignmentFlag(), "Read " + read + " primary in active region " + region); - } - - private void verifyReadExtended(ActiveRegion region, String readName) { - Assert.fail("The Extended read state has not been implemented"); - } - private void verifyReadNotPlaced(ActiveRegion region, String readName) { for (SAMRecord read : region.getReads()) { if (read.getReadName().equals(readName)) Assert.fail("Read " + readName + " found in active region " + region); - } + } } private SAMRecord getRead(ActiveRegion region, String readName) { @@ -302,7 +447,7 @@ public class TraverseActiveRegionsTest extends BaseTest { return read; } - Assert.fail("Read " + readName + " not found in active region " + region); + Assert.fail("Read " + readName + " not assigned to active region " + region); return null; }