From 36c27155afef2d7d8b69db00a6686dfc647ff98f Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 2 Oct 2014 14:25:49 -0400 Subject: [PATCH] Made the threshold for the probability of a state being active a command line argument remove TODO comment after activeProbThreshold recover static ACTIVE_PROB_THRESHOLD for unit tests Add min/max values for active_probability_threshold parameter Move activeProbThreshold parameter to GATKArguemtnCollection define ACTIVE_PROB_THRESHOLD in unit tests add construction of argCollection in in ctor Move arguments from GATKArgumentCollection to ActiveRegionWalker Throw exception if threshold < 0 or > 1 in ActivityProfile ctor max propogation distance parameter to ActiveRegionWalker for AcrtivityProfile Use polymorphic getMaxProbPropagationDistance() so BandPassActivityProfile computes the crrect region size cutoff Get the maxProbPropagationDistance from the super class's method, instead of directly, this is safer Removed extraneous command line imports and make maxProbPropagationDistance a hidden argument remove limit check for activeProbThreshold, not necessary because the check is made when imput as a command line arg Remove extra 'region' in the doxygen param description for maxProbPropagationDistance --- .../HaplotypeCallerIntegrationTest.java | 2 +- .../traversals/TraverseActiveRegions.java | 3 ++- .../engine/walkers/ActiveRegionWalker.java | 11 ++++++++ .../utils/activeregion/ActivityProfile.java | 26 ++++++++++++------- .../activeregion/BandPassActivityProfile.java | 22 ++++++++++------ .../activeregion/ActivityProfileUnitTest.java | 15 ++++++----- .../BandPassActivityProfileUnitTest.java | 19 +++++++++----- 7 files changed, 65 insertions(+), 33 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 2853fce29..11576911f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -78,7 +78,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String GGA_INTERVALS_FILE = privateTestDir + "haplotype-caller-reduced-test-interval.list"; private void HCTest(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE --maxReadsInRegionPerSample 1000 --minReadsPerAlignmentStart 5 -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE --maxReadsInRegionPerSample 1000 --minReadsPerAlignmentStart 5 --maxProbPropagationDistance 50 --activeProbabilityThreshold 0.002 -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCaller: args=" + args, spec); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java index 3a8c3ffab..7d93311f2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java @@ -159,7 +159,8 @@ public final class TraverseActiveRegions extends TraversalEngine extends Walker stateList; protected final GenomeLocParser parser; protected final GenomeLocSortedSet restrictToIntervals; + protected final int maxProbPropagationDistance; + protected final double activeProbThreshold; + protected GenomeLoc regionStartLoc = null; protected GenomeLoc regionStopLoc = null; @@ -60,22 +60,28 @@ public class ActivityProfile { /** * Create a new empty ActivityProfile * @param parser the parser we can use to create genome locs, cannot be null + * @param maxProbPropagationDistance region probability propagation distance beyond it's maximum size + * @param activeProbThreshold threshold for the probability of am active profile state being active */ - public ActivityProfile(final GenomeLocParser parser) { - this(parser, null); + public ActivityProfile(final GenomeLocParser parser, final int maxProbPropagationDistance, final double activeProbThreshold) { + this(parser, maxProbPropagationDistance, activeProbThreshold, null); } /** * Create a empty ActivityProfile, restricting output to profiles overlapping intervals, if not null * @param parser the parser we can use to create genome locs, cannot be null + * @param maxProbPropagationDistance region probability propagation distance beyond it's maximum size + * @param activeProbThreshold threshold for the probability of a profile state being active * @param intervals only include states that are within these intervals, if not null */ - public ActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet intervals) { + public ActivityProfile(final GenomeLocParser parser, final int maxProbPropagationDistance, final double activeProbThreshold, final GenomeLocSortedSet intervals) { if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); this.parser = parser; this.stateList = new ArrayList(); this.restrictToIntervals = intervals; + this.maxProbPropagationDistance = maxProbPropagationDistance; + this.activeProbThreshold = activeProbThreshold; } @Override @@ -98,7 +104,7 @@ public class ActivityProfile { */ @Ensures("result >= 0") public int getMaxProbPropagationDistance() { - return MAX_PROB_PROPAGATION_DISTANCE; + return maxProbPropagationDistance; } /** @@ -359,7 +365,7 @@ public class ActivityProfile { } final ActivityProfileState first = stateList.get(0); - final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD; + final boolean isActiveRegion = first.isActiveProb > activeProbThreshold; final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, minRegionSize, maxRegionSize, forceConversion); if ( offsetOfNextRegionEnd == -1 ) // couldn't find a valid ending offset, so we return null @@ -453,7 +459,7 @@ public class ActivityProfile { * Find the first index into the state list where the state is considered ! isActiveRegion * * Note that each state has a probability of being active, and this function thresholds that - * value on ACTIVE_PROB_THRESHOLD, coloring each state as active or inactive. Finds the + * value on activeProbThreshold, coloring each state as active or inactive. Finds the * largest contiguous stretch of states starting at the first state (index 0) with the same isActive * state as isActiveRegion. If the entire state list has the same isActive value, then returns * maxRegionSize @@ -470,7 +476,7 @@ public class ActivityProfile { int endOfActiveRegion = 0; while ( endOfActiveRegion < nStates && endOfActiveRegion < maxRegionSize ) { - if ( getProb(endOfActiveRegion) > ACTIVE_PROB_THRESHOLD != isActiveRegion ) { + if ( getProb(endOfActiveRegion) > activeProbThreshold != isActiveRegion ) { break; } endOfActiveRegion++; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfile.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfile.java index b79050c87..52437a846 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfile.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfile.java @@ -55,19 +55,22 @@ public class BandPassActivityProfile extends ActivityProfile { /** * Create a new BandPassActivityProfile with default sigma and filter sizes * - * @see #BandPassActivityProfile(org.broadinstitute.gatk.utils.GenomeLocParser, org.broadinstitute.gatk.utils.GenomeLocSortedSet, int, double, boolean) + * @see #BandPassActivityProfile(org.broadinstitute.gatk.utils.GenomeLocParser, org.broadinstitute.gatk.utils.GenomeLocSortedSet, int, double, int, double, boolean) */ - public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals) { - this(parser, restrictToIntervals, MAX_FILTER_SIZE, DEFAULT_SIGMA, true); + public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, + final int maxProbPropagationDistance, final double activeProbThreshold) { + this(parser, restrictToIntervals, maxProbPropagationDistance, activeProbThreshold, MAX_FILTER_SIZE, DEFAULT_SIGMA); } /** - * @see #BandPassActivityProfile(org.broadinstitute.gatk.utils.GenomeLocParser, org.broadinstitute.gatk.utils.GenomeLocSortedSet, int, double, boolean) + * @see #BandPassActivityProfile(org.broadinstitute.gatk.utils.GenomeLocParser, org.broadinstitute.gatk.utils.GenomeLocSortedSet, int, double, int, double, boolean) * * sets adaptiveFilterSize to true */ - public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, final int maxFilterSize, final double sigma) { - this(parser, restrictToIntervals, maxFilterSize, sigma, true); + public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, + final int maxProbPropagationDistance, final double activeProbThreshold, + final int maxFilterSize, final double sigma) { + this(parser, restrictToIntervals, maxProbPropagationDistance, activeProbThreshold, maxFilterSize, sigma, true); } /** @@ -75,12 +78,15 @@ public class BandPassActivityProfile extends ActivityProfile { * * @param parser our genome loc parser * @param restrictToIntervals only include states that are within these intervals, if not null + * @param maxProbPropagationDistance region probability propagation distance beyond it's maximum size + * @param activeProbThreshold threshold for the probability of a profile state being active * @param maxFilterSize the maximum size of the band pass filter we are allowed to create, regardless of sigma * @param sigma the variance of the Gaussian kernel for this band pass filter * @param adaptiveFilterSize if true, use the kernel itself to determine the best filter size */ - public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, final int maxFilterSize, final double sigma, final boolean adaptiveFilterSize) { - super(parser, restrictToIntervals); + public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, final int maxProbPropagationDistance, + final double activeProbThreshold, final int maxFilterSize, final double sigma, final boolean adaptiveFilterSize) { + super(parser, maxProbPropagationDistance, activeProbThreshold, restrictToIntervals); if ( sigma < 0 ) throw new IllegalArgumentException("Sigma must be greater than or equal to 0 but got " + sigma); diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileUnitTest.java index d450312e8..b3442b331 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileUnitTest.java @@ -51,6 +51,9 @@ public class ActivityProfileUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; private GenomeLoc startLoc; + private final static int MAX_PROB_PROPAGATION_DISTANCE = 50; + private final static double ACTIVE_PROB_THRESHOLD= 0.002; + @BeforeClass public void init() throws FileNotFoundException { // sequence @@ -86,10 +89,10 @@ public class ActivityProfileUnitTest extends BaseTest { public ActivityProfile makeProfile() { switch ( type ) { - case Base: return new ActivityProfile(genomeLocParser); + case Base: return new ActivityProfile(genomeLocParser, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); case BandPass: // zero size => equivalent to ActivityProfile - return new BandPassActivityProfile(genomeLocParser, null, 0, 0.01, false); + return new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, 0, 0.01, false); default: throw new IllegalStateException(type.toString()); } } @@ -230,7 +233,7 @@ public class ActivityProfileUnitTest extends BaseTest { @Test(enabled = !DEBUG, dataProvider = "RegionCreationTests") public void testRegionCreation(final int start, final List probs, int maxRegionSize, final int nParts, final boolean forceConversion, final boolean waitUntilEnd) { - final ActivityProfile profile = new ActivityProfile(genomeLocParser); + final ActivityProfile profile = new ActivityProfile(genomeLocParser, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); Assert.assertNotNull(profile.toString()); final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); @@ -316,7 +319,7 @@ public class ActivityProfileUnitTest extends BaseTest { @Test(enabled = ! DEBUG, dataProvider = "SoftClipsTest") public void testSoftClips(final int start, int nPrecedingSites, final int softClipSize) { - final ActivityProfile profile = new ActivityProfile(genomeLocParser); + final ActivityProfile profile = new ActivityProfile(genomeLocParser, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength(); final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); @@ -450,7 +453,7 @@ public class ActivityProfileUnitTest extends BaseTest { private double[] makeGaussian(final int mean, final int range, final double sigma) { final double[] gauss = new double[range]; for( int iii = 0; iii < range; iii++ ) { - gauss[iii] = MathUtils.normalDistribution(mean, sigma, iii) + ActivityProfile.ACTIVE_PROB_THRESHOLD; + gauss[iii] = MathUtils.normalDistribution(mean, sigma, iii) + ACTIVE_PROB_THRESHOLD; } return gauss; } @@ -469,7 +472,7 @@ public class ActivityProfileUnitTest extends BaseTest { @Test(dataProvider = "ActiveRegionCutTests") public void testActiveRegionCutTests(final int minRegionSize, final int maxRegionSize, final int expectedRegionSize, final List probs) { - final ActivityProfile profile = new ActivityProfile(genomeLocParser); + final ActivityProfile profile = new ActivityProfile(genomeLocParser, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); for ( int i = 0; i <= maxRegionSize + profile.getMaxProbPropagationDistance(); i++ ) { diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java index abf599a50..2087d9a0c 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -59,6 +59,9 @@ public class BandPassActivityProfileUnitTest extends BaseTest { private final static boolean DEBUG = false; private GenomeLocParser genomeLocParser; + private final static int MAX_PROB_PROPAGATION_DISTANCE = 50; + private final static double ACTIVE_PROB_THRESHOLD= 0.002; + @BeforeClass public void init() throws FileNotFoundException { // sequence @@ -91,7 +94,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest { @Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest") public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) { - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, bandPassSize, sigma, false); + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, bandPassSize, sigma, false); final int expectedBandSize = bandPassSize * 2 + 1; Assert.assertEquals(profile.getFilteredSize(), bandPassSize, "Wrong filter size"); @@ -155,7 +158,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest { @Test( enabled = ! DEBUG, dataProvider = "BandPassComposition") public void testBandPassComposition(final int bandPassSize, final int integrationLength) { final int start = 1; - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA); + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, + ACTIVE_PROB_THRESHOLD, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA); final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2]; // add a buffer so that we can get all of the band pass values @@ -228,7 +232,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest { @Test( enabled = ! DEBUG, dataProvider = "KernelCreation") public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) { - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, maxSize, sigma, true); + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, + maxSize, sigma, true); final double[] kernel = profile.getKernel(); Assert.assertEquals(kernel.length, expectedKernel.length); @@ -268,8 +273,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest { final Pair> reader = GATKVCFUtils.readAllVCs(file, codec); final List incRegions = new ArrayList(); - final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null); - final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser, null); + final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); + final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); int pos = start; for ( final VariantContext vc : reader.getSecond() ) { if ( vc == null ) continue; @@ -324,10 +329,10 @@ public class BandPassActivityProfileUnitTest extends BaseTest { lastPosSeen = region.getLocation().getStop(); for ( final ActivityProfileState state : region.getSupportingStates() ) { - Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive(), + Assert.assertEquals(state.isActiveProb > ACTIVE_PROB_THRESHOLD, region.isActive(), "Region is active=" + region.isActive() + " but contains a state " + state + " with prob " + state.isActiveProb + " not within expected values given threshold for activity of " - + ActivityProfile.ACTIVE_PROB_THRESHOLD); + + ACTIVE_PROB_THRESHOLD); } } }