From 592f90aaefdb46315862db069e57e4397bb558cc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 24 Jan 2013 11:28:02 -0500 Subject: [PATCH] ActivityProfile now cuts intelligently at the best local minimum when in a larger than max size active region -- This new algorithm is essential to properly handle activity profiles that have many large active regions generated from lots of dense variant events. The new algorithm passes unit tests and passes visualize visual inspection of both running on 1000G and NA12878 -- Misc. commenting of the code -- Updated ActiveRegionExtension to include a min active region size -- Renamed ActiveRegionExtension to ActiveRegionTraversalParameters, as it carries more than just the traversal extension now --- .../targets/FindCoveredIntervals.java | 4 +- .../haplotypecaller/HaplotypeCaller.java | 2 +- .../traversals/TraverseActiveRegions.java | 14 +- ...a => ActiveRegionTraversalParameters.java} | 36 +++- .../gatk/walkers/ActiveRegionWalker.java | 4 +- .../utils/activeregion/ActivityProfile.java | 82 +++++++-- .../activeregion/ActivityProfileState.java | 5 +- .../activeregion/ActivityProfileUnitTest.java | 170 +++++++++++++++++- 8 files changed, 283 insertions(+), 34 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/walkers/{ActiveRegionExtension.java => ActiveRegionTraversalParameters.java} (54%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index 74ff77e4b..3712a8e51 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -52,7 +52,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionTraversalParameters; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; @@ -64,7 +64,7 @@ import java.io.PrintStream; @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.CONTIG) -@ActiveRegionExtension(extension = 0, maxRegion = 50000) +@ActiveRegionTraversalParameters(extension = 0, maxRegion = 50000) public class FindCoveredIntervals extends ActiveRegionWalker { @Output(required = true) private PrintStream out; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 9bb04421c..a3d764141 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -131,7 +131,7 @@ import java.util.*; @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.LOCUS) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@ActiveRegionExtension(extension=65, maxRegion=300) +@ActiveRegionTraversalParameters(extension=65, maxRegion=300) //@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=5) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 52ac783a9..bff696f13 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionTraversalParameters; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -75,6 +75,7 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); @@ -102,9 +103,10 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine extends TraversalEngine walker, T sum, final boolean flushActivityProfile, final boolean forceAllRegionsToBeActive) { if ( ! walkerHasPresetRegions ) { // We don't have preset regions, so we get our regions from the activity profile - final Collection activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMaxRegionSize(), flushActivityProfile); + final Collection activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMinRegionSize(), getMaxRegionSize(), flushActivityProfile); workQueue.addAll(activeRegions); if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java similarity index 54% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java index 72c409f62..cdb45db7b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java @@ -33,7 +33,8 @@ import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; /** - * Describes the size of the buffer region that is added to each active region when pulling in covered reads. + * Describes the parameters that this walker requires of the active region traversal + * * User: rpoplin * Date: 1/18/12 */ @@ -41,13 +42,40 @@ import java.lang.annotation.RetentionPolicy; @Inherited @Retention(RetentionPolicy.RUNTIME) -public @interface ActiveRegionExtension { +public @interface ActiveRegionTraversalParameters { + /** + * How far to either side of the active region itself should we include reads? + * + * That is, if the active region is 10 bp wide, and extension is 5, ART will provide + * the walker with active regions 10 bp, with 5 bp of extension on either side, and + * all reads that cover the 20 bp of the region + extension. + * + * @return the size of the active region extension we'd like + */ public int extension() default 0; + + /** + * The minimum number of bp for an active region, when we need to chop it up into pieces because + * it's become too big. This only comes into effect when there's literally no good place to chop + * that does make the region smaller than this value. + * + * @return the min size in bp of regions + */ + public int minRegion() default 50; + + /** + * The maximum size in bp of active regions wanted by this walker + * + * Active regions larger than this value are automatically cut up by ART into smaller + * regions of size <= this value. + * + * @return the max size in bp of regions + */ public int maxRegion() default 1500; /** - * The sigma value for the Gaussian kernel of the band pass filter - * @return + * The variance value for the Gaussian kernel of the band pass filter employed by ART + * @return the breadth of the band pass gaussian kernel we want for our traversal */ public double bandPassSigma() default BandPassActivityProfile.DEFAULT_SIGMA; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 92504e3ba..e14e50b1a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -55,7 +55,7 @@ import java.util.*; @By(DataSource.READS) @Requires({DataSource.READS, DataSource.REFERENCE}) @PartitionBy(PartitionType.READ) -@ActiveRegionExtension(extension=50,maxRegion=1500) +@ActiveRegionTraversalParameters(extension=50,maxRegion=1500) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) @RemoveProgramRecords public abstract class ActiveRegionWalker extends Walker { @@ -160,7 +160,7 @@ public abstract class ActiveRegionWalker extends Walker allIntervals = new ArrayList(); for( final GenomeLoc interval : intervals.toList() ) { final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 16cb2fd84..f265f9d60 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -41,7 +41,7 @@ import java.util.*; */ public class ActivityProfile { private final static int MAX_PROB_PROPOGATION_DISTANCE = 50; - private final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author + protected final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author protected final List stateList; protected final GenomeLocParser parser; @@ -197,7 +197,6 @@ public class ActivityProfile { regionStopLoc = loc; contigLength = parser.getContigInfo(regionStartLoc.getContig()).getSequenceLength(); } else { - // TODO -- need to figure out where to add loc as the regions will be popping off the front if ( regionStopLoc.getStart() != loc.getStart() - 1 ) throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc ); regionStopLoc = loc; @@ -294,6 +293,7 @@ public class ActivityProfile { * No returned region will be larger than maxRegionSize. * * @param activeRegionExtension the extension value to provide to the constructed regions + * @param minRegionSize the minimum region size, in the case where we have to cut up regions that are too large * @param maxRegionSize the maximize size of the returned region * @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the * stateList. Used to close out the active region when we've hit some kind of end (such @@ -301,14 +301,15 @@ public class ActivityProfile { * @return a non-null list of active regions */ @Ensures("result != null") - public List popReadyActiveRegions(final int activeRegionExtension, final int maxRegionSize, final boolean forceConversion) { + public List popReadyActiveRegions(final int activeRegionExtension, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) { if ( activeRegionExtension < 0 ) throw new IllegalArgumentException("activeRegionExtension must be >= 0 but got " + activeRegionExtension); + if ( minRegionSize < 1 ) throw new IllegalArgumentException("minRegionSize must be >= 1 but got " + minRegionSize); if ( maxRegionSize < 1 ) throw new IllegalArgumentException("maxRegionSize must be >= 1 but got " + maxRegionSize); final LinkedList regions = new LinkedList(); while ( true ) { - final ActiveRegion nextRegion = popNextReadyActiveRegion(activeRegionExtension, maxRegionSize, forceConversion); + final ActiveRegion nextRegion = popNextReadyActiveRegion(activeRegionExtension, minRegionSize, maxRegionSize, forceConversion); if ( nextRegion == null ) return regions; else { @@ -325,19 +326,20 @@ public class ActivityProfile { * are also updated. * * @param activeRegionExtension the extension value to provide to the constructed regions + * @param minRegionSize the minimum region size, in the case where we have to cut up regions that are too large * @param maxRegionSize the maximize size of the returned region * @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the * stateList. Used to close out the active region when we've hit some kind of end (such * as the end of the contig) * @return a fully formed active region, or null if none can be made */ - private ActiveRegion popNextReadyActiveRegion(final int activeRegionExtension, final int maxRegionSize, final boolean forceConversion) { + private ActiveRegion popNextReadyActiveRegion(final int activeRegionExtension, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) { if ( stateList.isEmpty() ) return null; final ActivityProfileState first = stateList.get(0); final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD; - final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, maxRegionSize, forceConversion); + final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, minRegionSize, maxRegionSize, forceConversion); if ( offsetOfNextRegionEnd == -1 ) // couldn't find a valid ending offset, so we return null return null; @@ -363,9 +365,14 @@ public class ActivityProfile { * The current region is defined from the start of the stateList, looking for elements that have the same isActiveRegion * flag (i.e., if isActiveRegion is true we are looking for states with isActiveProb > threshold, or alternatively * for states < threshold). The maximize size of the returned region is maxRegionSize. If forceConversion is - * true, then we'll return the region end even if this isn't safely beyond the max prob propogation distance. + * true, then we'll return the region end even if this isn't safely beyond the max prob propagation distance. + * + * Note that if isActiveRegion is true, and we can construct a active region > maxRegionSize in bp, we + * find the further local minimum within that max region, and cut the region there, under the constraint + * that the resulting region must be at least minRegionSize in bp. * * @param isActiveRegion is the region we're looking for an active region or inactive region? + * @param minRegionSize the minimum region size, in the case where we have to cut up regions that are too large * @param maxRegionSize the maximize size of the returned region * @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the * stateList. Used to close out the active region when we've hit some kind of end (such @@ -376,16 +383,65 @@ public class ActivityProfile { "result >= -1", "result == -1 || result < maxRegionSize", "! (result == -1 && forceConversion)"}) - private int findEndOfRegion(final boolean isActiveRegion, final int maxRegionSize, final boolean forceConversion) { - int i = 0; - while ( i < stateList.size() && i < maxRegionSize ) { - if ( stateList.get(i).isActiveProb > ACTIVE_PROB_THRESHOLD != isActiveRegion ) { + private int findEndOfRegion(final boolean isActiveRegion, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) { + final int nStates = stateList.size(); + int endOfActiveRegion = 0; + while ( endOfActiveRegion < nStates && endOfActiveRegion < maxRegionSize ) { + if ( getProb(endOfActiveRegion) > ACTIVE_PROB_THRESHOLD != isActiveRegion ) { break; } - i++; + endOfActiveRegion++; + } + + if ( isActiveRegion && endOfActiveRegion == maxRegionSize ) { + // we've run to the end of the region, let's find a good place to cut + int minI = endOfActiveRegion - 1; + double minP = Double.MAX_VALUE; + for ( int i = minI; i >= minRegionSize - 1; i-- ) { + double cur = getProb(i); + if ( cur < minP && isMinimum(i) ) { + minP = cur; + minI = i; + } + } + + endOfActiveRegion = minI + 1; } // we're one past the end, so i must be decremented - return forceConversion || i + getMaxProbPropagationDistance() < stateList.size() ? i - 1 : -1; + return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size() ? endOfActiveRegion - 1 : -1; + } + + /** + * Helper function to get the probability of the state at offset index + * @param index a valid offset into the state list + * @return the isActiveProb of the state at index + */ + @Requires({"index >= 0", "index < stateList.size()"}) + private double getProb(final int index) { + return stateList.get(index).isActiveProb; + } + + /** + * Is the probability at index in a local minimum? + * + * Checks that the probability at index is <= both the probabilities to either side. + * Returns false if index is at the end or the start of the state list. + * + * @param index the index of the state we want to test + * @return true if prob at state is a minimum, false otherwise + */ + @Requires({"index >= 0", "index < stateList.size()"}) + private boolean isMinimum(final int index) { + if ( index == stateList.size() - 1 ) + // we cannot be at a minimum if the current position is the last in the state list + return false; + else if ( index < 1 ) + // we cannot be at a minimum if the current position is the first or second + return false; + else { + final double indexP = getProb(index); + return indexP <= getProb(index+1) && indexP < getProb(index-1); + } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java index 272596be3..5bba7db17 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java @@ -59,7 +59,10 @@ public class ActivityProfileState { /** * Create a new ActivityProfileState at loc with probability of being active of isActiveProb that maintains some - * information about the result state and value (TODO RYAN -- what do these mean?) + * information about the result state and value + * + * The only state value in use is HIGH_QUALITY_SOFT_CLIPS, and here the value is interpreted as the number + * of bp affected by the soft clips. * * @param loc the position of the result profile (for debugging purposes) * @param isActiveProb the probability of being active (between 0 and 1) diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index bce1722cd..f6246b137 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -33,6 +33,7 @@ import net.sf.picard.reference.ReferenceSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; @@ -46,6 +47,7 @@ import java.util.*; public class ActivityProfileUnitTest extends BaseTest { + private final static boolean DEBUG = false; private GenomeLocParser genomeLocParser; private GenomeLoc startLoc; @@ -123,7 +125,7 @@ public class ActivityProfileUnitTest extends BaseTest { return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class); } - @Test(dataProvider = "BasicActivityProfileTestProvider") + @Test(enabled = ! DEBUG, dataProvider = "BasicActivityProfileTestProvider") public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) { ActivityProfile profile = cfg.makeProfile(); @@ -226,7 +228,7 @@ public class ActivityProfileUnitTest extends BaseTest { } - @Test(enabled = true, dataProvider = "RegionCreationTests") + @Test(enabled = !DEBUG, dataProvider = "RegionCreationTests") public void testRegionCreation(final int start, final List probs, int maxRegionSize, final int nParts, final boolean forceConversion, final boolean waitUntilEnd) { final ActivityProfile profile = new ActivityProfile(genomeLocParser); Assert.assertNotNull(profile.toString()); @@ -242,13 +244,13 @@ public class ActivityProfileUnitTest extends BaseTest { Assert.assertNotNull(profile.toString()); if ( ! waitUntilEnd ) { - final List regions = profile.popReadyActiveRegions(0, maxRegionSize, false); + final List regions = profile.popReadyActiveRegions(0, 1, maxRegionSize, false); lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites); } } if ( waitUntilEnd || forceConversion ) { - final List regions = profile.popReadyActiveRegions(0, maxRegionSize, forceConversion); + final List regions = profile.popReadyActiveRegions(0, 1, maxRegionSize, forceConversion); lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites); } @@ -312,7 +314,7 @@ public class ActivityProfileUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "SoftClipsTest") + @Test(enabled = ! DEBUG, dataProvider = "SoftClipsTest") public void testSoftClips(final int start, int nPrecedingSites, final int softClipSize) { final ActivityProfile profile = new ActivityProfile(genomeLocParser); @@ -327,14 +329,15 @@ public class ActivityProfileUnitTest extends BaseTest { final GenomeLoc softClipLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start); profile.add(new ActivityProfileState(softClipLoc, 1.0, ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS, softClipSize)); + final int actualNumOfSoftClips = Math.min(softClipSize, profile.getMaxProbPropagationDistance()); if ( nPrecedingSites == 0 ) { - final int profileSize = Math.min(start + softClipSize, contigLength) - start + 1; + final int profileSize = Math.min(start + actualNumOfSoftClips, contigLength) - start + 1; Assert.assertEquals(profile.size(), profileSize, "Wrong number of states in the profile"); } for ( int i = 0; i < profile.size(); i++ ) { final ActivityProfileState state = profile.getStateList().get(i); - final boolean withinSCRange = state.getLoc().distance(softClipLoc) <= softClipSize; + final boolean withinSCRange = state.getLoc().distance(softClipLoc) <= actualNumOfSoftClips; if ( withinSCRange ) { Assert.assertTrue(state.isActiveProb > 0.0, "active prob should be changed within soft clip size"); } else { @@ -342,4 +345,157 @@ public class ActivityProfileUnitTest extends BaseTest { } } } + + // ------------------------------------------------------------------------------------- + // + // Tests to ensure we cut large active regions in the right place + // + // ------------------------------------------------------------------------------------- + + private void addProb(final List l, final double v) { + l.add(v); + } + + @DataProvider(name = "ActiveRegionCutTests") + public Object[][] makeActiveRegionCutTests() { + final List tests = new LinkedList(); + +// for ( final int activeRegionSize : Arrays.asList(30) ) { +// for ( final int minRegionSize : Arrays.asList(5) ) { + for ( final int activeRegionSize : Arrays.asList(10, 12, 20, 30, 40) ) { + for ( final int minRegionSize : Arrays.asList(1, 5, 10) ) { + final int maxRegionSize = activeRegionSize * 2 / 3; + if ( minRegionSize >= maxRegionSize ) continue; + { // test flat activity profile + final List probs = Collections.nCopies(activeRegionSize, 1.0); + tests.add(new Object[]{minRegionSize, maxRegionSize, maxRegionSize, probs}); + } + + { // test point profile is properly handled + for ( int end = 1; end < activeRegionSize; end++ ) { + final List probs = Collections.nCopies(end, 1.0); + tests.add(new Object[]{minRegionSize, maxRegionSize, Math.min(end, maxRegionSize), probs}); + } + } + + { // test increasing activity profile + final List probs = new ArrayList(activeRegionSize); + for ( int i = 0; i < activeRegionSize; i++ ) { + addProb(probs, (1.0*(i+1))/ activeRegionSize); + } + tests.add(new Object[]{minRegionSize, maxRegionSize, maxRegionSize, probs}); + } + + { // test decreasing activity profile + final List probs = new ArrayList(activeRegionSize); + for ( int i = 0; i < activeRegionSize; i++ ) { + addProb(probs, 1 - (1.0*(i+1))/ activeRegionSize); + } + tests.add(new Object[]{minRegionSize, maxRegionSize, maxRegionSize, probs}); + } + + { // test two peaks +// for ( final double rootSigma : Arrays.asList(2.0) ) { +// int maxPeak1 = 9; { +// int maxPeak2 = 16; { + for ( final double rootSigma : Arrays.asList(1.0, 2.0, 3.0) ) { + for ( int maxPeak1 = 0; maxPeak1 < activeRegionSize / 2; maxPeak1++ ) { + for ( int maxPeak2 = activeRegionSize / 2 + 1; maxPeak2 < activeRegionSize; maxPeak2++ ) { + final double[] gauss1 = makeGaussian(maxPeak1, activeRegionSize, rootSigma); + final double[] gauss2 = makeGaussian(maxPeak2, activeRegionSize, rootSigma+1); + final List probs = new ArrayList(activeRegionSize); + for ( int i = 0; i < activeRegionSize; i++ ) { + addProb(probs, gauss1[i] + gauss2[i]); + } + final int cutSite = findCutSiteForTwoMaxPeaks(probs, minRegionSize); + if ( cutSite != -1 && cutSite < maxRegionSize ) + tests.add(new Object[]{minRegionSize, maxRegionSize, Math.max(cutSite, minRegionSize), probs}); + } + } + } + } + + { // test that the lowest of two minima is taken + // looks like a bunch of 1s, 0.5, some 1.0s, 0.75, some more 1s +// int firstMin = 0; { +// int secondMin = 4; { + for ( int firstMin = 1; firstMin < activeRegionSize; firstMin++ ) { + for ( int secondMin = firstMin + 1; secondMin < activeRegionSize; secondMin++ ) { + final List probs = new ArrayList(Collections.nCopies(activeRegionSize, 1.0)); + probs.set(firstMin, 0.5); + probs.set(secondMin, 0.75); + final int expectedCut; + if ( firstMin + 1 < minRegionSize ) { + if ( firstMin == secondMin - 1 ) // edge case for non-min at minRegionSize + expectedCut = maxRegionSize; + else + expectedCut = secondMin + 1 > maxRegionSize ? maxRegionSize : ( secondMin + 1 < minRegionSize ? maxRegionSize : secondMin + 1); + } else if ( firstMin + 1 > maxRegionSize ) + expectedCut = maxRegionSize; + else { + expectedCut = firstMin + 1; + } + + Math.min(firstMin + 1, maxRegionSize); + tests.add(new Object[]{minRegionSize, maxRegionSize, expectedCut, probs}); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private double[] makeGaussian(final int mean, final int range, final double sigma) { + final double[] gauss = new double[range]; + for( int iii = 0; iii < range; iii++ ) { + gauss[iii] = MathUtils.NormalDistribution(mean, sigma, iii) + ActivityProfile.ACTIVE_PROB_THRESHOLD; + } + return gauss; + } + + private int findCutSiteForTwoMaxPeaks(final List probs, final int minRegionSize) { + for ( int i = probs.size() - 2; i > minRegionSize; i-- ) { + double prev = probs.get(i - 1); + double next = probs.get(i + 1); + double cur = probs.get(i); + if ( cur < next && cur < prev ) + return i + 1; + } + + return -1; + } + + +// private int findCutSite(final List probs) { +// for ( int i = probs.size() - 2; i > 0; i-- ) { +// double prev = probs.get(i + 1); +// double next = probs.get(i-1); +// double cur = probs.get(i); +// if ( cur < next && cur < prev ) +// return i + 1; +// } +// +// return -1; +// } + + @Test(dataProvider = "ActiveRegionCutTests") + public void testActiveRegionCutTests(final int minRegionSize, final int maxRegionSize, final int expectedRegionSize, final List probs) { + final ActivityProfile profile = new ActivityProfile(genomeLocParser); + + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + for ( int i = 0; i <= maxRegionSize + profile.getMaxProbPropagationDistance(); i++ ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + 1); + final double prob = i < probs.size() ? probs.get(i) : 0.0; + final ActivityProfileState state = new ActivityProfileState(loc, prob); + profile.add(state); + } + + final List regions = profile.popReadyActiveRegions(0, minRegionSize, maxRegionSize, false); + Assert.assertTrue(regions.size() >= 1, "Should only be one regions for this test"); + final ActiveRegion region = regions.get(0); + Assert.assertEquals(region.getLocation().getStart(), 1, "Region should start at 1"); + Assert.assertEquals(region.getLocation().size(), expectedRegionSize, "Incorrect region size; cut must have been incorrect"); + } } \ No newline at end of file