From 7fd27a5167ca0fd0eda062954565ed19831bc6e9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 22 Jan 2013 15:40:09 -0500 Subject: [PATCH] Add band pass filtering activity profile -- Based on the new incremental activity profile -- Unit Tested! Fixed a few bugs with the old band pass filter -- Expand IncrementalActivityProfileUnitTest to test the band pass filter as well for basic properties -- Add new UnitTest for BandPassIncrementalActivityProfile -- Added normalizeFromRealSpace to MathUtils -- Cleanup unused code in new activity profiles --- .../broadinstitute/sting/utils/MathUtils.java | 24 +++ .../BandPassIncrementalActivityProfile.java | 127 +++++++++++++ .../IncrementalActivityProfile.java | 41 ++--- ...assIncrementalActivityProfileUnitTest.java | 167 ++++++++++++++++++ .../IncrementalActivityProfileUnitTest.java | 27 ++- 5 files changed, 345 insertions(+), 41 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassIncrementalActivityProfile.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassIncrementalActivityProfileUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 7462416bc..f1f0ab9b1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -634,6 +634,30 @@ public class MathUtils { return normalizeFromLog10(array, false); } + /** + * normalizes the real-space probability array. + * + * Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok + * to have values in the array of > 1, or have the sum go above 0. + * + * @param array the array to be normalized + * @return a newly allocated array corresponding the normalized values in array + */ + @Requires("array != null") + @Ensures({"result != null"}) + public static double[] normalizeFromRealSpace(final double[] array) { + if ( array.length == 0 ) + return array; + + final double sum = sum(array); + final double[] normalized = new double[array.length]; + if ( sum < 0.0 || sum > 1.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum); + for ( int i = 0; i < array.length; i++ ) { + normalized[i] = array[i] / sum; + } + return normalized; + } + public static int maxElementIndex(final double[] array) { return maxElementIndex(array, array.length); } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassIncrementalActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassIncrementalActivityProfile.java new file mode 100644 index 000000000..805a0b60a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassIncrementalActivityProfile.java @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.activeregion; + +import com.google.java.contract.Ensures; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Collection; +import java.util.LinkedList; + +/** + * A band pass filtering version of the activity profile + * + * Applies a band pass filter with a Gaussian kernel to the input state probabilities to smooth + * them out of an interval + * + * @author Mark DePristo + * @since 2011 + */ +public class BandPassIncrementalActivityProfile extends IncrementalActivityProfile { + public static final int DEFAULT_FILTER_SIZE = 80; + + private final int filterSize; + private final double[] GaussianKernel; + + /** + * Create a band pass activity profile with the default band size + * @param parser our genome loc parser + */ + public BandPassIncrementalActivityProfile(final GenomeLocParser parser) { + this(parser, DEFAULT_FILTER_SIZE); + } + + /** + * Create an activity profile that implements a band pass filter on the states + * @param parser our genome loc parser + * @param filterSize the size (in bp) of the band pass filter. The filter size is the number of bp to each + * side that are included in the band. So a filter size of 1 implies that the actual band + * is 3 bp, 1 for the center site and 1 on each size. 2 => 5, etc. + */ + public BandPassIncrementalActivityProfile(final GenomeLocParser parser, final int filterSize) { + super(parser); + + if ( filterSize < 0 ) throw new IllegalArgumentException("Filter size must be greater than or equal to 0 but got " + filterSize); + + // setup the Gaussian kernel for the band pass filter + this.filterSize = filterSize; + final double[] kernel = new double[getBandSize()]; + for( int iii = 0; iii < 2* filterSize + 1; iii++ ) { + kernel[iii] = MathUtils.NormalDistribution(filterSize, 55.0, iii); + } + this.GaussianKernel = MathUtils.normalizeFromRealSpace(kernel); + } + + /** + * Get the size (in bp) of the band pass filter + * @return a positive integer + */ + @Ensures("result >= 1") + public int getBandSize() { + return 2 * filterSize + 1; + } + + /** + * Get the filter size (which is the size of each wing of the band, minus the center point) + * @return a positive integer + */ + @Ensures("result >= 0") + public int getFilteredSize() { + return filterSize; + } + + /** + * Get the kernel of this band pass filter. Do not modify returned result + * @return the kernel used in this band pass filter + */ + @Ensures({"result != null", "result.length == getBandSize()"}) + protected double[] getKernel() { + return GaussianKernel; + } + + /** + * Band pass the probabilities in the ActivityProfile, producing a new profile that's band pass filtered + * @return a new double[] that's the band-pass filtered version of this profile + */ + @Override + protected Collection processState(final ActivityProfileState justAddedState) { + final Collection states = new LinkedList(); + + for ( final ActivityProfileState superState : super.processState(justAddedState) ) { + for( int jjj = -filterSize; jjj <= filterSize; jjj++ ) { + final GenomeLoc loc = getLocForOffset(justAddedState.getLoc(), jjj); + if ( loc != null ) { + final double newProb = superState.isActiveProb * GaussianKernel[jjj + filterSize]; + states.add(new ActivityProfileState(loc, newProb)); + } + } + } + + return states; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/IncrementalActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/IncrementalActivityProfile.java index 3cbad54e9..1292b3176 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/IncrementalActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/IncrementalActivityProfile.java @@ -51,35 +51,13 @@ public class IncrementalActivityProfile { /** * Create a new empty IncrementalActivityProfile - * @param parser the parser we can use to create genome locs + * @param parser the parser we can use to create genome locs, cannot be null */ public IncrementalActivityProfile(final GenomeLocParser parser) { - this(parser, new ArrayList(), null); - } + if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); - /** - * Create a new IncrementalActivityProfile using state list (not copied) and starting at regionStartLoc - * @param parser the parser we can use to create genome locs - */ - @Deprecated - protected IncrementalActivityProfile(final GenomeLocParser parser, final List stateList, final GenomeLoc regionStartLoc) { this.parser = parser; - this.stateList = stateList; - this.regionStartLoc = regionStartLoc; - } - - /** - * Create a profile of the same class as this object containing just the provided stateList - * - * Used by clients to create derived activity profiles (such as ones without the starting X - * sites because they've been removed in an ActiveRegion) of the same class. - * - * @param isActiveList the active results list to use in the derived instance - * @return a freshly allocated data set - */ - @Deprecated - protected IncrementalActivityProfile createDerivedProfile(final List isActiveList) { - return new IncrementalActivityProfile(parser, isActiveList, regionStartLoc); + this.stateList = new ArrayList(); } @Override @@ -150,6 +128,19 @@ public class IncrementalActivityProfile { return stateList; } + /** + * Get the probabilities of the states as a single linear array of doubles + * @return a non-null array + */ + @Ensures("result != null") + protected double[] getProbabilitiesAsArray() { + final double[] probs = new double[getStateList().size()]; + int i = 0; + for ( final ActivityProfileState state : getStateList() ) + probs[i++] = state.isActiveProb; + return probs; + } + /** * Helper function that gets the genome loc for a site offset from relativeLoc, protecting ourselves from * falling off the edge of the contig. diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassIncrementalActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassIncrementalActivityProfileUnitTest.java new file mode 100644 index 000000000..be90353b3 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassIncrementalActivityProfileUnitTest.java @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.activeregion; + + +// the imports for unit testing. + + +import net.sf.picard.reference.ReferenceSequenceFile; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +public class BandPassIncrementalActivityProfileUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + } + + @DataProvider(name = "BandPassBasicTest") + public Object[][] makeBandPassTest() { + final List tests = new LinkedList(); + + for ( int start : Arrays.asList(1, 10, 100, 1000) ) { + for ( boolean precedingIsActive : Arrays.asList(true, false) ) { + for ( int precedingSites: Arrays.asList(0, 1, 10, 100) ) { + for ( int bandPassSize : Arrays.asList(0, 1, 10, 100) ) { +// for ( int start : Arrays.asList(10) ) { +// for ( boolean precedingIsActive : Arrays.asList(false) ) { +// for ( int precedingSites: Arrays.asList(0) ) { +// for ( int bandPassSize : Arrays.asList(1) ) { + tests.add(new Object[]{ start, precedingIsActive, precedingSites, bandPassSize }); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BandPassBasicTest") + public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize) { + final BandPassIncrementalActivityProfile profile = new BandPassIncrementalActivityProfile(genomeLocParser, bandPassSize); + + final int expectedBandSize = bandPassSize * 2 + 1; + Assert.assertEquals(profile.getBandSize(), expectedBandSize, "Wrong expected band size"); + + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + final double precedingProb = precedingIsActive ? 1.0 : 0.0; + for ( int i = 0; i < nPrecedingSites; i++ ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start); + final ActivityProfileState state = new ActivityProfileState(loc, precedingProb); + profile.add(state); + } + + final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start); + profile.add(new ActivityProfileState(nextLoc, 1.0)); + + if ( precedingIsActive == false && nPrecedingSites >= bandPassSize && bandPassSize < start ) { + // we have enough space that all probs fall on the genome + final double[] probs = profile.getProbabilitiesAsArray(); + Assert.assertEquals(MathUtils.sum(probs), 1.0 * (nPrecedingSites * precedingProb + 1), 1e-3, "Activity profile doesn't sum to number of non-zero prob states"); + } + } + + private double[] bandPassInOnePass(final BandPassIncrementalActivityProfile profile, final double[] activeProbArray) { + final double[] bandPassProbArray = new double[activeProbArray.length]; + + // apply the band pass filter for activeProbArray into filteredProbArray + final double[] GaussianKernel = profile.getKernel(); + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(profile.getFilteredSize() - iii, 0), Math.min(GaussianKernel.length, profile.getFilteredSize() + activeProbArray.length - iii)); + final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - profile.getFilteredSize()), Math.min(activeProbArray.length,iii + profile.getFilteredSize() + 1)); + bandPassProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); + } + + return bandPassProbArray; + } + + @DataProvider(name = "BandPassComposition") + public Object[][] makeBandPassComposition() { + final List tests = new LinkedList(); + + for ( int bandPassSize : Arrays.asList(0, 1, 10, 100, BandPassIncrementalActivityProfile.DEFAULT_FILTER_SIZE) ) { + for ( int integrationLength : Arrays.asList(1, 10, 100, 1000) ) { + tests.add(new Object[]{ bandPassSize, integrationLength }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "BandPassComposition") + public void testBandPassComposition(final int bandPassSize, final int integrationLength) { + final int start = 1; + final BandPassIncrementalActivityProfile profile = new BandPassIncrementalActivityProfile(genomeLocParser, bandPassSize); + final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2]; + + // add a buffer so that we can get all of the band pass values + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + int pos = start; + int rawProbsOffset = 0; + for ( int i = 0; i < bandPassSize; i++ ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos++); + final ActivityProfileState state = new ActivityProfileState(loc, 0.0); + profile.add(state); + rawActiveProbs[rawProbsOffset++] = 0.0; + rawActiveProbs[rawActiveProbs.length - rawProbsOffset] = 0.0; + } + + for ( int i = 0; i < integrationLength; i++ ) { + final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, pos++); + profile.add(new ActivityProfileState(nextLoc, 1.0)); + rawActiveProbs[rawProbsOffset++] = 1.0; + + for ( int j = 0; j < profile.size(); j++ ) { + Assert.assertTrue(profile.getStateList().get(j).isActiveProb >= 0.0, "State probability < 0 at " + j); + Assert.assertTrue(profile.getStateList().get(j).isActiveProb <= 1.0 + 1e-3, "State probability > 1 at " + j); + } + } + + final double[] expectedProbs = bandPassInOnePass(profile, rawActiveProbs); + for ( int j = 0; j < profile.size(); j++ ) { + Assert.assertEquals(profile.getStateList().get(j).isActiveProb, expectedProbs[j], "State probability not expected at " + j); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/IncrementalActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/IncrementalActivityProfileUnitTest.java index 16b9b1877..64065029c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/IncrementalActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/IncrementalActivityProfileUnitTest.java @@ -85,7 +85,9 @@ public class IncrementalActivityProfileUnitTest extends BaseTest { public IncrementalActivityProfile makeProfile() { switch ( type ) { case Base: return new IncrementalActivityProfile(genomeLocParser); - case BandPass: //return new BandPassActivityProfile(genomeLocParser); + case BandPass: + // zero size => equivalent to IncrementalActivityProfile + return new BandPassIncrementalActivityProfile(genomeLocParser, 0); default: throw new IllegalStateException(type.toString()); } } @@ -111,13 +113,11 @@ public class IncrementalActivityProfileUnitTest extends BaseTest { @DataProvider(name = "BasicActivityProfileTestProvider") public Object[][] makeQualIntervalTestProvider() { for ( final ProfileType type : ProfileType.values() ) { - if ( type != ProfileType.BandPass ) { // todo -- re-enable - new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1); - new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2); - new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2); - new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); - new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); - } + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2); + new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); } return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class); @@ -135,20 +135,15 @@ public class IncrementalActivityProfileUnitTest extends BaseTest { double p = cfg.probs.get(i); GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); profile.add(new ActivityProfileState(loc, p)); - Assert.assertFalse(profile.isEmpty()); + Assert.assertFalse(profile.isEmpty(), "Profile shouldn't be empty after adding a state"); } - Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() )); + Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ), "Start loc should be the start of the region"); - Assert.assertEquals(profile.size(), cfg.probs.size()); + Assert.assertEquals(profile.size(), cfg.probs.size(), "Should have exactly the number of states we expected to add"); assertProbsAreEqual(profile.stateList, cfg.probs); // TODO -- reanble tests //assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions); - - Assert.assertEquals(profile.createDerivedProfile(profile.stateList).getClass(), profile.getClass()); - - final List empty = new LinkedList(); - Assert.assertEquals(profile.createDerivedProfile(empty).size(), 0); } private void assertRegionsAreEqual(List actual, List expected) {