Add band pass filtering activity profile

-- Based on the new incremental activity profile
-- Unit Tested!  Fixed a few bugs with the old band pass filter
-- Expand IncrementalActivityProfileUnitTest to test the band pass filter as well for basic properties
-- Add new UnitTest for BandPassIncrementalActivityProfile
-- Added normalizeFromRealSpace to MathUtils
-- Cleanup unused code in new activity profiles
This commit is contained in:
Mark DePristo 2013-01-22 15:40:09 -05:00
parent eb60235dcd
commit 7fd27a5167
5 changed files with 345 additions and 41 deletions

View File

@ -634,6 +634,30 @@ public class MathUtils {
return normalizeFromLog10(array, false);
}
/**
* normalizes the real-space probability array.
*
* Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok
* to have values in the array of > 1, or have the sum go above 0.
*
* @param array the array to be normalized
* @return a newly allocated array corresponding the normalized values in array
*/
@Requires("array != null")
@Ensures({"result != null"})
public static double[] normalizeFromRealSpace(final double[] array) {
if ( array.length == 0 )
return array;
final double sum = sum(array);
final double[] normalized = new double[array.length];
if ( sum < 0.0 || sum > 1.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum);
for ( int i = 0; i < array.length; i++ ) {
normalized[i] = array[i] / sum;
}
return normalized;
}
public static int maxElementIndex(final double[] array) {
return maxElementIndex(array, array.length);
}

View File

@ -0,0 +1,127 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.activeregion;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Collection;
import java.util.LinkedList;
/**
* A band pass filtering version of the activity profile
*
* Applies a band pass filter with a Gaussian kernel to the input state probabilities to smooth
* them out of an interval
*
* @author Mark DePristo
* @since 2011
*/
public class BandPassIncrementalActivityProfile extends IncrementalActivityProfile {
public static final int DEFAULT_FILTER_SIZE = 80;
private final int filterSize;
private final double[] GaussianKernel;
/**
* Create a band pass activity profile with the default band size
* @param parser our genome loc parser
*/
public BandPassIncrementalActivityProfile(final GenomeLocParser parser) {
this(parser, DEFAULT_FILTER_SIZE);
}
/**
* Create an activity profile that implements a band pass filter on the states
* @param parser our genome loc parser
* @param filterSize the size (in bp) of the band pass filter. The filter size is the number of bp to each
* side that are included in the band. So a filter size of 1 implies that the actual band
* is 3 bp, 1 for the center site and 1 on each size. 2 => 5, etc.
*/
public BandPassIncrementalActivityProfile(final GenomeLocParser parser, final int filterSize) {
super(parser);
if ( filterSize < 0 ) throw new IllegalArgumentException("Filter size must be greater than or equal to 0 but got " + filterSize);
// setup the Gaussian kernel for the band pass filter
this.filterSize = filterSize;
final double[] kernel = new double[getBandSize()];
for( int iii = 0; iii < 2* filterSize + 1; iii++ ) {
kernel[iii] = MathUtils.NormalDistribution(filterSize, 55.0, iii);
}
this.GaussianKernel = MathUtils.normalizeFromRealSpace(kernel);
}
/**
* Get the size (in bp) of the band pass filter
* @return a positive integer
*/
@Ensures("result >= 1")
public int getBandSize() {
return 2 * filterSize + 1;
}
/**
* Get the filter size (which is the size of each wing of the band, minus the center point)
* @return a positive integer
*/
@Ensures("result >= 0")
public int getFilteredSize() {
return filterSize;
}
/**
* Get the kernel of this band pass filter. Do not modify returned result
* @return the kernel used in this band pass filter
*/
@Ensures({"result != null", "result.length == getBandSize()"})
protected double[] getKernel() {
return GaussianKernel;
}
/**
* Band pass the probabilities in the ActivityProfile, producing a new profile that's band pass filtered
* @return a new double[] that's the band-pass filtered version of this profile
*/
@Override
protected Collection<ActivityProfileState> processState(final ActivityProfileState justAddedState) {
final Collection<ActivityProfileState> states = new LinkedList<ActivityProfileState>();
for ( final ActivityProfileState superState : super.processState(justAddedState) ) {
for( int jjj = -filterSize; jjj <= filterSize; jjj++ ) {
final GenomeLoc loc = getLocForOffset(justAddedState.getLoc(), jjj);
if ( loc != null ) {
final double newProb = superState.isActiveProb * GaussianKernel[jjj + filterSize];
states.add(new ActivityProfileState(loc, newProb));
}
}
}
return states;
}
}

View File

@ -51,35 +51,13 @@ public class IncrementalActivityProfile {
/**
* Create a new empty IncrementalActivityProfile
* @param parser the parser we can use to create genome locs
* @param parser the parser we can use to create genome locs, cannot be null
*/
public IncrementalActivityProfile(final GenomeLocParser parser) {
this(parser, new ArrayList<ActivityProfileState>(), null);
}
if ( parser == null ) throw new IllegalArgumentException("parser cannot be null");
/**
* Create a new IncrementalActivityProfile using state list (not copied) and starting at regionStartLoc
* @param parser the parser we can use to create genome locs
*/
@Deprecated
protected IncrementalActivityProfile(final GenomeLocParser parser, final List<ActivityProfileState> stateList, final GenomeLoc regionStartLoc) {
this.parser = parser;
this.stateList = stateList;
this.regionStartLoc = regionStartLoc;
}
/**
* Create a profile of the same class as this object containing just the provided stateList
*
* Used by clients to create derived activity profiles (such as ones without the starting X
* sites because they've been removed in an ActiveRegion) of the same class.
*
* @param isActiveList the active results list to use in the derived instance
* @return a freshly allocated data set
*/
@Deprecated
protected IncrementalActivityProfile createDerivedProfile(final List<ActivityProfileState> isActiveList) {
return new IncrementalActivityProfile(parser, isActiveList, regionStartLoc);
this.stateList = new ArrayList<ActivityProfileState>();
}
@Override
@ -150,6 +128,19 @@ public class IncrementalActivityProfile {
return stateList;
}
/**
* Get the probabilities of the states as a single linear array of doubles
* @return a non-null array
*/
@Ensures("result != null")
protected double[] getProbabilitiesAsArray() {
final double[] probs = new double[getStateList().size()];
int i = 0;
for ( final ActivityProfileState state : getStateList() )
probs[i++] = state.isActiveProb;
return probs;
}
/**
* Helper function that gets the genome loc for a site offset from relativeLoc, protecting ourselves from
* falling off the edge of the contig.

View File

@ -0,0 +1,167 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.activeregion;
// the imports for unit testing.
import net.sf.picard.reference.ReferenceSequenceFile;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
public class BandPassIncrementalActivityProfileUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq);
}
@DataProvider(name = "BandPassBasicTest")
public Object[][] makeBandPassTest() {
final List<Object[]> tests = new LinkedList<Object[]>();
for ( int start : Arrays.asList(1, 10, 100, 1000) ) {
for ( boolean precedingIsActive : Arrays.asList(true, false) ) {
for ( int precedingSites: Arrays.asList(0, 1, 10, 100) ) {
for ( int bandPassSize : Arrays.asList(0, 1, 10, 100) ) {
// for ( int start : Arrays.asList(10) ) {
// for ( boolean precedingIsActive : Arrays.asList(false) ) {
// for ( int precedingSites: Arrays.asList(0) ) {
// for ( int bandPassSize : Arrays.asList(1) ) {
tests.add(new Object[]{ start, precedingIsActive, precedingSites, bandPassSize });
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BandPassBasicTest")
public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize) {
final BandPassIncrementalActivityProfile profile = new BandPassIncrementalActivityProfile(genomeLocParser, bandPassSize);
final int expectedBandSize = bandPassSize * 2 + 1;
Assert.assertEquals(profile.getBandSize(), expectedBandSize, "Wrong expected band size");
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
final double precedingProb = precedingIsActive ? 1.0 : 0.0;
for ( int i = 0; i < nPrecedingSites; i++ ) {
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start);
final ActivityProfileState state = new ActivityProfileState(loc, precedingProb);
profile.add(state);
}
final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start);
profile.add(new ActivityProfileState(nextLoc, 1.0));
if ( precedingIsActive == false && nPrecedingSites >= bandPassSize && bandPassSize < start ) {
// we have enough space that all probs fall on the genome
final double[] probs = profile.getProbabilitiesAsArray();
Assert.assertEquals(MathUtils.sum(probs), 1.0 * (nPrecedingSites * precedingProb + 1), 1e-3, "Activity profile doesn't sum to number of non-zero prob states");
}
}
private double[] bandPassInOnePass(final BandPassIncrementalActivityProfile profile, final double[] activeProbArray) {
final double[] bandPassProbArray = new double[activeProbArray.length];
// apply the band pass filter for activeProbArray into filteredProbArray
final double[] GaussianKernel = profile.getKernel();
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(profile.getFilteredSize() - iii, 0), Math.min(GaussianKernel.length, profile.getFilteredSize() + activeProbArray.length - iii));
final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - profile.getFilteredSize()), Math.min(activeProbArray.length,iii + profile.getFilteredSize() + 1));
bandPassProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
}
return bandPassProbArray;
}
@DataProvider(name = "BandPassComposition")
public Object[][] makeBandPassComposition() {
final List<Object[]> tests = new LinkedList<Object[]>();
for ( int bandPassSize : Arrays.asList(0, 1, 10, 100, BandPassIncrementalActivityProfile.DEFAULT_FILTER_SIZE) ) {
for ( int integrationLength : Arrays.asList(1, 10, 100, 1000) ) {
tests.add(new Object[]{ bandPassSize, integrationLength });
}
}
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "BandPassComposition")
public void testBandPassComposition(final int bandPassSize, final int integrationLength) {
final int start = 1;
final BandPassIncrementalActivityProfile profile = new BandPassIncrementalActivityProfile(genomeLocParser, bandPassSize);
final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2];
// add a buffer so that we can get all of the band pass values
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
int pos = start;
int rawProbsOffset = 0;
for ( int i = 0; i < bandPassSize; i++ ) {
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos++);
final ActivityProfileState state = new ActivityProfileState(loc, 0.0);
profile.add(state);
rawActiveProbs[rawProbsOffset++] = 0.0;
rawActiveProbs[rawActiveProbs.length - rawProbsOffset] = 0.0;
}
for ( int i = 0; i < integrationLength; i++ ) {
final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, pos++);
profile.add(new ActivityProfileState(nextLoc, 1.0));
rawActiveProbs[rawProbsOffset++] = 1.0;
for ( int j = 0; j < profile.size(); j++ ) {
Assert.assertTrue(profile.getStateList().get(j).isActiveProb >= 0.0, "State probability < 0 at " + j);
Assert.assertTrue(profile.getStateList().get(j).isActiveProb <= 1.0 + 1e-3, "State probability > 1 at " + j);
}
}
final double[] expectedProbs = bandPassInOnePass(profile, rawActiveProbs);
for ( int j = 0; j < profile.size(); j++ ) {
Assert.assertEquals(profile.getStateList().get(j).isActiveProb, expectedProbs[j], "State probability not expected at " + j);
}
}
}

View File

@ -85,7 +85,9 @@ public class IncrementalActivityProfileUnitTest extends BaseTest {
public IncrementalActivityProfile makeProfile() {
switch ( type ) {
case Base: return new IncrementalActivityProfile(genomeLocParser);
case BandPass: //return new BandPassActivityProfile(genomeLocParser);
case BandPass:
// zero size => equivalent to IncrementalActivityProfile
return new BandPassIncrementalActivityProfile(genomeLocParser, 0);
default: throw new IllegalStateException(type.toString());
}
}
@ -111,13 +113,11 @@ public class IncrementalActivityProfileUnitTest extends BaseTest {
@DataProvider(name = "BasicActivityProfileTestProvider")
public Object[][] makeQualIntervalTestProvider() {
for ( final ProfileType type : ProfileType.values() ) {
if ( type != ProfileType.BandPass ) { // todo -- re-enable
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2);
new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3);
}
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2);
new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3);
}
return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class);
@ -135,20 +135,15 @@ public class IncrementalActivityProfileUnitTest extends BaseTest {
double p = cfg.probs.get(i);
GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i);
profile.add(new ActivityProfileState(loc, p));
Assert.assertFalse(profile.isEmpty());
Assert.assertFalse(profile.isEmpty(), "Profile shouldn't be empty after adding a state");
}
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ), "Start loc should be the start of the region");
Assert.assertEquals(profile.size(), cfg.probs.size());
Assert.assertEquals(profile.size(), cfg.probs.size(), "Should have exactly the number of states we expected to add");
assertProbsAreEqual(profile.stateList, cfg.probs);
// TODO -- reanble tests
//assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions);
Assert.assertEquals(profile.createDerivedProfile(profile.stateList).getClass(), profile.getClass());
final List<ActivityProfileState> empty = new LinkedList<ActivityProfileState>();
Assert.assertEquals(profile.createDerivedProfile(empty).size(), 0);
}
private void assertRegionsAreEqual(List<ActiveRegion> actual, List<ActiveRegion> expected) {