IncrementalActivityProfile, complete with extensive unit tests

-- This is an activity profile compatible with fetching its implied active regions incrementally, as activity profile states are added
This commit is contained in:
Mark DePristo 2013-01-18 10:18:04 -05:00
parent 8d9b0f1bd5
commit e050f649fd
4 changed files with 735 additions and 2 deletions

View File

@ -61,7 +61,7 @@ public class ActivityProfile {
}
/**
* Create a profile of the same class as this object containing just the provided isActiveList
* Create a profile of the same class as this object containing just the provided stateList
*
* Used by clients to create derived activity profiles (such as ones without the starting X
* sites because they've been removed in an ActiveRegion) of the same class.

View File

@ -35,7 +35,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
* Date: 7/27/12
*/
public class ActivityProfileState {
private GenomeLoc loc;
final private GenomeLoc loc;
public double isActiveProb;
public Type resultState;
public Number resultValue;
@ -75,6 +75,16 @@ public class ActivityProfileState {
this.resultValue = resultValue;
}
/**
* The offset of state w.r.t. our current region's start location
* @param regionStartLoc the start of the region, as a genome loc
* @return the position of this profile relative to the start of this region
*/
public int getOffset(final GenomeLoc regionStartLoc) {
return getLoc().getStart() - regionStartLoc.getStart();
}
/**
* Get the genome loc associated with the ActivityProfileState
* @return the location of this result

View File

@ -0,0 +1,373 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.activeregion;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.*;
/**
* Class holding information about per-base activity scores for the
* active region traversal
*
* @author Mark DePristo
* @since Date created
*/
public class IncrementalActivityProfile {
private final static int MAX_PROB_PROPOGATION_DISTANCE = 10;
private final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
protected final List<ActivityProfileState> stateList;
protected final GenomeLocParser parser;
protected GenomeLoc regionStartLoc = null;
protected GenomeLoc regionStopLoc = null;
/**
* Create a new empty IncrementalActivityProfile
* @param parser the parser we can use to create genome locs
*/
public IncrementalActivityProfile(final GenomeLocParser parser) {
this(parser, new ArrayList<ActivityProfileState>(), null);
}
/**
* Create a new IncrementalActivityProfile using state list (not copied) and starting at regionStartLoc
* @param parser the parser we can use to create genome locs
*/
@Deprecated
protected IncrementalActivityProfile(final GenomeLocParser parser, final List<ActivityProfileState> stateList, final GenomeLoc regionStartLoc) {
this.parser = parser;
this.stateList = stateList;
this.regionStartLoc = regionStartLoc;
}
/**
* Create a profile of the same class as this object containing just the provided stateList
*
* Used by clients to create derived activity profiles (such as ones without the starting X
* sites because they've been removed in an ActiveRegion) of the same class.
*
* @param isActiveList the active results list to use in the derived instance
* @return a freshly allocated data set
*/
@Deprecated
protected IncrementalActivityProfile createDerivedProfile(final List<ActivityProfileState> isActiveList) {
return new IncrementalActivityProfile(parser, isActiveList, regionStartLoc);
}
@Override
public String toString() {
return "ActivityProfile{" +
"start=" + regionStartLoc +
", stop=" + regionStopLoc +
'}';
}
/**
* How far away can probability mass be moved around in this profile?
*
* This distance puts an upper limit on how far, in bp, we will ever propogate probability max around
* when adding a new ActivityProfileState. For example, if the value of this function is
* 10, and you are looking at a state at bp 5, and we know that no states beyond 5 + 10 will have
* their probability propograted back to that state.
*
* @return a positive integer distance in bp
*/
@Ensures("result >= 0")
public int getMaxProbPropogationDistance() {
return MAX_PROB_PROPOGATION_DISTANCE;
}
/**
* How many profile results are in this profile?
* @return the number of profile results
*/
@Ensures("result >= 0")
public int size() {
return stateList.size();
}
/**
* Is this profile empty?
* @return true if the profile is empty
*/
@Ensures("isEmpty() == (size() == 0)")
public boolean isEmpty() {
return stateList.isEmpty();
}
/**
* Get the list of active profile results in this object
* @return a non-null, ordered list of active profile results
*/
@Ensures("result != null")
protected List<ActivityProfileState> getStateList() {
return stateList;
}
/**
* Helper function that gets the genome loc for a site offset from relativeLoc, protecting ourselves from
* falling off the edge of the contig.
*
* @param relativeLoc the location offset is relative to
* @param offset the offset from relativeLoc where we'd like to create a GenomeLoc
* @return a genome loc with relativeLoc.start + offset, if this is on the contig, null otherwise
*/
@Requires("relativeLoc != null")
protected GenomeLoc getLocForOffset(final GenomeLoc relativeLoc, final int offset) {
final int start = relativeLoc.getStart() + offset;
if ( start < 0 || start > getCurrentContigLength() ) {
return null;
} else {
return parser.createGenomeLoc(regionStartLoc.getContig(), start);
}
}
/**
* Get the length of the current contig
* @return the length in bp
*/
@Requires("regionStartLoc != null")
@Ensures("result > 0")
private int getCurrentContigLength() {
// TODO -- fix performance problem with getContigInfo
return parser.getContigInfo(regionStartLoc.getContig()).getSequenceLength();
}
// --------------------------------------------------------------------------------
//
// routines to add states to a profile
//
// --------------------------------------------------------------------------------
/**
* Add the next ActivityProfileState to this profile.
*
* Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown
*
* @param state a well-formed ActivityProfileState result to incorporate into this profile
*/
@Requires("state != null")
public void add(final ActivityProfileState state) {
final GenomeLoc loc = state.getLoc();
if ( regionStartLoc == null ) {
regionStartLoc = loc;
regionStopLoc = loc;
} else {
// TODO -- need to figure out where to add loc as the regions will be popping off the front
if ( regionStopLoc.getStart() != loc.getStart() - 1 )
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc );
regionStopLoc = loc;
}
final Collection<ActivityProfileState> processedStates = processState(state);
for ( final ActivityProfileState processedState : processedStates ) {
incorporateSingleState(processedState);
}
}
/**
* Incorporate a single activity profile state into the current list of states
*
* If state's position occurs immediately after the last position in this profile, then
* the state is appended to the state list. If it's within the existing states list,
* the prob of stateToAdd is added to its corresponding state in the list. If the
* position would be before the start of this profile, stateToAdd is simply ignored.
*
* @param stateToAdd the state we want to add to the states list
*/
@Requires("stateToAdd != null")
private void incorporateSingleState(final ActivityProfileState stateToAdd) {
final int position = stateToAdd.getOffset(regionStartLoc);
if ( position > size() )
// should we allow this? probably not
throw new IllegalArgumentException("Must add state contiguous to existing states");
if ( position >= 0 ) {
// ignore states starting before this regions start
if ( position < size() ) {
stateList.get(position).isActiveProb += stateToAdd.isActiveProb;
} else {
if ( position != size() ) throw new IllegalStateException("position == size but it wasn't");
stateList.add(stateToAdd);
}
}
}
/**
* Process justAddedState, returning a collection of derived states that actually be added to the stateList
*
* The purpose of this function is to transform justAddedStates, if needed, into a series of atomic states
* that we actually want to track. For example, if state is for soft clips, we transform that single
* state into a list of states that surround the state up to the distance of the soft clip.
*
* Can be overridden by subclasses to transform states in any way
*
* There's no particular contract for the output states, except that they can never refer to states
* beyond the current end of the stateList unless the explictly include preceding states before
* the reference. So for example if the current state list is [1, 2, 3] this function could return
* [1,2,3,4,5] but not [1,2,3,5].
*
* @param justAddedState the state our client provided to use to add to the list
* @return a list of derived states that should actually be added to this profile's state list
*/
protected Collection<ActivityProfileState> processState(final ActivityProfileState justAddedState) {
if ( justAddedState.resultState.equals(ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS) ) {
// special code to deal with the problem that high quality soft clipped bases aren't added to pileups
final List<ActivityProfileState> states = new LinkedList<ActivityProfileState>();
final int numHQClips = justAddedState.resultValue.intValue();
for( int jjj = - numHQClips; jjj <= numHQClips; jjj++ ) {
final GenomeLoc loc = getLocForOffset(justAddedState.getLoc(), jjj);
if ( loc != null )
states.add(new ActivityProfileState(loc, justAddedState.isActiveProb));
}
return states;
} else {
return Collections.singletonList(justAddedState);
}
}
// --------------------------------------------------------------------------------
//
// routines to get active regions from the profile
//
// --------------------------------------------------------------------------------
/**
* Get the next completed active regions from this profile, and remove all states supporting them from this profile
*
* Takes the current profile and finds all of the active / inactive from the start of the profile that are
* ready. By ready we mean unable to have their probability modified any longer by future additions to the
* profile. The regions that are popped off the profile take their states with them, so the start of this
* profile will always be after the end of the last region returned here.
*
* The regions are returned sorted by genomic position.
*
* This function may not return anything in the list, if no regions are ready
*
* No returned region will be larger than maxRegionSize.
*
* @param activeRegionExtension the extension value to provide to the constructed regions
* @param maxRegionSize the maximize size of the returned region
* @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the
* stateList. Used to close out the active region when we've hit some kind of end (such
* as the end of the contig)
* @return a non-null list of active regions
*/
@Ensures("result != null")
public List<ActiveRegion> popReadyActiveRegions(final int activeRegionExtension, final int maxRegionSize, final boolean forceConversion) {
if ( activeRegionExtension < 0 ) throw new IllegalArgumentException("activeRegionExtension must be >= 0 but got " + activeRegionExtension);
if ( maxRegionSize < 1 ) throw new IllegalArgumentException("maxRegionSize must be >= 1 but got " + maxRegionSize);
final LinkedList<ActiveRegion> regions = new LinkedList<ActiveRegion>();
while ( true ) {
final ActiveRegion nextRegion = popNextReadyActiveRegion(activeRegionExtension, maxRegionSize, forceConversion);
if ( nextRegion == null )
return regions;
else {
regions.add(nextRegion);
}
}
}
/**
* Helper function for popReadyActiveRegions that pops the first ready region off the front of this profile
*
* If a region is returned, modifies the state of this profile so that states used to make the region are
* no longer part of the profile. Associated information (like the region start position) of this profile
* are also updated.
*
* @param activeRegionExtension the extension value to provide to the constructed regions
* @param maxRegionSize the maximize size of the returned region
* @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the
* stateList. Used to close out the active region when we've hit some kind of end (such
* as the end of the contig)
* @return a fully formed active region, or null if none can be made
*/
private ActiveRegion popNextReadyActiveRegion(final int activeRegionExtension, final int maxRegionSize, final boolean forceConversion) {
if ( stateList.isEmpty() )
return null;
final ActivityProfileState first = stateList.get(0);
final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD;
final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, maxRegionSize, forceConversion);
if ( offsetOfNextRegionEnd == -1 )
// couldn't find a valid ending offset, so we return null
return null;
// we need to create the active region, and clip out the states we're extracting from this profile
stateList.subList(0, offsetOfNextRegionEnd + 1).clear();
// update the start and stop locations as necessary
if ( stateList.isEmpty() ) {
regionStartLoc = regionStopLoc = null;
} else {
regionStartLoc = stateList.get(0).getLoc();
}
final GenomeLoc regionLoc = parser.createGenomeLoc(first.getLoc().getContig(), first.getLoc().getStart(), first.getLoc().getStart() + offsetOfNextRegionEnd);
return new ActiveRegion(regionLoc, isActiveRegion, parser, activeRegionExtension);
}
/**
* Find the end of the current region, returning the index into the element isActive element, or -1 if the region isn't done
*
* The current region is defined from the start of the stateList, looking for elements that have the same isActiveRegion
* flag (i.e., if isActiveRegion is true we are looking for states with isActiveProb > threshold, or alternatively
* for states < threshold). The maximize size of the returned region is maxRegionSize. If forceConversion is
* true, then we'll return the region end even if this isn't safely beyond the max prob propogation distance.
*
* @param isActiveRegion is the region we're looking for an active region or inactive region?
* @param maxRegionSize the maximize size of the returned region
* @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the
* stateList. Used to close out the active region when we've hit some kind of end (such
* as the end of the contig)
* @return the index into stateList of the last element of this region, or -1 if it cannot be found
*/
@Ensures({
"result >= -1",
"result == -1 || result < maxRegionSize",
"! (result == -1 && forceConversion)"})
private int findEndOfRegion(final boolean isActiveRegion, final int maxRegionSize, final boolean forceConversion) {
int i = 0;
while ( i < stateList.size() && i < maxRegionSize ) {
if ( stateList.get(i).isActiveProb > ACTIVE_PROB_THRESHOLD != isActiveRegion ) {
break;
}
i++;
}
// we're one past the end, so i must be decremented
return forceConversion || i + getMaxProbPropogationDistance() < stateList.size() ? i - 1 : -1;
}
}

View File

@ -0,0 +1,350 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.activeregion;
// the imports for unit testing.
import net.sf.picard.reference.ReferenceSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
public class IncrementalActivityProfileUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private GenomeLoc startLoc;
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq);
startLoc = genomeLocParser.createGenomeLoc("chr1", 1, 1, 100);
}
// --------------------------------------------------------------------------------
//
// Basic tests Provider
//
// --------------------------------------------------------------------------------
private class BasicActivityProfileTestProvider extends TestDataProvider {
List<Double> probs;
List<ActiveRegion> expectedRegions;
int extension = 0;
GenomeLoc regionStart = startLoc;
final ProfileType type;
public BasicActivityProfileTestProvider(final ProfileType type, final List<Double> probs, boolean startActive, int ... startsAndStops) {
super(BasicActivityProfileTestProvider.class);
this.type = type;
this.probs = probs;
this.expectedRegions = toRegions(startActive, startsAndStops);
setName(getName());
}
private String getName() {
return String.format("type=%s probs=%s expectedRegions=%s", type, Utils.join(",", probs), Utils.join(",", expectedRegions));
}
public IncrementalActivityProfile makeProfile() {
switch ( type ) {
case Base: return new IncrementalActivityProfile(genomeLocParser);
case BandPass: //return new BandPassActivityProfile(genomeLocParser);
default: throw new IllegalStateException(type.toString());
}
}
private List<ActiveRegion> toRegions(boolean isActive, int[] startsAndStops) {
List<ActiveRegion> l = new ArrayList<ActiveRegion>();
for ( int i = 0; i < startsAndStops.length - 1; i++) {
int start = regionStart.getStart() + startsAndStops[i];
int end = regionStart.getStart() + startsAndStops[i+1] - 1;
GenomeLoc activeLoc = genomeLocParser.createGenomeLoc(regionStart.getContig(), start, end);
ActiveRegion r = new ActiveRegion(activeLoc, isActive, genomeLocParser, extension);
l.add(r);
isActive = ! isActive;
}
return l;
}
}
private enum ProfileType {
Base, BandPass
}
@DataProvider(name = "BasicActivityProfileTestProvider")
public Object[][] makeQualIntervalTestProvider() {
for ( final ProfileType type : ProfileType.values() ) {
if ( type != ProfileType.BandPass ) { // todo -- re-enable
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2);
new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3);
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3);
}
}
return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class);
}
@Test(dataProvider = "BasicActivityProfileTestProvider")
public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) {
IncrementalActivityProfile profile = cfg.makeProfile();
Assert.assertTrue(profile.isEmpty());
Assert.assertEquals(profile.parser, genomeLocParser);
for ( int i = 0; i < cfg.probs.size(); i++ ) {
double p = cfg.probs.get(i);
GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i);
profile.add(new ActivityProfileState(loc, p));
Assert.assertFalse(profile.isEmpty());
}
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));
Assert.assertEquals(profile.size(), cfg.probs.size());
assertProbsAreEqual(profile.stateList, cfg.probs);
// TODO -- reanble tests
//assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions);
Assert.assertEquals(profile.createDerivedProfile(profile.stateList).getClass(), profile.getClass());
final List<ActivityProfileState> empty = new LinkedList<ActivityProfileState>();
Assert.assertEquals(profile.createDerivedProfile(empty).size(), 0);
}
private void assertRegionsAreEqual(List<ActiveRegion> actual, List<ActiveRegion> expected) {
Assert.assertEquals(actual.size(), expected.size());
for ( int i = 0; i < actual.size(); i++ ) {
Assert.assertTrue(actual.get(i).equalExceptReads(expected.get(i)));
}
}
private void assertProbsAreEqual(List<ActivityProfileState> actual, List<Double> expected) {
Assert.assertEquals(actual.size(), expected.size());
for ( int i = 0; i < actual.size(); i++ ) {
Assert.assertEquals(actual.get(i).isActiveProb, expected.get(i));
}
}
// -------------------------------------------------------------------------------------
//
// Hardcore tests for adding to the profile and constructing active regions
//
// -------------------------------------------------------------------------------------
private static class SizeToStringList<T> extends ArrayList<T> {
@Override public String toString() { return "List[" + size() + "]"; }
}
@DataProvider(name = "RegionCreationTests")
public Object[][] makeRegionCreationTests() {
final List<Object[]> tests = new LinkedList<Object[]>();
final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength();
for ( int start : Arrays.asList(1, 10, 100, contigLength - 100, contigLength - 10) ) {
for ( int regionSize : Arrays.asList(1, 10, 100, 1000, 10000) ) {
for ( int maxRegionSize : Arrays.asList(10, 50, 200) ) {
for ( final boolean waitUntilEnd : Arrays.asList(false, true) ) {
for ( final boolean forceConversion : Arrays.asList(false, true) ) {
// what do I really want to test here? I'd like to test a few cases:
// -- region is all active (1.0)
// -- region is all inactive (0.0)
// -- cut the interval into 1, 2, 3, 4, 5 ... 10 regions, each with alternating activity values
for ( final boolean startWithActive : Arrays.asList(true, false) ) {
for ( int nParts : Arrays.asList(1, 2, 3, 4, 5, 7, 10, 11, 13) ) {
// for ( int start : Arrays.asList(1) ) {
// for ( int regionSize : Arrays.asList(100) ) {
// for ( int maxRegionSize : Arrays.asList(10) ) {
// for ( final boolean waitUntilEnd : Arrays.asList(true) ) {
// for ( final boolean forceConversion : Arrays.asList(false) ) {
// for ( final boolean startWithActive : Arrays.asList(true) ) {
// for ( int nParts : Arrays.asList(3) ) {
regionSize = Math.min(regionSize, contigLength - start);
final List<Boolean> regions = makeRegions(regionSize, startWithActive, nParts);
tests.add(new Object[]{ start, regions, maxRegionSize, nParts, forceConversion, waitUntilEnd });
}
}
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
private List<Boolean> makeRegions(final int totalRegionSize,
final boolean startWithActive,
final int nParts) {
final List<Boolean> regions = new SizeToStringList<Boolean>();
boolean isActive = startWithActive;
final int activeRegionSize = Math.max(totalRegionSize / nParts, 1);
for ( int i = 0; i < totalRegionSize; i += activeRegionSize ) {
for ( int j = 0; j < activeRegionSize && j + i < totalRegionSize; j++ ) {
regions.add(isActive);
}
isActive = ! isActive;
}
return regions;
}
@Test(enabled = true, dataProvider = "RegionCreationTests")
public void testRegionCreation(final int start, final List<Boolean> probs, int maxRegionSize, final int nParts, final boolean forceConversion, final boolean waitUntilEnd) {
final IncrementalActivityProfile profile = new IncrementalActivityProfile(genomeLocParser);
Assert.assertNotNull(profile.toString());
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
final List<Boolean> seenSites = new ArrayList<Boolean>(Collections.nCopies(probs.size(), false));
ActiveRegion lastRegion = null;
for ( int i = 0; i < probs.size(); i++ ) {
final boolean isActive = probs.get(i);
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start);
final ActivityProfileState state = new ActivityProfileState(loc, isActive ? 1.0 : 0.0);
profile.add(state);
Assert.assertNotNull(profile.toString());
if ( ! waitUntilEnd ) {
final List<ActiveRegion> regions = profile.popReadyActiveRegions(0, maxRegionSize, false);
lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites);
}
}
if ( waitUntilEnd || forceConversion ) {
final List<ActiveRegion> regions = profile.popReadyActiveRegions(0, maxRegionSize, forceConversion);
lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites);
}
for ( int i = 0; i < probs.size(); i++ ) {
if ( forceConversion || (i + maxRegionSize + profile.getMaxProbPropogationDistance() < probs.size()))
// only require a site to be seen if we are forcing conversion or the site is more than maxRegionSize from the end
Assert.assertTrue(seenSites.get(i), "Missed site " + i);
}
Assert.assertNotNull(profile.toString());
}
private ActiveRegion assertGoodRegions(final int start, final List<ActiveRegion> regions, final int maxRegionSize, ActiveRegion lastRegion, final List<Boolean> probs, final List<Boolean> seenSites) {
for ( final ActiveRegion region : regions ) {
Assert.assertTrue(region.getLocation().size() > 0, "Region " + region + " has a bad size");
Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region " + region + " has a bad size: it's big than the max region size " + maxRegionSize);
if ( lastRegion != null ) {
Assert.assertTrue(region.getLocation().getStart() == lastRegion.getLocation().getStop() + 1, "Region " + region + " doesn't start immediately after previous region" + lastRegion);
}
// check that all active bases are actually active
final int regionOffset = region.getLocation().getStart() - start;
Assert.assertTrue(regionOffset >= 0 && regionOffset < probs.size(), "Region " + region + " has a bad offset w.r.t. start");
for ( int j = 0; j < region.getLocation().size(); j++ ) {
final int siteOffset = j + regionOffset;
Assert.assertEquals(region.isActive, probs.get(siteOffset).booleanValue());
Assert.assertFalse(seenSites.get(siteOffset), "Site " + j + " in " + region + " was seen already");
seenSites.set(siteOffset, true);
}
lastRegion = region;
}
return lastRegion;
}
// -------------------------------------------------------------------------------------
//
// Hardcore tests for adding to the profile and constructing active regions
//
// -------------------------------------------------------------------------------------
@DataProvider(name = "SoftClipsTest")
public Object[][] makeSoftClipsTest() {
final List<Object[]> tests = new LinkedList<Object[]>();
final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength();
for ( int start : Arrays.asList(1, 10, 100, contigLength - 100, contigLength - 10, contigLength - 1) ) {
for ( int precedingSites: Arrays.asList(0, 1, 10) ) {
if ( precedingSites + start < contigLength ) {
for ( int softClipSize : Arrays.asList(1, 2, 10, 100) ) {
// for ( int start : Arrays.asList(10) ) {
// for ( int precedingSites: Arrays.asList(10) ) {
// for ( int softClipSize : Arrays.asList(1) ) {
tests.add(new Object[]{ start, precedingSites, softClipSize });
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "SoftClipsTest")
public void testSoftClips(final int start, int nPrecedingSites, final int softClipSize) {
final IncrementalActivityProfile profile = new IncrementalActivityProfile(genomeLocParser);
final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength();
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
for ( int i = 0; i < nPrecedingSites; i++ ) {
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start);
final ActivityProfileState state = new ActivityProfileState(loc, 0.0);
profile.add(state);
}
final GenomeLoc softClipLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start);
profile.add(new ActivityProfileState(softClipLoc, 1.0, ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS, softClipSize));
if ( nPrecedingSites == 0 ) {
final int profileSize = Math.min(start + softClipSize, contigLength) - start + 1;
Assert.assertEquals(profile.size(), profileSize, "Wrong number of states in the profile");
}
for ( int i = 0; i < profile.size(); i++ ) {
final ActivityProfileState state = profile.getStateList().get(i);
final boolean withinSCRange = state.getLoc().distance(softClipLoc) <= softClipSize;
if ( withinSCRange ) {
Assert.assertTrue(state.isActiveProb > 0.0, "active prob should be changed within soft clip size");
} else {
Assert.assertEquals(state.isActiveProb, 0.0, "active prob shouldn't be changed outside of clip size");
}
}
}
}