IncrementalActivityProfile, complete with extensive unit tests
-- This is an activity profile compatible with fetching its implied active regions incrementally, as activity profile states are added
This commit is contained in:
parent
8d9b0f1bd5
commit
e050f649fd
|
|
@ -61,7 +61,7 @@ public class ActivityProfile {
|
|||
}
|
||||
|
||||
/**
|
||||
* Create a profile of the same class as this object containing just the provided isActiveList
|
||||
* Create a profile of the same class as this object containing just the provided stateList
|
||||
*
|
||||
* Used by clients to create derived activity profiles (such as ones without the starting X
|
||||
* sites because they've been removed in an ActiveRegion) of the same class.
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
* Date: 7/27/12
|
||||
*/
|
||||
public class ActivityProfileState {
|
||||
private GenomeLoc loc;
|
||||
final private GenomeLoc loc;
|
||||
public double isActiveProb;
|
||||
public Type resultState;
|
||||
public Number resultValue;
|
||||
|
|
@ -75,6 +75,16 @@ public class ActivityProfileState {
|
|||
this.resultValue = resultValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* The offset of state w.r.t. our current region's start location
|
||||
* @param regionStartLoc the start of the region, as a genome loc
|
||||
* @return the position of this profile relative to the start of this region
|
||||
*/
|
||||
public int getOffset(final GenomeLoc regionStartLoc) {
|
||||
return getLoc().getStart() - regionStartLoc.getStart();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the genome loc associated with the ActivityProfileState
|
||||
* @return the location of this result
|
||||
|
|
|
|||
|
|
@ -0,0 +1,373 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Class holding information about per-base activity scores for the
|
||||
* active region traversal
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since Date created
|
||||
*/
|
||||
public class IncrementalActivityProfile {
|
||||
private final static int MAX_PROB_PROPOGATION_DISTANCE = 10;
|
||||
private final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
|
||||
|
||||
protected final List<ActivityProfileState> stateList;
|
||||
protected final GenomeLocParser parser;
|
||||
|
||||
protected GenomeLoc regionStartLoc = null;
|
||||
protected GenomeLoc regionStopLoc = null;
|
||||
|
||||
/**
|
||||
* Create a new empty IncrementalActivityProfile
|
||||
* @param parser the parser we can use to create genome locs
|
||||
*/
|
||||
public IncrementalActivityProfile(final GenomeLocParser parser) {
|
||||
this(parser, new ArrayList<ActivityProfileState>(), null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new IncrementalActivityProfile using state list (not copied) and starting at regionStartLoc
|
||||
* @param parser the parser we can use to create genome locs
|
||||
*/
|
||||
@Deprecated
|
||||
protected IncrementalActivityProfile(final GenomeLocParser parser, final List<ActivityProfileState> stateList, final GenomeLoc regionStartLoc) {
|
||||
this.parser = parser;
|
||||
this.stateList = stateList;
|
||||
this.regionStartLoc = regionStartLoc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a profile of the same class as this object containing just the provided stateList
|
||||
*
|
||||
* Used by clients to create derived activity profiles (such as ones without the starting X
|
||||
* sites because they've been removed in an ActiveRegion) of the same class.
|
||||
*
|
||||
* @param isActiveList the active results list to use in the derived instance
|
||||
* @return a freshly allocated data set
|
||||
*/
|
||||
@Deprecated
|
||||
protected IncrementalActivityProfile createDerivedProfile(final List<ActivityProfileState> isActiveList) {
|
||||
return new IncrementalActivityProfile(parser, isActiveList, regionStartLoc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ActivityProfile{" +
|
||||
"start=" + regionStartLoc +
|
||||
", stop=" + regionStopLoc +
|
||||
'}';
|
||||
}
|
||||
|
||||
/**
|
||||
* How far away can probability mass be moved around in this profile?
|
||||
*
|
||||
* This distance puts an upper limit on how far, in bp, we will ever propogate probability max around
|
||||
* when adding a new ActivityProfileState. For example, if the value of this function is
|
||||
* 10, and you are looking at a state at bp 5, and we know that no states beyond 5 + 10 will have
|
||||
* their probability propograted back to that state.
|
||||
*
|
||||
* @return a positive integer distance in bp
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int getMaxProbPropogationDistance() {
|
||||
return MAX_PROB_PROPOGATION_DISTANCE;
|
||||
}
|
||||
|
||||
/**
|
||||
* How many profile results are in this profile?
|
||||
* @return the number of profile results
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int size() {
|
||||
return stateList.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this profile empty?
|
||||
* @return true if the profile is empty
|
||||
*/
|
||||
@Ensures("isEmpty() == (size() == 0)")
|
||||
public boolean isEmpty() {
|
||||
return stateList.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of active profile results in this object
|
||||
* @return a non-null, ordered list of active profile results
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
protected List<ActivityProfileState> getStateList() {
|
||||
return stateList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that gets the genome loc for a site offset from relativeLoc, protecting ourselves from
|
||||
* falling off the edge of the contig.
|
||||
*
|
||||
* @param relativeLoc the location offset is relative to
|
||||
* @param offset the offset from relativeLoc where we'd like to create a GenomeLoc
|
||||
* @return a genome loc with relativeLoc.start + offset, if this is on the contig, null otherwise
|
||||
*/
|
||||
@Requires("relativeLoc != null")
|
||||
protected GenomeLoc getLocForOffset(final GenomeLoc relativeLoc, final int offset) {
|
||||
final int start = relativeLoc.getStart() + offset;
|
||||
if ( start < 0 || start > getCurrentContigLength() ) {
|
||||
return null;
|
||||
} else {
|
||||
return parser.createGenomeLoc(regionStartLoc.getContig(), start);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the length of the current contig
|
||||
* @return the length in bp
|
||||
*/
|
||||
@Requires("regionStartLoc != null")
|
||||
@Ensures("result > 0")
|
||||
private int getCurrentContigLength() {
|
||||
// TODO -- fix performance problem with getContigInfo
|
||||
return parser.getContigInfo(regionStartLoc.getContig()).getSequenceLength();
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// routines to add states to a profile
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Add the next ActivityProfileState to this profile.
|
||||
*
|
||||
* Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown
|
||||
*
|
||||
* @param state a well-formed ActivityProfileState result to incorporate into this profile
|
||||
*/
|
||||
@Requires("state != null")
|
||||
public void add(final ActivityProfileState state) {
|
||||
final GenomeLoc loc = state.getLoc();
|
||||
|
||||
if ( regionStartLoc == null ) {
|
||||
regionStartLoc = loc;
|
||||
regionStopLoc = loc;
|
||||
} else {
|
||||
// TODO -- need to figure out where to add loc as the regions will be popping off the front
|
||||
if ( regionStopLoc.getStart() != loc.getStart() - 1 )
|
||||
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc );
|
||||
regionStopLoc = loc;
|
||||
}
|
||||
|
||||
final Collection<ActivityProfileState> processedStates = processState(state);
|
||||
for ( final ActivityProfileState processedState : processedStates ) {
|
||||
incorporateSingleState(processedState);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Incorporate a single activity profile state into the current list of states
|
||||
*
|
||||
* If state's position occurs immediately after the last position in this profile, then
|
||||
* the state is appended to the state list. If it's within the existing states list,
|
||||
* the prob of stateToAdd is added to its corresponding state in the list. If the
|
||||
* position would be before the start of this profile, stateToAdd is simply ignored.
|
||||
*
|
||||
* @param stateToAdd the state we want to add to the states list
|
||||
*/
|
||||
@Requires("stateToAdd != null")
|
||||
private void incorporateSingleState(final ActivityProfileState stateToAdd) {
|
||||
final int position = stateToAdd.getOffset(regionStartLoc);
|
||||
|
||||
if ( position > size() )
|
||||
// should we allow this? probably not
|
||||
throw new IllegalArgumentException("Must add state contiguous to existing states");
|
||||
|
||||
if ( position >= 0 ) {
|
||||
// ignore states starting before this regions start
|
||||
if ( position < size() ) {
|
||||
stateList.get(position).isActiveProb += stateToAdd.isActiveProb;
|
||||
} else {
|
||||
if ( position != size() ) throw new IllegalStateException("position == size but it wasn't");
|
||||
stateList.add(stateToAdd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process justAddedState, returning a collection of derived states that actually be added to the stateList
|
||||
*
|
||||
* The purpose of this function is to transform justAddedStates, if needed, into a series of atomic states
|
||||
* that we actually want to track. For example, if state is for soft clips, we transform that single
|
||||
* state into a list of states that surround the state up to the distance of the soft clip.
|
||||
*
|
||||
* Can be overridden by subclasses to transform states in any way
|
||||
*
|
||||
* There's no particular contract for the output states, except that they can never refer to states
|
||||
* beyond the current end of the stateList unless the explictly include preceding states before
|
||||
* the reference. So for example if the current state list is [1, 2, 3] this function could return
|
||||
* [1,2,3,4,5] but not [1,2,3,5].
|
||||
*
|
||||
* @param justAddedState the state our client provided to use to add to the list
|
||||
* @return a list of derived states that should actually be added to this profile's state list
|
||||
*/
|
||||
protected Collection<ActivityProfileState> processState(final ActivityProfileState justAddedState) {
|
||||
if ( justAddedState.resultState.equals(ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS) ) {
|
||||
// special code to deal with the problem that high quality soft clipped bases aren't added to pileups
|
||||
final List<ActivityProfileState> states = new LinkedList<ActivityProfileState>();
|
||||
final int numHQClips = justAddedState.resultValue.intValue();
|
||||
for( int jjj = - numHQClips; jjj <= numHQClips; jjj++ ) {
|
||||
final GenomeLoc loc = getLocForOffset(justAddedState.getLoc(), jjj);
|
||||
if ( loc != null )
|
||||
states.add(new ActivityProfileState(loc, justAddedState.isActiveProb));
|
||||
}
|
||||
|
||||
return states;
|
||||
} else {
|
||||
return Collections.singletonList(justAddedState);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// routines to get active regions from the profile
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Get the next completed active regions from this profile, and remove all states supporting them from this profile
|
||||
*
|
||||
* Takes the current profile and finds all of the active / inactive from the start of the profile that are
|
||||
* ready. By ready we mean unable to have their probability modified any longer by future additions to the
|
||||
* profile. The regions that are popped off the profile take their states with them, so the start of this
|
||||
* profile will always be after the end of the last region returned here.
|
||||
*
|
||||
* The regions are returned sorted by genomic position.
|
||||
*
|
||||
* This function may not return anything in the list, if no regions are ready
|
||||
*
|
||||
* No returned region will be larger than maxRegionSize.
|
||||
*
|
||||
* @param activeRegionExtension the extension value to provide to the constructed regions
|
||||
* @param maxRegionSize the maximize size of the returned region
|
||||
* @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the
|
||||
* stateList. Used to close out the active region when we've hit some kind of end (such
|
||||
* as the end of the contig)
|
||||
* @return a non-null list of active regions
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public List<ActiveRegion> popReadyActiveRegions(final int activeRegionExtension, final int maxRegionSize, final boolean forceConversion) {
|
||||
if ( activeRegionExtension < 0 ) throw new IllegalArgumentException("activeRegionExtension must be >= 0 but got " + activeRegionExtension);
|
||||
if ( maxRegionSize < 1 ) throw new IllegalArgumentException("maxRegionSize must be >= 1 but got " + maxRegionSize);
|
||||
|
||||
final LinkedList<ActiveRegion> regions = new LinkedList<ActiveRegion>();
|
||||
|
||||
while ( true ) {
|
||||
final ActiveRegion nextRegion = popNextReadyActiveRegion(activeRegionExtension, maxRegionSize, forceConversion);
|
||||
if ( nextRegion == null )
|
||||
return regions;
|
||||
else {
|
||||
regions.add(nextRegion);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function for popReadyActiveRegions that pops the first ready region off the front of this profile
|
||||
*
|
||||
* If a region is returned, modifies the state of this profile so that states used to make the region are
|
||||
* no longer part of the profile. Associated information (like the region start position) of this profile
|
||||
* are also updated.
|
||||
*
|
||||
* @param activeRegionExtension the extension value to provide to the constructed regions
|
||||
* @param maxRegionSize the maximize size of the returned region
|
||||
* @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the
|
||||
* stateList. Used to close out the active region when we've hit some kind of end (such
|
||||
* as the end of the contig)
|
||||
* @return a fully formed active region, or null if none can be made
|
||||
*/
|
||||
private ActiveRegion popNextReadyActiveRegion(final int activeRegionExtension, final int maxRegionSize, final boolean forceConversion) {
|
||||
if ( stateList.isEmpty() )
|
||||
return null;
|
||||
|
||||
final ActivityProfileState first = stateList.get(0);
|
||||
final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD;
|
||||
final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, maxRegionSize, forceConversion);
|
||||
if ( offsetOfNextRegionEnd == -1 )
|
||||
// couldn't find a valid ending offset, so we return null
|
||||
return null;
|
||||
|
||||
// we need to create the active region, and clip out the states we're extracting from this profile
|
||||
stateList.subList(0, offsetOfNextRegionEnd + 1).clear();
|
||||
|
||||
// update the start and stop locations as necessary
|
||||
if ( stateList.isEmpty() ) {
|
||||
regionStartLoc = regionStopLoc = null;
|
||||
} else {
|
||||
regionStartLoc = stateList.get(0).getLoc();
|
||||
}
|
||||
final GenomeLoc regionLoc = parser.createGenomeLoc(first.getLoc().getContig(), first.getLoc().getStart(), first.getLoc().getStart() + offsetOfNextRegionEnd);
|
||||
return new ActiveRegion(regionLoc, isActiveRegion, parser, activeRegionExtension);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the end of the current region, returning the index into the element isActive element, or -1 if the region isn't done
|
||||
*
|
||||
* The current region is defined from the start of the stateList, looking for elements that have the same isActiveRegion
|
||||
* flag (i.e., if isActiveRegion is true we are looking for states with isActiveProb > threshold, or alternatively
|
||||
* for states < threshold). The maximize size of the returned region is maxRegionSize. If forceConversion is
|
||||
* true, then we'll return the region end even if this isn't safely beyond the max prob propogation distance.
|
||||
*
|
||||
* @param isActiveRegion is the region we're looking for an active region or inactive region?
|
||||
* @param maxRegionSize the maximize size of the returned region
|
||||
* @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the
|
||||
* stateList. Used to close out the active region when we've hit some kind of end (such
|
||||
* as the end of the contig)
|
||||
* @return the index into stateList of the last element of this region, or -1 if it cannot be found
|
||||
*/
|
||||
@Ensures({
|
||||
"result >= -1",
|
||||
"result == -1 || result < maxRegionSize",
|
||||
"! (result == -1 && forceConversion)"})
|
||||
private int findEndOfRegion(final boolean isActiveRegion, final int maxRegionSize, final boolean forceConversion) {
|
||||
int i = 0;
|
||||
while ( i < stateList.size() && i < maxRegionSize ) {
|
||||
if ( stateList.get(i).isActiveProb > ACTIVE_PROB_THRESHOLD != isActiveRegion ) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
// we're one past the end, so i must be decremented
|
||||
return forceConversion || i + getMaxProbPropogationDistance() < stateList.size() ? i - 1 : -1;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,350 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class IncrementalActivityProfileUnitTest extends BaseTest {
|
||||
private GenomeLocParser genomeLocParser;
|
||||
private GenomeLoc startLoc;
|
||||
|
||||
@BeforeClass
|
||||
public void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
startLoc = genomeLocParser.createGenomeLoc("chr1", 1, 1, 100);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Basic tests Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class BasicActivityProfileTestProvider extends TestDataProvider {
|
||||
List<Double> probs;
|
||||
List<ActiveRegion> expectedRegions;
|
||||
int extension = 0;
|
||||
GenomeLoc regionStart = startLoc;
|
||||
final ProfileType type;
|
||||
|
||||
public BasicActivityProfileTestProvider(final ProfileType type, final List<Double> probs, boolean startActive, int ... startsAndStops) {
|
||||
super(BasicActivityProfileTestProvider.class);
|
||||
this.type = type;
|
||||
this.probs = probs;
|
||||
this.expectedRegions = toRegions(startActive, startsAndStops);
|
||||
setName(getName());
|
||||
}
|
||||
|
||||
private String getName() {
|
||||
return String.format("type=%s probs=%s expectedRegions=%s", type, Utils.join(",", probs), Utils.join(",", expectedRegions));
|
||||
}
|
||||
|
||||
public IncrementalActivityProfile makeProfile() {
|
||||
switch ( type ) {
|
||||
case Base: return new IncrementalActivityProfile(genomeLocParser);
|
||||
case BandPass: //return new BandPassActivityProfile(genomeLocParser);
|
||||
default: throw new IllegalStateException(type.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private List<ActiveRegion> toRegions(boolean isActive, int[] startsAndStops) {
|
||||
List<ActiveRegion> l = new ArrayList<ActiveRegion>();
|
||||
for ( int i = 0; i < startsAndStops.length - 1; i++) {
|
||||
int start = regionStart.getStart() + startsAndStops[i];
|
||||
int end = regionStart.getStart() + startsAndStops[i+1] - 1;
|
||||
GenomeLoc activeLoc = genomeLocParser.createGenomeLoc(regionStart.getContig(), start, end);
|
||||
ActiveRegion r = new ActiveRegion(activeLoc, isActive, genomeLocParser, extension);
|
||||
l.add(r);
|
||||
isActive = ! isActive;
|
||||
}
|
||||
return l;
|
||||
}
|
||||
}
|
||||
|
||||
private enum ProfileType {
|
||||
Base, BandPass
|
||||
}
|
||||
|
||||
@DataProvider(name = "BasicActivityProfileTestProvider")
|
||||
public Object[][] makeQualIntervalTestProvider() {
|
||||
for ( final ProfileType type : ProfileType.values() ) {
|
||||
if ( type != ProfileType.BandPass ) { // todo -- re-enable
|
||||
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1);
|
||||
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2);
|
||||
new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2);
|
||||
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3);
|
||||
new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3);
|
||||
}
|
||||
}
|
||||
|
||||
return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BasicActivityProfileTestProvider")
|
||||
public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) {
|
||||
IncrementalActivityProfile profile = cfg.makeProfile();
|
||||
|
||||
Assert.assertTrue(profile.isEmpty());
|
||||
|
||||
Assert.assertEquals(profile.parser, genomeLocParser);
|
||||
|
||||
for ( int i = 0; i < cfg.probs.size(); i++ ) {
|
||||
double p = cfg.probs.get(i);
|
||||
GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i);
|
||||
profile.add(new ActivityProfileState(loc, p));
|
||||
Assert.assertFalse(profile.isEmpty());
|
||||
}
|
||||
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));
|
||||
|
||||
Assert.assertEquals(profile.size(), cfg.probs.size());
|
||||
assertProbsAreEqual(profile.stateList, cfg.probs);
|
||||
|
||||
// TODO -- reanble tests
|
||||
//assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions);
|
||||
|
||||
Assert.assertEquals(profile.createDerivedProfile(profile.stateList).getClass(), profile.getClass());
|
||||
|
||||
final List<ActivityProfileState> empty = new LinkedList<ActivityProfileState>();
|
||||
Assert.assertEquals(profile.createDerivedProfile(empty).size(), 0);
|
||||
}
|
||||
|
||||
private void assertRegionsAreEqual(List<ActiveRegion> actual, List<ActiveRegion> expected) {
|
||||
Assert.assertEquals(actual.size(), expected.size());
|
||||
for ( int i = 0; i < actual.size(); i++ ) {
|
||||
Assert.assertTrue(actual.get(i).equalExceptReads(expected.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
private void assertProbsAreEqual(List<ActivityProfileState> actual, List<Double> expected) {
|
||||
Assert.assertEquals(actual.size(), expected.size());
|
||||
for ( int i = 0; i < actual.size(); i++ ) {
|
||||
Assert.assertEquals(actual.get(i).isActiveProb, expected.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Hardcore tests for adding to the profile and constructing active regions
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
private static class SizeToStringList<T> extends ArrayList<T> {
|
||||
@Override public String toString() { return "List[" + size() + "]"; }
|
||||
}
|
||||
|
||||
@DataProvider(name = "RegionCreationTests")
|
||||
public Object[][] makeRegionCreationTests() {
|
||||
final List<Object[]> tests = new LinkedList<Object[]>();
|
||||
|
||||
final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength();
|
||||
for ( int start : Arrays.asList(1, 10, 100, contigLength - 100, contigLength - 10) ) {
|
||||
for ( int regionSize : Arrays.asList(1, 10, 100, 1000, 10000) ) {
|
||||
for ( int maxRegionSize : Arrays.asList(10, 50, 200) ) {
|
||||
for ( final boolean waitUntilEnd : Arrays.asList(false, true) ) {
|
||||
for ( final boolean forceConversion : Arrays.asList(false, true) ) {
|
||||
// what do I really want to test here? I'd like to test a few cases:
|
||||
// -- region is all active (1.0)
|
||||
// -- region is all inactive (0.0)
|
||||
// -- cut the interval into 1, 2, 3, 4, 5 ... 10 regions, each with alternating activity values
|
||||
for ( final boolean startWithActive : Arrays.asList(true, false) ) {
|
||||
for ( int nParts : Arrays.asList(1, 2, 3, 4, 5, 7, 10, 11, 13) ) {
|
||||
|
||||
// for ( int start : Arrays.asList(1) ) {
|
||||
// for ( int regionSize : Arrays.asList(100) ) {
|
||||
// for ( int maxRegionSize : Arrays.asList(10) ) {
|
||||
// for ( final boolean waitUntilEnd : Arrays.asList(true) ) {
|
||||
// for ( final boolean forceConversion : Arrays.asList(false) ) {
|
||||
// for ( final boolean startWithActive : Arrays.asList(true) ) {
|
||||
// for ( int nParts : Arrays.asList(3) ) {
|
||||
regionSize = Math.min(regionSize, contigLength - start);
|
||||
final List<Boolean> regions = makeRegions(regionSize, startWithActive, nParts);
|
||||
tests.add(new Object[]{ start, regions, maxRegionSize, nParts, forceConversion, waitUntilEnd });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private List<Boolean> makeRegions(final int totalRegionSize,
|
||||
final boolean startWithActive,
|
||||
final int nParts) {
|
||||
final List<Boolean> regions = new SizeToStringList<Boolean>();
|
||||
|
||||
boolean isActive = startWithActive;
|
||||
final int activeRegionSize = Math.max(totalRegionSize / nParts, 1);
|
||||
for ( int i = 0; i < totalRegionSize; i += activeRegionSize ) {
|
||||
for ( int j = 0; j < activeRegionSize && j + i < totalRegionSize; j++ ) {
|
||||
regions.add(isActive);
|
||||
}
|
||||
isActive = ! isActive;
|
||||
}
|
||||
|
||||
return regions;
|
||||
}
|
||||
|
||||
|
||||
@Test(enabled = true, dataProvider = "RegionCreationTests")
|
||||
public void testRegionCreation(final int start, final List<Boolean> probs, int maxRegionSize, final int nParts, final boolean forceConversion, final boolean waitUntilEnd) {
|
||||
final IncrementalActivityProfile profile = new IncrementalActivityProfile(genomeLocParser);
|
||||
Assert.assertNotNull(profile.toString());
|
||||
|
||||
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
|
||||
final List<Boolean> seenSites = new ArrayList<Boolean>(Collections.nCopies(probs.size(), false));
|
||||
ActiveRegion lastRegion = null;
|
||||
for ( int i = 0; i < probs.size(); i++ ) {
|
||||
final boolean isActive = probs.get(i);
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start);
|
||||
final ActivityProfileState state = new ActivityProfileState(loc, isActive ? 1.0 : 0.0);
|
||||
profile.add(state);
|
||||
Assert.assertNotNull(profile.toString());
|
||||
|
||||
if ( ! waitUntilEnd ) {
|
||||
final List<ActiveRegion> regions = profile.popReadyActiveRegions(0, maxRegionSize, false);
|
||||
lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites);
|
||||
}
|
||||
}
|
||||
|
||||
if ( waitUntilEnd || forceConversion ) {
|
||||
final List<ActiveRegion> regions = profile.popReadyActiveRegions(0, maxRegionSize, forceConversion);
|
||||
lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites);
|
||||
}
|
||||
|
||||
for ( int i = 0; i < probs.size(); i++ ) {
|
||||
if ( forceConversion || (i + maxRegionSize + profile.getMaxProbPropogationDistance() < probs.size()))
|
||||
// only require a site to be seen if we are forcing conversion or the site is more than maxRegionSize from the end
|
||||
Assert.assertTrue(seenSites.get(i), "Missed site " + i);
|
||||
}
|
||||
|
||||
Assert.assertNotNull(profile.toString());
|
||||
}
|
||||
|
||||
private ActiveRegion assertGoodRegions(final int start, final List<ActiveRegion> regions, final int maxRegionSize, ActiveRegion lastRegion, final List<Boolean> probs, final List<Boolean> seenSites) {
|
||||
for ( final ActiveRegion region : regions ) {
|
||||
Assert.assertTrue(region.getLocation().size() > 0, "Region " + region + " has a bad size");
|
||||
Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region " + region + " has a bad size: it's big than the max region size " + maxRegionSize);
|
||||
if ( lastRegion != null ) {
|
||||
Assert.assertTrue(region.getLocation().getStart() == lastRegion.getLocation().getStop() + 1, "Region " + region + " doesn't start immediately after previous region" + lastRegion);
|
||||
}
|
||||
|
||||
// check that all active bases are actually active
|
||||
final int regionOffset = region.getLocation().getStart() - start;
|
||||
Assert.assertTrue(regionOffset >= 0 && regionOffset < probs.size(), "Region " + region + " has a bad offset w.r.t. start");
|
||||
for ( int j = 0; j < region.getLocation().size(); j++ ) {
|
||||
final int siteOffset = j + regionOffset;
|
||||
Assert.assertEquals(region.isActive, probs.get(siteOffset).booleanValue());
|
||||
Assert.assertFalse(seenSites.get(siteOffset), "Site " + j + " in " + region + " was seen already");
|
||||
seenSites.set(siteOffset, true);
|
||||
}
|
||||
|
||||
lastRegion = region;
|
||||
}
|
||||
|
||||
return lastRegion;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Hardcore tests for adding to the profile and constructing active regions
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "SoftClipsTest")
|
||||
public Object[][] makeSoftClipsTest() {
|
||||
final List<Object[]> tests = new LinkedList<Object[]>();
|
||||
|
||||
final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength();
|
||||
for ( int start : Arrays.asList(1, 10, 100, contigLength - 100, contigLength - 10, contigLength - 1) ) {
|
||||
for ( int precedingSites: Arrays.asList(0, 1, 10) ) {
|
||||
if ( precedingSites + start < contigLength ) {
|
||||
for ( int softClipSize : Arrays.asList(1, 2, 10, 100) ) {
|
||||
// for ( int start : Arrays.asList(10) ) {
|
||||
// for ( int precedingSites: Arrays.asList(10) ) {
|
||||
// for ( int softClipSize : Arrays.asList(1) ) {
|
||||
tests.add(new Object[]{ start, precedingSites, softClipSize });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "SoftClipsTest")
|
||||
public void testSoftClips(final int start, int nPrecedingSites, final int softClipSize) {
|
||||
final IncrementalActivityProfile profile = new IncrementalActivityProfile(genomeLocParser);
|
||||
|
||||
final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength();
|
||||
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
|
||||
for ( int i = 0; i < nPrecedingSites; i++ ) {
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start);
|
||||
final ActivityProfileState state = new ActivityProfileState(loc, 0.0);
|
||||
profile.add(state);
|
||||
}
|
||||
|
||||
final GenomeLoc softClipLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start);
|
||||
profile.add(new ActivityProfileState(softClipLoc, 1.0, ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS, softClipSize));
|
||||
|
||||
if ( nPrecedingSites == 0 ) {
|
||||
final int profileSize = Math.min(start + softClipSize, contigLength) - start + 1;
|
||||
Assert.assertEquals(profile.size(), profileSize, "Wrong number of states in the profile");
|
||||
}
|
||||
|
||||
for ( int i = 0; i < profile.size(); i++ ) {
|
||||
final ActivityProfileState state = profile.getStateList().get(i);
|
||||
final boolean withinSCRange = state.getLoc().distance(softClipLoc) <= softClipSize;
|
||||
if ( withinSCRange ) {
|
||||
Assert.assertTrue(state.isActiveProb > 0.0, "active prob should be changed within soft clip size");
|
||||
} else {
|
||||
Assert.assertEquals(state.isActiveProb, 0.0, "active prob shouldn't be changed outside of clip size");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue