Merge pull request #742 from broadinstitute/rhl_prob_thresh_param_arg

Rhl prob thresh param arg
This commit is contained in:
jmthibault79 2014-10-10 12:55:01 -04:00
commit 22a3c6e556
7 changed files with 65 additions and 33 deletions

View File

@ -78,7 +78,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String GGA_INTERVALS_FILE = privateTestDir + "haplotype-caller-reduced-test-interval.list";
private void HCTest(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE --maxReadsInRegionPerSample 1000 --minReadsPerAlignmentStart 5 -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE --maxReadsInRegionPerSample 1000 --minReadsPerAlignmentStart 5 --maxProbPropagationDistance 50 --activeProbabilityThreshold 0.002 -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCaller: args=" + args, spec);
}

View File

@ -159,7 +159,8 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
final double bandPassSigma = this.walker.bandPassSigma == null ? annotation.bandPassSigma() : this.walker.bandPassSigma;
walkerHasPresetRegions = this.walker.hasPresetActiveRegions();
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma);
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), this.walker.maxProbPropagationDistance, this.walker.activeProbThreshold,
BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma);
final int maxReadsAcrossSamples = annotation.maxReadsToHoldInMemoryPerSample() * SampleUtils.getSAMFileSamples(engine).size();
final int maxReadsToHoldInMemory = Math.min(maxReadsAcrossSamples, annotation.maxReadsToHoldTotal());

View File

@ -107,6 +107,17 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
@Argument(fullName="bandPassSigma", shortName="bandPassSigma", doc="The sigma of the band pass filter Gaussian kernel; if not provided defaults to Walker annotated default", required = false)
public Double bandPassSigma = null;
/*
* For active region limits in ActivityProfile
* */
@Hidden
@Argument(fullName = "maxProbPropagationDistance", shortName = "maxProbPropDist", minValue = 0, doc="Region probability propagation distance beyond it's maximum size.", required = false)
public Integer maxProbPropagationDistance = 50;
@Advanced
@Argument(fullName = "activeProbabilityThreshold", shortName = "ActProbThresh", minValue = 0.0, maxValue = 1.0, doc="Threshold for the probability of a profile state being active.", required = false)
public Double activeProbThreshold = 0.002;
private GenomeLocSortedSet presetActiveRegions = null;
@Override

View File

@ -41,13 +41,13 @@ import java.util.*;
* @since Date created
*/
public class ActivityProfile {
private final static int MAX_PROB_PROPAGATION_DISTANCE = 50;
protected final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
protected final List<ActivityProfileState> stateList;
protected final GenomeLocParser parser;
protected final GenomeLocSortedSet restrictToIntervals;
protected final int maxProbPropagationDistance;
protected final double activeProbThreshold;
protected GenomeLoc regionStartLoc = null;
protected GenomeLoc regionStopLoc = null;
@ -60,22 +60,28 @@ public class ActivityProfile {
/**
* Create a new empty ActivityProfile
* @param parser the parser we can use to create genome locs, cannot be null
* @param maxProbPropagationDistance region probability propagation distance beyond it's maximum size
* @param activeProbThreshold threshold for the probability of am active profile state being active
*/
public ActivityProfile(final GenomeLocParser parser) {
this(parser, null);
public ActivityProfile(final GenomeLocParser parser, final int maxProbPropagationDistance, final double activeProbThreshold) {
this(parser, maxProbPropagationDistance, activeProbThreshold, null);
}
/**
* Create a empty ActivityProfile, restricting output to profiles overlapping intervals, if not null
* @param parser the parser we can use to create genome locs, cannot be null
* @param maxProbPropagationDistance region probability propagation distance beyond it's maximum size
* @param activeProbThreshold threshold for the probability of a profile state being active
* @param intervals only include states that are within these intervals, if not null
*/
public ActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet intervals) {
public ActivityProfile(final GenomeLocParser parser, final int maxProbPropagationDistance, final double activeProbThreshold, final GenomeLocSortedSet intervals) {
if ( parser == null ) throw new IllegalArgumentException("parser cannot be null");
this.parser = parser;
this.stateList = new ArrayList<ActivityProfileState>();
this.restrictToIntervals = intervals;
this.maxProbPropagationDistance = maxProbPropagationDistance;
this.activeProbThreshold = activeProbThreshold;
}
@Override
@ -98,7 +104,7 @@ public class ActivityProfile {
*/
@Ensures("result >= 0")
public int getMaxProbPropagationDistance() {
return MAX_PROB_PROPAGATION_DISTANCE;
return maxProbPropagationDistance;
}
/**
@ -359,7 +365,7 @@ public class ActivityProfile {
}
final ActivityProfileState first = stateList.get(0);
final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD;
final boolean isActiveRegion = first.isActiveProb > activeProbThreshold;
final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, minRegionSize, maxRegionSize, forceConversion);
if ( offsetOfNextRegionEnd == -1 )
// couldn't find a valid ending offset, so we return null
@ -453,7 +459,7 @@ public class ActivityProfile {
* Find the first index into the state list where the state is considered ! isActiveRegion
*
* Note that each state has a probability of being active, and this function thresholds that
* value on ACTIVE_PROB_THRESHOLD, coloring each state as active or inactive. Finds the
* value on activeProbThreshold, coloring each state as active or inactive. Finds the
* largest contiguous stretch of states starting at the first state (index 0) with the same isActive
* state as isActiveRegion. If the entire state list has the same isActive value, then returns
* maxRegionSize
@ -470,7 +476,7 @@ public class ActivityProfile {
int endOfActiveRegion = 0;
while ( endOfActiveRegion < nStates && endOfActiveRegion < maxRegionSize ) {
if ( getProb(endOfActiveRegion) > ACTIVE_PROB_THRESHOLD != isActiveRegion ) {
if ( getProb(endOfActiveRegion) > activeProbThreshold != isActiveRegion ) {
break;
}
endOfActiveRegion++;

View File

@ -55,19 +55,22 @@ public class BandPassActivityProfile extends ActivityProfile {
/**
* Create a new BandPassActivityProfile with default sigma and filter sizes
*
* @see #BandPassActivityProfile(org.broadinstitute.gatk.utils.GenomeLocParser, org.broadinstitute.gatk.utils.GenomeLocSortedSet, int, double, boolean)
* @see #BandPassActivityProfile(org.broadinstitute.gatk.utils.GenomeLocParser, org.broadinstitute.gatk.utils.GenomeLocSortedSet, int, double, int, double, boolean)
*/
public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals) {
this(parser, restrictToIntervals, MAX_FILTER_SIZE, DEFAULT_SIGMA, true);
public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals,
final int maxProbPropagationDistance, final double activeProbThreshold) {
this(parser, restrictToIntervals, maxProbPropagationDistance, activeProbThreshold, MAX_FILTER_SIZE, DEFAULT_SIGMA);
}
/**
* @see #BandPassActivityProfile(org.broadinstitute.gatk.utils.GenomeLocParser, org.broadinstitute.gatk.utils.GenomeLocSortedSet, int, double, boolean)
* @see #BandPassActivityProfile(org.broadinstitute.gatk.utils.GenomeLocParser, org.broadinstitute.gatk.utils.GenomeLocSortedSet, int, double, int, double, boolean)
*
* sets adaptiveFilterSize to true
*/
public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, final int maxFilterSize, final double sigma) {
this(parser, restrictToIntervals, maxFilterSize, sigma, true);
public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals,
final int maxProbPropagationDistance, final double activeProbThreshold,
final int maxFilterSize, final double sigma) {
this(parser, restrictToIntervals, maxProbPropagationDistance, activeProbThreshold, maxFilterSize, sigma, true);
}
/**
@ -75,12 +78,15 @@ public class BandPassActivityProfile extends ActivityProfile {
*
* @param parser our genome loc parser
* @param restrictToIntervals only include states that are within these intervals, if not null
* @param maxProbPropagationDistance region probability propagation distance beyond it's maximum size
* @param activeProbThreshold threshold for the probability of a profile state being active
* @param maxFilterSize the maximum size of the band pass filter we are allowed to create, regardless of sigma
* @param sigma the variance of the Gaussian kernel for this band pass filter
* @param adaptiveFilterSize if true, use the kernel itself to determine the best filter size
*/
public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, final int maxFilterSize, final double sigma, final boolean adaptiveFilterSize) {
super(parser, restrictToIntervals);
public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, final int maxProbPropagationDistance,
final double activeProbThreshold, final int maxFilterSize, final double sigma, final boolean adaptiveFilterSize) {
super(parser, maxProbPropagationDistance, activeProbThreshold, restrictToIntervals);
if ( sigma < 0 ) throw new IllegalArgumentException("Sigma must be greater than or equal to 0 but got " + sigma);

View File

@ -51,6 +51,9 @@ public class ActivityProfileUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private GenomeLoc startLoc;
private final static int MAX_PROB_PROPAGATION_DISTANCE = 50;
private final static double ACTIVE_PROB_THRESHOLD= 0.002;
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
@ -86,10 +89,10 @@ public class ActivityProfileUnitTest extends BaseTest {
public ActivityProfile makeProfile() {
switch ( type ) {
case Base: return new ActivityProfile(genomeLocParser);
case Base: return new ActivityProfile(genomeLocParser, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD);
case BandPass:
// zero size => equivalent to ActivityProfile
return new BandPassActivityProfile(genomeLocParser, null, 0, 0.01, false);
return new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, 0, 0.01, false);
default: throw new IllegalStateException(type.toString());
}
}
@ -230,7 +233,7 @@ public class ActivityProfileUnitTest extends BaseTest {
@Test(enabled = !DEBUG, dataProvider = "RegionCreationTests")
public void testRegionCreation(final int start, final List<Boolean> probs, int maxRegionSize, final int nParts, final boolean forceConversion, final boolean waitUntilEnd) {
final ActivityProfile profile = new ActivityProfile(genomeLocParser);
final ActivityProfile profile = new ActivityProfile(genomeLocParser, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD);
Assert.assertNotNull(profile.toString());
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
@ -316,7 +319,7 @@ public class ActivityProfileUnitTest extends BaseTest {
@Test(enabled = ! DEBUG, dataProvider = "SoftClipsTest")
public void testSoftClips(final int start, int nPrecedingSites, final int softClipSize) {
final ActivityProfile profile = new ActivityProfile(genomeLocParser);
final ActivityProfile profile = new ActivityProfile(genomeLocParser, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD);
final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength();
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
@ -450,7 +453,7 @@ public class ActivityProfileUnitTest extends BaseTest {
private double[] makeGaussian(final int mean, final int range, final double sigma) {
final double[] gauss = new double[range];
for( int iii = 0; iii < range; iii++ ) {
gauss[iii] = MathUtils.normalDistribution(mean, sigma, iii) + ActivityProfile.ACTIVE_PROB_THRESHOLD;
gauss[iii] = MathUtils.normalDistribution(mean, sigma, iii) + ACTIVE_PROB_THRESHOLD;
}
return gauss;
}
@ -469,7 +472,7 @@ public class ActivityProfileUnitTest extends BaseTest {
@Test(dataProvider = "ActiveRegionCutTests")
public void testActiveRegionCutTests(final int minRegionSize, final int maxRegionSize, final int expectedRegionSize, final List<Double> probs) {
final ActivityProfile profile = new ActivityProfile(genomeLocParser);
final ActivityProfile profile = new ActivityProfile(genomeLocParser, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD);
final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
for ( int i = 0; i <= maxRegionSize + profile.getMaxProbPropagationDistance(); i++ ) {

View File

@ -59,6 +59,9 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
private final static boolean DEBUG = false;
private GenomeLocParser genomeLocParser;
private final static int MAX_PROB_PROPAGATION_DISTANCE = 50;
private final static double ACTIVE_PROB_THRESHOLD= 0.002;
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
@ -91,7 +94,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
@Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest")
public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) {
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, bandPassSize, sigma, false);
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, bandPassSize, sigma, false);
final int expectedBandSize = bandPassSize * 2 + 1;
Assert.assertEquals(profile.getFilteredSize(), bandPassSize, "Wrong filter size");
@ -155,7 +158,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
@Test( enabled = ! DEBUG, dataProvider = "BandPassComposition")
public void testBandPassComposition(final int bandPassSize, final int integrationLength) {
final int start = 1;
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA);
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE,
ACTIVE_PROB_THRESHOLD, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA);
final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2];
// add a buffer so that we can get all of the band pass values
@ -228,7 +232,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
@Test( enabled = ! DEBUG, dataProvider = "KernelCreation")
public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) {
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, maxSize, sigma, true);
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD,
maxSize, sigma, true);
final double[] kernel = profile.getKernel();
Assert.assertEquals(kernel.length, expectedKernel.length);
@ -268,8 +273,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
final Pair<VCFHeader, GATKVCFUtils.VCIterable<LineIterator>> reader = GATKVCFUtils.readAllVCs(file, codec);
final List<ActiveRegion> incRegions = new ArrayList<ActiveRegion>();
final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null);
final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser, null);
final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD);
final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD);
int pos = start;
for ( final VariantContext vc : reader.getSecond() ) {
if ( vc == null ) continue;
@ -324,10 +329,10 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
lastPosSeen = region.getLocation().getStop();
for ( final ActivityProfileState state : region.getSupportingStates() ) {
Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive(),
Assert.assertEquals(state.isActiveProb > ACTIVE_PROB_THRESHOLD, region.isActive(),
"Region is active=" + region.isActive() + " but contains a state " + state + " with prob "
+ state.isActiveProb + " not within expected values given threshold for activity of "
+ ActivityProfile.ACTIVE_PROB_THRESHOLD);
+ ACTIVE_PROB_THRESHOLD);
}
}
}