Add downsampling arguments to Mutect

- maxReadsInRegionPErSample, minReadsPerAlignment start are now M2 args
	- maxReadsInMemoryPerSample is an activeRegionWalker arg
This commit is contained in:
David Benjamin 2016-10-19 14:16:14 -04:00 committed by Geraldine Van der Auwera
parent 66aa735817
commit b1f2301dd3
4 changed files with 30 additions and 20 deletions

View File

@ -200,8 +200,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
private double log10GlobalReadMismappingRate;
private static final int REFERENCE_PADDING = 500;
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
@ArgumentCollection
protected M2ArgumentCollection MTAC = new M2ArgumentCollection();
@ -293,6 +291,16 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false)
protected boolean doNotRunPhysicalPhasing = false;
/**
* When downsampling, level the coverage of the reads in each sample to no more than maxReadsInRegionPerSample reads,
* not reducing coverage at any read start to less than minReadsPerAlignmentStart
*/
@Argument(fullName = "maxReadsInRegionPerSample", shortName = "maxReadsInRegionPerSample", doc="Maximum reads in an active region", required = false)
protected int maxReadsInRegionPerSample = 1000;
@Argument(fullName = "minReadsPerAlignmentStart", shortName = "minReadsPerAlignStart", doc="Minimum number of reads sharing the same alignment start for each genomic location in an active region", required = false)
protected int minReadsPerAlignmentStart = 5;
public RodBinding<VariantContext> getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; }
@Override

View File

@ -162,8 +162,8 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), this.walker.maxProbPropagationDistance, this.walker.activeProbThreshold,
BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma);
final int maxReadsAcrossSamples = annotation.maxReadsToHoldInMemoryPerSample() * ReadUtils.getSAMFileSamples(engine.getSAMFileHeader()).size();
final int maxReadsToHoldInMemory = Math.min(maxReadsAcrossSamples, annotation.maxReadsToHoldTotal());
final int maxReadsAcrossSamples = this.walker.maxReadsInMemoryPerSample * ReadUtils.getSAMFileSamples(engine.getSAMFileHeader()).size();
final int maxReadsToHoldInMemory = Math.min(maxReadsAcrossSamples, this.walker.maxTotalReadsInMemory);
myReads = new TAROrderedReadCache(maxReadsToHoldInMemory);
}

View File

@ -78,20 +78,4 @@ public @interface ActiveRegionTraversalParameters {
* @return the breadth of the band pass gaussian kernel we want for our traversal
*/
public double bandPassSigma() default BandPassActivityProfile.DEFAULT_SIGMA;
/**
* What is the maximum number of reads we're willing to hold in memory per sample
* during the traversal? This limits our exposure to unusually large amounts
* of coverage in the engine.
* @return the maximum number of reads we're willing to hold in memory
*/
public int maxReadsToHoldInMemoryPerSample() default 30000;
/**
* No matter what the per sample value says, we will never hold more than this
* number of reads in memory at any time. Provides an upper bound on the total number
* of reads in the case where we have a lot of samples.
* @return the maximum number of reads to hold in memory
*/
public int maxReadsToHoldTotal() default 10000000;
}

View File

@ -108,6 +108,24 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
@Argument(fullName="bandPassSigma", shortName="bandPassSigma", doc="The sigma of the band pass filter Gaussian kernel; if not provided defaults to Walker annotated default", required = false)
public Double bandPassSigma = null;
/**
* What is the maximum number of reads we're willing to hold in memory per sample
* during the traversal? This limits our exposure to unusually large amounts
* of coverage in the engine.
*/
@Advanced
@Argument(fullName="maxReadsInMemoryPerSample", shortName="maxReadsInMemoryPerSample", doc="Maximum reads per sample given to traversal map() function", required = false)
public int maxReadsInMemoryPerSample = 30000;
/**
* What is the maximum number of reads we're willing to hold in memory per sample
* during the traversal? This limits our exposure to unusually large amounts
* of coverage in the engine.
*/
@Advanced
@Argument(fullName="maxTotalReadsInMemory", shortName="maxTotalReadsInMemory", doc="Maximum total reads given to traversal map() function", required = false)
public int maxTotalReadsInMemory = 10000000;
/*
* For active region limits in ActivityProfile
* */