DoC now properly handles reference N bases + misc. additional cleanups
-- DoC now by default ignores bases with reference Ns, so these are not included in the coverage calculations at any stage. -- Added option --includeRefNSites that will include them in the calculation -- Added integration tests that ensures the per base tables (and so all subsequent calculations) work with and without reference N bases included -- Reorganized command line options, tagging advanced options with @Advanced
This commit is contained in:
parent
50de1a3eab
commit
c8a06e53c1
|
|
@ -26,6 +26,7 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.coverage;
|
package org.broadinstitute.sting.gatk.walkers.coverage;
|
||||||
|
|
||||||
import net.sf.samtools.SAMReadGroupRecord;
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
import org.broadinstitute.sting.commandline.Advanced;
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
|
@ -119,21 +120,6 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
||||||
@Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"})
|
@Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"})
|
||||||
Map<DoCOutputType,PrintStream> out;
|
Map<DoCOutputType,PrintStream> out;
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin.
|
|
||||||
*/
|
|
||||||
@Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
|
|
||||||
int start = 1;
|
|
||||||
/**
|
|
||||||
* Sets the high-coverage cutoff for granular binning. All loci with depth > END are counted in the last bin.
|
|
||||||
*/
|
|
||||||
@Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
|
|
||||||
int stop = 500;
|
|
||||||
/**
|
|
||||||
* Sets the number of bins for granular binning
|
|
||||||
*/
|
|
||||||
@Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false)
|
|
||||||
int nBins = 499;
|
|
||||||
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to -1.", required = false)
|
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to -1.", required = false)
|
||||||
int minMappingQuality = -1;
|
int minMappingQuality = -1;
|
||||||
@Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false)
|
@Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false)
|
||||||
|
|
@ -142,16 +128,19 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
||||||
byte minBaseQuality = -1;
|
byte minBaseQuality = -1;
|
||||||
@Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false)
|
@Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false)
|
||||||
byte maxBaseQuality = Byte.MAX_VALUE;
|
byte maxBaseQuality = Byte.MAX_VALUE;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instead of reporting depth, report the base pileup at each locus
|
* Instead of reporting depth, report the base pileup at each locus
|
||||||
*/
|
*/
|
||||||
@Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false)
|
@Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false)
|
||||||
boolean printBaseCounts = false;
|
boolean printBaseCounts = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Do not tabulate locus statistics (# loci covered by sample by coverage)
|
* Do not tabulate locus statistics (# loci covered by sample by coverage)
|
||||||
*/
|
*/
|
||||||
@Argument(fullName = "omitLocusTable", shortName = "omitLocusTable", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false)
|
@Argument(fullName = "omitLocusTable", shortName = "omitLocusTable", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false)
|
||||||
boolean omitLocusTable = false;
|
boolean omitLocusTable = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Do not tabulate interval statistics (mean, median, quartiles AND # intervals by sample by coverage)
|
* Do not tabulate interval statistics (mean, median, quartiles AND # intervals by sample by coverage)
|
||||||
*/
|
*/
|
||||||
|
|
@ -162,8 +151,52 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
||||||
*/
|
*/
|
||||||
@Argument(fullName = "omitDepthOutputAtEachBase", shortName = "omitBaseOutput", doc = "Will omit the output of the depth of coverage at each base, which should result in speedup", required = false)
|
@Argument(fullName = "omitDepthOutputAtEachBase", shortName = "omitBaseOutput", doc = "Will omit the output of the depth of coverage at each base, which should result in speedup", required = false)
|
||||||
boolean omitDepthOutput = false;
|
boolean omitDepthOutput = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Path to the RefSeq file for use in aggregating coverage statistics over genes
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false)
|
||||||
|
File refSeqGeneList = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The format of the output file
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false)
|
||||||
|
String outputFormat = "rtable";
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Advanced arguments
|
||||||
|
//
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName = "includeRefNSites", doc = "If provided, sites with reference N bases but with coverage from neighboring reads will be included in DoC calculations.", required = false)
|
||||||
|
boolean includeRefNBases = false;
|
||||||
|
|
||||||
|
@Advanced
|
||||||
@Argument(fullName = "printBinEndpointsAndExit", doc = "Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data.", required = false)
|
@Argument(fullName = "printBinEndpointsAndExit", doc = "Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data.", required = false)
|
||||||
boolean printBinEndpointsAndExit = false;
|
boolean printBinEndpointsAndExit = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin.
|
||||||
|
*/
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
|
||||||
|
int start = 1;
|
||||||
|
/**
|
||||||
|
* Sets the high-coverage cutoff for granular binning. All loci with depth > END are counted in the last bin.
|
||||||
|
*/
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
|
||||||
|
int stop = 500;
|
||||||
|
/**
|
||||||
|
* Sets the number of bins for granular binning
|
||||||
|
*/
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false)
|
||||||
|
int nBins = 499;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Do not tabulate the sample summary statistics (total, mean, median, quartile coverage per sample)
|
* Do not tabulate the sample summary statistics (total, mean, median, quartile coverage per sample)
|
||||||
*/
|
*/
|
||||||
|
|
@ -174,27 +207,22 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
||||||
*/
|
*/
|
||||||
@Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library.", required = false)
|
@Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library.", required = false)
|
||||||
Set<DoCOutputType.Partition> partitionTypes = EnumSet.of(DoCOutputType.Partition.sample);
|
Set<DoCOutputType.Partition> partitionTypes = EnumSet.of(DoCOutputType.Partition.sample);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output.
|
* Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output.
|
||||||
*/
|
*/
|
||||||
|
@Advanced
|
||||||
@Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false)
|
@Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false)
|
||||||
boolean includeDeletions = false;
|
boolean includeDeletions = false;
|
||||||
|
|
||||||
|
@Advanced
|
||||||
@Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
|
@Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
|
||||||
boolean ignoreDeletionSites = false;
|
boolean ignoreDeletionSites = false;
|
||||||
|
|
||||||
/**
|
|
||||||
* Path to the RefSeq file for use in aggregating coverage statistics over genes
|
|
||||||
*/
|
|
||||||
@Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false)
|
|
||||||
File refSeqGeneList = null;
|
|
||||||
/**
|
|
||||||
* The format of the output file
|
|
||||||
*/
|
|
||||||
@Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false)
|
|
||||||
String outputFormat = "rtable";
|
|
||||||
/**
|
/**
|
||||||
* A coverage threshold for summarizing (e.g. % bases >= CT for each sample)
|
* A coverage threshold for summarizing (e.g. % bases >= CT for each sample)
|
||||||
*/
|
*/
|
||||||
|
@Advanced
|
||||||
@Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments.", required = false)
|
@Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments.", required = false)
|
||||||
int[] coverageThresholds = {15};
|
int[] coverageThresholds = {15};
|
||||||
|
|
||||||
|
|
@ -334,24 +362,29 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<DoCOutputType.Partition,Map<String,int[]>> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Map<DoCOutputType.Partition,Map<String,int[]>> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if (includeRefNBases || BaseUtils.isRegularBase(ref.getBase())) {
|
||||||
|
if ( ! omitDepthOutput ) {
|
||||||
|
getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
|
||||||
|
//System.out.printf("\t[log]\t%s",ref.getLocus());
|
||||||
|
}
|
||||||
|
|
||||||
if ( ! omitDepthOutput ) {
|
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes);
|
||||||
getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
|
} else {
|
||||||
//System.out.printf("\t[log]\t%s",ref.getLocus());
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public CoveragePartitioner reduce(Map<DoCOutputType.Partition,Map<String,int[]>> thisMap, CoveragePartitioner prevReduce) {
|
public CoveragePartitioner reduce(Map<DoCOutputType.Partition,Map<String,int[]>> thisMap, CoveragePartitioner prevReduce) {
|
||||||
if ( ! omitDepthOutput ) {
|
if ( thisMap != null ) { // skip sites we didn't want to include in the calculation (ref Ns)
|
||||||
//checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order
|
if ( ! omitDepthOutput ) {
|
||||||
printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType());
|
//checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order
|
||||||
// this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without
|
printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType());
|
||||||
// turning on omit
|
// this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without
|
||||||
}
|
// turning on omit
|
||||||
|
}
|
||||||
|
|
||||||
prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object
|
prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object
|
||||||
|
}
|
||||||
|
|
||||||
return prevReduce;
|
return prevReduce;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -94,4 +94,14 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
execute("testNoCoverageDueToFiltering",spec);
|
execute("testNoCoverageDueToFiltering",spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testRefNHandling(boolean includeNs, final String md5) {
|
||||||
|
String command = "-R " + b37KGReference + " -L 20:26,319,565-26,319,575 -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -T DepthOfCoverage -baseCounts --omitIntervalStatistics --omitLocusTable --omitPerSampleStats -o %s";
|
||||||
|
if ( includeNs ) command += " --includeRefNSites";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(command, 1, Arrays.asList(md5));
|
||||||
|
executeTest("Testing DoC " + (includeNs ? "with" : "without") + " reference Ns", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testRefNWithNs() { testRefNHandling(true, "24cd2da2e4323ce6fd76217ba6dc2834"); }
|
||||||
|
@Test public void testRefNWithoutNs() { testRefNHandling(false, "4fc0f1a2e968f777d693abcefd4fb7af"); }
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue