From 2e35592295267f9832647a43658e6972a290289c Mon Sep 17 00:00:00 2001
From: Mark DePristo
Date: Wed, 17 Aug 2011 16:32:01 -0400
Subject: [PATCH] GATKDocs for CallableLoci
---
.../walkers/coverage/CallableLociWalker.java | 185 +++++++++++++++---
1 file changed, 163 insertions(+), 22 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
index 90e6fcd77..3046ef925 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
@@ -42,50 +42,190 @@ import java.io.PrintStream;
/**
* Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
*
- * @Author depristo
- * @Date May 7, 2010
+ *
+ * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system
+ * considers the coverage at each locus and emits either a per base state or a summary interval BED file that
+ * partitions the genomic intervals into the following callable states:
+ *
+ * - REF_N
+ * - the reference base was an N, which is not considered callable the GATK
+ * - CALLABLE
+ * - the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ * - NO_COVERAGE
+ * - absolutely no reads were seen at this locus, regardless of the filtering parameters
+ * - LOW_COVERAGE
+ * - there were less than min. depth bases at the locus, after applying filters
+ * - EXCESSIVE_COVERAGE
+ * - more than -maxDepth read at the locus, indicating some sort of mapping problem
+ * - POOR_MAPPING_QUALITY
+ * - more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
+ *
+ *
+ * Input
+ *
+ * A BAM file containing exactly one sample.
+ *
+ *
+ * Output
+ *
+ *
+ * - -o: a OutputFormatted (recommended BED) file with the callable status covering each base
+ * - -summary: a table of callable status x count of all examined bases
+ *
+ *
+ *
+ * Examples
+ *
+ * -T CallableLociWalker \
+ * -I my.bam \
+ * -summary my.summary \
+ * -o my.bed
+ *
+ *
+ * would produce a BED file (my.bed) that looks like:
+ *
+ *
+ * 20 10000000 10000864 CALLABLE
+ * 20 10000865 10000985 POOR_MAPPING_QUALITY
+ * 20 10000986 10001138 CALLABLE
+ * 20 10001139 10001254 POOR_MAPPING_QUALITY
+ * 20 10001255 10012255 CALLABLE
+ * 20 10012256 10012259 POOR_MAPPING_QUALITY
+ * 20 10012260 10012263 CALLABLE
+ * 20 10012264 10012328 POOR_MAPPING_QUALITY
+ * 20 10012329 10012550 CALLABLE
+ * 20 10012551 10012551 LOW_COVERAGE
+ * 20 10012552 10012554 CALLABLE
+ * 20 10012555 10012557 LOW_COVERAGE
+ * 20 10012558 10012558 CALLABLE
+ * et cetera...
+ *
+ * as well as a summary table that looks like:
+ *
+ *
+ * state nBases
+ * REF_N 0
+ * CALLABLE 996046
+ * NO_COVERAGE 121
+ * LOW_COVERAGE 928
+ * EXCESSIVE_COVERAGE 0
+ * POOR_MAPPING_QUALITY 2906
+ *
+ *
+ * @author Mark DePristo
+ * @since May 7, 2010
*/
@By(DataSource.REFERENCE)
public class CallableLociWalker extends LocusWalker {
@Output
PrintStream out;
- @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read. The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false)
+ /**
+ * Callable loci summary counts (see outputs) will be written to this file.
+ */
+ @Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
+ File summaryFile;
+
+ /**
+ * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but
+ * aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with
+ * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where
+ * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to
+ * calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states
+ */
+ @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false)
byte maxLowMAPQ = 1;
- @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
+ /**
+ * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
+ * state.
+ */
+ @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
byte minMappingQuality = 10;
- @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
+ /**
+ * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
+ */
+ @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
byte minBaseQuality = 20;
+ /**
+ * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
+ * value and is less than maxDepth the site is considered CALLABLE.
+ */
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
int minDepth = 4;
+ /**
+ * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH
+ */
@Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
int maxDepth = -1;
+ /**
+ * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We
+ * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads
+ * covering the site.
+ */
@Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
int minDepthLowMAPQ = 10;
- @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
+ /**
+ * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality
+ * exceeds this fraction then the site has POOR_MAPPING_QUALITY.
+ */
+ @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false)
double maxLowMAPQFraction = 0.1;
- @Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false)
+ /**
+ * The output of this walker will be written in this format. The recommended option is BED.
+ */
+ @Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
OutputFormat outputFormat;
- @Argument(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
- File summaryFile;
+ public enum OutputFormat {
+ /**
+ * The output will be written as a BED file. There's a BED element for each
+ * continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended
+ * format
+ */
+ BED,
- public enum OutputFormat { BED, STATE_PER_BASE }
+ /**
+ * Emit chr start stop state quads for each base. Produces a potentially disasterously
+ * large amount of output.
+ */
+ STATE_PER_BASE
+ }
+
+ public enum CalledState {
+ /** the reference base was an N, which is not considered callable the GATK */
+ REF_N,
+ /** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
+ CALLABLE,
+ /** absolutely no reads were seen at this locus, regardless of the filtering parameters */
+ NO_COVERAGE,
+ /** there were less than min. depth bases at the locus, after applying filters */
+ LOW_COVERAGE,
+ /** more than -maxDepth read at the locus, indicating some sort of mapping problem */
+ EXCESSIVE_COVERAGE,
+ /** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
+ POOR_MAPPING_QUALITY
+ }
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
////////////////////////////////////////////////////////////////////////////////////
+ @Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
+ @Override
public void initialize() {
+ if ( getToolkit().getSamples().size() > 1 )
+ throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getToolkit().getSamples());
+
try {
PrintStream summaryOut = new PrintStream(summaryFile);
summaryOut.close();
@@ -94,15 +234,15 @@ public class CallableLociWalker extends LocusWalker