diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 90e6fcd77..3046ef925 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -42,50 +42,190 @@ import java.io.PrintStream; /** * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome * - * @Author depristo - * @Date May 7, 2010 + *

+ * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system + * considers the coverage at each locus and emits either a per base state or a summary interval BED file that + * partitions the genomic intervals into the following callable states: + *

+ *
REF_N
+ *
the reference base was an N, which is not considered callable the GATK
+ *
CALLABLE
+ *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ *
NO_COVERAGE
+ *
absolutely no reads were seen at this locus, regardless of the filtering parameters
+ *
LOW_COVERAGE
+ *
there were less than min. depth bases at the locus, after applying filters
+ *
EXCESSIVE_COVERAGE
+ *
more than -maxDepth read at the locus, indicating some sort of mapping problem
+ *
POOR_MAPPING_QUALITY
+ *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
+ *

+ * + *

Input

+ *

+ * A BAM file containing exactly one sample. + *

+ * + *

Output

+ *

+ *

+ *

+ * + *

Examples

+ *
+ *     -T CallableLociWalker \
+ *     -I my.bam \
+ *     -summary my.summary \
+ *     -o my.bed
+ * 
+ * + * would produce a BED file (my.bed) that looks like: + * + *
+ *     20 10000000 10000864 CALLABLE
+ *     20 10000865 10000985 POOR_MAPPING_QUALITY
+ *     20 10000986 10001138 CALLABLE
+ *     20 10001139 10001254 POOR_MAPPING_QUALITY
+ *     20 10001255 10012255 CALLABLE
+ *     20 10012256 10012259 POOR_MAPPING_QUALITY
+ *     20 10012260 10012263 CALLABLE
+ *     20 10012264 10012328 POOR_MAPPING_QUALITY
+ *     20 10012329 10012550 CALLABLE
+ *     20 10012551 10012551 LOW_COVERAGE
+ *     20 10012552 10012554 CALLABLE
+ *     20 10012555 10012557 LOW_COVERAGE
+ *     20 10012558 10012558 CALLABLE
+ *     et cetera...
+ * 
+ * as well as a summary table that looks like: + * + *
+ *                        state nBases
+ *                        REF_N 0
+ *                     CALLABLE 996046
+ *                  NO_COVERAGE 121
+ *                 LOW_COVERAGE 928
+ *           EXCESSIVE_COVERAGE 0
+ *         POOR_MAPPING_QUALITY 2906
+ * 
+ * + * @author Mark DePristo + * @since May 7, 2010 */ @By(DataSource.REFERENCE) public class CallableLociWalker extends LocusWalker { @Output PrintStream out; - @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read. The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false) + /** + * Callable loci summary counts (see outputs) will be written to this file. + */ + @Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true) + File summaryFile; + + /** + * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but + * aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with + * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where + * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to + * calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states + */ + @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false) byte maxLowMAPQ = 1; - @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false) + /** + * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE + * state. + */ + @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false) byte minMappingQuality = 10; - @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) + /** + * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state + */ + @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false) byte minBaseQuality = 20; + /** + * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this + * value and is less than maxDepth the site is considered CALLABLE. + */ @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false) int minDepth = 4; + /** + * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH + */ @Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false) int maxDepth = -1; + /** + * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We + * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads + * covering the site. + */ @Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false) int minDepthLowMAPQ = 10; - @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false) + /** + * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality + * exceeds this fraction then the site has POOR_MAPPING_QUALITY. + */ + @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false) double maxLowMAPQFraction = 0.1; - @Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false) + /** + * The output of this walker will be written in this format. The recommended option is BED. + */ + @Argument(fullName = "format", shortName = "format", doc = "Output format", required = false) OutputFormat outputFormat; - @Argument(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true) - File summaryFile; + public enum OutputFormat { + /** + * The output will be written as a BED file. There's a BED element for each + * continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended + * format + */ + BED, - public enum OutputFormat { BED, STATE_PER_BASE } + /** + * Emit chr start stop state quads for each base. Produces a potentially disasterously + * large amount of output. + */ + STATE_PER_BASE + } + + public enum CalledState { + /** the reference base was an N, which is not considered callable the GATK */ + REF_N, + /** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */ + CALLABLE, + /** absolutely no reads were seen at this locus, regardless of the filtering parameters */ + NO_COVERAGE, + /** there were less than min. depth bases at the locus, after applying filters */ + LOW_COVERAGE, + /** more than -maxDepth read at the locus, indicating some sort of mapping problem */ + EXCESSIVE_COVERAGE, + /** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */ + POOR_MAPPING_QUALITY + } //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS //////////////////////////////////////////////////////////////////////////////////// + @Override public boolean includeReadsWithDeletionAtLoci() { return true; } + @Override public void initialize() { + if ( getToolkit().getSamples().size() > 1 ) + throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getToolkit().getSamples()); + try { PrintStream summaryOut = new PrintStream(summaryFile); summaryOut.close(); @@ -94,15 +234,15 @@ public class CallableLociWalker extends LocusWalker