GATKDocs for CallableLoci
This commit is contained in:
parent
fdd46a5e99
commit
2e35592295
|
|
@ -42,50 +42,190 @@ import java.io.PrintStream;
|
||||||
/**
|
/**
|
||||||
* Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
|
* Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
|
||||||
*
|
*
|
||||||
* @Author depristo
|
* <p>
|
||||||
* @Date May 7, 2010
|
* A very common question about a NGS set of reads is what areas of the genome are considered callable. The system
|
||||||
|
* considers the coverage at each locus and emits either a per base state or a summary interval BED file that
|
||||||
|
* partitions the genomic intervals into the following callable states:
|
||||||
|
* <dl>
|
||||||
|
* <dt>REF_N</dt>
|
||||||
|
* <dd>the reference base was an N, which is not considered callable the GATK</dd>
|
||||||
|
* <dt>CALLABLE</dt>
|
||||||
|
* <dd>the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE</dd>
|
||||||
|
* <dt>NO_COVERAGE</dt>
|
||||||
|
* <dd>absolutely no reads were seen at this locus, regardless of the filtering parameters</dd>
|
||||||
|
* <dt>LOW_COVERAGE</dt>
|
||||||
|
* <dd>there were less than min. depth bases at the locus, after applying filters</dd>
|
||||||
|
* <dt>EXCESSIVE_COVERAGE</dt>
|
||||||
|
* <dd>more than -maxDepth read at the locus, indicating some sort of mapping problem</dd>
|
||||||
|
* <dt>POOR_MAPPING_QUALITY</dt>
|
||||||
|
* <dd>more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads</dd>
|
||||||
|
* </dl>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Input</h2>
|
||||||
|
* <p>
|
||||||
|
* A BAM file containing <b>exactly one sample</b>.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Output</h2>
|
||||||
|
* <p>
|
||||||
|
* <ul>
|
||||||
|
* <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li>
|
||||||
|
* <li>-summary: a table of callable status x count of all examined bases</li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Examples</h2>
|
||||||
|
* <pre>
|
||||||
|
* -T CallableLociWalker \
|
||||||
|
* -I my.bam \
|
||||||
|
* -summary my.summary \
|
||||||
|
* -o my.bed
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* would produce a BED file (my.bed) that looks like:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* 20 10000000 10000864 CALLABLE
|
||||||
|
* 20 10000865 10000985 POOR_MAPPING_QUALITY
|
||||||
|
* 20 10000986 10001138 CALLABLE
|
||||||
|
* 20 10001139 10001254 POOR_MAPPING_QUALITY
|
||||||
|
* 20 10001255 10012255 CALLABLE
|
||||||
|
* 20 10012256 10012259 POOR_MAPPING_QUALITY
|
||||||
|
* 20 10012260 10012263 CALLABLE
|
||||||
|
* 20 10012264 10012328 POOR_MAPPING_QUALITY
|
||||||
|
* 20 10012329 10012550 CALLABLE
|
||||||
|
* 20 10012551 10012551 LOW_COVERAGE
|
||||||
|
* 20 10012552 10012554 CALLABLE
|
||||||
|
* 20 10012555 10012557 LOW_COVERAGE
|
||||||
|
* 20 10012558 10012558 CALLABLE
|
||||||
|
* et cetera...
|
||||||
|
* </pre>
|
||||||
|
* as well as a summary table that looks like:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* state nBases
|
||||||
|
* REF_N 0
|
||||||
|
* CALLABLE 996046
|
||||||
|
* NO_COVERAGE 121
|
||||||
|
* LOW_COVERAGE 928
|
||||||
|
* EXCESSIVE_COVERAGE 0
|
||||||
|
* POOR_MAPPING_QUALITY 2906
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @author Mark DePristo
|
||||||
|
* @since May 7, 2010
|
||||||
*/
|
*/
|
||||||
@By(DataSource.REFERENCE)
|
@By(DataSource.REFERENCE)
|
||||||
public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableBaseState, CallableLociWalker.Integrator> {
|
public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableBaseState, CallableLociWalker.Integrator> {
|
||||||
@Output
|
@Output
|
||||||
PrintStream out;
|
PrintStream out;
|
||||||
|
|
||||||
@Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read. The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false)
|
/**
|
||||||
|
* Callable loci summary counts (see outputs) will be written to this file.
|
||||||
|
*/
|
||||||
|
@Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
|
||||||
|
File summaryFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The gap between this value and mmq are reads that are not sufficiently well mapped for calling but
|
||||||
|
* aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with
|
||||||
|
* MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where
|
||||||
|
* reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to
|
||||||
|
* calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false)
|
||||||
byte maxLowMAPQ = 1;
|
byte maxLowMAPQ = 1;
|
||||||
|
|
||||||
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
|
/**
|
||||||
|
* Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
|
||||||
|
* state.
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
|
||||||
byte minMappingQuality = 10;
|
byte minMappingQuality = 10;
|
||||||
|
|
||||||
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
|
/**
|
||||||
|
* Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
|
||||||
byte minBaseQuality = 20;
|
byte minBaseQuality = 20;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
|
||||||
|
* value and is less than maxDepth the site is considered CALLABLE.
|
||||||
|
*/
|
||||||
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
|
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
|
||||||
int minDepth = 4;
|
int minDepth = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH
|
||||||
|
*/
|
||||||
@Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
|
@Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
|
||||||
int maxDepth = -1;
|
int maxDepth = -1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We
|
||||||
|
* won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads
|
||||||
|
* covering the site.
|
||||||
|
*/
|
||||||
@Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
|
@Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
|
||||||
int minDepthLowMAPQ = 10;
|
int minDepthLowMAPQ = 10;
|
||||||
|
|
||||||
@Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
|
/**
|
||||||
|
* If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality
|
||||||
|
* exceeds this fraction then the site has POOR_MAPPING_QUALITY.
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false)
|
||||||
double maxLowMAPQFraction = 0.1;
|
double maxLowMAPQFraction = 0.1;
|
||||||
|
|
||||||
@Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false)
|
/**
|
||||||
|
* The output of this walker will be written in this format. The recommended option is BED.
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
|
||||||
OutputFormat outputFormat;
|
OutputFormat outputFormat;
|
||||||
|
|
||||||
@Argument(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
|
public enum OutputFormat {
|
||||||
File summaryFile;
|
/**
|
||||||
|
* The output will be written as a BED file. There's a BED element for each
|
||||||
|
* continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended
|
||||||
|
* format
|
||||||
|
*/
|
||||||
|
BED,
|
||||||
|
|
||||||
public enum OutputFormat { BED, STATE_PER_BASE }
|
/**
|
||||||
|
* Emit chr start stop state quads for each base. Produces a potentially disasterously
|
||||||
|
* large amount of output.
|
||||||
|
*/
|
||||||
|
STATE_PER_BASE
|
||||||
|
}
|
||||||
|
|
||||||
|
public enum CalledState {
|
||||||
|
/** the reference base was an N, which is not considered callable the GATK */
|
||||||
|
REF_N,
|
||||||
|
/** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
|
||||||
|
CALLABLE,
|
||||||
|
/** absolutely no reads were seen at this locus, regardless of the filtering parameters */
|
||||||
|
NO_COVERAGE,
|
||||||
|
/** there were less than min. depth bases at the locus, after applying filters */
|
||||||
|
LOW_COVERAGE,
|
||||||
|
/** more than -maxDepth read at the locus, indicating some sort of mapping problem */
|
||||||
|
EXCESSIVE_COVERAGE,
|
||||||
|
/** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
|
||||||
|
POOR_MAPPING_QUALITY
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// STANDARD WALKER METHODS
|
// STANDARD WALKER METHODS
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
@Override
|
||||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||||
|
|
||||||
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
if ( getToolkit().getSamples().size() > 1 )
|
||||||
|
throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getToolkit().getSamples());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
PrintStream summaryOut = new PrintStream(summaryFile);
|
PrintStream summaryOut = new PrintStream(summaryFile);
|
||||||
summaryOut.close();
|
summaryOut.close();
|
||||||
|
|
@ -94,15 +234,15 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class Integrator {
|
protected static class Integrator {
|
||||||
long counts[] = new long[CalledState.values().length];
|
final long counts[] = new long[CalledState.values().length];
|
||||||
CallableBaseState state = null;
|
CallableBaseState state = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class CallableBaseState implements HasGenomeLocation {
|
protected static class CallableBaseState implements HasGenomeLocation {
|
||||||
public GenomeLocParser genomeLocParser;
|
final public GenomeLocParser genomeLocParser;
|
||||||
public GenomeLoc loc;
|
public GenomeLoc loc;
|
||||||
public CalledState state;
|
final public CalledState state;
|
||||||
|
|
||||||
public CallableBaseState(GenomeLocParser genomeLocParser,GenomeLoc loc, CalledState state) {
|
public CallableBaseState(GenomeLocParser genomeLocParser,GenomeLoc loc, CalledState state) {
|
||||||
this.genomeLocParser = genomeLocParser;
|
this.genomeLocParser = genomeLocParser;
|
||||||
|
|
@ -133,16 +273,10 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return String.format("%s %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), state);
|
return String.format("%s %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), state);
|
||||||
//return String.format("%s %d %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), loc.getStop() - loc.getStart() + 1, state);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum CalledState { REF_N, CALLABLE, NO_COVERAGE, LOW_COVERAGE, EXCESSIVE_COVERAGE, POOR_MAPPING_QUALITY }
|
@Override
|
||||||
|
|
||||||
public Integrator reduceInit() {
|
|
||||||
return new Integrator();
|
|
||||||
}
|
|
||||||
|
|
||||||
public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
CalledState state;
|
CalledState state;
|
||||||
|
|
||||||
|
|
@ -179,6 +313,12 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
|
||||||
return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state);
|
return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Integrator reduceInit() {
|
||||||
|
return new Integrator();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public Integrator reduce(CallableBaseState state, Integrator integrator) {
|
public Integrator reduce(CallableBaseState state, Integrator integrator) {
|
||||||
// update counts
|
// update counts
|
||||||
integrator.counts[state.getState().ordinal()]++;
|
integrator.counts[state.getState().ordinal()]++;
|
||||||
|
|
@ -206,6 +346,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
|
||||||
// INTERVAL ON TRAVERSAL DONE
|
// INTERVAL ON TRAVERSAL DONE
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
@Override
|
||||||
public void onTraversalDone(Integrator result) {
|
public void onTraversalDone(Integrator result) {
|
||||||
// print out the last state
|
// print out the last state
|
||||||
if ( result != null ) {
|
if ( result != null ) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue