DiagnoseTargets now outputs a VCF file

- refactored the statistics classes
 - concurrent callable statuses by sample are now available.

Signed-off-by: Mauricio Carneiro <carneiro@broadinstitute.org>
This commit is contained in:
Roger Zurawicki 2012-03-30 12:44:04 -04:00 committed by Mauricio Carneiro
parent 719ec9144a
commit 9ece93ae9c
7 changed files with 659 additions and 264 deletions

View File

@ -1,23 +1,25 @@
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2012, The Broad Institute
* Permission is hereby granted, free of charge, to any person *
* obtaining a copy of this software and associated documentation * Permission is hereby granted, free of charge, to any person
* files (the "Software"), to deal in the Software without * obtaining a copy of this software and associated documentation
* restriction, including without limitation the rights to use, * files (the "Software"), to deal in the Software without
* copy, modify, merge, publish, distribute, sublicense, and/or sell * restriction, including without limitation the rights to use,
* copies of the Software, and to permit persons to whom the * copy, modify, merge, publish, distribute, sublicense, and/or sell
* Software is furnished to do so, subject to the following * copies of the Software, and to permit persons to whom the
* conditions: * Software is furnished to do so, subject to the following
* The above copyright notice and this permission notice shall be * conditions:
* included in all copies or substantial portions of the Software. *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * The above copyright notice and this permission notice shall be
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * included in all copies or substantial portions of the Software.
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* * OTHER DEALINGS IN THE SOFTWARE. * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/ */
package org.broadinstitute.sting.gatk.walkers.coverage; package org.broadinstitute.sting.gatk.walkers.coverage;
@ -42,40 +44,40 @@ import java.io.PrintStream;
/** /**
* Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
* * <p/>
* <p> * <p>
* A very common question about a NGS set of reads is what areas of the genome are considered callable. The system * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system
* considers the coverage at each locus and emits either a per base state or a summary interval BED file that * considers the coverage at each locus and emits either a per base state or a summary interval BED file that
* partitions the genomic intervals into the following callable states: * partitions the genomic intervals into the following callable states:
* <dl> * <dl>
* <dt>REF_N</dt> * <dt>REF_N</dt>
* <dd>the reference base was an N, which is not considered callable the GATK</dd> * <dd>the reference base was an N, which is not considered callable the GATK</dd>
* <dt>CALLABLE</dt> * <dt>PASS</dt>
* <dd>the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE</dd> * <dd>the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE</dd>
* <dt>NO_COVERAGE</dt> * <dt>NO_COVERAGE</dt>
* <dd>absolutely no reads were seen at this locus, regardless of the filtering parameters</dd> * <dd>absolutely no reads were seen at this locus, regardless of the filtering parameters</dd>
* <dt>LOW_COVERAGE</dt> * <dt>LOW_COVERAGE</dt>
* <dd>there were less than min. depth bases at the locus, after applying filters</dd> * <dd>there were less than min. depth bases at the locus, after applying filters</dd>
* <dt>EXCESSIVE_COVERAGE</dt> * <dt>EXCESSIVE_COVERAGE</dt>
* <dd>more than -maxDepth read at the locus, indicating some sort of mapping problem</dd> * <dd>more than -maxDepth read at the locus, indicating some sort of mapping problem</dd>
* <dt>POOR_MAPPING_QUALITY</dt> * <dt>POOR_MAPPING_QUALITY</dt>
* <dd>more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads</dd> * <dd>more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads</dd>
* </dl> * </dl>
* </p> * </p>
* * <p/>
* <h2>Input</h2> * <h2>Input</h2>
* <p> * <p>
* A BAM file containing <b>exactly one sample</b>. * A BAM file containing <b>exactly one sample</b>.
* </p> * </p>
* * <p/>
* <h2>Output</h2> * <h2>Output</h2>
* <p> * <p>
* <ul> * <ul>
* <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li> * <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li>
* <li>-summary: a table of callable status x count of all examined bases</li> * <li>-summary: a table of callable status x count of all examined bases</li>
* </ul> * </ul>
* </p> * </p>
* * <p/>
* <h2>Examples</h2> * <h2>Examples</h2>
* <pre> * <pre>
* -T CallableLociWalker \ * -T CallableLociWalker \
@ -83,31 +85,31 @@ import java.io.PrintStream;
* -summary my.summary \ * -summary my.summary \
* -o my.bed * -o my.bed
* </pre> * </pre>
* * <p/>
* would produce a BED file (my.bed) that looks like: * would produce a BED file (my.bed) that looks like:
* * <p/>
* <pre> * <pre>
* 20 10000000 10000864 CALLABLE * 20 10000000 10000864 PASS
* 20 10000865 10000985 POOR_MAPPING_QUALITY * 20 10000865 10000985 POOR_MAPPING_QUALITY
* 20 10000986 10001138 CALLABLE * 20 10000986 10001138 PASS
* 20 10001139 10001254 POOR_MAPPING_QUALITY * 20 10001139 10001254 POOR_MAPPING_QUALITY
* 20 10001255 10012255 CALLABLE * 20 10001255 10012255 PASS
* 20 10012256 10012259 POOR_MAPPING_QUALITY * 20 10012256 10012259 POOR_MAPPING_QUALITY
* 20 10012260 10012263 CALLABLE * 20 10012260 10012263 PASS
* 20 10012264 10012328 POOR_MAPPING_QUALITY * 20 10012264 10012328 POOR_MAPPING_QUALITY
* 20 10012329 10012550 CALLABLE * 20 10012329 10012550 PASS
* 20 10012551 10012551 LOW_COVERAGE * 20 10012551 10012551 LOW_COVERAGE
* 20 10012552 10012554 CALLABLE * 20 10012552 10012554 PASS
* 20 10012555 10012557 LOW_COVERAGE * 20 10012555 10012557 LOW_COVERAGE
* 20 10012558 10012558 CALLABLE * 20 10012558 10012558 PASS
* et cetera... * et cetera...
* </pre> * </pre>
* as well as a summary table that looks like: * as well as a summary table that looks like:
* * <p/>
* <pre> * <pre>
* state nBases * state nBases
* REF_N 0 * REF_N 0
* CALLABLE 996046 * PASS 996046
* NO_COVERAGE 121 * NO_COVERAGE 121
* LOW_COVERAGE 928 * LOW_COVERAGE 928
* EXCESSIVE_COVERAGE 0 * EXCESSIVE_COVERAGE 0
@ -139,21 +141,21 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
byte maxLowMAPQ = 1; byte maxLowMAPQ = 1;
/** /**
* Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS
* state. * state.
*/ */
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false) @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
byte minMappingQuality = 10; byte minMappingQuality = 10;
/** /**
* Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state
*/ */
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false) @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
byte minBaseQuality = 20; byte minBaseQuality = 20;
/** /**
* If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
* value and is less than maxDepth the site is considered CALLABLE. * value and is less than maxDepth the site is considered PASS.
*/ */
@Advanced @Advanced
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false) @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
@ -191,7 +193,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
public enum OutputFormat { public enum OutputFormat {
/** /**
* The output will be written as a BED file. There's a BED element for each * The output will be written as a BED file. There's a BED element for each
* continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended * continuous run of callable states (i.e., PASS, REF_N, etc). This is the recommended
* format * format
*/ */
BED, BED,
@ -204,17 +206,29 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
} }
public enum CalledState { public enum CalledState {
/** the reference base was an N, which is not considered callable the GATK */ /**
* the reference base was an N, which is not considered callable the GATK
*/
REF_N, REF_N,
/** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */ /**
* the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
*/
CALLABLE, CALLABLE,
/** absolutely no reads were seen at this locus, regardless of the filtering parameters */ /**
* absolutely no reads were seen at this locus, regardless of the filtering parameters
*/
NO_COVERAGE, NO_COVERAGE,
/** there were less than min. depth bases at the locus, after applying filters */ /**
* there were less than min. depth bases at the locus, after applying filters
*/
LOW_COVERAGE, LOW_COVERAGE,
/** more than -maxDepth read at the locus, indicating some sort of mapping problem */ /**
* more than -maxDepth read at the locus, indicating some sort of mapping problem
*/
EXCESSIVE_COVERAGE, EXCESSIVE_COVERAGE,
/** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */ /**
* more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*/
POOR_MAPPING_QUALITY POOR_MAPPING_QUALITY
} }
@ -223,11 +237,13 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
@Override @Override
public boolean includeReadsWithDeletionAtLoci() { return true; } public boolean includeReadsWithDeletionAtLoci() {
return true;
}
@Override @Override
public void initialize() { public void initialize() {
if ( getSampleDB().getSamples().size() != 1 ) { if (getSampleDB().getSamples().size() != 1) {
throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getSampleDB().getSamples()); throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getSampleDB().getSamples());
} }
@ -249,7 +265,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
public GenomeLoc loc; public GenomeLoc loc;
final public CalledState state; final public CalledState state;
public CallableBaseState(GenomeLocParser genomeLocParser,GenomeLoc loc, CalledState state) { public CallableBaseState(GenomeLocParser genomeLocParser, GenomeLoc loc, CalledState state) {
this.genomeLocParser = genomeLocParser; this.genomeLocParser = genomeLocParser;
this.loc = loc; this.loc = loc;
this.state = state; this.state = state;
@ -264,12 +280,13 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
} }
// update routines // update routines
public boolean changingState( CalledState newState ) { public boolean changingState(CalledState newState) {
return state != newState; return state != newState;
} }
/** /**
* Updating the location of this CalledBaseState by the new stop location * Updating the location of this CalledBaseState by the new stop location
*
* @param newStop * @param newStop
*/ */
public void update(GenomeLoc newStop) { public void update(GenomeLoc newStop) {
@ -285,7 +302,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
CalledState state; CalledState state;
if ( BaseUtils.isNBase(ref.getBase()) ) { if (BaseUtils.isNBase(ref.getBase())) {
state = CalledState.REF_N; state = CalledState.REF_N;
} else { } else {
// count up the depths of all and QC+ bases // count up the depths of all and QC+ bases
@ -293,29 +310,29 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
for (PileupElement e : context.getBasePileup()) { for (PileupElement e : context.getBasePileup()) {
rawDepth++; rawDepth++;
if ( e.getMappingQual() <= maxLowMAPQ ) if (e.getMappingQual() <= maxLowMAPQ)
lowMAPQDepth++; lowMAPQDepth++;
if ( e.getMappingQual() >= minMappingQuality && ( e.getQual() >= minBaseQuality || e.isDeletion() ) ) { if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) {
QCDepth++; QCDepth++;
} }
} }
//System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth); //System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth);
if ( rawDepth == 0 ) { if (rawDepth == 0) {
state = CalledState.NO_COVERAGE; state = CalledState.NO_COVERAGE;
} else if ( rawDepth >= minDepthLowMAPQ && MathUtils.ratio( lowMAPQDepth, rawDepth ) >= maxLowMAPQFraction ) { } else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) {
state = CalledState.POOR_MAPPING_QUALITY; state = CalledState.POOR_MAPPING_QUALITY;
} else if ( QCDepth < minDepth ) { } else if (QCDepth < minDepth) {
state = CalledState.LOW_COVERAGE; state = CalledState.LOW_COVERAGE;
} else if ( rawDepth >= maxDepth && maxDepth != -1 ) { } else if (rawDepth >= maxDepth && maxDepth != -1) {
state = CalledState.EXCESSIVE_COVERAGE; state = CalledState.EXCESSIVE_COVERAGE;
} else { } else {
state = CalledState.CALLABLE; state = CalledState.CALLABLE;
} }
} }
return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state); return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state);
} }
@Override @Override
@ -328,15 +345,15 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
// update counts // update counts
integrator.counts[state.getState().ordinal()]++; integrator.counts[state.getState().ordinal()]++;
if ( outputFormat == OutputFormat.STATE_PER_BASE ) { if (outputFormat == OutputFormat.STATE_PER_BASE) {
out.println(state.toString()); out.println(state.toString());
} }
// format is integrating // format is integrating
if ( integrator.state == null ) if (integrator.state == null)
integrator.state = state; integrator.state = state;
else if ( state.getLocation().getStart() != integrator.state.getLocation().getStop() + 1 || else if (state.getLocation().getStart() != integrator.state.getLocation().getStop() + 1 ||
integrator.state.changingState(state.getState()) ) { integrator.state.changingState(state.getState())) {
out.println(integrator.state.toString()); out.println(integrator.state.toString());
integrator.state = state; integrator.state = state;
} else { } else {
@ -354,14 +371,14 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
@Override @Override
public void onTraversalDone(Integrator result) { public void onTraversalDone(Integrator result) {
// print out the last state // print out the last state
if ( result != null ) { if (result != null) {
if ( outputFormat == OutputFormat.BED ) // get the last interval if (outputFormat == OutputFormat.BED) // get the last interval
out.println(result.state.toString()); out.println(result.state.toString());
try { try {
PrintStream summaryOut = new PrintStream(summaryFile); PrintStream summaryOut = new PrintStream(summaryFile);
summaryOut.printf("%30s %s%n", "state", "nBases"); summaryOut.printf("%30s %s%n", "state", "nBases");
for ( CalledState state : CalledState.values() ) { for (CalledState state : CalledState.values()) {
summaryOut.printf("%30s %d%n", state, result.counts[state.ordinal()]); summaryOut.printf("%30s %d%n", state, result.counts[state.ordinal()]);
} }
summaryOut.close(); summaryOut.close();

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
/** /**
@ -7,16 +31,40 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
* @since 2/1/12 * @since 2/1/12
*/ */
public enum CallableStatus { public enum CallableStatus {
/** the reference base was an N, which is not considered callable the GATK */ /**
REF_N, * the reference base was an N, which is not considered callable the GATK
/** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */ */
CALLABLE, // todo -- implement this status
/** absolutely no reads were seen at this locus, regardless of the filtering parameters */ REF_N("the reference base was an N, which is not considered callable the GATK"),
NO_COVERAGE, /**
/** there were less than min. depth bases at the locus, after applying filters */ * the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
LOW_COVERAGE, */
/** more than -maxDepth read at the locus, indicating some sort of mapping problem */ PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"),
EXCESSIVE_COVERAGE, /**
/** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */ * absolutely no reads were seen at this locus, regardless of the filtering parameters
POOR_QUALITY */
NO_COVERAGE("absolutely no reads were seen at this locus, regardless of the filtering parameters"),
/**
* there were less than min. depth bases at the locus, after applying filters
*/
LOW_COVERAGE("there were less than min. depth bases at the locus, after applying filters"),
/**
* more than -maxDepth read at the locus, indicating some sort of mapping problem
*/
EXCESSIVE_COVERAGE("more than -maxDepth read at the locus, indicating some sort of mapping problem"),
/**
* more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*/
POOR_QUALITY("more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads"),
BAD_MATE(""),
INCONSISTENT_COVERAGE("");
public String description;
private CallableStatus(String description) {
this.description = description;
}
} }

View File

@ -1,45 +1,67 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broad.tribble.Feature; import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocComparator; import org.broadinstitute.sting.utils.GenomeLocComparator;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.PrintStream; import java.util.*;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
/** /**
* Short one line description of the walker. * Short one line description of the walker.
* * <p/>
* <p> * <p>
* [Long description of the walker] * [Long description of the walker]
* </p> * </p>
* * <p/>
* * <p/>
* <h2>Input</h2> * <h2>Input</h2>
* <p> * <p>
* [Description of the Input] * [Description of the Input]
* </p> * </p>
* * <p/>
* <h2>Output</h2> * <h2>Output</h2>
* <p> * <p>
* [Description of the Output] * [Description of the Output]
* </p> * </p>
* * <p/>
* <h2>Examples</h2> * <h2>Examples</h2>
* <pre> * <pre>
* java * java
@ -51,12 +73,13 @@ import java.util.TreeSet;
* @since 2/1/12 * @since 2/1/12
*/ */
@By(value = DataSource.READS) @By(value = DataSource.READS)
public class DiagnoseTargets extends LocusWalker<Long, Long> { @PartitionBy(PartitionType.INTERVAL)
public class DiagnoseTargets extends LocusWalker<Long, Long> implements AnnotatorCompatibleWalker {
@Input(fullName = "interval_track", shortName = "int", doc = "", required = true) @Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
private IntervalBinding<Feature> intervalTrack = null; private IntervalBinding<Feature> intervalTrack = null;
@Output @Output(doc = "File to which variants should be written", required = true)
private PrintStream out = System.out; protected VCFWriter vcfWriter = null;
@Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false) @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false)
private int expandInterval = 50; private int expandInterval = 50;
@ -73,13 +96,13 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
@Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false) @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false)
private int maximumCoverage = 700; private int maximumCoverage = 700;
private TreeSet<GenomeLoc> intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them) private TreeSet<GenomeLoc> intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them)
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // interval => statistics private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // interval => statistics
private Iterator<GenomeLoc> intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome private Iterator<GenomeLoc> intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome
private GenomeLoc currentInterval = null; // The "current" interval loaded and being filled with statistics private GenomeLoc currentInterval = null; // The "current" interval loaded
private IntervalStatistics currentIntervalStatistics = null; // The "current" interval loaded and being filled with statistics private IntervalStatistics currentIntervalStatistics = null; // The "current" interval being filled with statistics
private Set<String> samples = null; // All the samples being processed
private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals) private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals)
@Override @Override
public void initialize() { public void initialize() {
@ -88,38 +111,72 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
if (intervalTrack == null) if (intervalTrack == null)
throw new UserException("This tool currently only works if you provide an interval track"); throw new UserException("This tool currently only works if you provide an interval track");
parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below
List<GenomeLoc> originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided List<GenomeLoc> originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided
intervalList = new TreeSet<GenomeLoc>(new GenomeLocComparator()); intervalList = new TreeSet<GenomeLoc>(new GenomeLocComparator());
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>(originalList.size() * 2); intervalMap = new HashMap<GenomeLoc, IntervalStatistics>();
for (GenomeLoc interval : originalList) for (GenomeLoc interval : originalList)
addAndExpandIntervalToLists(interval); intervalList.add(interval);
//addAndExpandIntervalToMap(interval);
intervalListIterator = intervalList.iterator(); intervalListIterator = intervalList.iterator();
// get all of the unique sample names
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// initialize the header
Set<VCFHeaderLine> headerInfo = getHeaderInfo();
vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
}
/**
* Gets the header lines for the VCF writer
*
* @return A set of VCF header lines
*/
private Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
// INFO fields for overall data
headerLines.add(new VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
headerLines.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools"));
headerLines.add(new VCFInfoHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype
headerLines.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools"));
headerLines.add(new VCFFormatHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
// FILTER fields
for (CallableStatus stat : CallableStatus.values()) {
headerLines.add(new VCFHeaderLine(stat.name(), stat.description));
}
return headerLines;
} }
@Override @Override
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
GenomeLoc refLocus = ref.getLocus(); GenomeLoc refLocus = ref.getLocus();
while (currentInterval == null || currentInterval.isBefore(refLocus)) { while (currentInterval == null || currentInterval.isBefore(refLocus)) { // do this for first time and while currentInterval is behind current locus
if (!intervalListIterator.hasNext()) if (!intervalListIterator.hasNext())
return 0L; return 0L;
if (currentInterval != null)
processIntervalStats(currentInterval, Allele.create(ref.getBase(), true));
currentInterval = intervalListIterator.next(); currentInterval = intervalListIterator.next();
addAndExpandIntervalToMap(currentInterval);
currentIntervalStatistics = intervalMap.get(currentInterval); currentIntervalStatistics = intervalMap.get(currentInterval);
} }
if (currentInterval.isPast(refLocus)) if (currentInterval.isPast(refLocus)) // skip if we are behind the current interval
return 0L; return 0L;
byte[] mappingQualities = context.getBasePileup().getMappingQuals(); currentIntervalStatistics.addLocus(context); // Add current locus to stats
byte[] baseQualities = context.getBasePileup().getQuals();
int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
int rawCoverage = context.size();
IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage);
currentIntervalStatistics.addLocus(refLocus, locusData);
return 1L; return 1L;
} }
@ -129,6 +186,13 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
return 0L; return 0L;
} }
/**
* Not sure what we are going to do here
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return a long
*/
@Override @Override
public Long reduce(Long value, Long sum) { public Long reduce(Long value, Long sum) {
return sum + value; return sum + value;
@ -136,14 +200,25 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
@Override @Override
public void onTraversalDone(Long result) { public void onTraversalDone(Long result) {
super.onTraversalDone(result); for (GenomeLoc interval : intervalMap.keySet())
out.println("Interval\tCallStatus\tCOV\tAVG"); processIntervalStats(interval, Allele.create("<DT>", true));
for (GenomeLoc interval : intervalList) {
IntervalStatistics stats = intervalMap.get(interval);
out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage()));
}
} }
@Override
public RodBinding<VariantContext> getSnpEffRodBinding() {return null;}
@Override
public RodBinding<VariantContext> getDbsnpRodBinding() {return null;}
@Override
public List<RodBinding<VariantContext>> getCompRodBindings() {return null;}
@Override
public List<RodBinding<VariantContext>> getResourceRodBindings() {return null;}
@Override
public boolean alwaysAppendDbsnpId() {return false;}
private GenomeLoc createIntervalBefore(GenomeLoc interval) { private GenomeLoc createIntervalBefore(GenomeLoc interval) {
int start = Math.max(interval.getStart() - expandInterval, 0); int start = Math.max(interval.getStart() - expandInterval, 0);
int stop = Math.max(interval.getStart() - 1, 0); int stop = Math.max(interval.getStart() - 1, 0);
@ -157,16 +232,75 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop);
} }
private void addAndExpandIntervalToLists(GenomeLoc interval) { /**
* Takes an interval and commits it to memory.
* It will expand it if so told by the -exp command line argument
*
* @param interval The new interval to process
*/
private void addAndExpandIntervalToMap(GenomeLoc interval) {
if (expandInterval > 0) { if (expandInterval > 0) {
GenomeLoc before = createIntervalBefore(interval); GenomeLoc before = createIntervalBefore(interval);
GenomeLoc after = createIntervalAfter(interval); GenomeLoc after = createIntervalAfter(interval);
intervalList.add(before); intervalList.add(before);
intervalList.add(after); intervalList.add(after);
intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); intervalMap.put(before, new IntervalStatistics(samples, before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); intervalMap.put(after, new IntervalStatistics(samples, after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
} }
intervalList.add(interval); if (!intervalList.contains(interval))
intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); intervalList.add(interval);
intervalMap.put(interval, new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
}
/**
* Takes the interval, finds it in the stash, prints it to the VCF, and removes it
*
* @param interval The interval in memory that you want to write out and clear
* @param allele the allele
*/
private void processIntervalStats(GenomeLoc interval, Allele allele) {
IntervalStatistics stats = intervalMap.get(interval);
List<Allele> alleles = new ArrayList<Allele>();
Map<String, Object> attributes = new HashMap<String, Object>();
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
alleles.add(allele);
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles);
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF
vcb.filters(statusesToStrings(stats.callableStatuses()));
attributes.put(VCFConstants.END_KEY, interval.getStop());
attributes.put(VCFConstants.DEPTH_KEY, stats.totalCoverage());
attributes.put("AV", stats.averageCoverage());
vcb = vcb.attributes(attributes);
for (String sample : samples) {
Map<String, Object> infos = new HashMap<String, Object>();
infos.put("DP", stats.getSample(sample).totalCoverage());
infos.put("AV", stats.getSample(sample).averageCoverage());
Set<String> filters = new HashSet<String>();
filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses()));
genotypes.add(new Genotype(sample, alleles, VariantContext.NO_LOG10_PERROR, filters, infos, false));
}
vcb = vcb.genotypes(genotypes);
vcfWriter.add(vcb.make());
intervalMap.remove(interval);
}
private static Set<String> statusesToStrings(Set<CallableStatus> statuses) {
Set<String> output = new HashSet<String>(statuses.size());
for (CallableStatus status : statuses)
output.add(status.name());
return output;
} }
} }

View File

@ -1,34 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
/**
* The definition of a locus for the DiagnoseTargets walker statistics calculation
*
* @author Mauricio Carneiro
* @since 2/3/12
*/
class IntervalStatisticLocus {
private final byte[] mappingQuality;
private final byte[] baseQuality;
private final int coverage;
private final int rawCoverage;
public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) {
this.mappingQuality = mappingQuality;
this.baseQuality = baseQuality;
this.coverage = coverage;
this.rawCoverage = rawCoverage;
}
public IntervalStatisticLocus() {
this(new byte[1], new byte[1], 0, 0);
}
public int getCoverage() {
return coverage;
}
public int getRawCoverage() {
return rawCoverage;
}
}

View File

@ -1,44 +1,62 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/** public class IntervalStatistics {
* Short one line description of the walker.
* private final Map<String, SampleStatistics> samples;
* @author Mauricio Carneiro
* @since 2/1/12
*/
class IntervalStatistics {
private final GenomeLoc interval; private final GenomeLoc interval;
private final ArrayList<IntervalStatisticLocus> loci;
private final int minimumCoverageThreshold; private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
private final int maximumCoverageThreshold;
private final int minimumMappingQuality;
private final int minimumBaseQuality;
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
private IntervalStatistics(GenomeLoc interval, ArrayList<IntervalStatisticLocus> loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { public IntervalStatistics(Set<String> samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
this.interval = interval; this.interval = interval;
this.loci = loci; this.samples = new HashMap<String, SampleStatistics>(samples.size());
this.minimumCoverageThreshold = minimumCoverageThreshold; for (String sample : samples)
this.maximumCoverageThreshold = maximumCoverageThreshold; this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality));
this.minimumMappingQuality = minimumMappingQuality;
this.minimumBaseQuality = minimumBaseQuality;
} }
public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { public SampleStatistics getSample(String sample) {
this(interval, new ArrayList<IntervalStatisticLocus>(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); return samples.get(sample);
}
// Initialize every loci (this way we don't have to worry about non-existent loci in the object public void addLocus(AlignmentContext context) {
for (int i = 0; i < interval.size(); i++) ReadBackedPileup pileup = context.getBasePileup();
this.loci.add(i, new IntervalStatisticLocus());
for (String sample : samples.keySet())
getSample(sample).addLocus(context.getLocation(), pileup.getPileupForSample(sample));
} }
public long totalCoverage() { public long totalCoverage() {
@ -50,73 +68,27 @@ class IntervalStatistics {
public double averageCoverage() { public double averageCoverage() {
if (preComputedTotalCoverage < 0) if (preComputedTotalCoverage < 0)
calculateTotalCoverage(); calculateTotalCoverage();
return (double) preComputedTotalCoverage / loci.size(); return (double) preComputedTotalCoverage / interval.size();
}
/**
* Calculates the callable status of the entire interval
*
* @return the callable status of the entire interval
*/
public CallableStatus callableStatus() {
long max = -1;
CallableStatus maxCallableStatus = null;
HashMap<CallableStatus, Integer> statusCounts = new HashMap<CallableStatus, Integer>(CallableStatus.values().length);
// initialize the statusCounts with all callable states
for (CallableStatus key : CallableStatus.values())
statusCounts.put(key, 0);
// calculate the callable status for each locus
for (int i = 0; i < loci.size(); i++) {
CallableStatus status = callableStatus(i);
int count = statusCounts.get(status) + 1;
statusCounts.put(status, count);
if (count > max) {
max = count;
maxCallableStatus = status;
}
}
return maxCallableStatus;
}
public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) {
if (!interval.containsP(locus))
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus));
int locusIndex = locus.getStart() - interval.getStart();
loci.add(locusIndex, locusData);
}
/**
* returns the callable status of this locus without taking the reference base into account.
*
* @param locusIndex location in the genome to inquire (only one locus)
* @return the callable status of a locus
*/
private CallableStatus callableStatus(int locusIndex) {
if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold)
return CallableStatus.EXCESSIVE_COVERAGE;
if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold)
return CallableStatus.CALLABLE;
if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold)
return CallableStatus.POOR_QUALITY;
if (loci.get(locusIndex).getRawCoverage() > 0)
return CallableStatus.LOW_COVERAGE;
return CallableStatus.NO_COVERAGE;
} }
private void calculateTotalCoverage() { private void calculateTotalCoverage() {
preComputedTotalCoverage = 0; preComputedTotalCoverage = 0;
for (IntervalStatisticLocus locus : loci) for (SampleStatistics sample : samples.values())
preComputedTotalCoverage += locus.getCoverage(); preComputedTotalCoverage += sample.totalCoverage();
} }
/**
* Return the Callable statuses for the interval as a whole
* todo -- add a voting system for sample flags and add interval specific statuses
*
* @return the callable status(es) for the whole interval
*/
public Set<CallableStatus> callableStatuses() {
Set<CallableStatus> output = new HashSet<CallableStatus>();
for (SampleStatistics sample : samples.values())
output.addAll(sample.getCallableStatuses());
return output;
}
} }

View File

@ -0,0 +1,83 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import java.util.HashSet;
import java.util.Set;
public class LocusStatistics {
final int coverage;
final int rawCoverage;
public LocusStatistics() {
this.coverage = 0;
this.rawCoverage = 0;
}
public LocusStatistics(int coverage, int rawCoverage) {
this.coverage = coverage;
this.rawCoverage = rawCoverage;
}
public int getCoverage() {
return coverage;
}
public int getRawCoverage() {
return rawCoverage;
}
/**
* Generates all applicable statuses from the coverages in this locus
*
* @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality
* @param maximumCoverageThreshold the maximum threshold for determining excessive coverage
* @return a set of all statuses that apply
*/
public Set<CallableStatus> callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) {
Set<CallableStatus> output = new HashSet<CallableStatus>();
// if too much coverage
if (getCoverage() > maximumCoverageThreshold)
output.add(CallableStatus.EXCESSIVE_COVERAGE);
// if not enough coverage
if (getCoverage() < minimumCoverageThreshold) {
// was there a lot of low Qual coverage?
if (getRawCoverage() >= minimumCoverageThreshold)
output.add(CallableStatus.POOR_QUALITY);
// no?
else {
// is there any coverage?
if (getRawCoverage() > 0)
output.add(CallableStatus.LOW_COVERAGE);
else
output.add(CallableStatus.NO_COVERAGE);
}
}
return output;
}
}

View File

@ -0,0 +1,175 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
/**
* Short one line description of the walker.
*
* @author Mauricio Carneiro
* @since 2/1/12
*/
class SampleStatistics {
private final GenomeLoc interval;
private final ArrayList<LocusStatistics> loci;
private final int minimumCoverageThreshold;
private final int maximumCoverageThreshold;
private final int minimumMappingQuality;
private final int minimumBaseQuality;
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
private SampleStatistics(GenomeLoc interval, ArrayList<LocusStatistics> loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
this.interval = interval;
this.loci = loci;
this.minimumCoverageThreshold = minimumCoverageThreshold;
this.maximumCoverageThreshold = maximumCoverageThreshold;
this.minimumMappingQuality = minimumMappingQuality;
this.minimumBaseQuality = minimumBaseQuality;
}
public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
this(interval, new ArrayList<LocusStatistics>(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality);
// Initialize every loci (this way we don't have to worry about non-existent loci in the object
for (int i = 0; i < interval.size(); i++)
this.loci.add(i, new LocusStatistics());
}
public long totalCoverage() {
if (preComputedTotalCoverage < 0)
calculateTotalCoverage();
return preComputedTotalCoverage;
}
public double averageCoverage() {
if (preComputedTotalCoverage < 0)
calculateTotalCoverage();
return (double) preComputedTotalCoverage / loci.size();
}
/**
* Calculates the callable statuses of the entire interval
*
* @return the callable statuses of the entire interval
*/
public Set<CallableStatus> getCallableStatuses() {
Map<CallableStatus, Integer> totals = new HashMap<CallableStatus, Integer>(CallableStatus.values().length);
// initialize map
for (CallableStatus status : CallableStatus.values())
totals.put(status, 0);
// sum up all the callable statuses for each locus
for (int i = 0; i < interval.size(); i++) {
for (CallableStatus status : callableStatus(i)) {
int count = totals.get(status);
totals.put(status, count + 1);
}
}
Set<CallableStatus> output = new HashSet<CallableStatus>();
// double to avoid type casting
double intervalSize = interval.size();
double coverageStatusThreshold = 0.20;
if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold)
output.add(CallableStatus.NO_COVERAGE);
if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold)
output.add(CallableStatus.LOW_COVERAGE);
double excessiveCoverageThreshold = 0.20;
if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold)
output.add(CallableStatus.EXCESSIVE_COVERAGE);
double qualityStatusThreshold = 0.50;
if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold)
output.add(CallableStatus.POOR_QUALITY);
if (totals.get(CallableStatus.REF_N) > 0)
output.add(CallableStatus.REF_N);
if (output.isEmpty()) {
output.add(CallableStatus.PASS);
}
return output;
}
/**
* Adds a locus to the interval wide stats
*
* @param locus The locus given as a GenomeLoc
* @param pileup The pileup of that locus
*/
public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) {
if (!interval.containsP(locus))
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus));
// a null pileup means there nothing ot add
if (pileup != null) {
int locusIndex = locus.getStart() - interval.getStart();
int rawCoverage = pileup.depthOfCoverage();
int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage);
loci.add(locusIndex, locusData);
}
}
/**
* returns the callable status of this locus without taking the reference base into account.
*
* @param locusIndex location in the genome to inquire (only one locus)
* @return the callable status of a locus
*/
private Set<CallableStatus> callableStatus(int locusIndex) {
LocusStatistics locus = loci.get(locusIndex);
return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold);
}
private void calculateTotalCoverage() {
preComputedTotalCoverage = 0;
for (LocusStatistics locus : loci)
preComputedTotalCoverage += locus.getCoverage();
}
}