diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 1dfc6fea0..2a8940de0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -1,23 +1,25 @@ /* - * Copyright (c) 2009 The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * * OTHER DEALINGS IN THE SOFTWARE. + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.coverage; @@ -42,40 +44,40 @@ import java.io.PrintStream; /** * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome - * + *

*

* A very common question about a NGS set of reads is what areas of the genome are considered callable. The system * considers the coverage at each locus and emits either a per base state or a summary interval BED file that * partitions the genomic intervals into the following callable states: *

- *
REF_N
- *
the reference base was an N, which is not considered callable the GATK
- *
CALLABLE
- *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
- *
NO_COVERAGE
- *
absolutely no reads were seen at this locus, regardless of the filtering parameters
- *
LOW_COVERAGE
- *
there were less than min. depth bases at the locus, after applying filters
- *
EXCESSIVE_COVERAGE
- *
more than -maxDepth read at the locus, indicating some sort of mapping problem
- *
POOR_MAPPING_QUALITY
- *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
REF_N
+ *
the reference base was an N, which is not considered callable the GATK
+ *
PASS
+ *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ *
NO_COVERAGE
+ *
absolutely no reads were seen at this locus, regardless of the filtering parameters
+ *
LOW_COVERAGE
+ *
there were less than min. depth bases at the locus, after applying filters
+ *
EXCESSIVE_COVERAGE
+ *
more than -maxDepth read at the locus, indicating some sort of mapping problem
+ *
POOR_MAPPING_QUALITY
+ *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*
*

- * + *

*

Input

*

- * A BAM file containing exactly one sample. + * A BAM file containing exactly one sample. *

- * + *

*

Output

*

*

*

- * + *

*

Examples

*
  *     -T CallableLociWalker \
@@ -83,31 +85,31 @@ import java.io.PrintStream;
  *     -summary my.summary \
  *     -o my.bed
  * 
- * + *

* would produce a BED file (my.bed) that looks like: - * + *

*

- *     20 10000000 10000864 CALLABLE
+ *     20 10000000 10000864 PASS
  *     20 10000865 10000985 POOR_MAPPING_QUALITY
- *     20 10000986 10001138 CALLABLE
+ *     20 10000986 10001138 PASS
  *     20 10001139 10001254 POOR_MAPPING_QUALITY
- *     20 10001255 10012255 CALLABLE
+ *     20 10001255 10012255 PASS
  *     20 10012256 10012259 POOR_MAPPING_QUALITY
- *     20 10012260 10012263 CALLABLE
+ *     20 10012260 10012263 PASS
  *     20 10012264 10012328 POOR_MAPPING_QUALITY
- *     20 10012329 10012550 CALLABLE
+ *     20 10012329 10012550 PASS
  *     20 10012551 10012551 LOW_COVERAGE
- *     20 10012552 10012554 CALLABLE
+ *     20 10012552 10012554 PASS
  *     20 10012555 10012557 LOW_COVERAGE
- *     20 10012558 10012558 CALLABLE
+ *     20 10012558 10012558 PASS
  *     et cetera...
  * 
* as well as a summary table that looks like: - * + *

*

  *                        state nBases
  *                        REF_N 0
- *                     CALLABLE 996046
+ *                     PASS 996046
  *                  NO_COVERAGE 121
  *                 LOW_COVERAGE 928
  *           EXCESSIVE_COVERAGE 0
@@ -139,21 +141,21 @@ public class CallableLociWalker extends LocusWalker minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
+     * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS
      * state.
      */
     @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
     byte minMappingQuality = 10;
 
     /**
-     * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
+     * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state
      */
     @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
     byte minBaseQuality = 20;
 
     /**
      * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
-     * value and is less than maxDepth the site is considered CALLABLE.
+     * value and is less than maxDepth the site is considered PASS.
      */
     @Advanced
     @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
@@ -191,7 +193,7 @@ public class CallableLociWalker extends LocusWalker= minMappingQuality && ( e.getQual() >= minBaseQuality || e.isDeletion() ) ) {
+                if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) {
                     QCDepth++;
                 }
             }
 
             //System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth);
-            if ( rawDepth == 0 ) {
+            if (rawDepth == 0) {
                 state = CalledState.NO_COVERAGE;
-            } else if ( rawDepth >= minDepthLowMAPQ && MathUtils.ratio( lowMAPQDepth, rawDepth ) >= maxLowMAPQFraction ) {
+            } else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) {
                 state = CalledState.POOR_MAPPING_QUALITY;
-            } else if ( QCDepth < minDepth ) {
+            } else if (QCDepth < minDepth) {
                 state = CalledState.LOW_COVERAGE;
-            } else if ( rawDepth >= maxDepth && maxDepth != -1 ) {
+            } else if (rawDepth >= maxDepth && maxDepth != -1) {
                 state = CalledState.EXCESSIVE_COVERAGE;
             } else {
                 state = CalledState.CALLABLE;
             }
         }
 
-        return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state);
+        return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state);
     }
 
     @Override
@@ -328,15 +345,15 @@ public class CallableLociWalker extends LocusWalker
  * 

* [Long description of the walker] *

- * - * + *

+ *

*

Input

*

* [Description of the Input] *

- * + *

*

Output

*

* [Description of the Output] *

- * + *

*

Examples

*
  *    java
@@ -51,12 +73,13 @@ import java.util.TreeSet;
  * @since 2/1/12
  */
 @By(value = DataSource.READS)
-public class DiagnoseTargets extends LocusWalker {
+@PartitionBy(PartitionType.INTERVAL)
+public class DiagnoseTargets extends LocusWalker implements AnnotatorCompatibleWalker {
     @Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
     private IntervalBinding intervalTrack = null;
 
-    @Output
-    private PrintStream out = System.out;
+    @Output(doc = "File to which variants should be written", required = true)
+    protected VCFWriter vcfWriter = null;
 
     @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false)
     private int expandInterval = 50;
@@ -73,13 +96,13 @@ public class DiagnoseTargets extends LocusWalker {
     @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false)
     private int maximumCoverage = 700;
 
-    private TreeSet intervalList = null;                     // The list of intervals of interest (plus expanded intervals if user wants them)
-    private HashMap intervalMap = null;  // interval => statistics
-    private Iterator intervalListIterator;                   // An iterator to go over all the intervals provided as we traverse the genome
-    private GenomeLoc currentInterval = null;                           // The "current" interval loaded and being filled with statistics
-    private IntervalStatistics currentIntervalStatistics = null;                 // The "current" interval loaded and being filled with statistics
-
-    private GenomeLocParser parser;                                     // just an object to allow us to create genome locs (for the expanded intervals)
+    private TreeSet intervalList = null;                                                                     // The list of intervals of interest (plus expanded intervals if user wants them)
+    private HashMap intervalMap = null;                                                  // interval => statistics
+    private Iterator intervalListIterator;                                                                   // An iterator to go over all the intervals provided as we traverse the genome
+    private GenomeLoc currentInterval = null;                                                                           // The "current" interval loaded
+    private IntervalStatistics currentIntervalStatistics = null;                                                        // The "current" interval being filled with statistics
+    private Set samples = null;                                                                                 // All the samples being processed
+    private GenomeLocParser parser;                                                                                     // just an object to allow us to create genome locs (for the expanded intervals)
 
     @Override
     public void initialize() {
@@ -88,38 +111,72 @@ public class DiagnoseTargets extends LocusWalker {
         if (intervalTrack == null)
             throw new UserException("This tool currently only works if you provide an interval track");
 
-        parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary());       // Important to initialize the parser before creating the intervals below
+        parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary());                                       // Important to initialize the parser before creating the intervals below
 
-        List originalList = intervalTrack.getIntervals(getToolkit());        // The original list of targets provided by the user that will be expanded or not depending on the options provided
+        List originalList = intervalTrack.getIntervals(getToolkit());                                        // The original list of targets provided by the user that will be expanded or not depending on the options provided
         intervalList = new TreeSet(new GenomeLocComparator());
-        intervalMap = new HashMap(originalList.size() * 2);
+        intervalMap = new HashMap();
         for (GenomeLoc interval : originalList)
-            addAndExpandIntervalToLists(interval);
+            intervalList.add(interval);
+        //addAndExpandIntervalToMap(interval);
 
         intervalListIterator = intervalList.iterator();
+
+        // get all of the unique sample names
+        samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
+
+        // initialize the header
+        Set headerInfo = getHeaderInfo();
+
+        vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
+    }
+
+    /**
+     * Gets the header lines for the VCF writer
+     *
+     * @return A set of VCF header lines
+     */
+    private Set getHeaderInfo() {
+        Set headerLines = new HashSet();
+
+        // INFO fields for overall data
+        headerLines.add(new VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
+        headerLines.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools"));
+        headerLines.add(new VCFInfoHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
+        headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
+
+        // FORMAT fields for each genotype
+        headerLines.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools"));
+        headerLines.add(new VCFFormatHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
+
+        // FILTER fields
+
+        for (CallableStatus stat : CallableStatus.values()) {
+            headerLines.add(new VCFHeaderLine(stat.name(), stat.description));
+        }
+
+        return headerLines;
     }
 
     @Override
     public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
         GenomeLoc refLocus = ref.getLocus();
-        while (currentInterval == null || currentInterval.isBefore(refLocus)) {
+        while (currentInterval == null || currentInterval.isBefore(refLocus)) {                                         // do this for first time and while currentInterval is behind current locus
             if (!intervalListIterator.hasNext())
                 return 0L;
 
+            if (currentInterval != null)
+                processIntervalStats(currentInterval, Allele.create(ref.getBase(), true));
+
             currentInterval = intervalListIterator.next();
+            addAndExpandIntervalToMap(currentInterval);
             currentIntervalStatistics = intervalMap.get(currentInterval);
         }
 
-        if (currentInterval.isPast(refLocus))
+        if (currentInterval.isPast(refLocus))                                                                           // skip if we are behind the current interval
             return 0L;
 
-        byte[] mappingQualities = context.getBasePileup().getMappingQuals();
-        byte[] baseQualities = context.getBasePileup().getQuals();
-        int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
-        int rawCoverage = context.size();
-
-        IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage);
-        currentIntervalStatistics.addLocus(refLocus, locusData);
+        currentIntervalStatistics.addLocus(context);                                                                    // Add current locus to stats
 
         return 1L;
     }
@@ -129,6 +186,13 @@ public class DiagnoseTargets extends LocusWalker {
         return 0L;
     }
 
+    /**
+     * Not sure what we are going to do here
+     *
+     * @param value result of the map.
+     * @param sum   accumulator for the reduce.
+     * @return a long
+     */
     @Override
     public Long reduce(Long value, Long sum) {
         return sum + value;
@@ -136,14 +200,25 @@ public class DiagnoseTargets extends LocusWalker {
 
     @Override
     public void onTraversalDone(Long result) {
-        super.onTraversalDone(result);
-        out.println("Interval\tCallStatus\tCOV\tAVG");
-        for (GenomeLoc interval : intervalList) {
-            IntervalStatistics stats = intervalMap.get(interval);
-            out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage()));
-        }
+        for (GenomeLoc interval : intervalMap.keySet()) 
+            processIntervalStats(interval, Allele.create("
", true)); } + @Override + public RodBinding getSnpEffRodBinding() {return null;} + + @Override + public RodBinding getDbsnpRodBinding() {return null;} + + @Override + public List> getCompRodBindings() {return null;} + + @Override + public List> getResourceRodBindings() {return null;} + + @Override + public boolean alwaysAppendDbsnpId() {return false;} + private GenomeLoc createIntervalBefore(GenomeLoc interval) { int start = Math.max(interval.getStart() - expandInterval, 0); int stop = Math.max(interval.getStart() - 1, 0); @@ -157,16 +232,75 @@ public class DiagnoseTargets extends LocusWalker { return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); } - private void addAndExpandIntervalToLists(GenomeLoc interval) { + /** + * Takes an interval and commits it to memory. + * It will expand it if so told by the -exp command line argument + * + * @param interval The new interval to process + */ + private void addAndExpandIntervalToMap(GenomeLoc interval) { if (expandInterval > 0) { GenomeLoc before = createIntervalBefore(interval); GenomeLoc after = createIntervalAfter(interval); intervalList.add(before); intervalList.add(after); - intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); - intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + intervalMap.put(before, new IntervalStatistics(samples, before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + intervalMap.put(after, new IntervalStatistics(samples, after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); } - intervalList.add(interval); - intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + if (!intervalList.contains(interval)) + intervalList.add(interval); + intervalMap.put(interval, new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + } + + /** + * Takes the interval, finds it in the stash, prints it to the VCF, and removes it + * + * @param interval The interval in memory that you want to write out and clear + * @param allele the allele + */ + private void processIntervalStats(GenomeLoc interval, Allele allele) { + IntervalStatistics stats = intervalMap.get(interval); + + List alleles = new ArrayList(); + Map attributes = new HashMap(); + ArrayList genotypes = new ArrayList(); + + alleles.add(allele); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); + + vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF + vcb.filters(statusesToStrings(stats.callableStatuses())); + + attributes.put(VCFConstants.END_KEY, interval.getStop()); + attributes.put(VCFConstants.DEPTH_KEY, stats.totalCoverage()); + attributes.put("AV", stats.averageCoverage()); + + vcb = vcb.attributes(attributes); + + for (String sample : samples) { + Map infos = new HashMap(); + infos.put("DP", stats.getSample(sample).totalCoverage()); + infos.put("AV", stats.getSample(sample).averageCoverage()); + + Set filters = new HashSet(); + filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); + + + genotypes.add(new Genotype(sample, alleles, VariantContext.NO_LOG10_PERROR, filters, infos, false)); + } + vcb = vcb.genotypes(genotypes); + + vcfWriter.add(vcb.make()); + + intervalMap.remove(interval); + } + + private static Set statusesToStrings(Set statuses) { + Set output = new HashSet(statuses.size()); + + for (CallableStatus status : statuses) + output.add(status.name()); + + return output; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java deleted file mode 100644 index 5620c3902..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; - -/** - * The definition of a locus for the DiagnoseTargets walker statistics calculation - * - * @author Mauricio Carneiro - * @since 2/3/12 - */ -class IntervalStatisticLocus { - private final byte[] mappingQuality; - private final byte[] baseQuality; - private final int coverage; - private final int rawCoverage; - - public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) { - this.mappingQuality = mappingQuality; - this.baseQuality = baseQuality; - this.coverage = coverage; - this.rawCoverage = rawCoverage; - } - - public IntervalStatisticLocus() { - this(new byte[1], new byte[1], 0, 0); - } - - public int getCoverage() { - return coverage; - } - - public int getRawCoverage() { - return rawCoverage; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 8ee5f76fb..75f56808f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -1,44 +1,62 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; -/** - * Short one line description of the walker. - * - * @author Mauricio Carneiro - * @since 2/1/12 - */ -class IntervalStatistics { +public class IntervalStatistics { + + private final Map samples; private final GenomeLoc interval; - private final ArrayList loci; - private final int minimumCoverageThreshold; - private final int maximumCoverageThreshold; - private final int minimumMappingQuality; - private final int minimumBaseQuality; + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private IntervalStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + public IntervalStatistics(Set samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { this.interval = interval; - this.loci = loci; - this.minimumCoverageThreshold = minimumCoverageThreshold; - this.maximumCoverageThreshold = maximumCoverageThreshold; - this.minimumMappingQuality = minimumMappingQuality; - this.minimumBaseQuality = minimumBaseQuality; + this.samples = new HashMap(samples.size()); + for (String sample : samples) + this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality)); } - public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { - this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + public SampleStatistics getSample(String sample) { + return samples.get(sample); + } - // Initialize every loci (this way we don't have to worry about non-existent loci in the object - for (int i = 0; i < interval.size(); i++) - this.loci.add(i, new IntervalStatisticLocus()); + public void addLocus(AlignmentContext context) { + ReadBackedPileup pileup = context.getBasePileup(); + for (String sample : samples.keySet()) + getSample(sample).addLocus(context.getLocation(), pileup.getPileupForSample(sample)); } public long totalCoverage() { @@ -50,73 +68,27 @@ class IntervalStatistics { public double averageCoverage() { if (preComputedTotalCoverage < 0) calculateTotalCoverage(); - return (double) preComputedTotalCoverage / loci.size(); - } - - /** - * Calculates the callable status of the entire interval - * - * @return the callable status of the entire interval - */ - public CallableStatus callableStatus() { - long max = -1; - CallableStatus maxCallableStatus = null; - HashMap statusCounts = new HashMap(CallableStatus.values().length); - - // initialize the statusCounts with all callable states - for (CallableStatus key : CallableStatus.values()) - statusCounts.put(key, 0); - - // calculate the callable status for each locus - for (int i = 0; i < loci.size(); i++) { - CallableStatus status = callableStatus(i); - int count = statusCounts.get(status) + 1; - statusCounts.put(status, count); - - if (count > max) { - max = count; - maxCallableStatus = status; - } - } - - return maxCallableStatus; - } - - public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) { - if (!interval.containsP(locus)) - throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); - - int locusIndex = locus.getStart() - interval.getStart(); - - loci.add(locusIndex, locusData); - } - - /** - * returns the callable status of this locus without taking the reference base into account. - * - * @param locusIndex location in the genome to inquire (only one locus) - * @return the callable status of a locus - */ - private CallableStatus callableStatus(int locusIndex) { - if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold) - return CallableStatus.EXCESSIVE_COVERAGE; - - if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold) - return CallableStatus.CALLABLE; - - if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold) - return CallableStatus.POOR_QUALITY; - - if (loci.get(locusIndex).getRawCoverage() > 0) - return CallableStatus.LOW_COVERAGE; - - return CallableStatus.NO_COVERAGE; + return (double) preComputedTotalCoverage / interval.size(); } private void calculateTotalCoverage() { preComputedTotalCoverage = 0; - for (IntervalStatisticLocus locus : loci) - preComputedTotalCoverage += locus.getCoverage(); + for (SampleStatistics sample : samples.values()) + preComputedTotalCoverage += sample.totalCoverage(); } + /** + * Return the Callable statuses for the interval as a whole + * todo -- add a voting system for sample flags and add interval specific statuses + * + * @return the callable status(es) for the whole interval + */ + public Set callableStatuses() { + Set output = new HashSet(); + + for (SampleStatistics sample : samples.values()) + output.addAll(sample.getCallableStatuses()); + + return output; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java new file mode 100644 index 000000000..237ca1b1c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import java.util.HashSet; +import java.util.Set; + +public class LocusStatistics { + final int coverage; + final int rawCoverage; + + public LocusStatistics() { + this.coverage = 0; + this.rawCoverage = 0; + } + + public LocusStatistics(int coverage, int rawCoverage) { + this.coverage = coverage; + this.rawCoverage = rawCoverage; + } + + public int getCoverage() { + return coverage; + } + + public int getRawCoverage() { + return rawCoverage; + } + + /** + * Generates all applicable statuses from the coverages in this locus + * + * @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality + * @param maximumCoverageThreshold the maximum threshold for determining excessive coverage + * @return a set of all statuses that apply + */ + public Set callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) { + Set output = new HashSet(); + + // if too much coverage + if (getCoverage() > maximumCoverageThreshold) + output.add(CallableStatus.EXCESSIVE_COVERAGE); + + // if not enough coverage + if (getCoverage() < minimumCoverageThreshold) { + // was there a lot of low Qual coverage? + if (getRawCoverage() >= minimumCoverageThreshold) + output.add(CallableStatus.POOR_QUALITY); + // no? + else { + // is there any coverage? + if (getRawCoverage() > 0) + output.add(CallableStatus.LOW_COVERAGE); + else + output.add(CallableStatus.NO_COVERAGE); + } + } + + return output; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java new file mode 100644 index 000000000..9e4993853 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.*; + +/** + * Short one line description of the walker. + * + * @author Mauricio Carneiro + * @since 2/1/12 + */ +class SampleStatistics { + private final GenomeLoc interval; + private final ArrayList loci; + + private final int minimumCoverageThreshold; + private final int maximumCoverageThreshold; + private final int minimumMappingQuality; + private final int minimumBaseQuality; + + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) + + private SampleStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this.interval = interval; + this.loci = loci; + this.minimumCoverageThreshold = minimumCoverageThreshold; + this.maximumCoverageThreshold = maximumCoverageThreshold; + this.minimumMappingQuality = minimumMappingQuality; + this.minimumBaseQuality = minimumBaseQuality; + } + + public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + + // Initialize every loci (this way we don't have to worry about non-existent loci in the object + for (int i = 0; i < interval.size(); i++) + this.loci.add(i, new LocusStatistics()); + + } + + public long totalCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return preComputedTotalCoverage; + } + + public double averageCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return (double) preComputedTotalCoverage / loci.size(); + } + + /** + * Calculates the callable statuses of the entire interval + * + * @return the callable statuses of the entire interval + */ + public Set getCallableStatuses() { + + Map totals = new HashMap(CallableStatus.values().length); + + // initialize map + for (CallableStatus status : CallableStatus.values()) + totals.put(status, 0); + + // sum up all the callable statuses for each locus + for (int i = 0; i < interval.size(); i++) { + for (CallableStatus status : callableStatus(i)) { + int count = totals.get(status); + + totals.put(status, count + 1); + } + } + + + Set output = new HashSet(); + + // double to avoid type casting + double intervalSize = interval.size(); + + double coverageStatusThreshold = 0.20; + if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold) + output.add(CallableStatus.NO_COVERAGE); + + if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold) + output.add(CallableStatus.LOW_COVERAGE); + + double excessiveCoverageThreshold = 0.20; + if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold) + output.add(CallableStatus.EXCESSIVE_COVERAGE); + + double qualityStatusThreshold = 0.50; + if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold) + output.add(CallableStatus.POOR_QUALITY); + + if (totals.get(CallableStatus.REF_N) > 0) + output.add(CallableStatus.REF_N); + + if (output.isEmpty()) { + output.add(CallableStatus.PASS); + } + return output; + } + + /** + * Adds a locus to the interval wide stats + * + * @param locus The locus given as a GenomeLoc + * @param pileup The pileup of that locus + */ + public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { + if (!interval.containsP(locus)) + throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); + + // a null pileup means there nothing ot add + if (pileup != null) { + + int locusIndex = locus.getStart() - interval.getStart(); + + int rawCoverage = pileup.depthOfCoverage(); + int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); + + LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); + + loci.add(locusIndex, locusData); + } + } + + /** + * returns the callable status of this locus without taking the reference base into account. + * + * @param locusIndex location in the genome to inquire (only one locus) + * @return the callable status of a locus + */ + private Set callableStatus(int locusIndex) { + LocusStatistics locus = loci.get(locusIndex); + + return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold); + } + + + private void calculateTotalCoverage() { + preComputedTotalCoverage = 0; + for (LocusStatistics locus : loci) + preComputedTotalCoverage += locus.getCoverage(); + } + +}