Coverage statistics now fully implements DepthOfCoverage functionality, including the ability to print base counts. Minor changes to BaseUtils to support 'N' and 'D' characters. PickSequenomProbes now has the option to not print the whole window as part of the probe name (e.g. you just see PROJECT_NAME|CHR_POS and not PROJECT_NAME|CHR_POS_CHR_PROBESTART-PROBEND). Full integration tests for CoverageStatistics are forthcoming.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2924 348d0f76-0448-11de-a6fe-93d51630548a
2010-03-04 15:00:02 +00:00 · 2010-03-04 15:00:02 +00:00 · 6759acbdef
parent 023654696e
commit 6759acbdef
5 changed files with 181 additions and 38 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java
@ -26,6 +26,9 @@ public class PickSequenomProbes extends RefWalker<String, String> {
    protected String SNP_MASK = null;
    @Argument(required=false, shortName="project_id",doc="If specified, all probenames will be prepended with 'project_id|'")
    String project_id = null;
+    @Argument(required = false, shortName="omitWindow", doc = "If specified, the window appender will be omitted from the design files (e.g. \"_chr:start-stop\")")
+    boolean omitWindow = false;
+
    private byte [] maskFlags = new byte[401];

    private LocationAwareSeekableRODIterator snpMaskIterator=null;
@ -116,11 +119,14 @@ public class PickSequenomProbes extends RefWalker<String, String> {
            assay_id.append('|');
        }
        assay_id.append(context.getLocation().toString().replace(':','_'));
-        assay_id.append('_');
-        if ( variant.isInsertion() ) assay_id.append("gI_");
-        else if ( variant.isDeletion()) assay_id.append("gD_");
+        if ( variant.isInsertion() ) assay_id.append("_gI");
+        else if ( variant.isDeletion()) assay_id.append("_gD");

-        assay_id.append(ref.getWindow().toString().replace(':', '_'));
+        if ( ! omitWindow ) {
+            assay_id.append("_");
+            assay_id.append(ref.getWindow().toString().replace(':', '_'));
+        }
+        
        return assay_id.toString() + "\t" + assay_sequence + "\n";
    }

--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java
@ -8,6 +8,8 @@ import org.broadinstitute.sting.gatk.walkers.By;
 import org.broadinstitute.sting.gatk.walkers.DataSource;
 import org.broadinstitute.sting.gatk.walkers.LocusWalker;
 import org.broadinstitute.sting.gatk.walkers.TreeReducible;
+import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverageWalker;
+import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.Pair;
 import org.broadinstitute.sting.utils.StingException;
@ -33,15 +35,16 @@ import java.util.*;
 // todo [DONE] -- add ability to print out the calculated bins and quit (for pre-analysis bin size selection)
 // todo [DONE] -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps]
 // todo [DONE] -- per locus output through -o
-// todo -- support for using read groups instead of samples
-// todo -- coverage without deletions
-// todo -- base counts
-// todo -- support for aggregate (ignoring sample IDs) granular histograms; maybe n*[start,stop], bins*sqrt(n)
+// todo [DONE] -- support for using read groups instead of samples
+// todo [DONE] -- coverage including deletions
+// todo [DONE] -- base counts
+// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time
+// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n)
 // todo -- alter logarithmic scaling to spread out bins more
 // todo -- allow for user to set linear binning (default is logarithmic)
 // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now
@By(DataSource.REFERENCE)
-public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOfCoverageStats> implements TreeReducible<DepthOfCoverageStats> {
+public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCoverageStats> implements TreeReducible<DepthOfCoverageStats> {
    @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
    int start = 1;
    @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
@ -52,6 +55,8 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
    byte minMappingQuality = 50;
    @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
    byte minBaseQuality = 20;
+    @Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false)
+    boolean printBaseCounts = false;
    @Argument(fullName = "omitLocusTable", shortName = "omitLocus", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false)
    boolean omitLocusTable = false;
    @Argument(fullName = "omitIntervalStatistics", shortName = "omitIntervals", doc = "Will omit the per-interval statistics section, which should result in speedup", required = false)
@ -64,11 +69,17 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
    boolean omitSampleSummary = false;
    @Argument(fullName = "useReadGroups", shortName = "rg", doc = "Split depth of coverage output by read group rather than by sample", required = false)
    boolean useReadGroup = false;
+    @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false)
+    boolean includeDeletions = false;
+    @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
+    boolean ignoreDeletionSites = false;

    ////////////////////////////////////////////////////////////////////////////////////
    // STANDARD WALKER METHODS
    ////////////////////////////////////////////////////////////////////////////////////

+    public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; }
+
    public void initialize() {

        if ( printBinEndpointsAndExit ) {
@ -92,6 +103,9 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf

            for ( String s : allSamples) {
                out.printf("\t%s_%s","Depth_for",s);
+                if ( printBaseCounts ) {
+                    out.printf("\t%s_%s",s,"_base_counts");
+                }
            }

            out.printf("%n");
@ -137,45 +151,33 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
            stats.initializeLocusCounts();
        }

+        if ( includeDeletions ) {
+            stats.initializeDeletions();
+        }
+
        return stats;
    }

-    public Map<String,Integer> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
-        Map<String,StratifiedAlignmentContext> contexts;
-        if ( useReadGroup ) {
-            contexts = StratifiedAlignmentContext.splitContextByReadGroup(context.getBasePileup());
-        } else {
-            contexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
-        }
-
-        HashMap<String,Integer> depthBySample = new HashMap<String,Integer>();
-
-        for ( String sample : contexts.keySet() ) {
-            AlignmentContext sampleContext = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
-            int properDepth = 0;
-            for ( PileupElement e : sampleContext.getBasePileup() ) {
-                if ( e.getQual() >= minBaseQuality && e.getMappingQual() >= minMappingQuality ) {
-                    properDepth++;
-                }
-            }
-
-            depthBySample.put(sample,properDepth);
-        }
+    public Map<String,int[]> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        
        if ( ! omitDepthOutput ) {
            out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
        }

-        return depthBySample;
+        Map<String,int[]> countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality,
+                useReadGroup ? CoverageUtils.PartitionType.BY_READ_GROUP : CoverageUtils.PartitionType.BY_SAMPLE);
+
+        return countsBySample;
    }

-    public DepthOfCoverageStats reduce(Map<String,Integer> thisMap, DepthOfCoverageStats prevReduce) {
+    public DepthOfCoverageStats reduce(Map<String,int[]> thisMap, DepthOfCoverageStats prevReduce) {
        prevReduce.update(thisMap);
        if ( ! omitDepthOutput ) {
            printDepths(out,thisMap, prevReduce.getAllSamples());
            // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without
            // turning on omit
        }
+
        return prevReduce;
    }

@ -216,6 +218,9 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
        summaryHeader.append("\taverage_coverage");

        for ( String s : firstStats.getAllSamples() ) {
+            summaryHeader.append("\t");
+            summaryHeader.append(s);
+            summaryHeader.append("_total_cvg");
            summaryHeader.append("\t");
            summaryHeader.append(s);
            summaryHeader.append("_mean_cvg");
@ -265,11 +270,12 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
        targetSummary.append(intervalStats.first.toString());
        targetSummary.append("\t");
        targetSummary.append(stats.getTotalCoverage());
-        // TODO: change this to use the raw counts directly rather than re-estimating from mean*nloci
        targetSummary.append("\t");
-        targetSummary.append(stats.getTotalMeanCoverage());
+        targetSummary.append(String.format("%.2f",stats.getTotalMeanCoverage()));

        for ( String s : stats.getAllSamples() ) {
+            targetSummary.append("\t");
+            targetSummary.append(stats.getTotals().get(s));
            targetSummary.append("\t");
            targetSummary.append(String.format("%.2f", stats.getMeans().get(s)));
            targetSummary.append("\t");
@ -344,7 +350,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
        ///////////////////
        // OPTIONAL OUTPUTS
        //////////////////
-
+        
        if ( ! omitSampleSummary ) {
            logger.info("Printing sample summary");
            File summaryStatisticsFile = deriveFromStream("summary_statistics");
@ -484,18 +490,52 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
        return bin;
    }
    
-    private void printDepths(PrintStream stream, Map<String,Integer> depthBySample, Set<String> allSamples) {
+    private void printDepths(PrintStream stream, Map<String,int[]> countsBySample, Set<String> allSamples) {
        // get the depths per sample and build up the output string while tabulating total and average coverage
+        // todo -- update me to deal with base counts/indels
        StringBuilder perSampleOutput = new StringBuilder();
        int tDepth = 0;
        for ( String s : allSamples ) {
            perSampleOutput.append("\t");
-            int dp = depthBySample.keySet().contains(s) ? depthBySample.get(s) : 0;
+            long dp = countsBySample.keySet().contains(s) ? sumArray(countsBySample.get(s)) : 0;
            perSampleOutput.append(dp);
+            if ( printBaseCounts ) {
+                perSampleOutput.append("\t");
+                perSampleOutput.append(baseCounts(countsBySample.get(s)));
+            }
            tDepth += dp;
        }
        // remember -- genome locus was printed in map()
        stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
        
    }
+
+    private long sumArray(int[] array) {
+        long i = 0;
+        for ( int j : array ) {
+            i += j;
+        }
+        return i;
+    }
+
+    private String baseCounts(int[] counts) {
+        if ( counts == null ) {
+            counts = new int[6];
+        }
+        StringBuilder s = new StringBuilder();
+        int nbases = 0;
+        for ( char b : BaseUtils.EXTENDED_BASES ) {
+            nbases++;
+            if ( includeDeletions || b != 'D' ) {
+                s.append(b);
+                s.append(":");
+                s.append(counts[BaseUtils.extendedBaseToBaseIndex(b)]);
+                if ( nbases < 6 ) {
+                    s.append(" ");
+                }
+            }
+        }
+
+        return s.toString();
+    }
 }
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java
@ -0,0 +1,53 @@
+package org.broadinstitute.sting.oneoffprojects.walkers.coverage;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
+ *
+ * @Author chartl
+ * @Date Mar 3, 2010
+ */
+public class CoverageUtils {
+
+    public enum PartitionType { BY_READ_GROUP, BY_SAMPLE }
+
+    public static Map<String,int[]> getBaseCountsBySample(AlignmentContext context, int minMapQ, int minBaseQ, PartitionType type) {
+        Map<String,int[]> samplesToCounts = new HashMap<String,int[]>();
+
+            for (PileupElement e : context.getBasePileup()) {
+                if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) {
+                    String sample = type == PartitionType.BY_SAMPLE ? e.getRead().getReadGroup().getSample() : e.getRead().getReadGroup().getReadGroupId();
+                    if ( samplesToCounts.keySet().contains(sample) ) {
+                        updateCounts(samplesToCounts.get(sample),e);
+                    } else {
+                        samplesToCounts.put(sample,new int[6]);
+                        updateCounts(samplesToCounts.get(sample),e);
+                    }
+                }
+            }
+
+        return samplesToCounts;
+    }
+
+    private static void updateCounts(int[] counts, PileupElement e) {
+        if ( e.isDeletion() ) {
+            counts[BaseUtils.DELETION_INDEX]++;
+        } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) {
+            counts[BaseUtils.NO_CALL_INDEX]++;
+        } else {
+            try {
+                counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++;
+            } catch (ArrayIndexOutOfBoundsException exc) {
+                throw new StingException("Expected a simple base, but actually received"+(char)e.getBase());
+            }
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java
@ -1,5 +1,7 @@
 package org.broadinstitute.sting.oneoffprojects.walkers.coverage;

+import org.broadinstitute.sting.utils.BaseUtils;
+
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
@ -28,6 +30,7 @@ public class DepthOfCoverageStats {
    private boolean tabulateLocusCounts = false;
    private long nLoci; // number of loci seen
    private long totalDepthOfCoverage;
+    private boolean includeDeletions = false;

    ////////////////////////////////////////////////////////////////////////////////////
    // TEMPORARY DATA ( not worth re-instantiating )
@ -108,11 +111,15 @@ public class DepthOfCoverageStats {
        tabulateLocusCounts = true;
    }

+    public void initializeDeletions() {
+        includeDeletions = true;
+    }
+
    ////////////////////////////////////////////////////////////////////////////////////
    // UPDATE METHODS
    ////////////////////////////////////////////////////////////////////////////////////

-    public void update(Map<String,Integer> depthBySample) {
+    public void updateDepths(Map<String,Integer> depthBySample) {
        int b;
        for ( String sample : granularHistogramBySample.keySet() ) {
            if ( depthBySample.containsKey(sample) ) {
@ -135,6 +142,24 @@ public class DepthOfCoverageStats {
        totalLocusDepth = 0;
    }

+    public void update(Map<String,int[]> countsBySample) {
+        // todo -- do we want to do anything special regarding base count or deletion statistics?
+        HashMap<String,Integer> depthBySample = new HashMap<String,Integer>();
+        // todo -- needs fixing with advent of new baseutils functionality using ENUMS and handling N,D
+        for ( String s : countsBySample.keySet() ) {
+            int total = 0;
+            int[] counts = countsBySample.get(s);
+            for ( char base : BaseUtils.EXTENDED_BASES ) {
+                if ( includeDeletions || ! ( base == 'D') ) { // note basesAreEqual assigns TRUE to (N,D) as both have simple index -1
+                    total += counts[BaseUtils.extendedBaseToBaseIndex(base)];
+                }
+            }
+            depthBySample.put(s,total);
+        }
+        
+        this.updateDepths(depthBySample);
+    }
+
    private int updateSample(String sample, int depth) {
        totalCoverages.put(sample,totalCoverages.get(sample)+depth);

@ -229,6 +254,10 @@ public class DepthOfCoverageStats {
        return means;
    }

+    public Map<String,Long> getTotals() {
+        return totalCoverages;
+    }
+
    public long getTotalLoci() {
        return nLoci;
    }
--- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@ -9,6 +9,10 @@ import java.util.Random;
 */
 public class BaseUtils {
    public final static char[] BASES = { 'A', 'C', 'G', 'T' };
+    public final static char[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' };
+    // todo -- fix me (enums?)
+    public static final byte DELETION_INDEX = 4;
+    public static final byte NO_CALL_INDEX = 5; // (this is 'N')

    /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
    // a pyrimidine to another pyrimidine nucleotide (C <-> T).
@ -135,6 +139,17 @@ public class BaseUtils {
        }
    }

+    static public int extendedBaseToBaseIndex(char base) {
+        switch (base) {
+            case 'd':
+            case 'D': return DELETION_INDEX;
+            case 'n':
+            case 'N': return NO_CALL_INDEX;
+
+            default: return simpleBaseToBaseIndex(base);
+        }
+    }
+
    static public int simpleBaseToBaseIndex(byte base) {
        return simpleBaseToBaseIndex((char)base);
    }