From 6759acbdefe678a03d42cc3cb52596238eb0ca09 Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 4 Mar 2010 15:00:02 +0000 Subject: [PATCH] Coverage statistics now fully implements DepthOfCoverage functionality, including the ability to print base counts. Minor changes to BaseUtils to support 'N' and 'D' characters. PickSequenomProbes now has the option to not print the whole window as part of the probe name (e.g. you just see PROJECT_NAME|CHR_POS and not PROJECT_NAME|CHR_POS_CHR_PROBESTART-PROBEND). Full integration tests for CoverageStatistics are forthcoming. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2924 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/fasta/PickSequenomProbes.java | 14 ++- .../walkers/coverage/CoverageStatistics.java | 106 ++++++++++++------ .../walkers/coverage/CoverageUtils.java | 53 +++++++++ .../coverage/DepthOfCoverageStats.java | 31 ++++- .../broadinstitute/sting/utils/BaseUtils.java | 15 +++ 5 files changed, 181 insertions(+), 38 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java index 57de80817..b7d8137bc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java @@ -26,6 +26,9 @@ public class PickSequenomProbes extends RefWalker { protected String SNP_MASK = null; @Argument(required=false, shortName="project_id",doc="If specified, all probenames will be prepended with 'project_id|'") String project_id = null; + @Argument(required = false, shortName="omitWindow", doc = "If specified, the window appender will be omitted from the design files (e.g. \"_chr:start-stop\")") + boolean omitWindow = false; + private byte [] maskFlags = new byte[401]; private LocationAwareSeekableRODIterator snpMaskIterator=null; @@ -116,11 +119,14 @@ public class PickSequenomProbes extends RefWalker { assay_id.append('|'); } assay_id.append(context.getLocation().toString().replace(':','_')); - assay_id.append('_'); - if ( variant.isInsertion() ) assay_id.append("gI_"); - else if ( variant.isDeletion()) assay_id.append("gD_"); + if ( variant.isInsertion() ) assay_id.append("_gI"); + else if ( variant.isDeletion()) assay_id.append("_gD"); - assay_id.append(ref.getWindow().toString().replace(':', '_')); + if ( ! omitWindow ) { + assay_id.append("_"); + assay_id.append(ref.getWindow().toString().replace(':', '_')); + } + return assay_id.toString() + "\t" + assay_sequence + "\n"; } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java index 6340a4f25..403e7e6b8 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java @@ -8,6 +8,8 @@ import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverageWalker; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.StingException; @@ -33,15 +35,16 @@ import java.util.*; // todo [DONE] -- add ability to print out the calculated bins and quit (for pre-analysis bin size selection) // todo [DONE] -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps] // todo [DONE] -- per locus output through -o -// todo -- support for using read groups instead of samples -// todo -- coverage without deletions -// todo -- base counts -// todo -- support for aggregate (ignoring sample IDs) granular histograms; maybe n*[start,stop], bins*sqrt(n) +// todo [DONE] -- support for using read groups instead of samples +// todo [DONE] -- coverage including deletions +// todo [DONE] -- base counts +// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time +// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) // todo -- alter logarithmic scaling to spread out bins more // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) -public class CoverageStatistics extends LocusWalker, DepthOfCoverageStats> implements TreeReducible { +public class CoverageStatistics extends LocusWalker, DepthOfCoverageStats> implements TreeReducible { @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) int start = 1; @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) @@ -52,6 +55,8 @@ public class CoverageStatistics extends LocusWalker, DepthOf byte minMappingQuality = 50; @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) byte minBaseQuality = 20; + @Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false) + boolean printBaseCounts = false; @Argument(fullName = "omitLocusTable", shortName = "omitLocus", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false) boolean omitLocusTable = false; @Argument(fullName = "omitIntervalStatistics", shortName = "omitIntervals", doc = "Will omit the per-interval statistics section, which should result in speedup", required = false) @@ -64,11 +69,17 @@ public class CoverageStatistics extends LocusWalker, DepthOf boolean omitSampleSummary = false; @Argument(fullName = "useReadGroups", shortName = "rg", doc = "Split depth of coverage output by read group rather than by sample", required = false) boolean useReadGroup = false; + @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) + boolean includeDeletions = false; + @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) + boolean ignoreDeletionSites = false; //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS //////////////////////////////////////////////////////////////////////////////////// + public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; } + public void initialize() { if ( printBinEndpointsAndExit ) { @@ -92,6 +103,9 @@ public class CoverageStatistics extends LocusWalker, DepthOf for ( String s : allSamples) { out.printf("\t%s_%s","Depth_for",s); + if ( printBaseCounts ) { + out.printf("\t%s_%s",s,"_base_counts"); + } } out.printf("%n"); @@ -137,45 +151,33 @@ public class CoverageStatistics extends LocusWalker, DepthOf stats.initializeLocusCounts(); } + if ( includeDeletions ) { + stats.initializeDeletions(); + } + return stats; } - public Map map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - Map contexts; - if ( useReadGroup ) { - contexts = StratifiedAlignmentContext.splitContextByReadGroup(context.getBasePileup()); - } else { - contexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup()); - } - - HashMap depthBySample = new HashMap(); - - for ( String sample : contexts.keySet() ) { - AlignmentContext sampleContext = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - int properDepth = 0; - for ( PileupElement e : sampleContext.getBasePileup() ) { - if ( e.getQual() >= minBaseQuality && e.getMappingQual() >= minMappingQuality ) { - properDepth++; - } - } - - depthBySample.put(sample,properDepth); - } + public Map map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( ! omitDepthOutput ) { out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) } - return depthBySample; + Map countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality, + useReadGroup ? CoverageUtils.PartitionType.BY_READ_GROUP : CoverageUtils.PartitionType.BY_SAMPLE); + + return countsBySample; } - public DepthOfCoverageStats reduce(Map thisMap, DepthOfCoverageStats prevReduce) { + public DepthOfCoverageStats reduce(Map thisMap, DepthOfCoverageStats prevReduce) { prevReduce.update(thisMap); if ( ! omitDepthOutput ) { printDepths(out,thisMap, prevReduce.getAllSamples()); // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without // turning on omit } + return prevReduce; } @@ -216,6 +218,9 @@ public class CoverageStatistics extends LocusWalker, DepthOf summaryHeader.append("\taverage_coverage"); for ( String s : firstStats.getAllSamples() ) { + summaryHeader.append("\t"); + summaryHeader.append(s); + summaryHeader.append("_total_cvg"); summaryHeader.append("\t"); summaryHeader.append(s); summaryHeader.append("_mean_cvg"); @@ -265,11 +270,12 @@ public class CoverageStatistics extends LocusWalker, DepthOf targetSummary.append(intervalStats.first.toString()); targetSummary.append("\t"); targetSummary.append(stats.getTotalCoverage()); - // TODO: change this to use the raw counts directly rather than re-estimating from mean*nloci targetSummary.append("\t"); - targetSummary.append(stats.getTotalMeanCoverage()); + targetSummary.append(String.format("%.2f",stats.getTotalMeanCoverage())); for ( String s : stats.getAllSamples() ) { + targetSummary.append("\t"); + targetSummary.append(stats.getTotals().get(s)); targetSummary.append("\t"); targetSummary.append(String.format("%.2f", stats.getMeans().get(s))); targetSummary.append("\t"); @@ -344,7 +350,7 @@ public class CoverageStatistics extends LocusWalker, DepthOf /////////////////// // OPTIONAL OUTPUTS ////////////////// - + if ( ! omitSampleSummary ) { logger.info("Printing sample summary"); File summaryStatisticsFile = deriveFromStream("summary_statistics"); @@ -484,18 +490,52 @@ public class CoverageStatistics extends LocusWalker, DepthOf return bin; } - private void printDepths(PrintStream stream, Map depthBySample, Set allSamples) { + private void printDepths(PrintStream stream, Map countsBySample, Set allSamples) { // get the depths per sample and build up the output string while tabulating total and average coverage + // todo -- update me to deal with base counts/indels StringBuilder perSampleOutput = new StringBuilder(); int tDepth = 0; for ( String s : allSamples ) { perSampleOutput.append("\t"); - int dp = depthBySample.keySet().contains(s) ? depthBySample.get(s) : 0; + long dp = countsBySample.keySet().contains(s) ? sumArray(countsBySample.get(s)) : 0; perSampleOutput.append(dp); + if ( printBaseCounts ) { + perSampleOutput.append("\t"); + perSampleOutput.append(baseCounts(countsBySample.get(s))); + } tDepth += dp; } // remember -- genome locus was printed in map() stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); } + + private long sumArray(int[] array) { + long i = 0; + for ( int j : array ) { + i += j; + } + return i; + } + + private String baseCounts(int[] counts) { + if ( counts == null ) { + counts = new int[6]; + } + StringBuilder s = new StringBuilder(); + int nbases = 0; + for ( char b : BaseUtils.EXTENDED_BASES ) { + nbases++; + if ( includeDeletions || b != 'D' ) { + s.append(b); + s.append(":"); + s.append(counts[BaseUtils.extendedBaseToBaseIndex(b)]); + if ( nbases < 6 ) { + s.append(" "); + } + } + } + + return s.toString(); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java new file mode 100644 index 000000000..388c5573d --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java @@ -0,0 +1,53 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.coverage; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.util.HashMap; +import java.util.Map; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Mar 3, 2010 + */ +public class CoverageUtils { + + public enum PartitionType { BY_READ_GROUP, BY_SAMPLE } + + public static Map getBaseCountsBySample(AlignmentContext context, int minMapQ, int minBaseQ, PartitionType type) { + Map samplesToCounts = new HashMap(); + + for (PileupElement e : context.getBasePileup()) { + if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) { + String sample = type == PartitionType.BY_SAMPLE ? e.getRead().getReadGroup().getSample() : e.getRead().getReadGroup().getReadGroupId(); + if ( samplesToCounts.keySet().contains(sample) ) { + updateCounts(samplesToCounts.get(sample),e); + } else { + samplesToCounts.put(sample,new int[6]); + updateCounts(samplesToCounts.get(sample),e); + } + } + } + + return samplesToCounts; + } + + private static void updateCounts(int[] counts, PileupElement e) { + if ( e.isDeletion() ) { + counts[BaseUtils.DELETION_INDEX]++; + } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { + counts[BaseUtils.NO_CALL_INDEX]++; + } else { + try { + counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++; + } catch (ArrayIndexOutOfBoundsException exc) { + throw new StingException("Expected a simple base, but actually received"+(char)e.getBase()); + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java index 69e692d58..60add05f1 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.coverage; +import org.broadinstitute.sting.utils.BaseUtils; + import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -28,6 +30,7 @@ public class DepthOfCoverageStats { private boolean tabulateLocusCounts = false; private long nLoci; // number of loci seen private long totalDepthOfCoverage; + private boolean includeDeletions = false; //////////////////////////////////////////////////////////////////////////////////// // TEMPORARY DATA ( not worth re-instantiating ) @@ -108,11 +111,15 @@ public class DepthOfCoverageStats { tabulateLocusCounts = true; } + public void initializeDeletions() { + includeDeletions = true; + } + //////////////////////////////////////////////////////////////////////////////////// // UPDATE METHODS //////////////////////////////////////////////////////////////////////////////////// - public void update(Map depthBySample) { + public void updateDepths(Map depthBySample) { int b; for ( String sample : granularHistogramBySample.keySet() ) { if ( depthBySample.containsKey(sample) ) { @@ -135,6 +142,24 @@ public class DepthOfCoverageStats { totalLocusDepth = 0; } + public void update(Map countsBySample) { + // todo -- do we want to do anything special regarding base count or deletion statistics? + HashMap depthBySample = new HashMap(); + // todo -- needs fixing with advent of new baseutils functionality using ENUMS and handling N,D + for ( String s : countsBySample.keySet() ) { + int total = 0; + int[] counts = countsBySample.get(s); + for ( char base : BaseUtils.EXTENDED_BASES ) { + if ( includeDeletions || ! ( base == 'D') ) { // note basesAreEqual assigns TRUE to (N,D) as both have simple index -1 + total += counts[BaseUtils.extendedBaseToBaseIndex(base)]; + } + } + depthBySample.put(s,total); + } + + this.updateDepths(depthBySample); + } + private int updateSample(String sample, int depth) { totalCoverages.put(sample,totalCoverages.get(sample)+depth); @@ -229,6 +254,10 @@ public class DepthOfCoverageStats { return means; } + public Map getTotals() { + return totalCoverages; + } + public long getTotalLoci() { return nLoci; } diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 0532c10f6..4d7f1ac41 100644 --- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -9,6 +9,10 @@ import java.util.Random; */ public class BaseUtils { public final static char[] BASES = { 'A', 'C', 'G', 'T' }; + public final static char[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' }; + // todo -- fix me (enums?) + public static final byte DELETION_INDEX = 4; + public static final byte NO_CALL_INDEX = 5; // (this is 'N') /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or // a pyrimidine to another pyrimidine nucleotide (C <-> T). @@ -135,6 +139,17 @@ public class BaseUtils { } } + static public int extendedBaseToBaseIndex(char base) { + switch (base) { + case 'd': + case 'D': return DELETION_INDEX; + case 'n': + case 'N': return NO_CALL_INDEX; + + default: return simpleBaseToBaseIndex(base); + } + } + static public int simpleBaseToBaseIndex(byte base) { return simpleBaseToBaseIndex((char)base); }