diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java index 57de80817..b7d8137bc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java @@ -26,6 +26,9 @@ public class PickSequenomProbes extends RefWalker { protected String SNP_MASK = null; @Argument(required=false, shortName="project_id",doc="If specified, all probenames will be prepended with 'project_id|'") String project_id = null; + @Argument(required = false, shortName="omitWindow", doc = "If specified, the window appender will be omitted from the design files (e.g. \"_chr:start-stop\")") + boolean omitWindow = false; + private byte [] maskFlags = new byte[401]; private LocationAwareSeekableRODIterator snpMaskIterator=null; @@ -116,11 +119,14 @@ public class PickSequenomProbes extends RefWalker { assay_id.append('|'); } assay_id.append(context.getLocation().toString().replace(':','_')); - assay_id.append('_'); - if ( variant.isInsertion() ) assay_id.append("gI_"); - else if ( variant.isDeletion()) assay_id.append("gD_"); + if ( variant.isInsertion() ) assay_id.append("_gI"); + else if ( variant.isDeletion()) assay_id.append("_gD"); - assay_id.append(ref.getWindow().toString().replace(':', '_')); + if ( ! omitWindow ) { + assay_id.append("_"); + assay_id.append(ref.getWindow().toString().replace(':', '_')); + } + return assay_id.toString() + "\t" + assay_sequence + "\n"; } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java index 6340a4f25..403e7e6b8 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java @@ -8,6 +8,8 @@ import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverageWalker; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.StingException; @@ -33,15 +35,16 @@ import java.util.*; // todo [DONE] -- add ability to print out the calculated bins and quit (for pre-analysis bin size selection) // todo [DONE] -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps] // todo [DONE] -- per locus output through -o -// todo -- support for using read groups instead of samples -// todo -- coverage without deletions -// todo -- base counts -// todo -- support for aggregate (ignoring sample IDs) granular histograms; maybe n*[start,stop], bins*sqrt(n) +// todo [DONE] -- support for using read groups instead of samples +// todo [DONE] -- coverage including deletions +// todo [DONE] -- base counts +// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time +// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) // todo -- alter logarithmic scaling to spread out bins more // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) -public class CoverageStatistics extends LocusWalker, DepthOfCoverageStats> implements TreeReducible { +public class CoverageStatistics extends LocusWalker, DepthOfCoverageStats> implements TreeReducible { @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) int start = 1; @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) @@ -52,6 +55,8 @@ public class CoverageStatistics extends LocusWalker, DepthOf byte minMappingQuality = 50; @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) byte minBaseQuality = 20; + @Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false) + boolean printBaseCounts = false; @Argument(fullName = "omitLocusTable", shortName = "omitLocus", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false) boolean omitLocusTable = false; @Argument(fullName = "omitIntervalStatistics", shortName = "omitIntervals", doc = "Will omit the per-interval statistics section, which should result in speedup", required = false) @@ -64,11 +69,17 @@ public class CoverageStatistics extends LocusWalker, DepthOf boolean omitSampleSummary = false; @Argument(fullName = "useReadGroups", shortName = "rg", doc = "Split depth of coverage output by read group rather than by sample", required = false) boolean useReadGroup = false; + @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) + boolean includeDeletions = false; + @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) + boolean ignoreDeletionSites = false; //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS //////////////////////////////////////////////////////////////////////////////////// + public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; } + public void initialize() { if ( printBinEndpointsAndExit ) { @@ -92,6 +103,9 @@ public class CoverageStatistics extends LocusWalker, DepthOf for ( String s : allSamples) { out.printf("\t%s_%s","Depth_for",s); + if ( printBaseCounts ) { + out.printf("\t%s_%s",s,"_base_counts"); + } } out.printf("%n"); @@ -137,45 +151,33 @@ public class CoverageStatistics extends LocusWalker, DepthOf stats.initializeLocusCounts(); } + if ( includeDeletions ) { + stats.initializeDeletions(); + } + return stats; } - public Map map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - Map contexts; - if ( useReadGroup ) { - contexts = StratifiedAlignmentContext.splitContextByReadGroup(context.getBasePileup()); - } else { - contexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup()); - } - - HashMap depthBySample = new HashMap(); - - for ( String sample : contexts.keySet() ) { - AlignmentContext sampleContext = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - int properDepth = 0; - for ( PileupElement e : sampleContext.getBasePileup() ) { - if ( e.getQual() >= minBaseQuality && e.getMappingQual() >= minMappingQuality ) { - properDepth++; - } - } - - depthBySample.put(sample,properDepth); - } + public Map map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( ! omitDepthOutput ) { out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) } - return depthBySample; + Map countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality, + useReadGroup ? CoverageUtils.PartitionType.BY_READ_GROUP : CoverageUtils.PartitionType.BY_SAMPLE); + + return countsBySample; } - public DepthOfCoverageStats reduce(Map thisMap, DepthOfCoverageStats prevReduce) { + public DepthOfCoverageStats reduce(Map thisMap, DepthOfCoverageStats prevReduce) { prevReduce.update(thisMap); if ( ! omitDepthOutput ) { printDepths(out,thisMap, prevReduce.getAllSamples()); // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without // turning on omit } + return prevReduce; } @@ -216,6 +218,9 @@ public class CoverageStatistics extends LocusWalker, DepthOf summaryHeader.append("\taverage_coverage"); for ( String s : firstStats.getAllSamples() ) { + summaryHeader.append("\t"); + summaryHeader.append(s); + summaryHeader.append("_total_cvg"); summaryHeader.append("\t"); summaryHeader.append(s); summaryHeader.append("_mean_cvg"); @@ -265,11 +270,12 @@ public class CoverageStatistics extends LocusWalker, DepthOf targetSummary.append(intervalStats.first.toString()); targetSummary.append("\t"); targetSummary.append(stats.getTotalCoverage()); - // TODO: change this to use the raw counts directly rather than re-estimating from mean*nloci targetSummary.append("\t"); - targetSummary.append(stats.getTotalMeanCoverage()); + targetSummary.append(String.format("%.2f",stats.getTotalMeanCoverage())); for ( String s : stats.getAllSamples() ) { + targetSummary.append("\t"); + targetSummary.append(stats.getTotals().get(s)); targetSummary.append("\t"); targetSummary.append(String.format("%.2f", stats.getMeans().get(s))); targetSummary.append("\t"); @@ -344,7 +350,7 @@ public class CoverageStatistics extends LocusWalker, DepthOf /////////////////// // OPTIONAL OUTPUTS ////////////////// - + if ( ! omitSampleSummary ) { logger.info("Printing sample summary"); File summaryStatisticsFile = deriveFromStream("summary_statistics"); @@ -484,18 +490,52 @@ public class CoverageStatistics extends LocusWalker, DepthOf return bin; } - private void printDepths(PrintStream stream, Map depthBySample, Set allSamples) { + private void printDepths(PrintStream stream, Map countsBySample, Set allSamples) { // get the depths per sample and build up the output string while tabulating total and average coverage + // todo -- update me to deal with base counts/indels StringBuilder perSampleOutput = new StringBuilder(); int tDepth = 0; for ( String s : allSamples ) { perSampleOutput.append("\t"); - int dp = depthBySample.keySet().contains(s) ? depthBySample.get(s) : 0; + long dp = countsBySample.keySet().contains(s) ? sumArray(countsBySample.get(s)) : 0; perSampleOutput.append(dp); + if ( printBaseCounts ) { + perSampleOutput.append("\t"); + perSampleOutput.append(baseCounts(countsBySample.get(s))); + } tDepth += dp; } // remember -- genome locus was printed in map() stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); } + + private long sumArray(int[] array) { + long i = 0; + for ( int j : array ) { + i += j; + } + return i; + } + + private String baseCounts(int[] counts) { + if ( counts == null ) { + counts = new int[6]; + } + StringBuilder s = new StringBuilder(); + int nbases = 0; + for ( char b : BaseUtils.EXTENDED_BASES ) { + nbases++; + if ( includeDeletions || b != 'D' ) { + s.append(b); + s.append(":"); + s.append(counts[BaseUtils.extendedBaseToBaseIndex(b)]); + if ( nbases < 6 ) { + s.append(" "); + } + } + } + + return s.toString(); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java new file mode 100644 index 000000000..388c5573d --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageUtils.java @@ -0,0 +1,53 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.coverage; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.util.HashMap; +import java.util.Map; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Mar 3, 2010 + */ +public class CoverageUtils { + + public enum PartitionType { BY_READ_GROUP, BY_SAMPLE } + + public static Map getBaseCountsBySample(AlignmentContext context, int minMapQ, int minBaseQ, PartitionType type) { + Map samplesToCounts = new HashMap(); + + for (PileupElement e : context.getBasePileup()) { + if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) { + String sample = type == PartitionType.BY_SAMPLE ? e.getRead().getReadGroup().getSample() : e.getRead().getReadGroup().getReadGroupId(); + if ( samplesToCounts.keySet().contains(sample) ) { + updateCounts(samplesToCounts.get(sample),e); + } else { + samplesToCounts.put(sample,new int[6]); + updateCounts(samplesToCounts.get(sample),e); + } + } + } + + return samplesToCounts; + } + + private static void updateCounts(int[] counts, PileupElement e) { + if ( e.isDeletion() ) { + counts[BaseUtils.DELETION_INDEX]++; + } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { + counts[BaseUtils.NO_CALL_INDEX]++; + } else { + try { + counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++; + } catch (ArrayIndexOutOfBoundsException exc) { + throw new StingException("Expected a simple base, but actually received"+(char)e.getBase()); + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java index 69e692d58..60add05f1 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.coverage; +import org.broadinstitute.sting.utils.BaseUtils; + import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -28,6 +30,7 @@ public class DepthOfCoverageStats { private boolean tabulateLocusCounts = false; private long nLoci; // number of loci seen private long totalDepthOfCoverage; + private boolean includeDeletions = false; //////////////////////////////////////////////////////////////////////////////////// // TEMPORARY DATA ( not worth re-instantiating ) @@ -108,11 +111,15 @@ public class DepthOfCoverageStats { tabulateLocusCounts = true; } + public void initializeDeletions() { + includeDeletions = true; + } + //////////////////////////////////////////////////////////////////////////////////// // UPDATE METHODS //////////////////////////////////////////////////////////////////////////////////// - public void update(Map depthBySample) { + public void updateDepths(Map depthBySample) { int b; for ( String sample : granularHistogramBySample.keySet() ) { if ( depthBySample.containsKey(sample) ) { @@ -135,6 +142,24 @@ public class DepthOfCoverageStats { totalLocusDepth = 0; } + public void update(Map countsBySample) { + // todo -- do we want to do anything special regarding base count or deletion statistics? + HashMap depthBySample = new HashMap(); + // todo -- needs fixing with advent of new baseutils functionality using ENUMS and handling N,D + for ( String s : countsBySample.keySet() ) { + int total = 0; + int[] counts = countsBySample.get(s); + for ( char base : BaseUtils.EXTENDED_BASES ) { + if ( includeDeletions || ! ( base == 'D') ) { // note basesAreEqual assigns TRUE to (N,D) as both have simple index -1 + total += counts[BaseUtils.extendedBaseToBaseIndex(base)]; + } + } + depthBySample.put(s,total); + } + + this.updateDepths(depthBySample); + } + private int updateSample(String sample, int depth) { totalCoverages.put(sample,totalCoverages.get(sample)+depth); @@ -229,6 +254,10 @@ public class DepthOfCoverageStats { return means; } + public Map getTotals() { + return totalCoverages; + } + public long getTotalLoci() { return nLoci; } diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 0532c10f6..4d7f1ac41 100644 --- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -9,6 +9,10 @@ import java.util.Random; */ public class BaseUtils { public final static char[] BASES = { 'A', 'C', 'G', 'T' }; + public final static char[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' }; + // todo -- fix me (enums?) + public static final byte DELETION_INDEX = 4; + public static final byte NO_CALL_INDEX = 5; // (this is 'N') /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or // a pyrimidine to another pyrimidine nucleotide (C <-> T). @@ -135,6 +139,17 @@ public class BaseUtils { } } + static public int extendedBaseToBaseIndex(char base) { + switch (base) { + case 'd': + case 'D': return DELETION_INDEX; + case 'n': + case 'N': return NO_CALL_INDEX; + + default: return simpleBaseToBaseIndex(base); + } + } + static public int simpleBaseToBaseIndex(byte base) { return simpleBaseToBaseIndex((char)base); }