Quick commit in advance of Aaron's. Just a bunch of refactoring (private classes separated out, put in proper package). Also support added for coverage by read group rather than sample.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2897 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-02-26 16:39:47 +00:00
parent 622554d7bd
commit 87f8fb7282
3 changed files with 279 additions and 254 deletions

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
package org.broadinstitute.sting.oneoffprojects.walkers.coverage;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -31,7 +31,7 @@ import java.util.*;
*/
// todo [DONE] -- add per target (e.g. regional) aggregation
// todo [DONE] -- add ability to print out the calculated bins and quit (for pre-analysis bin size selection)
// todo -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps]
// todo [DONE] -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps]
// todo [DONE] -- per locus output through -o
// todo -- support for using read groups instead of samples
// todo -- coverage without deletions
@ -62,6 +62,8 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
boolean printBinEndpointsAndExit = false;
@Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime.", required = false)
boolean omitSampleSummary = false;
@Argument(fullName = "useReadGroups", shortName = "rg", doc = "Split depth of coverage output by read group rather than by sample", required = false)
boolean useReadGroup = false;
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
@ -86,13 +88,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
if ( ! omitDepthOutput ) { // print header
out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth");
// get all the samples
HashSet<String> allSamples = new HashSet<String>(); // since the DOCS object uses a HashMap, this will be in the same order
for ( Set<String> sampleSet : getToolkit().getSamplesByReaders()) {
for (String s : sampleSet) {
allSamples.add(s);
}
}
HashSet<String> allSamples = getSamplesFromToolKit(useReadGroup);
for ( String s : allSamples) {
out.printf("\t%s_%s","Depth_for",s);
@ -104,21 +100,39 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
out.printf("Per-Locus Depth of Coverage output was omitted");
}
}
private HashSet<String> getSamplesFromToolKit( boolean getReadGroupsInstead ) {
HashSet<String> partitions = new HashSet<String>(); // since the DOCS object uses a HashMap, this will be in the same order
if ( getReadGroupsInstead ) {
for ( Set<String> rgSet : getToolkit().getMergedReadGroupsByReaders() ) {
for ( String rg : rgSet ) {
partitions.add(rg);
}
}
} else {
for ( Set<String> sampleSet : getToolkit().getSamplesByReaders()) {
for (String s : sampleSet) {
partitions.add(s);
}
}
}
return partitions;
}
public boolean isReduceByInterval() {
return ( ! omitIntervals );
}
public DepthOfCoverageStats reduceInit() {
List<Set<String>> samplesByReaders = getToolkit().getSamplesByReaders();
DepthOfCoverageStats stats = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins));
for ( Set<String> sampleSet : samplesByReaders ) {
for ( String sample : sampleSet ) {
stats.addSample(sample);
}
for ( String sample : getSamplesFromToolKit(useReadGroup) ) {
stats.addSample(sample);
}
if ( ! omitLocusTable ) {
stats.initializeLocusCounts();
}
@ -127,7 +141,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
}
public Map<String,Integer> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
Map<String,StratifiedAlignmentContext> contextsBySample =
Map<String, StratifiedAlignmentContext> contextsBySample =
StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
HashMap<String,Integer> depthBySample = new HashMap<String,Integer>();
@ -479,241 +493,4 @@ public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOf
stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
}
}
class DepthOfCoverageStats {
////////////////////////////////////////////////////////////////////////////////////
// STATIC DATA
////////////////////////////////////////////////////////////////////////////////////
/* none so far */
////////////////////////////////////////////////////////////////////////////////////
// STANDARD DATA
////////////////////////////////////////////////////////////////////////////////////
private Map<String,int[]> granularHistogramBySample; // holds the counts per each bin
private Map<String,Long> totalCoverages; // holds total coverage per sample
private int[] binLeftEndpoints; // describes the left endpoint for each bin
private int[][] locusCoverageCounts; // holds counts of number of bases with >=X samples at >=Y coverage
private boolean tabulateLocusCounts = false;
private long nLoci; // number of loci seen
private long totalDepthOfCoverage;
////////////////////////////////////////////////////////////////////////////////////
// TEMPORARY DATA ( not worth re-instantiating )
////////////////////////////////////////////////////////////////////////////////////
private int[] locusHistogram; // holds a histogram for each locus; reset after each update() call
private int totalLocusDepth; // holds the total depth of coverage for each locus; reset after each update() call
////////////////////////////////////////////////////////////////////////////////////
// STATIC METHODS
////////////////////////////////////////////////////////////////////////////////////
public static int[] calculateBinEndpoints(int lower, int upper, int bins) {
if ( bins > upper - lower || lower < 1 ) {
throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+
"lower bound must be at least 1, and number of bins may not exceed stop - start");
}
int[] binLeftEndpoints = new int[bins+1];
binLeftEndpoints[0] = lower;
int length = upper - lower;
double scale = Math.log10((double) length)/bins;
for ( int b = 1; b < bins ; b++ ) {
int leftEnd = lower + (int) Math.floor(Math.pow(10.0,(b-1.0)*scale));
// todo -- simplify to length^(scale/bins); make non-constant to put bin ends in more "useful"
// todo -- positions on the number line
while ( leftEnd <= binLeftEndpoints[b-1] ) {
leftEnd++;
}
binLeftEndpoints[b] = leftEnd;
}
binLeftEndpoints[binLeftEndpoints.length-1] = upper;
return binLeftEndpoints;
}
////////////////////////////////////////////////////////////////////////////////////
// INITIALIZATION METHODS
////////////////////////////////////////////////////////////////////////////////////
public DepthOfCoverageStats(int[] leftEndpoints) {
this.binLeftEndpoints = leftEndpoints;
granularHistogramBySample = new HashMap<String,int[]>();
totalCoverages = new HashMap<String,Long>();
nLoci = 0;
totalLocusDepth = 0;
totalDepthOfCoverage = 0;
}
public void addSample(String sample) {
if ( granularHistogramBySample.containsKey(sample) ) {
return;
}
int[] binCounts = new int[this.binLeftEndpoints.length+1];
for ( int b = 0; b < binCounts.length; b ++ ) {
binCounts[b] = 0;
}
granularHistogramBySample.put(sample,binCounts);
totalCoverages.put(sample,0l);
}
public void initializeLocusCounts() {
locusCoverageCounts = new int[granularHistogramBySample.size()][binLeftEndpoints.length+1];
locusHistogram = new int[binLeftEndpoints.length+1];
for ( int b = 0; b < binLeftEndpoints.length+1; b ++ ) {
for ( int a = 0; a < granularHistogramBySample.size(); a ++ ) {
locusCoverageCounts[a][b] = 0;
}
locusHistogram[b] = 0;
}
tabulateLocusCounts = true;
}
////////////////////////////////////////////////////////////////////////////////////
// UPDATE METHODS
////////////////////////////////////////////////////////////////////////////////////
public void update(Map<String,Integer> depthBySample) {
int b;
for ( String sample : granularHistogramBySample.keySet() ) {
if ( depthBySample.containsKey(sample) ) {
b = updateSample(sample,depthBySample.get(sample));
totalLocusDepth += depthBySample.get(sample);
} else {
b = updateSample(sample,0);
}
if ( tabulateLocusCounts ) {
for ( int i = 0; i <= b; i ++ ) {
locusHistogram[i]++;
}
}
}
updateLocusCounts(locusHistogram);
nLoci++;
totalDepthOfCoverage += totalLocusDepth;
totalLocusDepth = 0;
}
private int updateSample(String sample, int depth) {
totalCoverages.put(sample,totalCoverages.get(sample)+depth);
int[] granularBins = granularHistogramBySample.get(sample);
for ( int b = 0; b < binLeftEndpoints.length; b ++ ) {
if ( depth < binLeftEndpoints[b] ) {
granularBins[b]++;
return b;
}
}
granularBins[binLeftEndpoints.length]++; // greater than all left-endpoints
return binLeftEndpoints.length;
}
public void merge(DepthOfCoverageStats newStats) {
this.mergeSamples(newStats);
if ( this.tabulateLocusCounts && newStats.tabulateLocusCounts ) {
this.mergeLocusCounts(newStats.getLocusCounts());
}
nLoci += newStats.getTotalLoci();
totalDepthOfCoverage += newStats.getTotalCoverage();
}
private void mergeSamples(DepthOfCoverageStats otherStats) {
Map<String,int[]> otherHistogram = otherStats.getHistograms();
Map<String,Double> otherMeans = otherStats.getMeans();
for ( String s : this.getAllSamples() ) {
int[] internalCounts = granularHistogramBySample.get(s);
int[] externalCounts = otherHistogram.get(s);
for ( int b = 0; b < internalCounts.length; b++ ) {
internalCounts[b] += externalCounts[b];
}
this.totalCoverages.put(s, this.totalCoverages.get(s) + otherStats.totalCoverages.get(s));
}
}
private void mergeLocusCounts( int[][] otherCounts ) {
for ( int a = 0; a < locusCoverageCounts.length; a ++ ) {
for ( int b = 0; b < locusCoverageCounts[0].length; b ++ ) {
locusCoverageCounts[a][b] += otherCounts[a][b];
}
}
}
/*
* Update locus counts -- takes an array in which the number of samples
* with depth ABOVE [i] is held. So if the bin left endpoints were 2, 5, 10
* then we'd have an array that represented:
* [# samples with depth 0 - inf], [# samples with depth 2 - inf],
* [# samples with depth 5 - inf], [# samples with depth 10-inf];
*
* this is
* @argument cumulativeSamplesByDepthBin - see above
*/
private void updateLocusCounts(int[] cumulativeSamplesByDepthBin) {
if ( tabulateLocusCounts ) {
for ( int bin = 0; bin < cumulativeSamplesByDepthBin.length; bin ++ ) {
int numSamples = cumulativeSamplesByDepthBin[bin];
for ( int i = 0; i < numSamples; i ++ ) {
locusCoverageCounts[i][bin]++;
}
cumulativeSamplesByDepthBin[bin] = 0; // reset counts in advance of next update()
}
}
}
////////////////////////////////////////////////////////////////////////////////////
// ACCESSOR METHODS
////////////////////////////////////////////////////////////////////////////////////
public Map<String,int[]> getHistograms() {
return granularHistogramBySample;
}
public int[][] getLocusCounts() {
return locusCoverageCounts;
}
public int[] getEndpoints() {
return binLeftEndpoints;
}
public Map<String,Double> getMeans() {
HashMap<String,Double> means = new HashMap<String,Double>();
for ( String s : getAllSamples() ) {
means.put(s,( (double)totalCoverages.get(s))/( (double) nLoci ));
}
return means;
}
public long getTotalLoci() {
return nLoci;
}
public Set<String> getAllSamples() {
return granularHistogramBySample.keySet();
}
public double getTotalMeanCoverage() {
return ( (double) totalDepthOfCoverage )/ ( (double) nLoci );
}
public long getTotalCoverage() {
return totalDepthOfCoverage;
}
}
}

View File

@ -0,0 +1,248 @@
package org.broadinstitute.sting.oneoffprojects.walkers.coverage;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Feb 26, 2010
*/
public class DepthOfCoverageStats {
////////////////////////////////////////////////////////////////////////////////////
// STATIC DATA
////////////////////////////////////////////////////////////////////////////////////
/* none so far */
////////////////////////////////////////////////////////////////////////////////////
// STANDARD DATA
////////////////////////////////////////////////////////////////////////////////////
private Map<String,int[]> granularHistogramBySample; // holds the counts per each bin
private Map<String,Long> totalCoverages; // holds total coverage per sample
private int[] binLeftEndpoints; // describes the left endpoint for each bin
private int[][] locusCoverageCounts; // holds counts of number of bases with >=X samples at >=Y coverage
private boolean tabulateLocusCounts = false;
private long nLoci; // number of loci seen
private long totalDepthOfCoverage;
////////////////////////////////////////////////////////////////////////////////////
// TEMPORARY DATA ( not worth re-instantiating )
////////////////////////////////////////////////////////////////////////////////////
private int[] locusHistogram; // holds a histogram for each locus; reset after each update() call
private int totalLocusDepth; // holds the total depth of coverage for each locus; reset after each update() call
////////////////////////////////////////////////////////////////////////////////////
// STATIC METHODS
////////////////////////////////////////////////////////////////////////////////////
public static int[] calculateBinEndpoints(int lower, int upper, int bins) {
if ( bins > upper - lower || lower < 1 ) {
throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+
"lower bound must be at least 1, and number of bins may not exceed stop - start");
}
int[] binLeftEndpoints = new int[bins+1];
binLeftEndpoints[0] = lower;
int length = upper - lower;
double scale = Math.log10((double) length)/bins;
for ( int b = 1; b < bins ; b++ ) {
int leftEnd = lower + (int) Math.floor(Math.pow(10.0,(b-1.0)*scale));
// todo -- simplify to length^(scale/bins); make non-constant to put bin ends in more "useful"
// todo -- positions on the number line
while ( leftEnd <= binLeftEndpoints[b-1] ) {
leftEnd++;
}
binLeftEndpoints[b] = leftEnd;
}
binLeftEndpoints[binLeftEndpoints.length-1] = upper;
return binLeftEndpoints;
}
////////////////////////////////////////////////////////////////////////////////////
// INITIALIZATION METHODS
////////////////////////////////////////////////////////////////////////////////////
public DepthOfCoverageStats(int[] leftEndpoints) {
this.binLeftEndpoints = leftEndpoints;
granularHistogramBySample = new HashMap<String,int[]>();
totalCoverages = new HashMap<String,Long>();
nLoci = 0;
totalLocusDepth = 0;
totalDepthOfCoverage = 0;
}
public void addSample(String sample) {
if ( granularHistogramBySample.containsKey(sample) ) {
return;
}
int[] binCounts = new int[this.binLeftEndpoints.length+1];
for ( int b = 0; b < binCounts.length; b ++ ) {
binCounts[b] = 0;
}
granularHistogramBySample.put(sample,binCounts);
totalCoverages.put(sample,0l);
}
public void initializeLocusCounts() {
locusCoverageCounts = new int[granularHistogramBySample.size()][binLeftEndpoints.length+1];
locusHistogram = new int[binLeftEndpoints.length+1];
for ( int b = 0; b < binLeftEndpoints.length+1; b ++ ) {
for ( int a = 0; a < granularHistogramBySample.size(); a ++ ) {
locusCoverageCounts[a][b] = 0;
}
locusHistogram[b] = 0;
}
tabulateLocusCounts = true;
}
////////////////////////////////////////////////////////////////////////////////////
// UPDATE METHODS
////////////////////////////////////////////////////////////////////////////////////
public void update(Map<String,Integer> depthBySample) {
int b;
for ( String sample : granularHistogramBySample.keySet() ) {
if ( depthBySample.containsKey(sample) ) {
b = updateSample(sample,depthBySample.get(sample));
totalLocusDepth += depthBySample.get(sample);
} else {
b = updateSample(sample,0);
}
if ( tabulateLocusCounts ) {
for ( int i = 0; i <= b; i ++ ) {
locusHistogram[i]++;
}
}
}
updateLocusCounts(locusHistogram);
nLoci++;
totalDepthOfCoverage += totalLocusDepth;
totalLocusDepth = 0;
}
private int updateSample(String sample, int depth) {
totalCoverages.put(sample,totalCoverages.get(sample)+depth);
int[] granularBins = granularHistogramBySample.get(sample);
for ( int b = 0; b < binLeftEndpoints.length; b ++ ) {
if ( depth < binLeftEndpoints[b] ) {
granularBins[b]++;
return b;
}
}
granularBins[binLeftEndpoints.length]++; // greater than all left-endpoints
return binLeftEndpoints.length;
}
public void merge(DepthOfCoverageStats newStats) {
this.mergeSamples(newStats);
if ( this.tabulateLocusCounts && newStats.tabulateLocusCounts ) {
this.mergeLocusCounts(newStats.getLocusCounts());
}
nLoci += newStats.getTotalLoci();
totalDepthOfCoverage += newStats.getTotalCoverage();
}
private void mergeSamples(DepthOfCoverageStats otherStats) {
Map<String,int[]> otherHistogram = otherStats.getHistograms();
Map<String,Double> otherMeans = otherStats.getMeans();
for ( String s : this.getAllSamples() ) {
int[] internalCounts = granularHistogramBySample.get(s);
int[] externalCounts = otherHistogram.get(s);
for ( int b = 0; b < internalCounts.length; b++ ) {
internalCounts[b] += externalCounts[b];
}
this.totalCoverages.put(s, this.totalCoverages.get(s) + otherStats.totalCoverages.get(s));
}
}
private void mergeLocusCounts( int[][] otherCounts ) {
for ( int a = 0; a < locusCoverageCounts.length; a ++ ) {
for ( int b = 0; b < locusCoverageCounts[0].length; b ++ ) {
locusCoverageCounts[a][b] += otherCounts[a][b];
}
}
}
/*
* Update locus counts -- takes an array in which the number of samples
* with depth ABOVE [i] is held. So if the bin left endpoints were 2, 5, 10
* then we'd have an array that represented:
* [# samples with depth 0 - inf], [# samples with depth 2 - inf],
* [# samples with depth 5 - inf], [# samples with depth 10-inf];
*
* this is
* @argument cumulativeSamplesByDepthBin - see above
*/
private void updateLocusCounts(int[] cumulativeSamplesByDepthBin) {
if ( tabulateLocusCounts ) {
for ( int bin = 0; bin < cumulativeSamplesByDepthBin.length; bin ++ ) {
int numSamples = cumulativeSamplesByDepthBin[bin];
for ( int i = 0; i < numSamples; i ++ ) {
locusCoverageCounts[i][bin]++;
}
cumulativeSamplesByDepthBin[bin] = 0; // reset counts in advance of next update()
}
}
}
////////////////////////////////////////////////////////////////////////////////////
// ACCESSOR METHODS
////////////////////////////////////////////////////////////////////////////////////
public Map<String,int[]> getHistograms() {
return granularHistogramBySample;
}
public int[][] getLocusCounts() {
return locusCoverageCounts;
}
public int[] getEndpoints() {
return binLeftEndpoints;
}
public Map<String,Double> getMeans() {
HashMap<String,Double> means = new HashMap<String,Double>();
for ( String s : getAllSamples() ) {
means.put(s,( (double)totalCoverages.get(s))/( (double) nLoci ));
}
return means;
}
public long getTotalLoci() {
return nLoci;
}
public Set<String> getAllSamples() {
return granularHistogramBySample.keySet();
}
public double getTotalMeanCoverage() {
return ( (double) totalDepthOfCoverage )/ ( (double) nLoci );
}
public long getTotalCoverage() {
return totalDepthOfCoverage;
}
}

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
package org.broadinstitute.sting.oneoffprojects.walkers.coverage;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;