2010-02-24 02:41:02 +08:00
package org.broadinstitute.sting.oneoffprojects.walkers ;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext ;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext ;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext ;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker ;
import org.broadinstitute.sting.gatk.walkers.By ;
import org.broadinstitute.sting.gatk.walkers.DataSource ;
import org.broadinstitute.sting.gatk.walkers.LocusWalker ;
import org.broadinstitute.sting.gatk.walkers.TreeReducible ;
2010-02-26 04:25:07 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
import org.broadinstitute.sting.utils.Pair ;
2010-02-24 02:41:02 +08:00
import org.broadinstitute.sting.utils.StingException ;
import org.broadinstitute.sting.utils.cmdLine.Argument ;
import org.broadinstitute.sting.utils.pileup.PileupElement ;
import java.io.File ;
import java.io.IOException ;
import java.io.PrintStream ;
import java.util.HashMap ;
import java.util.List ;
import java.util.Map ;
import java.util.Set ;
/ * *
* A parallelizable walker designed to quickly aggregate relevant coverage statistics across samples in the input
* file . Assesses the mean and median granular coverages of each sample , and generates part of a cumulative
* distribution of % bases and % targets covered for certain depths . The granularity of DOC can be set by command
* line arguments .
*
*
* @Author chartl
* @Date Feb 22 , 2010
* /
2010-02-26 04:25:07 +08:00
// todo [DONE] -- add per target (e.g. regional) aggregation
// todo [DONE] -- add ability to print out the calculated bins and quit (for pre-analysis bin size selection)
// todo -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps]
// todo [DONE] -- per locus output through -o
// todo -- support for using read groups instead of samples
// todo -- coverage without deletions
// todo -- base counts
// todo -- support for aggregate (ignoring sample IDs) granular histograms; maybe n*[start,stop], bins*sqrt(n)
// todo -- alter logarithmic scaling to spread out bins more
// todo -- allow for user to set linear binning (default is logarithmic)
// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now
2010-02-24 02:41:02 +08:00
@By ( DataSource . REFERENCE )
public class CoverageStatistics extends LocusWalker < Map < String , Integer > , DepthOfCoverageStats > implements TreeReducible < DepthOfCoverageStats > {
@Argument ( fullName = "start" , doc = "Starting (left endpoint) for granular binning" , required = false )
int start = 1 ;
@Argument ( fullName = "stop" , doc = "Ending (right endpoint) for granular binning" , required = false )
2010-02-24 04:43:15 +08:00
int stop = 1000 ;
2010-02-24 02:41:02 +08:00
@Argument ( fullName = "nBins" , doc = "Number of bins to use for granular binning" , required = false )
int nBins = 20 ;
@Argument ( fullName = "minMappingQuality" , shortName = "mmq" , doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50." , required = false )
byte minMappingQuality = 50 ;
@Argument ( fullName = "minBaseQuality" , shortName = "mbq" , doc = "Minimum quality of bases to count towards depth. Defaults to 20." , required = false )
byte minBaseQuality = 20 ;
2010-02-26 04:25:07 +08:00
@Argument ( fullName = "omitLocusTable" , shortName = "omitLocus" , doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup" , required = false )
boolean omitLocusTable = false ;
@Argument ( fullName = "omitIntervalStatistics" , shortName = "omitIntervals" , doc = "Will omit the per-interval statistics section, which should result in speedup" , required = false )
boolean omitIntervals = false ;
@Argument ( fullName = "omitDepthOutputAtEachBase" , shortName = "omitBaseOutput" , doc = "Will omit the output of the depth of coverage at each base, which should result in speedup" , required = false )
boolean omitDepthOutput = false ;
@Argument ( fullName = "printBinEndpointsAndExit" , doc = "Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data." , required = false )
boolean printBinEndpointsAndExit = false ;
@Argument ( fullName = "omitPerSampleStats" , shortName = "omitSampleSummary" , doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime." , required = false )
boolean omitSampleSummary = false ;
2010-02-24 02:41:02 +08:00
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
////////////////////////////////////////////////////////////////////////////////////
2010-02-26 04:25:07 +08:00
public void initialize ( ) {
if ( printBinEndpointsAndExit ) {
int [ ] endpoints = DepthOfCoverageStats . calculateBinEndpoints ( start , stop , nBins ) ;
System . out . print ( "[ " ) ;
for ( int e : endpoints ) {
System . out . print ( e + " " ) ;
}
System . out . println ( "]" ) ;
System . exit ( 0 ) ;
}
if ( getToolkit ( ) . getArguments ( ) . outFileName = = null ) {
throw new StingException ( "This walker requires that you specify an output file (-o)" ) ;
}
}
public boolean isReduceByInterval ( ) {
return ( ! omitIntervals ) ;
}
2010-02-24 02:41:02 +08:00
public DepthOfCoverageStats reduceInit ( ) {
List < Set < String > > samplesByReaders = getToolkit ( ) . getSamplesByReaders ( ) ;
DepthOfCoverageStats stats = new DepthOfCoverageStats ( DepthOfCoverageStats . calculateBinEndpoints ( start , stop , nBins ) ) ;
for ( Set < String > sampleSet : samplesByReaders ) {
for ( String sample : sampleSet ) {
stats . addSample ( sample ) ;
}
}
2010-02-26 04:25:07 +08:00
if ( ! omitLocusTable ) {
2010-02-24 02:41:02 +08:00
stats . initializeLocusCounts ( ) ;
}
2010-02-26 04:25:07 +08:00
if ( ! omitDepthOutput ) { // print header
out . printf ( "%s\t%s\t%s" , "Locus" , "Total_Depth" , "Average_Depth" ) ;
for ( String s : stats . getAllSamples ( ) ) {
out . printf ( "\t%s_%s" , "Depth_for" , s ) ;
}
out . printf ( "%n" ) ;
} else {
out . printf ( "Per-Locus Depth of Coverage output was omitted" ) ;
}
2010-02-24 02:41:02 +08:00
return stats ;
}
public Map < String , Integer > map ( RefMetaDataTracker tracker , ReferenceContext ref , AlignmentContext context ) {
Map < String , StratifiedAlignmentContext > contextsBySample =
StratifiedAlignmentContext . splitContextBySample ( context . getBasePileup ( ) ) ;
HashMap < String , Integer > depthBySample = new HashMap < String , Integer > ( ) ;
for ( String sample : contextsBySample . keySet ( ) ) {
AlignmentContext sampleContext = contextsBySample . get ( sample ) . getContext ( StratifiedAlignmentContext . StratifiedContextType . COMPLETE ) ;
int properDepth = 0 ;
for ( PileupElement e : sampleContext . getBasePileup ( ) ) {
if ( e . getQual ( ) > = minBaseQuality & & e . getMappingQual ( ) > = minMappingQuality ) {
properDepth + + ;
}
}
depthBySample . put ( sample , properDepth ) ;
}
2010-02-26 04:25:07 +08:00
if ( ! omitDepthOutput ) {
out . printf ( "%s" , ref . getLocus ( ) ) ; // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
}
2010-02-24 02:41:02 +08:00
return depthBySample ;
}
public DepthOfCoverageStats reduce ( Map < String , Integer > thisMap , DepthOfCoverageStats prevReduce ) {
prevReduce . update ( thisMap ) ;
2010-02-26 04:25:07 +08:00
if ( ! omitDepthOutput ) {
printDepths ( out , thisMap , prevReduce . getAllSamples ( ) ) ;
// this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without
// turning on omit
}
2010-02-24 02:41:02 +08:00
return prevReduce ;
}
public DepthOfCoverageStats treeReduce ( DepthOfCoverageStats left , DepthOfCoverageStats right ) {
left . merge ( right ) ;
return left ;
}
2010-02-26 04:25:07 +08:00
////////////////////////////////////////////////////////////////////////////////////
// INTERVAL ON TRAVERSAL DONE
////////////////////////////////////////////////////////////////////////////////////
public void onTraversalDone ( List < Pair < GenomeLoc , DepthOfCoverageStats > > statsByInterval ) {
File intervalStatisticsFile = deriveFromStream ( "interval_statistics" ) ;
File intervalSummaryFile = deriveFromStream ( "interval_summary" ) ;
DepthOfCoverageStats mergedStats = printIntervalStatsAndMerge ( statsByInterval , intervalSummaryFile , intervalStatisticsFile ) ;
this . onTraversalDone ( mergedStats ) ;
}
private DepthOfCoverageStats printIntervalStatsAndMerge ( List < Pair < GenomeLoc , DepthOfCoverageStats > > statsByInterval , File summaryFile , File statsFile ) {
PrintStream summaryOut ;
PrintStream statsOut ;
try {
summaryOut = summaryFile = = null ? out : new PrintStream ( summaryFile ) ;
statsOut = statsFile = = null ? out : new PrintStream ( statsFile ) ;
} catch ( IOException e ) {
throw new StingException ( "Unable to open interval file on reduce" , e ) ;
}
Pair < GenomeLoc , DepthOfCoverageStats > firstPair = statsByInterval . remove ( 0 ) ;
DepthOfCoverageStats firstStats = firstPair . second ;
StringBuilder summaryHeader = new StringBuilder ( ) ;
summaryHeader . append ( "Target" ) ;
summaryHeader . append ( "\ttotal_coverage" ) ;
summaryHeader . append ( "\taverage_coverage" ) ;
for ( String s : firstStats . getAllSamples ( ) ) {
summaryHeader . append ( "\t" ) ;
summaryHeader . append ( s ) ;
summaryHeader . append ( "_mean_cvg" ) ;
summaryHeader . append ( "\t" ) ;
summaryHeader . append ( s ) ;
summaryHeader . append ( "_granular_Q1" ) ;
summaryHeader . append ( "\t" ) ;
summaryHeader . append ( s ) ;
summaryHeader . append ( "_granular_median" ) ;
summaryHeader . append ( "\t" ) ;
summaryHeader . append ( s ) ;
summaryHeader . append ( "_granular_Q3" ) ;
}
summaryOut . printf ( "%s%n" , summaryHeader ) ;
int [ ] [ ] nTargetsByAvgCvgBySample = new int [ firstStats . getHistograms ( ) . size ( ) ] [ firstStats . getEndpoints ( ) . length + 1 ] ;
for ( int i = 0 ; i < nTargetsByAvgCvgBySample . length ; i + + ) {
for ( int b = 0 ; b < nTargetsByAvgCvgBySample [ 0 ] . length ; b + + ) {
nTargetsByAvgCvgBySample [ i ] [ b ] = 0 ;
}
}
printTargetSummary ( summaryOut , firstPair ) ;
updateTargetTable ( nTargetsByAvgCvgBySample , firstStats ) ;
for ( Pair < GenomeLoc , DepthOfCoverageStats > targetStats : statsByInterval ) {
printTargetSummary ( summaryOut , targetStats ) ;
updateTargetTable ( nTargetsByAvgCvgBySample , targetStats . second ) ;
firstStats = this . treeReduce ( firstStats , targetStats . second ) ;
}
printIntervalTable ( statsOut , nTargetsByAvgCvgBySample , firstStats . getEndpoints ( ) ) ;
summaryOut . close ( ) ;
statsOut . close ( ) ;
return firstStats ;
}
private void printTargetSummary ( PrintStream output , Pair < GenomeLoc , DepthOfCoverageStats > intervalStats ) {
DepthOfCoverageStats stats = intervalStats . second ;
int [ ] bins = stats . getEndpoints ( ) ;
StringBuilder targetSummary = new StringBuilder ( ) ;
targetSummary . append ( intervalStats . first . toString ( ) ) ;
targetSummary . append ( "\t" ) ;
targetSummary . append ( stats . getTotalLoci ( ) * stats . getMeans ( ) . get ( DepthOfCoverageStats . ALL_SAMPLES ) ) ;
// TODO: change this to use the raw counts directly rather than re-estimating from mean*nloci
targetSummary . append ( "\t" ) ;
targetSummary . append ( stats . getMeans ( ) . get ( DepthOfCoverageStats . ALL_SAMPLES ) ) ;
for ( String s : stats . getAllSamples ( ) ) {
targetSummary . append ( "\t" ) ;
targetSummary . append ( String . format ( "%.2f" , stats . getMeans ( ) . get ( s ) ) ) ;
targetSummary . append ( "\t" ) ;
int median = getQuantile ( stats . getHistograms ( ) . get ( s ) , 0.5 ) ;
int q1 = getQuantile ( stats . getHistograms ( ) . get ( s ) , 0.25 ) ;
int q3 = getQuantile ( stats . getHistograms ( ) . get ( s ) , 0.75 ) ;
targetSummary . append ( bins [ q1 ] ) ;
targetSummary . append ( "\t" ) ;
targetSummary . append ( bins [ median ] ) ;
targetSummary . append ( "\t" ) ;
targetSummary . append ( bins [ q3 ] ) ;
}
output . printf ( "%s%n" , targetSummary ) ;
}
private void printIntervalTable ( PrintStream output , int [ ] [ ] intervalTable , int [ ] cutoffs ) {
output . printf ( "\tdepth>=%d" , 0 ) ;
for ( int col = 0 ; col < intervalTable [ 0 ] . length - 1 ; col + + ) {
output . printf ( "\tdepth>=%d" , cutoffs [ col ] ) ;
}
output . printf ( String . format ( "%n" ) ) ;
for ( int row = 0 ; row < intervalTable . length ; row + + ) {
output . printf ( "At_least_%d_samples" , row + 1 ) ;
for ( int col = 0 ; col < intervalTable [ 0 ] . length ; col + + ) {
output . printf ( "\t%d" , intervalTable [ row ] [ col ] ) ;
}
output . printf ( String . format ( "%n" ) ) ;
}
}
/ *
* @updateTargetTable
* The idea is to have counts for how many * targets * have at least K samples with
* median coverage of at least X .
* To that end :
* Iterate over the samples the DOCS object , determine how many there are with
* median coverage > leftEnds [ 0 ] ; how many with median coverage > leftEnds [ 1 ]
* and so on . Then this target has at least N , N - 1 , N - 2 , . . . 1 , 0 samples covered
* to leftEnds [ 0 ] and at least M , M - 1 , M - 2 , . . .1 , 0 samples covered to leftEnds [ 1 ]
* and so on .
* /
private void updateTargetTable ( int [ ] [ ] table , DepthOfCoverageStats stats ) {
int [ ] cutoffs = stats . getEndpoints ( ) ;
int [ ] countsOfMediansAboveCutoffs = new int [ cutoffs . length + 1 ] ; // 0 bin to catch everything
for ( int i = 0 ; i < countsOfMediansAboveCutoffs . length ; i + + ) {
countsOfMediansAboveCutoffs [ i ] = 0 ;
}
for ( String s : stats . getAllSamples ( ) ) {
int medianBin = getQuantile ( stats . getHistograms ( ) . get ( s ) , 0.5 ) ;
for ( int i = 0 ; i < = medianBin ; i + + ) {
countsOfMediansAboveCutoffs [ i ] + + ;
}
}
for ( int medianBin = 0 ; medianBin < countsOfMediansAboveCutoffs . length ; medianBin + + ) {
for ( ; countsOfMediansAboveCutoffs [ medianBin ] > 0 ; countsOfMediansAboveCutoffs [ medianBin ] - - ) {
table [ countsOfMediansAboveCutoffs [ medianBin ] - 1 ] [ medianBin ] + + ;
// the -1 is due to counts being 1-based and offsets being 0-based
}
}
}
////////////////////////////////////////////////////////////////////////////////////
// FINAL ON TRAVERSAL DONE
////////////////////////////////////////////////////////////////////////////////////
2010-02-24 02:41:02 +08:00
public void onTraversalDone ( DepthOfCoverageStats coverageProfiles ) {
2010-02-26 04:25:07 +08:00
///////////////////
// OPTIONAL OUTPUTS
//////////////////
if ( ! omitSampleSummary ) {
File summaryStatisticsFile = deriveFromStream ( "summary_statistics" ) ;
File perSampleStatisticsFile = deriveFromStream ( "sample_statistics" ) ;
printSummary ( out , summaryStatisticsFile , coverageProfiles ) ;
printPerSample ( out , perSampleStatisticsFile , coverageProfiles ) ;
}
if ( ! omitLocusTable ) {
File perLocusStatisticsFile = deriveFromStream ( "locus_statistics" ) ;
printPerLocus ( perLocusStatisticsFile , coverageProfiles ) ;
}
}
public File deriveFromStream ( String append ) {
String name = getToolkit ( ) . getArguments ( ) . outFileName ;
if ( name . contains ( "stdout" ) | | name . contains ( "Stdout" ) | | name . contains ( "STDOUT" ) ) {
return null ;
} else {
return new File ( name + "." + append ) ;
}
2010-02-24 02:41:02 +08:00
}
////////////////////////////////////////////////////////////////////////////////////
// HELPER OUTPUT METHODS
////////////////////////////////////////////////////////////////////////////////////
private void printPerSample ( PrintStream out , File optionalFile , DepthOfCoverageStats stats ) {
PrintStream output = getCorrectStream ( out , optionalFile ) ;
int [ ] leftEnds = stats . getEndpoints ( ) ;
StringBuilder hBuilder = new StringBuilder ( ) ;
hBuilder . append ( "\t" ) ;
hBuilder . append ( String . format ( "[0,%d)\t" , leftEnds [ 0 ] ) ) ;
for ( int i = 1 ; i < leftEnds . length ; i + + )
hBuilder . append ( String . format ( "[%d,%d)\t" , leftEnds [ i - 1 ] , leftEnds [ i ] ) ) ;
hBuilder . append ( String . format ( "[%d,inf)%n" , leftEnds [ leftEnds . length - 1 ] ) ) ;
output . print ( hBuilder . toString ( ) ) ;
Map < String , int [ ] > histograms = stats . getHistograms ( ) ;
for ( String s : histograms . keySet ( ) ) {
StringBuilder sBuilder = new StringBuilder ( ) ;
sBuilder . append ( String . format ( "%s" , s ) ) ;
for ( int count : histograms . get ( s ) ) {
sBuilder . append ( String . format ( "\t%d" , count ) ) ;
}
sBuilder . append ( String . format ( "%n" ) ) ;
output . print ( sBuilder . toString ( ) ) ;
}
}
private void printPerLocus ( File locusFile , DepthOfCoverageStats stats ) {
PrintStream output = getCorrectStream ( null , locusFile ) ;
if ( output = = null ) {
return ;
}
int [ ] endpoints = stats . getEndpoints ( ) ;
int samples = stats . getHistograms ( ) . size ( ) ;
int [ ] [ ] baseCoverageCumDist = stats . getLocusCounts ( ) ;
// rows - # of samples
// columns - depth of coverage
StringBuilder header = new StringBuilder ( ) ;
2010-02-24 13:20:18 +08:00
header . append ( String . format ( "\t>=0" ) ) ;
2010-02-24 02:41:02 +08:00
for ( int d : endpoints ) {
2010-02-24 13:20:18 +08:00
header . append ( String . format ( "\t>=%d" , d ) ) ;
2010-02-24 02:41:02 +08:00
}
header . append ( String . format ( "%n" ) ) ;
output . print ( header ) ;
2010-02-24 13:20:18 +08:00
for ( int row = 0 ; row < samples ; row + + ) {
output . printf ( "%s_%d\t" , "NSamples" , row + 1 ) ;
2010-02-24 02:41:02 +08:00
for ( int depthBin = 0 ; depthBin < baseCoverageCumDist [ 0 ] . length ; depthBin + + ) {
output . printf ( "%d\t" , baseCoverageCumDist [ row ] [ depthBin ] ) ;
}
output . printf ( "%n" ) ;
}
}
private PrintStream getCorrectStream ( PrintStream out , File optionalFile ) {
PrintStream output ;
if ( optionalFile = = null ) {
output = out ;
} else {
try {
output = new PrintStream ( optionalFile ) ;
} catch ( IOException e ) {
logger . warn ( "Error opening the output file " + optionalFile . getAbsolutePath ( ) + ". Defaulting to stdout" ) ;
output = out ;
}
}
return output ;
}
private void printSummary ( PrintStream out , File optionalFile , DepthOfCoverageStats stats ) {
PrintStream output = getCorrectStream ( out , optionalFile ) ;
output . printf ( "%s\t%s\t%s\t%s\t%s%n" , "sample_id" , "mean" , "granular_third_quartile" , "granular_median" , "granular_first_quartile" ) ;
Map < String , int [ ] > histograms = stats . getHistograms ( ) ;
Map < String , Double > means = stats . getMeans ( ) ;
int [ ] leftEnds = stats . getEndpoints ( ) ;
for ( String s : histograms . keySet ( ) ) {
2010-02-26 04:25:07 +08:00
int [ ] histogram = histograms . get ( s ) ;
int median = getQuantile ( histogram , 0.5 ) ;
int q1 = getQuantile ( histogram , 0.25 ) ;
int q3 = getQuantile ( histogram , 0.75 ) ;
// if any of these are larger than the higest bin, put the median as in the largest bin
median = median = = histogram . length - 1 ? histogram . length - 2 : median ;
q1 = q1 = = histogram . length - 1 ? histogram . length - 2 : q1 ;
q3 = q3 = = histogram . length - 1 ? histogram . length - 2 : q3 ;
2010-02-24 02:41:02 +08:00
output . printf ( "%s\t%.2f\t%d\t%d\t%d%n" , s , means . get ( s ) , leftEnds [ q3 ] , leftEnds [ median ] , leftEnds [ q1 ] ) ;
}
output . printf ( "%s\t%.2f\t%s\t%s\t%s%n" , "Total" , means . get ( DepthOfCoverageStats . ALL_SAMPLES ) , "N/A" , "N/A" , "N/A" ) ;
}
private int getQuantile ( int [ ] histogram , double prop ) {
int total = 0 ;
for ( int i = 0 ; i < histogram . length ; i + + ) {
total + = histogram [ i ] ;
}
int counts = 0 ;
int bin = - 1 ;
while ( counts < prop * total ) {
counts + = histogram [ bin + 1 ] ;
bin + + ;
}
return bin ;
}
2010-02-26 04:25:07 +08:00
private void printDepths ( PrintStream stream , Map < String , Integer > depthBySample , Set < String > allSamples ) {
// get the depths per sample and build up the output string while tabulating total and average coverage
StringBuilder perSampleOutput = new StringBuilder ( ) ;
int tDepth = 0 ;
for ( String s : allSamples ) {
perSampleOutput . append ( "\t" ) ;
int dp = depthBySample . keySet ( ) . contains ( s ) ? depthBySample . get ( s ) : 0 ;
perSampleOutput . append ( dp ) ;
tDepth + = dp ;
}
// remember -- genome locus was printed in map()
stream . printf ( "\t%d\t%.2f\t%s%n" , tDepth , ( ( double ) tDepth / ( double ) allSamples . size ( ) ) , perSampleOutput ) ;
}
2010-02-24 02:41:02 +08:00
}
class DepthOfCoverageStats {
2010-02-26 04:25:07 +08:00
////////////////////////////////////////////////////////////////////////////////////
// STATIC DATA
////////////////////////////////////////////////////////////////////////////////////
2010-02-24 02:41:02 +08:00
public static String ALL_SAMPLES = "ALL_COMBINED_SAMPLES" ;
2010-02-26 04:25:07 +08:00
////////////////////////////////////////////////////////////////////////////////////
// STANDARD DATA
////////////////////////////////////////////////////////////////////////////////////
2010-02-24 02:41:02 +08:00
private Map < String , int [ ] > granularHistogramBySample ; // holds the counts per each bin
private Map < String , Double > meanCoverages ; // holds mean coverage per sample
private int [ ] binLeftEndpoints ; // describes the left endpoint for each bin
private int [ ] [ ] locusCoverageCounts ; // holds counts of number of bases with >=X samples at >=Y coverage
private boolean tabulateLocusCounts = false ;
private int nLoci ; // number of loci seen
2010-02-26 04:25:07 +08:00
////////////////////////////////////////////////////////////////////////////////////
// TEMPORARY DATA ( not worth re-instantiating )
////////////////////////////////////////////////////////////////////////////////////
2010-02-24 02:41:02 +08:00
private int [ ] locusHistogram ; // holds a histogram for each locus; reset after each update() call
private int totalDepth ; // holds the total depth of coverage for each locus; reset after each update() call
////////////////////////////////////////////////////////////////////////////////////
// STATIC METHODS
////////////////////////////////////////////////////////////////////////////////////
public static int [ ] calculateBinEndpoints ( int lower , int upper , int bins ) {
if ( bins > upper - lower | | lower < 1 ) {
throw new IllegalArgumentException ( "Illegal argument to calculateBinEndpoints; " +
"lower bound must be at least 1, and number of bins may not exceed stop - start" ) ;
}
int [ ] binLeftEndpoints = new int [ bins + 1 ] ;
2010-02-24 04:48:53 +08:00
binLeftEndpoints [ 0 ] = lower ;
2010-02-24 02:41:02 +08:00
int length = upper - lower ;
double scale = Math . log10 ( ( double ) length ) / bins ;
for ( int b = 1 ; b < bins ; b + + ) {
2010-02-24 04:48:53 +08:00
int leftEnd = lower + ( int ) Math . floor ( Math . pow ( 10.0 , ( b - 1.0 ) * scale ) ) ;
2010-02-24 13:20:18 +08:00
// todo -- simplify to length^(scale/bins); make non-constant to put bin ends in more "useful"
// todo -- positions on the number line
2010-02-24 02:41:02 +08:00
while ( leftEnd < = binLeftEndpoints [ b - 1 ] ) {
leftEnd + + ;
}
binLeftEndpoints [ b ] = leftEnd ;
}
2010-02-24 04:43:15 +08:00
binLeftEndpoints [ binLeftEndpoints . length - 1 ] = upper ;
2010-02-24 02:41:02 +08:00
return binLeftEndpoints ;
}
////////////////////////////////////////////////////////////////////////////////////
// INITIALIZATION METHODS
////////////////////////////////////////////////////////////////////////////////////
public DepthOfCoverageStats ( int [ ] leftEndpoints ) {
this . binLeftEndpoints = leftEndpoints ;
granularHistogramBySample = new HashMap < String , int [ ] > ( ) ;
meanCoverages = new HashMap < String , Double > ( ) ;
2010-02-24 04:43:15 +08:00
meanCoverages . put ( DepthOfCoverageStats . ALL_SAMPLES , 0.0 ) ;
2010-02-24 02:41:02 +08:00
nLoci = 0 ;
totalDepth = 0 ;
}
public void addSample ( String sample ) {
if ( granularHistogramBySample . containsKey ( sample ) ) {
return ;
}
2010-02-24 13:20:18 +08:00
int [ ] binCounts = new int [ this . binLeftEndpoints . length + 1 ] ;
2010-02-24 02:41:02 +08:00
for ( int b = 0 ; b < binCounts . length ; b + + ) {
binCounts [ b ] = 0 ;
}
granularHistogramBySample . put ( sample , binCounts ) ;
meanCoverages . put ( sample , 0.0 ) ;
}
public void initializeLocusCounts ( ) {
2010-02-24 13:20:18 +08:00
locusCoverageCounts = new int [ granularHistogramBySample . size ( ) ] [ binLeftEndpoints . length + 1 ] ;
locusHistogram = new int [ binLeftEndpoints . length + 1 ] ;
for ( int b = 0 ; b < binLeftEndpoints . length + 1 ; b + + ) {
2010-02-24 02:41:02 +08:00
for ( int a = 0 ; a < granularHistogramBySample . size ( ) ; a + + ) {
locusCoverageCounts [ a ] [ b ] = 0 ;
}
locusHistogram [ b ] = 0 ;
}
tabulateLocusCounts = true ;
}
////////////////////////////////////////////////////////////////////////////////////
// UPDATE METHODS
////////////////////////////////////////////////////////////////////////////////////
public void update ( Map < String , Integer > depthBySample ) {
int b ;
for ( String sample : granularHistogramBySample . keySet ( ) ) {
if ( depthBySample . containsKey ( sample ) ) {
b = updateSample ( sample , depthBySample . get ( sample ) ) ;
totalDepth + = depthBySample . get ( sample ) ;
} else {
b = updateSample ( sample , 0 ) ;
}
if ( tabulateLocusCounts ) {
for ( int i = 0 ; i < = b ; i + + ) {
locusHistogram [ i ] + + ;
}
}
}
double meanDepth = meanCoverages . get ( DepthOfCoverageStats . ALL_SAMPLES ) ;
double newMean = ( meanDepth * nLoci + ( double ) totalDepth ) / ( nLoci + 1 ) ;
meanCoverages . put ( DepthOfCoverageStats . ALL_SAMPLES , newMean ) ;
updateLocusCounts ( locusHistogram ) ;
nLoci + + ;
totalDepth = 0 ;
}
private int updateSample ( String sample , int depth ) {
double mean = meanCoverages . get ( sample ) ;
double newMean = ( nLoci * mean + ( double ) depth ) / ( nLoci + 1.0 ) ;
meanCoverages . put ( sample , newMean ) ;
int [ ] granularBins = granularHistogramBySample . get ( sample ) ;
2010-02-24 13:20:18 +08:00
for ( int b = 0 ; b < binLeftEndpoints . length ; b + + ) {
2010-02-24 02:41:02 +08:00
if ( depth < binLeftEndpoints [ b ] ) {
2010-02-24 13:20:18 +08:00
granularBins [ b ] + + ;
return b ;
2010-02-24 02:41:02 +08:00
}
}
2010-02-24 13:20:18 +08:00
granularBins [ binLeftEndpoints . length ] + + ; // greater than all left-endpoints
return binLeftEndpoints . length ;
2010-02-24 02:41:02 +08:00
}
public void merge ( DepthOfCoverageStats newStats ) {
this . mergeSamples ( newStats ) ;
if ( this . tabulateLocusCounts & & newStats . tabulateLocusCounts ) {
this . mergeLocusCounts ( newStats . getLocusCounts ( ) ) ;
}
double totalMean = ( meanCoverages . get ( DepthOfCoverageStats . ALL_SAMPLES ) * nLoci +
newStats . getMeans ( ) . get ( DepthOfCoverageStats . ALL_SAMPLES ) * newStats . getTotalLoci ( ) ) /
( nLoci + newStats . getTotalLoci ( ) ) ;
meanCoverages . put ( DepthOfCoverageStats . ALL_SAMPLES , totalMean ) ;
nLoci + = newStats . getTotalLoci ( ) ;
}
private void mergeSamples ( DepthOfCoverageStats otherStats ) {
Map < String , int [ ] > otherHistogram = otherStats . getHistograms ( ) ;
Map < String , Double > otherMeans = otherStats . getMeans ( ) ;
for ( String s : granularHistogramBySample . keySet ( ) ) {
int [ ] internalCounts = granularHistogramBySample . get ( s ) ;
int [ ] externalCounts = otherHistogram . get ( s ) ;
for ( int b = 0 ; b < internalCounts . length ; b + + ) {
internalCounts [ b ] + = externalCounts [ b ] ;
}
double internalMean = meanCoverages . get ( s ) ;
double externalMean = otherMeans . get ( s ) ;
double newMean = ( internalMean * nLoci + externalMean * otherStats . getTotalLoci ( ) ) / ( nLoci + otherStats . getTotalLoci ( ) ) ;
meanCoverages . put ( s , newMean ) ;
}
}
private void mergeLocusCounts ( int [ ] [ ] otherCounts ) {
for ( int a = 0 ; a < locusCoverageCounts . length ; a + + ) {
for ( int b = 0 ; b < locusCoverageCounts [ 0 ] . length ; b + + ) {
locusCoverageCounts [ a ] [ b ] + = otherCounts [ a ] [ b ] ;
}
}
}
/ *
* Update locus counts - - takes an array in which the number of samples
* with depth ABOVE [ i ] is held . So if the bin left endpoints were 2 , 5 , 10
* then we ' d have an array that represented :
* [ # samples with depth 0 - inf ] , [ # samples with depth 2 - inf ] ,
* [ # samples with depth 5 - inf ] , [ # samples with depth 10 - inf ] ;
*
* this is
* @argument cumulativeSamplesByDepthBin - see above
* /
private void updateLocusCounts ( int [ ] cumulativeSamplesByDepthBin ) {
if ( tabulateLocusCounts ) {
for ( int bin = 0 ; bin < cumulativeSamplesByDepthBin . length ; bin + + ) {
int numSamples = cumulativeSamplesByDepthBin [ bin ] ;
for ( int i = 0 ; i < numSamples ; i + + ) {
locusCoverageCounts [ i ] [ bin ] + + ;
}
cumulativeSamplesByDepthBin [ bin ] = 0 ; // reset counts in advance of next update()
}
}
}
////////////////////////////////////////////////////////////////////////////////////
// ACCESSOR METHODS
////////////////////////////////////////////////////////////////////////////////////
public Map < String , int [ ] > getHistograms ( ) {
return granularHistogramBySample ;
}
public int [ ] [ ] getLocusCounts ( ) {
return locusCoverageCounts ;
}
public int [ ] getEndpoints ( ) {
return binLeftEndpoints ;
}
public Map < String , Double > getMeans ( ) {
return meanCoverages ;
}
public int getTotalLoci ( ) {
return nLoci ;
}
2010-02-26 04:25:07 +08:00
public Set < String > getAllSamples ( ) {
return granularHistogramBySample . keySet ( ) ;
}
2010-02-24 02:41:02 +08:00
}