Cut the memory footprint of the RecalDatum in half to improve performance of CountCovariates when run with many covariates.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2520 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-06 16:12:27 +00:00
parent 370a365147
commit e011a1b6f8
3 changed files with 147 additions and 10 deletions

View File

@ -103,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private static final String versionString = "v2.2.1"; // Major version, minor version, and build number
private static final String versionString = "v2.2.2"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -157,7 +157,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
}
// Initialize the requested covariates by parsing the -cov argument
requestedCovariates = new ArrayList<Covariate>();
// First add the required covariates
@ -324,7 +323,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
counts.second++;
}
}
}
@ -368,9 +366,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
RecalDatum datum = (RecalDatum) dataManager.data.get( key );
RecalDatumOptimized datum = (RecalDatumOptimized) dataManager.data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
datum = new RecalDatumOptimized(); // initialized with zeros, will be incremented at end of method
dataManager.data.put( datum, (Object[])key );
}
@ -469,14 +467,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
for( Comparable comp : keyList ) {
key[curPos] = comp;
final Object val = data.get(comp);
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
// For each Covariate in the key
for( Object compToPrint : key ) {
// Output the Covariate's value
recalTableStream.print( compToPrint + "," );
}
// Output the RecalDatum entry
recalTableStream.println( ((RecalDatum)val).outputToCSV() );
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
} else { // Another layer in the nested hash map
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val);
}
@ -487,14 +485,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
for( Object comp : data.keySet() ) {
key[curPos] = comp;
final Object val = data.get(comp);
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
// For each Covariate in the key
for( Object compToPrint : key ) {
// Output the Covariate's value
recalTableStream.print( compToPrint + "," );
}
// Output the RecalDatum entry
recalTableStream.println( ((RecalDatum)val).outputToCSV() );
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
} else { // Another layer in the nested hash map
printMappings( recalTableStream, curPos + 1, key, (Map) val);
}

View File

@ -0,0 +1,139 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Jan 6, 2010
*
* An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed.
* Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
*/
public class RecalDatumOptimized {
private long numObservations; // number of bases seen in total
private long numMismatches; // number of bases seen that didn't match the reference
//---------------------------------------------------------------------------------------------------------------
//
// constructors
//
//---------------------------------------------------------------------------------------------------------------
public RecalDatumOptimized() {
numObservations = 0L;
numMismatches = 0L;
}
public RecalDatumOptimized( final long _numObservations, final long _numMismatches) {
numObservations = _numObservations;
numMismatches = _numMismatches;
}
public RecalDatumOptimized( final RecalDatumOptimized copy ) {
this.numObservations = copy.numObservations;
this.numMismatches = copy.numMismatches;
}
//---------------------------------------------------------------------------------------------------------------
//
// increment methods
//
//---------------------------------------------------------------------------------------------------------------
public final void increment( final long incObservations, final long incMismatches ) {
numObservations += incObservations;
numMismatches += incMismatches;
}
public final void increment( final RecalDatumOptimized other ) {
increment( other.numObservations, other.numMismatches );
}
public final void increment( final List<RecalDatumOptimized> data ) {
for ( RecalDatumOptimized other : data ) {
this.increment( other );
}
}
public final void increment( final char curBase, final char refBase ) {
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches
}
//---------------------------------------------------------------------------------------------------------------
//
// methods to derive empirical quality score
//
//---------------------------------------------------------------------------------------------------------------
public final double empiricalQualDouble( final int smoothing ) {
final double doubleMismatches = (double) ( numMismatches + smoothing );
final double doubleObservations = (double) ( numObservations + smoothing );
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; }
return empiricalQual;
}
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
public final byte empiricalQualByte( final int smoothing ) {
final double doubleMismatches = (double) ( numMismatches + smoothing );
final double doubleObservations = (double) ( numObservations + smoothing );
return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations );
}
public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero
//---------------------------------------------------------------------------------------------------------------
//
// misc. methods
//
//---------------------------------------------------------------------------------------------------------------
public final String outputToCSV( ) {
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
}
public final String outputToCSV( final int smoothing ) {
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) );
}
public final long getNumObservations() {
return numObservations;
}
public final long getNumMismatches() {
return numMismatches;
}
public String toString() {
return String.format( "RecalDatum: %d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
}
}

View File

@ -92,7 +92,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final String versionString = "v2.2.2"; // Major version, minor version, and build number
private static final String versionString = "v2.2.3"; // Major version, minor version, and build number
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
private static final long RANDOM_SEED = 1032861495;