Cut the memory footprint of the RecalDatum in half to improve performance of CountCovariates when run with many covariates.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2520 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
370a365147
commit
e011a1b6f8
|
|
@ -103,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private static final String versionString = "v2.2.1"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.2.2"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
|
|
@ -157,7 +157,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
|
||||
}
|
||||
|
||||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
// First add the required covariates
|
||||
|
|
@ -324,7 +323,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
counts.second++;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -368,9 +366,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
RecalDatum datum = (RecalDatum) dataManager.data.get( key );
|
||||
RecalDatumOptimized datum = (RecalDatumOptimized) dataManager.data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
||||
datum = new RecalDatumOptimized(); // initialized with zeros, will be incremented at end of method
|
||||
dataManager.data.put( datum, (Object[])key );
|
||||
}
|
||||
|
||||
|
|
@ -469,14 +467,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
for( Comparable comp : keyList ) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatum)val).outputToCSV() );
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val);
|
||||
}
|
||||
|
|
@ -487,14 +485,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
for( Object comp : data.keySet() ) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatum)val).outputToCSV() );
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappings( recalTableStream, curPos + 1, key, (Map) val);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,139 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Jan 6, 2010
|
||||
*
|
||||
* An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed.
|
||||
* Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
|
||||
*/
|
||||
|
||||
public class RecalDatumOptimized {
|
||||
|
||||
private long numObservations; // number of bases seen in total
|
||||
private long numMismatches; // number of bases seen that didn't match the reference
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
public RecalDatumOptimized() {
|
||||
numObservations = 0L;
|
||||
numMismatches = 0L;
|
||||
}
|
||||
|
||||
public RecalDatumOptimized( final long _numObservations, final long _numMismatches) {
|
||||
numObservations = _numObservations;
|
||||
numMismatches = _numMismatches;
|
||||
}
|
||||
|
||||
public RecalDatumOptimized( final RecalDatumOptimized copy ) {
|
||||
this.numObservations = copy.numObservations;
|
||||
this.numMismatches = copy.numMismatches;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// increment methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
public final void increment( final long incObservations, final long incMismatches ) {
|
||||
numObservations += incObservations;
|
||||
numMismatches += incMismatches;
|
||||
}
|
||||
|
||||
public final void increment( final RecalDatumOptimized other ) {
|
||||
increment( other.numObservations, other.numMismatches );
|
||||
}
|
||||
|
||||
public final void increment( final List<RecalDatumOptimized> data ) {
|
||||
for ( RecalDatumOptimized other : data ) {
|
||||
this.increment( other );
|
||||
}
|
||||
}
|
||||
|
||||
public final void increment( final char curBase, final char refBase ) {
|
||||
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// methods to derive empirical quality score
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final double empiricalQualDouble( final int smoothing ) {
|
||||
final double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||
final double doubleObservations = (double) ( numObservations + smoothing );
|
||||
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; }
|
||||
return empiricalQual;
|
||||
}
|
||||
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||
|
||||
public final byte empiricalQualByte( final int smoothing ) {
|
||||
final double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||
final double doubleObservations = (double) ( numObservations + smoothing );
|
||||
return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations );
|
||||
}
|
||||
public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// misc. methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final String outputToCSV( ) {
|
||||
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
|
||||
}
|
||||
public final String outputToCSV( final int smoothing ) {
|
||||
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) );
|
||||
}
|
||||
|
||||
public final long getNumObservations() {
|
||||
return numObservations;
|
||||
}
|
||||
|
||||
public final long getNumMismatches() {
|
||||
return numMismatches;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format( "RecalDatum: %d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
|
||||
}
|
||||
}
|
||||
|
|
@ -92,7 +92,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String versionString = "v2.2.2"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.2.3"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
|
||||
private static final long RANDOM_SEED = 1032861495;
|
||||
|
|
|
|||
Loading…
Reference in New Issue