Cut the memory footprint of the RecalDatum in half to improve performance of CountCovariates when run with many covariates.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2520 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
370a365147
commit
e011a1b6f8
|
|
@ -103,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
||||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
|
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
|
||||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||||
private static final String versionString = "v2.2.1"; // Major version, minor version, and build number
|
private static final String versionString = "v2.2.2"; // Major version, minor version, and build number
|
||||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||||
|
|
@ -157,7 +157,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
|
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Initialize the requested covariates by parsing the -cov argument
|
// Initialize the requested covariates by parsing the -cov argument
|
||||||
requestedCovariates = new ArrayList<Covariate>();
|
requestedCovariates = new ArrayList<Covariate>();
|
||||||
// First add the required covariates
|
// First add the required covariates
|
||||||
|
|
@ -324,7 +323,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
}
|
}
|
||||||
counts.second++;
|
counts.second++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -368,9 +366,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||||
RecalDatum datum = (RecalDatum) dataManager.data.get( key );
|
RecalDatumOptimized datum = (RecalDatumOptimized) dataManager.data.get( key );
|
||||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
datum = new RecalDatumOptimized(); // initialized with zeros, will be incremented at end of method
|
||||||
dataManager.data.put( datum, (Object[])key );
|
dataManager.data.put( datum, (Object[])key );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -469,14 +467,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
for( Comparable comp : keyList ) {
|
for( Comparable comp : keyList ) {
|
||||||
key[curPos] = comp;
|
key[curPos] = comp;
|
||||||
final Object val = data.get(comp);
|
final Object val = data.get(comp);
|
||||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||||
// For each Covariate in the key
|
// For each Covariate in the key
|
||||||
for( Object compToPrint : key ) {
|
for( Object compToPrint : key ) {
|
||||||
// Output the Covariate's value
|
// Output the Covariate's value
|
||||||
recalTableStream.print( compToPrint + "," );
|
recalTableStream.print( compToPrint + "," );
|
||||||
}
|
}
|
||||||
// Output the RecalDatum entry
|
// Output the RecalDatum entry
|
||||||
recalTableStream.println( ((RecalDatum)val).outputToCSV() );
|
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||||
} else { // Another layer in the nested hash map
|
} else { // Another layer in the nested hash map
|
||||||
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val);
|
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val);
|
||||||
}
|
}
|
||||||
|
|
@ -487,14 +485,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
for( Object comp : data.keySet() ) {
|
for( Object comp : data.keySet() ) {
|
||||||
key[curPos] = comp;
|
key[curPos] = comp;
|
||||||
final Object val = data.get(comp);
|
final Object val = data.get(comp);
|
||||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||||
// For each Covariate in the key
|
// For each Covariate in the key
|
||||||
for( Object compToPrint : key ) {
|
for( Object compToPrint : key ) {
|
||||||
// Output the Covariate's value
|
// Output the Covariate's value
|
||||||
recalTableStream.print( compToPrint + "," );
|
recalTableStream.print( compToPrint + "," );
|
||||||
}
|
}
|
||||||
// Output the RecalDatum entry
|
// Output the RecalDatum entry
|
||||||
recalTableStream.println( ((RecalDatum)val).outputToCSV() );
|
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||||
} else { // Another layer in the nested hash map
|
} else { // Another layer in the nested hash map
|
||||||
printMappings( recalTableStream, curPos + 1, key, (Map) val);
|
printMappings( recalTableStream, curPos + 1, key, (Map) val);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,139 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: rpoplin
|
||||||
|
* Date: Jan 6, 2010
|
||||||
|
*
|
||||||
|
* An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed.
|
||||||
|
* Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class RecalDatumOptimized {
|
||||||
|
|
||||||
|
private long numObservations; // number of bases seen in total
|
||||||
|
private long numMismatches; // number of bases seen that didn't match the reference
|
||||||
|
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// constructors
|
||||||
|
//
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
public RecalDatumOptimized() {
|
||||||
|
numObservations = 0L;
|
||||||
|
numMismatches = 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RecalDatumOptimized( final long _numObservations, final long _numMismatches) {
|
||||||
|
numObservations = _numObservations;
|
||||||
|
numMismatches = _numMismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RecalDatumOptimized( final RecalDatumOptimized copy ) {
|
||||||
|
this.numObservations = copy.numObservations;
|
||||||
|
this.numMismatches = copy.numMismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// increment methods
|
||||||
|
//
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
public final void increment( final long incObservations, final long incMismatches ) {
|
||||||
|
numObservations += incObservations;
|
||||||
|
numMismatches += incMismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void increment( final RecalDatumOptimized other ) {
|
||||||
|
increment( other.numObservations, other.numMismatches );
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void increment( final List<RecalDatumOptimized> data ) {
|
||||||
|
for ( RecalDatumOptimized other : data ) {
|
||||||
|
this.increment( other );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void increment( final char curBase, final char refBase ) {
|
||||||
|
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// methods to derive empirical quality score
|
||||||
|
//
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public final double empiricalQualDouble( final int smoothing ) {
|
||||||
|
final double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||||
|
final double doubleObservations = (double) ( numObservations + smoothing );
|
||||||
|
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||||
|
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; }
|
||||||
|
return empiricalQual;
|
||||||
|
}
|
||||||
|
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||||
|
|
||||||
|
public final byte empiricalQualByte( final int smoothing ) {
|
||||||
|
final double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||||
|
final double doubleObservations = (double) ( numObservations + smoothing );
|
||||||
|
return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations );
|
||||||
|
}
|
||||||
|
public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||||
|
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// misc. methods
|
||||||
|
//
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public final String outputToCSV( ) {
|
||||||
|
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
|
||||||
|
}
|
||||||
|
public final String outputToCSV( final int smoothing ) {
|
||||||
|
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
public final long getNumObservations() {
|
||||||
|
return numObservations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final long getNumMismatches() {
|
||||||
|
return numMismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format( "RecalDatum: %d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -92,7 +92,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||||
private static final String versionString = "v2.2.2"; // Major version, minor version, and build number
|
private static final String versionString = "v2.2.3"; // Major version, minor version, and build number
|
||||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||||
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
|
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
|
||||||
private static final long RANDOM_SEED = 1032861495;
|
private static final long RANDOM_SEED = 1032861495;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue