TableRecalibration is now much smarter about hashing calculations, taking advantage of the sequential recalibration formulation. Instead of hashing RecalDatums it hashes the empirical quality score itself. This cuts the runtime by 20 percent. TableRecalibration also now skips over reads with zero mapping quality (outputs them to the new bam but doesn't touch their base quality scores).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2069 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-18 16:47:44 +00:00
parent be31d7f4cc
commit f0a234ab29
4 changed files with 169 additions and 123 deletions

View File

@ -51,7 +51,7 @@ public class DinucCovariate implements Covariate {
dinucHashMap = new HashMap<Integer, Dinuc>();
for(byte byte1 : BASES) {
for(byte byte2: BASES) {
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) );
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
}
}
}

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;
@ -43,98 +42,134 @@ public class RecalDataManager {
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
private boolean collapsedTablesCreated;
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
private NHashMap<Double> dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
private NHashMap<Double> dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
RecalDataManager() {
data = new NHashMap<RecalDatum>();
collapsedTablesCreated = false;
}
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
if( createCollapsedTables ) { // initialize all the collapsed tables
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
}
dataSumExpectedErrors = new NHashMap<Double>();
} else {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
}
}
RecalDataManager( int estimatedCapacity ) {
RecalDataManager( final int estimatedCapacity ) {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
// a number to monkey around with when optimizing performace of the HashMap
collapsedTablesCreated = false;
}
// BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker
/**
* Create all the collapsed tables that will be used in the sequential calculation in TableRecalibrationWalker
* @param numCovariates The number of covariates you have determines the number of tables to create
* Add the given mapping to all of the collapsed hash tables
* @param key The list of comparables that is the key for this mapping
* @param fullDatum The RecalDatum which is the data for this mapping
*/
public final void createCollapsedTables( final int numCovariates ) {
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum ) {
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
//data.put(key, thisDatum); // add the mapping to the main table
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
dataSumExpectedErrors = new NHashMap<Double>();
// preallocate for use in for loops below
RecalDatum thisDatum;
RecalDatum collapsedDatum;
List<? extends Comparable> key;
ArrayList<Comparable> newKey;
Double sumExpectedErrors;
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
if( sumExpectedErrors == null ) {
dataSumExpectedErrors.put( newKey, 0.0 );
} else {
dataSumExpectedErrors.remove( newKey );
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
}
// for every data point in the map
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : data.entrySet() ) {
thisDatum = entry.getValue();
key = entry.getKey();
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
newKey = new ArrayList<Comparable>();
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score
collapsedDatum = dataCollapsedQualityScore.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
collapsedDatum = dataCollapsedReadGroup.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedReadGroup.put( newKey, new RecalDatum( thisDatum ) );
} else {
collapsedDatum.increment( thisDatum );
}
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
sumExpectedErrors = dataSumExpectedErrors.get( newKey );
if( sumExpectedErrors == null ) {
dataSumExpectedErrors.put( newKey, 0.0 );
} else {
dataSumExpectedErrors.remove( newKey );
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * thisDatum.getNumObservations();
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
}
newKey = new ArrayList<Comparable>();
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score
collapsedDatum = dataCollapsedQualityScore.get( newKey );
newKey.add( key.get(1) ); // and quality score ...
newKey.add( key.get(iii + 2) ); // and the given covariate
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
if( collapsedDatum == null ) {
dataCollapsedQualityScore.put( newKey, new RecalDatum( thisDatum ) );
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment( thisDatum );
collapsedDatum.increment(fullDatum);
}
}
}
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score ...
newKey.add( key.get(iii + 2) ); // and the given covariate
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
if( collapsedDatum == null ) {
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum( thisDatum ) );
} else {
collapsedDatum.increment( thisDatum );
}
/**
* Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
* that will be used in the sequential calculation in TableRecalibrationWalker
* @param numCovariates The number of covariates you have determines the number of tables to create
* @param smoothing The smoothing paramter that goes into empirical quality score calculation
*/
public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) {
dataCollapsedReadGroupDouble = new NHashMap<Double>();
dataCollapsedQualityScoreDouble = new NHashMap<Double>();
dataCollapsedByCovariateDouble = new ArrayList<NHashMap<Double>>();
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariateDouble.add( new NHashMap<Double>() );
}
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
// Looping over the entrySet is really expensive but worth it
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
}
collapsedTablesCreated = true;
dataCollapsedQualityScore.clear();
dataCollapsedByCovariate.clear();
dataCollapsedQualityScore = null; // will never need this again
dataCollapsedByCovariate = null; // will never need this again
if( data!=null ) {
data.clear();
data = null; // will never need this again
}
}
/**
@ -143,10 +178,6 @@ public class RecalDataManager {
* @return The desired collapsed HashMap
*/
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
if( !collapsedTablesCreated ) {
throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound.");
}
if( covariate == 0) {
return dataCollapsedReadGroup; // table where everything except read group has been collapsed
} else if( covariate == 1 ) {
@ -155,4 +186,19 @@ public class RecalDataManager {
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
}
}
/**
* Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object
* @param covariate Which covariate indexes the desired collapsed NHashMap<Double>
* @return The desired collapsed NHashMap<Double>
*/
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
if( covariate == 0) {
return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
} else if( covariate == 1 ) {
return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
} else {
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
}
}
}

View File

@ -86,8 +86,8 @@ public class RecalDatum {
}
}
public final void increment( final char curBase, final char ref ) {
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches
public final void increment( final char curBase, final char refBase ) {
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches
}
//---------------------------------------------------------------------------------------------------------------
@ -100,7 +100,7 @@ public class RecalDatum {
double doubleMismatches = (double) ( numMismatches + smoothing );
double doubleObservations = (double) ( numObservations + smoothing );
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; }
return empiricalQual;
}
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero

View File

@ -74,7 +74,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
public int SMOOTHING = 1;
private int SMOOTHING = 1;
//public enum RecalibrationMode {
// COMBINATORIAL,
@ -83,11 +83,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
//}
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
public String MODE_STRING = "SEQUENTIAL";
private String MODE_STRING = "SEQUENTIAL";
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
protected RecalDataManager dataManager;
protected ArrayList<Covariate> requestedCovariates;
private RecalDataManager dataManager;
private ArrayList<Covariate> requestedCovariates;
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
@ -162,10 +162,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} else { // found some data
if( !foundAllCovariates ) {
foundAllCovariates = true;
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
dataManager = new RecalDataManager( estimatedCapacity );
final boolean createCollapsedTables = true;
// Initialize the data hashMaps
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
}
addCSVData(line); // parse the line and add the data to the HashMap
@ -179,10 +179,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
logger.info( "...done!" );
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
// Create the collapsed tables that are used in the sequential calculation
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
logger.info( "Creating collapsed tables for use in sequential calculation..." );
dataManager.createCollapsedTables( requestedCovariates.size() );
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
logger.info( "...done!" );
}
}
@ -201,7 +204,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
key.add( cov.getValue( vals[iii] ) );
}
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
dataManager.data.put( key, datum );
dataManager.addToAllTables( key, datum );
}
//---------------------------------------------------------------------------------------------------------------
@ -218,12 +222,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/
public SAMRecord map( char[] refBases, SAMRecord read ) {
// WARNING: refBases is always null because this walker doesn't have @REQUIRES({DataSource.REFERENCE_BASES})
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
// This is done in order to speed up the code
// if( read.getMappingQuality() <= 0 ) {
// return read; // early return here, unmapped reads and mapping quality zero reads should be left alone
// }
if( read.getMappingQuality() <= 0 ) {
return read; // early return here, unmapped reads and mapping quality zero reads should be left alone
}
byte[] originalQuals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
@ -253,21 +257,19 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases ) ); // offset is zero based so passing iii is correct here
}
if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) {
RecalDatum datum = dataManager.data.get( key );
if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
}
} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
recalQuals[iii] = performSequentialQualityCalculation( key );
} else {
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING );
}
recalQuals[iii] = performSequentialQualityCalculation( key );
// Do some error checking on the new quality score
if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) {
throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] );
}
//if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { // BUGBUG: This isn't supported. No need to keep the full data hashmap around so it was removed for major speed up
// //RecalDatum datum = dataManager.data.get( key );
// //if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
// // recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
// //}
// throw new StingException("The Combinatorial mode isn't supported.");
//} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
//
//} else {
// throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING );
//}
}
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
@ -303,42 +305,40 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
byte qualFromRead = Byte.parseByte(key.get(1).toString());
ArrayList<Comparable> newKey;
String readGroupKeyElement = key.get(0).toString();
int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString());
byte qualFromRead = (byte)qualityScoreKeyElement;
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
// The global quality shift (over the read group only)
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // read group
newKey.add( readGroupKeyElement );
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey );
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( newKey );
double globalDeltaQ = 0.0;
double aggregrateQreported = 0.0;
if( globalDeltaQDatum != null ) {
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) );
globalDeltaQ = globalDeltaQDatum.empiricalQualDouble( SMOOTHING ) - aggregrateQreported;
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) );
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
}
// The shift in quality between reported and empirical
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // read group
newKey.add( key.get(1) ); // quality score
RecalDatum deltaQReportedDatum = dataManager.getCollapsedTable(1).get( newKey );
newKey.add( qualityScoreKeyElement );
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get( newKey );
double deltaQReported = 0.0;
if( deltaQReportedDatum != null ) {
deltaQReported = deltaQReportedDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - globalDeltaQ;
if( deltaQReportedEmpirical != null ) {
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
}
// The shift in quality due to each covariate by itself in turn
double deltaQCovariates = 0.0;
RecalDatum deltaQCovariateDatum;
Double deltaQCovariateEmpirical;
for( int iii = 2; iii < key.size(); iii++ ) {
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // read group
newKey.add( key.get(1) ); // quality score
newKey.add( key.get(iii) ); // given covariate
deltaQCovariateDatum = dataManager.getCollapsedTable(iii).get( newKey );
if( deltaQCovariateDatum != null ) {
deltaQCovariates += ( deltaQCovariateDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - (globalDeltaQ + deltaQReported) );
newKey.add( key.get(iii) ); // the given covariate
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( newKey );
if( deltaQCovariateEmpirical != null ) {
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
}
newKey.remove( 2 ); // this new covariate is always added in at position 2 in the newKey list
}
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;