From f0a234ab29c014225e5e210575626827df12a1a4 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 18 Nov 2009 16:47:44 +0000 Subject: [PATCH] TableRecalibration is now much smarter about hashing calculations, taking advantage of the sequential recalibration formulation. Instead of hashing RecalDatums it hashes the empirical quality score itself. This cuts the runtime by 20 percent. TableRecalibration also now skips over reads with zero mapping quality (outputs them to the new bam but doesn't touch their base quality scores). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2069 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/Recalibration/DinucCovariate.java | 2 +- .../Recalibration/RecalDataManager.java | 186 +++++++++++------- .../walkers/Recalibration/RecalDatum.java | 6 +- .../TableRecalibrationWalker.java | 98 ++++----- 4 files changed, 169 insertions(+), 123 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java index 4a7045563..02c912468 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java @@ -51,7 +51,7 @@ public class DinucCovariate implements Covariate { dinucHashMap = new HashMap(); for(byte byte1 : BASES) { for(byte byte2: BASES) { - dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); + dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java index 5d03c2972..693fedad9 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; -import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.QualityUtils; import java.util.*; @@ -43,98 +42,134 @@ public class RecalDataManager { private NHashMap dataCollapsedReadGroup; // table where everything except read group has been collapsed private NHashMap dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed private ArrayList> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed - private boolean collapsedTablesCreated; public NHashMap dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed + private NHashMap dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams RecalDataManager() { data = new NHashMap(); - collapsedTablesCreated = false; + } + + RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) { + if( createCollapsedTables ) { // initialize all the collapsed tables + dataCollapsedReadGroup = new NHashMap(); + dataCollapsedQualityScore = new NHashMap(); + dataCollapsedByCovariate = new ArrayList>(); + for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariate.add( new NHashMap() ); + } + dataSumExpectedErrors = new NHashMap(); + } else { + data = new NHashMap( estimatedCapacity, 0.8f); + } } - RecalDataManager( int estimatedCapacity ) { + RecalDataManager( final int estimatedCapacity ) { data = new NHashMap( estimatedCapacity, 0.8f ); // second arg is the 'loading factor', // a number to monkey around with when optimizing performace of the HashMap - collapsedTablesCreated = false; } - // BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker /** - * Create all the collapsed tables that will be used in the sequential calculation in TableRecalibrationWalker - * @param numCovariates The number of covariates you have determines the number of tables to create + * Add the given mapping to all of the collapsed hash tables + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping */ - public final void createCollapsedTables( final int numCovariates ) { - dataCollapsedReadGroup = new NHashMap(); - dataCollapsedQualityScore = new NHashMap(); - dataCollapsedByCovariate = new ArrayList>(); - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariate.add( new NHashMap() ); + public final void addToAllTables( final List key, final RecalDatum fullDatum ) { + + // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around + //data.put(key, thisDatum); // add the mapping to the main table + + // create dataCollapsedReadGroup, the table where everything except read group has been collapsed + ArrayList newKey = new ArrayList(); + newKey.add( key.get(0) ); // make a new key with just the read group + RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment(fullDatum); } - dataSumExpectedErrors = new NHashMap(); - // preallocate for use in for loops below - RecalDatum thisDatum; - RecalDatum collapsedDatum; - List key; - ArrayList newKey; - Double sumExpectedErrors; + // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed + newKey = new ArrayList(); + newKey.add( key.get(0) ); // make a new key with just the read group + Double sumExpectedErrors = dataSumExpectedErrors.get( newKey ); + if( sumExpectedErrors == null ) { + dataSumExpectedErrors.put( newKey, 0.0 ); + } else { + dataSumExpectedErrors.remove( newKey ); + sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations(); + dataSumExpectedErrors.put( newKey, sumExpectedErrors ); + } - // for every data point in the map - for( Map.Entry,RecalDatum> entry : data.entrySet() ) { - thisDatum = entry.getValue(); - key = entry.getKey(); - - // create dataCollapsedReadGroup, the table where everything except read group has been collapsed + newKey = new ArrayList(); + // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed + newKey.add( key.get(0) ); // make a new key with the read group ... + newKey.add( key.get(1) ); // and quality score + collapsedDatum = dataCollapsedQualityScore.get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment(fullDatum); + } + + // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed + for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group - collapsedDatum = dataCollapsedReadGroup.get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedReadGroup.put( newKey, new RecalDatum( thisDatum ) ); - } else { - collapsedDatum.increment( thisDatum ); - } - - // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed - newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group - sumExpectedErrors = dataSumExpectedErrors.get( newKey ); - if( sumExpectedErrors == null ) { - dataSumExpectedErrors.put( newKey, 0.0 ); - } else { - dataSumExpectedErrors.remove( newKey ); - sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * thisDatum.getNumObservations(); - dataSumExpectedErrors.put( newKey, sumExpectedErrors ); - } - - newKey = new ArrayList(); - // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed newKey.add( key.get(0) ); // make a new key with the read group ... - newKey.add( key.get(1) ); // and quality score - collapsedDatum = dataCollapsedQualityScore.get( newKey ); + newKey.add( key.get(1) ); // and quality score ... + newKey.add( key.get(iii + 2) ); // and the given covariate + collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); if( collapsedDatum == null ) { - dataCollapsedQualityScore.put( newKey, new RecalDatum( thisDatum ) ); + dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); } else { - collapsedDatum.increment( thisDatum ); + collapsedDatum.increment(fullDatum); } + } + } - // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted - newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with the read group ... - newKey.add( key.get(1) ); // and quality score ... - newKey.add( key.get(iii + 2) ); // and the given covariate - collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum( thisDatum ) ); - } else { - collapsedDatum.increment( thisDatum ); - } + /** + * Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score + * that will be used in the sequential calculation in TableRecalibrationWalker + * @param numCovariates The number of covariates you have determines the number of tables to create + * @param smoothing The smoothing paramter that goes into empirical quality score calculation + */ + public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) { + + dataCollapsedReadGroupDouble = new NHashMap(); + dataCollapsedQualityScoreDouble = new NHashMap(); + dataCollapsedByCovariateDouble = new ArrayList>(); + for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariateDouble.add( new NHashMap() ); + } + + // Hash the empirical quality scores so we don't have to call Math.log at every base for every read + // Looping over the entrySet is really expensive but worth it + for( Map.Entry,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) { + dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + } + for( Map.Entry,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) { + dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + } + for( int iii = 0; iii < numCovariates - 2; iii++ ) { + for( Map.Entry,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) { + dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); } } - collapsedTablesCreated = true; + dataCollapsedQualityScore.clear(); + dataCollapsedByCovariate.clear(); + dataCollapsedQualityScore = null; // will never need this again + dataCollapsedByCovariate = null; // will never need this again + if( data!=null ) { + data.clear(); + data = null; // will never need this again + } } /** @@ -143,10 +178,6 @@ public class RecalDataManager { * @return The desired collapsed HashMap */ public final NHashMap getCollapsedTable( final int covariate ) { - if( !collapsedTablesCreated ) { - throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound."); - } - if( covariate == 0) { return dataCollapsedReadGroup; // table where everything except read group has been collapsed } else if( covariate == 1 ) { @@ -155,4 +186,19 @@ public class RecalDataManager { return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed } } + + /** + * Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object + * @param covariate Which covariate indexes the desired collapsed NHashMap + * @return The desired collapsed NHashMap + */ + public final NHashMap getCollapsedDoubleTable( final int covariate ) { + if( covariate == 0) { + return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed + } else if( covariate == 1 ) { + return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed + } else { + return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java index 4904deacc..da07c331e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java @@ -86,8 +86,8 @@ public class RecalDatum { } } - public final void increment( final char curBase, final char ref ) { - increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches + public final void increment( final char curBase, final char refBase ) { + increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches } //--------------------------------------------------------------------------------------------------------------- @@ -100,7 +100,7 @@ public class RecalDatum { double doubleMismatches = (double) ( numMismatches + smoothing ); double doubleObservations = (double) ( numObservations + smoothing ); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; + if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; } return empiricalQual; } public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index c0cbd8c69..4e25bd98d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -74,7 +74,7 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates; + private RecalDataManager dataManager; + private ArrayList requestedCovariates; private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*"); @@ -162,10 +162,10 @@ public class TableRecalibrationWalker extends ReadWalker 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception - dataManager = new RecalDataManager( estimatedCapacity ); + final boolean createCollapsedTables = true; + // Initialize the data hashMaps + dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() ); } addCSVData(line); // parse the line and add the data to the HashMap @@ -179,10 +179,13 @@ public class TableRecalibrationWalker extends ReadWalker QualityUtils.MAX_REASONABLE_Q_SCORE ) { - throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); - } + //if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { // BUGBUG: This isn't supported. No need to keep the full data hashmap around so it was removed for major speed up + // //RecalDatum datum = dataManager.data.get( key ); + // //if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing + // // recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); + // //} + // throw new StingException("The Combinatorial mode isn't supported."); + //} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) { + // + //} else { + // throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING ); + //} } preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low @@ -303,42 +305,40 @@ public class TableRecalibrationWalker extends ReadWalker key ) { - byte qualFromRead = Byte.parseByte(key.get(1).toString()); - ArrayList newKey; + String readGroupKeyElement = key.get(0).toString(); + int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString()); + byte qualFromRead = (byte)qualityScoreKeyElement; + ArrayList newKey = new ArrayList(); // The global quality shift (over the read group only) - newKey = new ArrayList(); - newKey.add( key.get(0) ); // read group + newKey.add( readGroupKeyElement ); RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey ); + Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( newKey ); double globalDeltaQ = 0.0; double aggregrateQreported = 0.0; if( globalDeltaQDatum != null ) { - aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) ); - globalDeltaQ = globalDeltaQDatum.empiricalQualDouble( SMOOTHING ) - aggregrateQreported; + aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) ); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported; } - + // The shift in quality between reported and empirical - newKey = new ArrayList(); - newKey.add( key.get(0) ); // read group - newKey.add( key.get(1) ); // quality score - RecalDatum deltaQReportedDatum = dataManager.getCollapsedTable(1).get( newKey ); + newKey.add( qualityScoreKeyElement ); + Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get( newKey ); double deltaQReported = 0.0; - if( deltaQReportedDatum != null ) { - deltaQReported = deltaQReportedDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - globalDeltaQ; + if( deltaQReportedEmpirical != null ) { + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; } // The shift in quality due to each covariate by itself in turn double deltaQCovariates = 0.0; - RecalDatum deltaQCovariateDatum; + Double deltaQCovariateEmpirical; for( int iii = 2; iii < key.size(); iii++ ) { - newKey = new ArrayList(); - newKey.add( key.get(0) ); // read group - newKey.add( key.get(1) ); // quality score - newKey.add( key.get(iii) ); // given covariate - deltaQCovariateDatum = dataManager.getCollapsedTable(iii).get( newKey ); - if( deltaQCovariateDatum != null ) { - deltaQCovariates += ( deltaQCovariateDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - (globalDeltaQ + deltaQReported) ); + newKey.add( key.get(iii) ); // the given covariate + deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( newKey ); + if( deltaQCovariateEmpirical != null ) { + deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); } + newKey.remove( 2 ); // this new covariate is always added in at position 2 in the newKey list } double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;