diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java index 4a7045563..02c912468 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java @@ -51,7 +51,7 @@ public class DinucCovariate implements Covariate { dinucHashMap = new HashMap(); for(byte byte1 : BASES) { for(byte byte2: BASES) { - dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); + dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java index 5d03c2972..693fedad9 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; -import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.QualityUtils; import java.util.*; @@ -43,98 +42,134 @@ public class RecalDataManager { private NHashMap dataCollapsedReadGroup; // table where everything except read group has been collapsed private NHashMap dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed private ArrayList> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed - private boolean collapsedTablesCreated; public NHashMap dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed + private NHashMap dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams RecalDataManager() { data = new NHashMap(); - collapsedTablesCreated = false; + } + + RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) { + if( createCollapsedTables ) { // initialize all the collapsed tables + dataCollapsedReadGroup = new NHashMap(); + dataCollapsedQualityScore = new NHashMap(); + dataCollapsedByCovariate = new ArrayList>(); + for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariate.add( new NHashMap() ); + } + dataSumExpectedErrors = new NHashMap(); + } else { + data = new NHashMap( estimatedCapacity, 0.8f); + } } - RecalDataManager( int estimatedCapacity ) { + RecalDataManager( final int estimatedCapacity ) { data = new NHashMap( estimatedCapacity, 0.8f ); // second arg is the 'loading factor', // a number to monkey around with when optimizing performace of the HashMap - collapsedTablesCreated = false; } - // BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker /** - * Create all the collapsed tables that will be used in the sequential calculation in TableRecalibrationWalker - * @param numCovariates The number of covariates you have determines the number of tables to create + * Add the given mapping to all of the collapsed hash tables + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping */ - public final void createCollapsedTables( final int numCovariates ) { - dataCollapsedReadGroup = new NHashMap(); - dataCollapsedQualityScore = new NHashMap(); - dataCollapsedByCovariate = new ArrayList>(); - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariate.add( new NHashMap() ); + public final void addToAllTables( final List key, final RecalDatum fullDatum ) { + + // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around + //data.put(key, thisDatum); // add the mapping to the main table + + // create dataCollapsedReadGroup, the table where everything except read group has been collapsed + ArrayList newKey = new ArrayList(); + newKey.add( key.get(0) ); // make a new key with just the read group + RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment(fullDatum); } - dataSumExpectedErrors = new NHashMap(); - // preallocate for use in for loops below - RecalDatum thisDatum; - RecalDatum collapsedDatum; - List key; - ArrayList newKey; - Double sumExpectedErrors; + // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed + newKey = new ArrayList(); + newKey.add( key.get(0) ); // make a new key with just the read group + Double sumExpectedErrors = dataSumExpectedErrors.get( newKey ); + if( sumExpectedErrors == null ) { + dataSumExpectedErrors.put( newKey, 0.0 ); + } else { + dataSumExpectedErrors.remove( newKey ); + sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations(); + dataSumExpectedErrors.put( newKey, sumExpectedErrors ); + } - // for every data point in the map - for( Map.Entry,RecalDatum> entry : data.entrySet() ) { - thisDatum = entry.getValue(); - key = entry.getKey(); - - // create dataCollapsedReadGroup, the table where everything except read group has been collapsed + newKey = new ArrayList(); + // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed + newKey.add( key.get(0) ); // make a new key with the read group ... + newKey.add( key.get(1) ); // and quality score + collapsedDatum = dataCollapsedQualityScore.get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment(fullDatum); + } + + // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed + for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group - collapsedDatum = dataCollapsedReadGroup.get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedReadGroup.put( newKey, new RecalDatum( thisDatum ) ); - } else { - collapsedDatum.increment( thisDatum ); - } - - // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed - newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group - sumExpectedErrors = dataSumExpectedErrors.get( newKey ); - if( sumExpectedErrors == null ) { - dataSumExpectedErrors.put( newKey, 0.0 ); - } else { - dataSumExpectedErrors.remove( newKey ); - sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * thisDatum.getNumObservations(); - dataSumExpectedErrors.put( newKey, sumExpectedErrors ); - } - - newKey = new ArrayList(); - // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed newKey.add( key.get(0) ); // make a new key with the read group ... - newKey.add( key.get(1) ); // and quality score - collapsedDatum = dataCollapsedQualityScore.get( newKey ); + newKey.add( key.get(1) ); // and quality score ... + newKey.add( key.get(iii + 2) ); // and the given covariate + collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); if( collapsedDatum == null ) { - dataCollapsedQualityScore.put( newKey, new RecalDatum( thisDatum ) ); + dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); } else { - collapsedDatum.increment( thisDatum ); + collapsedDatum.increment(fullDatum); } + } + } - // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted - newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with the read group ... - newKey.add( key.get(1) ); // and quality score ... - newKey.add( key.get(iii + 2) ); // and the given covariate - collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum( thisDatum ) ); - } else { - collapsedDatum.increment( thisDatum ); - } + /** + * Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score + * that will be used in the sequential calculation in TableRecalibrationWalker + * @param numCovariates The number of covariates you have determines the number of tables to create + * @param smoothing The smoothing paramter that goes into empirical quality score calculation + */ + public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) { + + dataCollapsedReadGroupDouble = new NHashMap(); + dataCollapsedQualityScoreDouble = new NHashMap(); + dataCollapsedByCovariateDouble = new ArrayList>(); + for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariateDouble.add( new NHashMap() ); + } + + // Hash the empirical quality scores so we don't have to call Math.log at every base for every read + // Looping over the entrySet is really expensive but worth it + for( Map.Entry,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) { + dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + } + for( Map.Entry,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) { + dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + } + for( int iii = 0; iii < numCovariates - 2; iii++ ) { + for( Map.Entry,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) { + dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); } } - collapsedTablesCreated = true; + dataCollapsedQualityScore.clear(); + dataCollapsedByCovariate.clear(); + dataCollapsedQualityScore = null; // will never need this again + dataCollapsedByCovariate = null; // will never need this again + if( data!=null ) { + data.clear(); + data = null; // will never need this again + } } /** @@ -143,10 +178,6 @@ public class RecalDataManager { * @return The desired collapsed HashMap */ public final NHashMap getCollapsedTable( final int covariate ) { - if( !collapsedTablesCreated ) { - throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound."); - } - if( covariate == 0) { return dataCollapsedReadGroup; // table where everything except read group has been collapsed } else if( covariate == 1 ) { @@ -155,4 +186,19 @@ public class RecalDataManager { return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed } } + + /** + * Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object + * @param covariate Which covariate indexes the desired collapsed NHashMap + * @return The desired collapsed NHashMap + */ + public final NHashMap getCollapsedDoubleTable( final int covariate ) { + if( covariate == 0) { + return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed + } else if( covariate == 1 ) { + return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed + } else { + return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java index 4904deacc..da07c331e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java @@ -86,8 +86,8 @@ public class RecalDatum { } } - public final void increment( final char curBase, final char ref ) { - increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches + public final void increment( final char curBase, final char refBase ) { + increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches } //--------------------------------------------------------------------------------------------------------------- @@ -100,7 +100,7 @@ public class RecalDatum { double doubleMismatches = (double) ( numMismatches + smoothing ); double doubleObservations = (double) ( numObservations + smoothing ); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; + if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; } return empiricalQual; } public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index c0cbd8c69..4e25bd98d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -74,7 +74,7 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates; + private RecalDataManager dataManager; + private ArrayList requestedCovariates; private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*"); @@ -162,10 +162,10 @@ public class TableRecalibrationWalker extends ReadWalker 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception - dataManager = new RecalDataManager( estimatedCapacity ); + final boolean createCollapsedTables = true; + // Initialize the data hashMaps + dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() ); } addCSVData(line); // parse the line and add the data to the HashMap @@ -179,10 +179,13 @@ public class TableRecalibrationWalker extends ReadWalker QualityUtils.MAX_REASONABLE_Q_SCORE ) { - throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); - } + //if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { // BUGBUG: This isn't supported. No need to keep the full data hashmap around so it was removed for major speed up + // //RecalDatum datum = dataManager.data.get( key ); + // //if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing + // // recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); + // //} + // throw new StingException("The Combinatorial mode isn't supported."); + //} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) { + // + //} else { + // throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING ); + //} } preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low @@ -303,42 +305,40 @@ public class TableRecalibrationWalker extends ReadWalker key ) { - byte qualFromRead = Byte.parseByte(key.get(1).toString()); - ArrayList newKey; + String readGroupKeyElement = key.get(0).toString(); + int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString()); + byte qualFromRead = (byte)qualityScoreKeyElement; + ArrayList newKey = new ArrayList(); // The global quality shift (over the read group only) - newKey = new ArrayList(); - newKey.add( key.get(0) ); // read group + newKey.add( readGroupKeyElement ); RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey ); + Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( newKey ); double globalDeltaQ = 0.0; double aggregrateQreported = 0.0; if( globalDeltaQDatum != null ) { - aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) ); - globalDeltaQ = globalDeltaQDatum.empiricalQualDouble( SMOOTHING ) - aggregrateQreported; + aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) ); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported; } - + // The shift in quality between reported and empirical - newKey = new ArrayList(); - newKey.add( key.get(0) ); // read group - newKey.add( key.get(1) ); // quality score - RecalDatum deltaQReportedDatum = dataManager.getCollapsedTable(1).get( newKey ); + newKey.add( qualityScoreKeyElement ); + Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get( newKey ); double deltaQReported = 0.0; - if( deltaQReportedDatum != null ) { - deltaQReported = deltaQReportedDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - globalDeltaQ; + if( deltaQReportedEmpirical != null ) { + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; } // The shift in quality due to each covariate by itself in turn double deltaQCovariates = 0.0; - RecalDatum deltaQCovariateDatum; + Double deltaQCovariateEmpirical; for( int iii = 2; iii < key.size(); iii++ ) { - newKey = new ArrayList(); - newKey.add( key.get(0) ); // read group - newKey.add( key.get(1) ); // quality score - newKey.add( key.get(iii) ); // given covariate - deltaQCovariateDatum = dataManager.getCollapsedTable(iii).get( newKey ); - if( deltaQCovariateDatum != null ) { - deltaQCovariates += ( deltaQCovariateDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - (globalDeltaQ + deltaQReported) ); + newKey.add( key.get(iii) ); // the given covariate + deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( newKey ); + if( deltaQCovariateEmpirical != null ) { + deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); } + newKey.remove( 2 ); // this new covariate is always added in at position 2 in the newKey list } double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;