TableRecalibration is now much smarter about hashing calculations, taking advantage of the sequential recalibration formulation. Instead of hashing RecalDatums it hashes the empirical quality score itself. This cuts the runtime by 20 percent. TableRecalibration also now skips over reads with zero mapping quality (outputs them to the new bam but doesn't touch their base quality scores).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2069 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-18 16:47:44 +00:00
parent be31d7f4cc
commit f0a234ab29
4 changed files with 169 additions and 123 deletions

View File

@ -51,7 +51,7 @@ public class DinucCovariate implements Covariate {
dinucHashMap = new HashMap<Integer, Dinuc>(); dinucHashMap = new HashMap<Integer, Dinuc>();
for(byte byte1 : BASES) { for(byte byte1 : BASES) {
for(byte byte2: BASES) { for(byte byte2: BASES) {
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
} }
} }
} }

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*; import java.util.*;
@ -43,98 +42,134 @@ public class RecalDataManager {
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
private boolean collapsedTablesCreated;
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
private NHashMap<Double> dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
private NHashMap<Double> dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
RecalDataManager() { RecalDataManager() {
data = new NHashMap<RecalDatum>(); data = new NHashMap<RecalDatum>();
collapsedTablesCreated = false; }
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
if( createCollapsedTables ) { // initialize all the collapsed tables
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
}
dataSumExpectedErrors = new NHashMap<Double>();
} else {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
}
} }
RecalDataManager( int estimatedCapacity ) { RecalDataManager( final int estimatedCapacity ) {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor', data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
// a number to monkey around with when optimizing performace of the HashMap // a number to monkey around with when optimizing performace of the HashMap
collapsedTablesCreated = false;
} }
// BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker
/** /**
* Create all the collapsed tables that will be used in the sequential calculation in TableRecalibrationWalker * Add the given mapping to all of the collapsed hash tables
* @param numCovariates The number of covariates you have determines the number of tables to create * @param key The list of comparables that is the key for this mapping
* @param fullDatum The RecalDatum which is the data for this mapping
*/ */
public final void createCollapsedTables( final int numCovariates ) { public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum ) {
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
dataCollapsedQualityScore = new NHashMap<RecalDatum>(); // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>(); //data.put(key, thisDatum); // add the mapping to the main table
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() ); // create dataCollapsedReadGroup, the table where everything except read group has been collapsed
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
} }
dataSumExpectedErrors = new NHashMap<Double>();
// preallocate for use in for loops below // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
RecalDatum thisDatum; newKey = new ArrayList<Comparable>();
RecalDatum collapsedDatum; newKey.add( key.get(0) ); // make a new key with just the read group
List<? extends Comparable> key; Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
ArrayList<Comparable> newKey; if( sumExpectedErrors == null ) {
Double sumExpectedErrors; dataSumExpectedErrors.put( newKey, 0.0 );
} else {
dataSumExpectedErrors.remove( newKey );
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
}
// for every data point in the map newKey = new ArrayList<Comparable>();
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : data.entrySet() ) { // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
thisDatum = entry.getValue(); newKey.add( key.get(0) ); // make a new key with the read group ...
key = entry.getKey(); newKey.add( key.get(1) ); // and quality score
collapsedDatum = dataCollapsedQualityScore.get( newKey );
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed if( collapsedDatum == null ) {
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted
newKey = new ArrayList<Comparable>(); newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
collapsedDatum = dataCollapsedReadGroup.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedReadGroup.put( newKey, new RecalDatum( thisDatum ) );
} else {
collapsedDatum.increment( thisDatum );
}
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
sumExpectedErrors = dataSumExpectedErrors.get( newKey );
if( sumExpectedErrors == null ) {
dataSumExpectedErrors.put( newKey, 0.0 );
} else {
dataSumExpectedErrors.remove( newKey );
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * thisDatum.getNumObservations();
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
}
newKey = new ArrayList<Comparable>();
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
newKey.add( key.get(0) ); // make a new key with the read group ... newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score newKey.add( key.get(1) ); // and quality score ...
collapsedDatum = dataCollapsedQualityScore.get( newKey ); newKey.add( key.get(iii + 2) ); // and the given covariate
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
if( collapsedDatum == null ) { if( collapsedDatum == null ) {
dataCollapsedQualityScore.put( newKey, new RecalDatum( thisDatum ) ); dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
} else { } else {
collapsedDatum.increment( thisDatum ); collapsedDatum.increment(fullDatum);
} }
}
}
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed /**
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted * Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
newKey = new ArrayList<Comparable>(); * that will be used in the sequential calculation in TableRecalibrationWalker
newKey.add( key.get(0) ); // make a new key with the read group ... * @param numCovariates The number of covariates you have determines the number of tables to create
newKey.add( key.get(1) ); // and quality score ... * @param smoothing The smoothing paramter that goes into empirical quality score calculation
newKey.add( key.get(iii + 2) ); // and the given covariate */
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) {
if( collapsedDatum == null ) {
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum( thisDatum ) ); dataCollapsedReadGroupDouble = new NHashMap<Double>();
} else { dataCollapsedQualityScoreDouble = new NHashMap<Double>();
collapsedDatum.increment( thisDatum ); dataCollapsedByCovariateDouble = new ArrayList<NHashMap<Double>>();
} for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariateDouble.add( new NHashMap<Double>() );
}
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
// Looping over the entrySet is really expensive but worth it
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
} }
} }
collapsedTablesCreated = true; dataCollapsedQualityScore.clear();
dataCollapsedByCovariate.clear();
dataCollapsedQualityScore = null; // will never need this again
dataCollapsedByCovariate = null; // will never need this again
if( data!=null ) {
data.clear();
data = null; // will never need this again
}
} }
/** /**
@ -143,10 +178,6 @@ public class RecalDataManager {
* @return The desired collapsed HashMap * @return The desired collapsed HashMap
*/ */
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) { public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
if( !collapsedTablesCreated ) {
throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound.");
}
if( covariate == 0) { if( covariate == 0) {
return dataCollapsedReadGroup; // table where everything except read group has been collapsed return dataCollapsedReadGroup; // table where everything except read group has been collapsed
} else if( covariate == 1 ) { } else if( covariate == 1 ) {
@ -155,4 +186,19 @@ public class RecalDataManager {
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
} }
} }
/**
* Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object
* @param covariate Which covariate indexes the desired collapsed NHashMap<Double>
* @return The desired collapsed NHashMap<Double>
*/
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
if( covariate == 0) {
return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
} else if( covariate == 1 ) {
return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
} else {
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
}
}
} }

View File

@ -86,8 +86,8 @@ public class RecalDatum {
} }
} }
public final void increment( final char curBase, final char ref ) { public final void increment( final char curBase, final char refBase ) {
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -100,7 +100,7 @@ public class RecalDatum {
double doubleMismatches = (double) ( numMismatches + smoothing ); double doubleMismatches = (double) ( numMismatches + smoothing );
double doubleObservations = (double) ( numObservations + smoothing ); double doubleObservations = (double) ( numObservations + smoothing );
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; }
return empiricalQual; return empiricalQual;
} }
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero

View File

@ -74,7 +74,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) @Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3; private int WINDOW_SIZE = 3;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
public int SMOOTHING = 1; private int SMOOTHING = 1;
//public enum RecalibrationMode { //public enum RecalibrationMode {
// COMBINATORIAL, // COMBINATORIAL,
@ -83,11 +83,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
//} //}
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false) //@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
public String MODE_STRING = "SEQUENTIAL"; private String MODE_STRING = "SEQUENTIAL";
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes? //public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
protected RecalDataManager dataManager; private RecalDataManager dataManager;
protected ArrayList<Covariate> requestedCovariates; private ArrayList<Covariate> requestedCovariates;
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*"); private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
@ -162,10 +162,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} else { // found some data } else { // found some data
if( !foundAllCovariates ) { if( !foundAllCovariates ) {
foundAllCovariates = true; foundAllCovariates = true;
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
dataManager = new RecalDataManager( estimatedCapacity ); final boolean createCollapsedTables = true;
// Initialize the data hashMaps
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
} }
addCSVData(line); // parse the line and add the data to the HashMap addCSVData(line); // parse the line and add the data to the HashMap
@ -179,10 +179,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} }
logger.info( "...done!" ); logger.info( "...done!" );
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
// Create the collapsed tables that are used in the sequential calculation // Create the collapsed tables that are used in the sequential calculation
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) { if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
logger.info( "Creating collapsed tables for use in sequential calculation..." ); logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
dataManager.createCollapsedTables( requestedCovariates.size() ); dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
logger.info( "...done!" ); logger.info( "...done!" );
} }
} }
@ -201,7 +204,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
key.add( cov.getValue( vals[iii] ) ); key.add( cov.getValue( vals[iii] ) );
} }
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) ); RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
dataManager.data.put( key, datum ); dataManager.addToAllTables( key, datum );
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -218,12 +222,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/ */
public SAMRecord map( char[] refBases, SAMRecord read ) { public SAMRecord map( char[] refBases, SAMRecord read ) {
// WARNING: refBases is always null because this walker doesn't have @REQUIRES({DataSource.REFERENCE_BASES}) // WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
// This is done in order to speed up the code // This is done in order to speed up the code
// if( read.getMappingQuality() <= 0 ) { if( read.getMappingQuality() <= 0 ) {
// return read; // early return here, unmapped reads and mapping quality zero reads should be left alone return read; // early return here, unmapped reads and mapping quality zero reads should be left alone
// } }
byte[] originalQuals = read.getBaseQualities(); byte[] originalQuals = read.getBaseQualities();
// Check if we need to use the original quality scores instead // Check if we need to use the original quality scores instead
@ -253,21 +257,19 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases ) ); // offset is zero based so passing iii is correct here key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases ) ); // offset is zero based so passing iii is correct here
} }
if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { recalQuals[iii] = performSequentialQualityCalculation( key );
RecalDatum datum = dataManager.data.get( key );
if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
}
} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
recalQuals[iii] = performSequentialQualityCalculation( key );
} else {
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING );
}
// Do some error checking on the new quality score //if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { // BUGBUG: This isn't supported. No need to keep the full data hashmap around so it was removed for major speed up
if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { // //RecalDatum datum = dataManager.data.get( key );
throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); // //if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
} // // recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
// //}
// throw new StingException("The Combinatorial mode isn't supported.");
//} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
//
//} else {
// throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING );
//}
} }
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
@ -303,42 +305,40 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/ */
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) { private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
byte qualFromRead = Byte.parseByte(key.get(1).toString()); String readGroupKeyElement = key.get(0).toString();
ArrayList<Comparable> newKey; int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString());
byte qualFromRead = (byte)qualityScoreKeyElement;
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
// The global quality shift (over the read group only) // The global quality shift (over the read group only)
newKey = new ArrayList<Comparable>(); newKey.add( readGroupKeyElement );
newKey.add( key.get(0) ); // read group
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey ); RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey );
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( newKey );
double globalDeltaQ = 0.0; double globalDeltaQ = 0.0;
double aggregrateQreported = 0.0; double aggregrateQreported = 0.0;
if( globalDeltaQDatum != null ) { if( globalDeltaQDatum != null ) {
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) ); aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) );
globalDeltaQ = globalDeltaQDatum.empiricalQualDouble( SMOOTHING ) - aggregrateQreported; globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
} }
// The shift in quality between reported and empirical // The shift in quality between reported and empirical
newKey = new ArrayList<Comparable>(); newKey.add( qualityScoreKeyElement );
newKey.add( key.get(0) ); // read group Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get( newKey );
newKey.add( key.get(1) ); // quality score
RecalDatum deltaQReportedDatum = dataManager.getCollapsedTable(1).get( newKey );
double deltaQReported = 0.0; double deltaQReported = 0.0;
if( deltaQReportedDatum != null ) { if( deltaQReportedEmpirical != null ) {
deltaQReported = deltaQReportedDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - globalDeltaQ; deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
} }
// The shift in quality due to each covariate by itself in turn // The shift in quality due to each covariate by itself in turn
double deltaQCovariates = 0.0; double deltaQCovariates = 0.0;
RecalDatum deltaQCovariateDatum; Double deltaQCovariateEmpirical;
for( int iii = 2; iii < key.size(); iii++ ) { for( int iii = 2; iii < key.size(); iii++ ) {
newKey = new ArrayList<Comparable>(); newKey.add( key.get(iii) ); // the given covariate
newKey.add( key.get(0) ); // read group deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( newKey );
newKey.add( key.get(1) ); // quality score if( deltaQCovariateEmpirical != null ) {
newKey.add( key.get(iii) ); // given covariate deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
deltaQCovariateDatum = dataManager.getCollapsedTable(iii).get( newKey );
if( deltaQCovariateDatum != null ) {
deltaQCovariates += ( deltaQCovariateDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - (globalDeltaQ + deltaQReported) );
} }
newKey.remove( 2 ); // this new covariate is always added in at position 2 in the newKey list
} }
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;