TableRecalibration is now much smarter about hashing calculations, taking advantage of the sequential recalibration formulation. Instead of hashing RecalDatums it hashes the empirical quality score itself. This cuts the runtime by 20 percent. TableRecalibration also now skips over reads with zero mapping quality (outputs them to the new bam but doesn't touch their base quality scores).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2069 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
be31d7f4cc
commit
f0a234ab29
|
|
@ -51,7 +51,7 @@ public class DinucCovariate implements Covariate {
|
||||||
dinucHashMap = new HashMap<Integer, Dinuc>();
|
dinucHashMap = new HashMap<Integer, Dinuc>();
|
||||||
for(byte byte1 : BASES) {
|
for(byte byte1 : BASES) {
|
||||||
for(byte byte2: BASES) {
|
for(byte byte2: BASES) {
|
||||||
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) );
|
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -43,98 +42,134 @@ public class RecalDataManager {
|
||||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
|
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
|
||||||
private boolean collapsedTablesCreated;
|
|
||||||
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||||
|
|
||||||
|
private NHashMap<Double> dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
|
||||||
|
private NHashMap<Double> dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
|
||||||
|
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||||
|
|
||||||
|
|
||||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
|
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
|
||||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
|
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
|
||||||
|
|
||||||
RecalDataManager() {
|
RecalDataManager() {
|
||||||
data = new NHashMap<RecalDatum>();
|
data = new NHashMap<RecalDatum>();
|
||||||
collapsedTablesCreated = false;
|
}
|
||||||
|
|
||||||
|
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
|
||||||
|
if( createCollapsedTables ) { // initialize all the collapsed tables
|
||||||
|
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
||||||
|
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
|
||||||
|
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
||||||
|
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||||
|
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
||||||
|
}
|
||||||
|
dataSumExpectedErrors = new NHashMap<Double>();
|
||||||
|
} else {
|
||||||
|
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
RecalDataManager( int estimatedCapacity ) {
|
RecalDataManager( final int estimatedCapacity ) {
|
||||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
|
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
|
||||||
// a number to monkey around with when optimizing performace of the HashMap
|
// a number to monkey around with when optimizing performace of the HashMap
|
||||||
collapsedTablesCreated = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker
|
|
||||||
/**
|
/**
|
||||||
* Create all the collapsed tables that will be used in the sequential calculation in TableRecalibrationWalker
|
* Add the given mapping to all of the collapsed hash tables
|
||||||
* @param numCovariates The number of covariates you have determines the number of tables to create
|
* @param key The list of comparables that is the key for this mapping
|
||||||
|
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||||
*/
|
*/
|
||||||
public final void createCollapsedTables( final int numCovariates ) {
|
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum ) {
|
||||||
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
|
||||||
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
|
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||||
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
//data.put(key, thisDatum); // add the mapping to the main table
|
||||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
|
||||||
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||||
|
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
|
||||||
|
newKey.add( key.get(0) ); // make a new key with just the read group
|
||||||
|
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||||
|
if( collapsedDatum == null ) {
|
||||||
|
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||||
|
} else {
|
||||||
|
collapsedDatum.increment(fullDatum);
|
||||||
}
|
}
|
||||||
dataSumExpectedErrors = new NHashMap<Double>();
|
|
||||||
|
|
||||||
// preallocate for use in for loops below
|
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||||
RecalDatum thisDatum;
|
newKey = new ArrayList<Comparable>();
|
||||||
RecalDatum collapsedDatum;
|
newKey.add( key.get(0) ); // make a new key with just the read group
|
||||||
List<? extends Comparable> key;
|
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
|
||||||
ArrayList<Comparable> newKey;
|
if( sumExpectedErrors == null ) {
|
||||||
Double sumExpectedErrors;
|
dataSumExpectedErrors.put( newKey, 0.0 );
|
||||||
|
} else {
|
||||||
|
dataSumExpectedErrors.remove( newKey );
|
||||||
|
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
|
||||||
|
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
|
||||||
|
}
|
||||||
|
|
||||||
// for every data point in the map
|
newKey = new ArrayList<Comparable>();
|
||||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : data.entrySet() ) {
|
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||||
thisDatum = entry.getValue();
|
newKey.add( key.get(0) ); // make a new key with the read group ...
|
||||||
key = entry.getKey();
|
newKey.add( key.get(1) ); // and quality score
|
||||||
|
collapsedDatum = dataCollapsedQualityScore.get( newKey );
|
||||||
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
if( collapsedDatum == null ) {
|
||||||
|
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
|
||||||
|
} else {
|
||||||
|
collapsedDatum.increment(fullDatum);
|
||||||
|
}
|
||||||
|
|
||||||
|
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||||
|
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted
|
||||||
newKey = new ArrayList<Comparable>();
|
newKey = new ArrayList<Comparable>();
|
||||||
newKey.add( key.get(0) ); // make a new key with just the read group
|
|
||||||
collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
|
||||||
if( collapsedDatum == null ) {
|
|
||||||
dataCollapsedReadGroup.put( newKey, new RecalDatum( thisDatum ) );
|
|
||||||
} else {
|
|
||||||
collapsedDatum.increment( thisDatum );
|
|
||||||
}
|
|
||||||
|
|
||||||
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
|
||||||
newKey = new ArrayList<Comparable>();
|
|
||||||
newKey.add( key.get(0) ); // make a new key with just the read group
|
|
||||||
sumExpectedErrors = dataSumExpectedErrors.get( newKey );
|
|
||||||
if( sumExpectedErrors == null ) {
|
|
||||||
dataSumExpectedErrors.put( newKey, 0.0 );
|
|
||||||
} else {
|
|
||||||
dataSumExpectedErrors.remove( newKey );
|
|
||||||
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * thisDatum.getNumObservations();
|
|
||||||
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
|
|
||||||
}
|
|
||||||
|
|
||||||
newKey = new ArrayList<Comparable>();
|
|
||||||
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
|
||||||
newKey.add( key.get(0) ); // make a new key with the read group ...
|
newKey.add( key.get(0) ); // make a new key with the read group ...
|
||||||
newKey.add( key.get(1) ); // and quality score
|
newKey.add( key.get(1) ); // and quality score ...
|
||||||
collapsedDatum = dataCollapsedQualityScore.get( newKey );
|
newKey.add( key.get(iii + 2) ); // and the given covariate
|
||||||
|
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
||||||
if( collapsedDatum == null ) {
|
if( collapsedDatum == null ) {
|
||||||
dataCollapsedQualityScore.put( newKey, new RecalDatum( thisDatum ) );
|
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
|
||||||
} else {
|
} else {
|
||||||
collapsedDatum.increment( thisDatum );
|
collapsedDatum.increment(fullDatum);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
/**
|
||||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted
|
* Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
|
||||||
newKey = new ArrayList<Comparable>();
|
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||||
newKey.add( key.get(0) ); // make a new key with the read group ...
|
* @param numCovariates The number of covariates you have determines the number of tables to create
|
||||||
newKey.add( key.get(1) ); // and quality score ...
|
* @param smoothing The smoothing paramter that goes into empirical quality score calculation
|
||||||
newKey.add( key.get(iii + 2) ); // and the given covariate
|
*/
|
||||||
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) {
|
||||||
if( collapsedDatum == null ) {
|
|
||||||
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum( thisDatum ) );
|
dataCollapsedReadGroupDouble = new NHashMap<Double>();
|
||||||
} else {
|
dataCollapsedQualityScoreDouble = new NHashMap<Double>();
|
||||||
collapsedDatum.increment( thisDatum );
|
dataCollapsedByCovariateDouble = new ArrayList<NHashMap<Double>>();
|
||||||
}
|
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||||
|
dataCollapsedByCovariateDouble.add( new NHashMap<Double>() );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
|
||||||
|
// Looping over the entrySet is really expensive but worth it
|
||||||
|
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
|
||||||
|
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||||
|
}
|
||||||
|
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
|
||||||
|
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||||
|
}
|
||||||
|
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
|
||||||
|
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
|
||||||
|
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
collapsedTablesCreated = true;
|
dataCollapsedQualityScore.clear();
|
||||||
|
dataCollapsedByCovariate.clear();
|
||||||
|
dataCollapsedQualityScore = null; // will never need this again
|
||||||
|
dataCollapsedByCovariate = null; // will never need this again
|
||||||
|
if( data!=null ) {
|
||||||
|
data.clear();
|
||||||
|
data = null; // will never need this again
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -143,10 +178,6 @@ public class RecalDataManager {
|
||||||
* @return The desired collapsed HashMap
|
* @return The desired collapsed HashMap
|
||||||
*/
|
*/
|
||||||
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
|
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
|
||||||
if( !collapsedTablesCreated ) {
|
|
||||||
throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound.");
|
|
||||||
}
|
|
||||||
|
|
||||||
if( covariate == 0) {
|
if( covariate == 0) {
|
||||||
return dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
return dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||||
} else if( covariate == 1 ) {
|
} else if( covariate == 1 ) {
|
||||||
|
|
@ -155,4 +186,19 @@ public class RecalDataManager {
|
||||||
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
|
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object
|
||||||
|
* @param covariate Which covariate indexes the desired collapsed NHashMap<Double>
|
||||||
|
* @return The desired collapsed NHashMap<Double>
|
||||||
|
*/
|
||||||
|
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
|
||||||
|
if( covariate == 0) {
|
||||||
|
return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
|
||||||
|
} else if( covariate == 1 ) {
|
||||||
|
return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
|
||||||
|
} else {
|
||||||
|
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -86,8 +86,8 @@ public class RecalDatum {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public final void increment( final char curBase, final char ref ) {
|
public final void increment( final char curBase, final char refBase ) {
|
||||||
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches
|
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -100,7 +100,7 @@ public class RecalDatum {
|
||||||
double doubleMismatches = (double) ( numMismatches + smoothing );
|
double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||||
double doubleObservations = (double) ( numObservations + smoothing );
|
double doubleObservations = (double) ( numObservations + smoothing );
|
||||||
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||||
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
|
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; }
|
||||||
return empiricalQual;
|
return empiricalQual;
|
||||||
}
|
}
|
||||||
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
|
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
|
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
|
||||||
private int WINDOW_SIZE = 3;
|
private int WINDOW_SIZE = 3;
|
||||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||||
public int SMOOTHING = 1;
|
private int SMOOTHING = 1;
|
||||||
|
|
||||||
//public enum RecalibrationMode {
|
//public enum RecalibrationMode {
|
||||||
// COMBINATORIAL,
|
// COMBINATORIAL,
|
||||||
|
|
@ -83,11 +83,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
|
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
|
||||||
public String MODE_STRING = "SEQUENTIAL";
|
private String MODE_STRING = "SEQUENTIAL";
|
||||||
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
|
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
|
||||||
|
|
||||||
protected RecalDataManager dataManager;
|
private RecalDataManager dataManager;
|
||||||
protected ArrayList<Covariate> requestedCovariates;
|
private ArrayList<Covariate> requestedCovariates;
|
||||||
|
|
||||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||||
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||||
|
|
@ -162,10 +162,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
} else { // found some data
|
} else { // found some data
|
||||||
if( !foundAllCovariates ) {
|
if( !foundAllCovariates ) {
|
||||||
foundAllCovariates = true;
|
foundAllCovariates = true;
|
||||||
logger.info( "The covariates being used here: " );
|
|
||||||
logger.info( requestedCovariates );
|
|
||||||
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
||||||
dataManager = new RecalDataManager( estimatedCapacity );
|
final boolean createCollapsedTables = true;
|
||||||
|
// Initialize the data hashMaps
|
||||||
|
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
|
||||||
|
|
||||||
}
|
}
|
||||||
addCSVData(line); // parse the line and add the data to the HashMap
|
addCSVData(line); // parse the line and add the data to the HashMap
|
||||||
|
|
@ -179,10 +179,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
}
|
}
|
||||||
logger.info( "...done!" );
|
logger.info( "...done!" );
|
||||||
|
|
||||||
|
logger.info( "The covariates being used here: " );
|
||||||
|
logger.info( requestedCovariates );
|
||||||
|
|
||||||
// Create the collapsed tables that are used in the sequential calculation
|
// Create the collapsed tables that are used in the sequential calculation
|
||||||
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
||||||
logger.info( "Creating collapsed tables for use in sequential calculation..." );
|
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||||
dataManager.createCollapsedTables( requestedCovariates.size() );
|
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
|
||||||
logger.info( "...done!" );
|
logger.info( "...done!" );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -201,7 +204,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
key.add( cov.getValue( vals[iii] ) );
|
key.add( cov.getValue( vals[iii] ) );
|
||||||
}
|
}
|
||||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
|
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
|
||||||
dataManager.data.put( key, datum );
|
dataManager.addToAllTables( key, datum );
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -218,12 +222,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
*/
|
*/
|
||||||
public SAMRecord map( char[] refBases, SAMRecord read ) {
|
public SAMRecord map( char[] refBases, SAMRecord read ) {
|
||||||
|
|
||||||
// WARNING: refBases is always null because this walker doesn't have @REQUIRES({DataSource.REFERENCE_BASES})
|
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
|
||||||
// This is done in order to speed up the code
|
// This is done in order to speed up the code
|
||||||
|
|
||||||
// if( read.getMappingQuality() <= 0 ) {
|
if( read.getMappingQuality() <= 0 ) {
|
||||||
// return read; // early return here, unmapped reads and mapping quality zero reads should be left alone
|
return read; // early return here, unmapped reads and mapping quality zero reads should be left alone
|
||||||
// }
|
}
|
||||||
|
|
||||||
byte[] originalQuals = read.getBaseQualities();
|
byte[] originalQuals = read.getBaseQualities();
|
||||||
// Check if we need to use the original quality scores instead
|
// Check if we need to use the original quality scores instead
|
||||||
|
|
@ -253,21 +257,19 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases ) ); // offset is zero based so passing iii is correct here
|
key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases ) ); // offset is zero based so passing iii is correct here
|
||||||
}
|
}
|
||||||
|
|
||||||
if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) {
|
recalQuals[iii] = performSequentialQualityCalculation( key );
|
||||||
RecalDatum datum = dataManager.data.get( key );
|
|
||||||
if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
|
|
||||||
recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
|
|
||||||
}
|
|
||||||
} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
|
||||||
recalQuals[iii] = performSequentialQualityCalculation( key );
|
|
||||||
} else {
|
|
||||||
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING );
|
|
||||||
}
|
|
||||||
|
|
||||||
// Do some error checking on the new quality score
|
//if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { // BUGBUG: This isn't supported. No need to keep the full data hashmap around so it was removed for major speed up
|
||||||
if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) {
|
// //RecalDatum datum = dataManager.data.get( key );
|
||||||
throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] );
|
// //if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
|
||||||
}
|
// // recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
|
||||||
|
// //}
|
||||||
|
// throw new StingException("The Combinatorial mode isn't supported.");
|
||||||
|
//} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
||||||
|
//
|
||||||
|
//} else {
|
||||||
|
// throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING );
|
||||||
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
|
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
|
||||||
|
|
@ -303,42 +305,40 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
*/
|
*/
|
||||||
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
|
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
|
||||||
|
|
||||||
byte qualFromRead = Byte.parseByte(key.get(1).toString());
|
String readGroupKeyElement = key.get(0).toString();
|
||||||
ArrayList<Comparable> newKey;
|
int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString());
|
||||||
|
byte qualFromRead = (byte)qualityScoreKeyElement;
|
||||||
|
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
|
||||||
|
|
||||||
// The global quality shift (over the read group only)
|
// The global quality shift (over the read group only)
|
||||||
newKey = new ArrayList<Comparable>();
|
newKey.add( readGroupKeyElement );
|
||||||
newKey.add( key.get(0) ); // read group
|
|
||||||
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey );
|
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey );
|
||||||
|
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( newKey );
|
||||||
double globalDeltaQ = 0.0;
|
double globalDeltaQ = 0.0;
|
||||||
double aggregrateQreported = 0.0;
|
double aggregrateQreported = 0.0;
|
||||||
if( globalDeltaQDatum != null ) {
|
if( globalDeltaQDatum != null ) {
|
||||||
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) );
|
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get( newKey ) / ((double) globalDeltaQDatum.getNumObservations()) );
|
||||||
globalDeltaQ = globalDeltaQDatum.empiricalQualDouble( SMOOTHING ) - aggregrateQreported;
|
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The shift in quality between reported and empirical
|
// The shift in quality between reported and empirical
|
||||||
newKey = new ArrayList<Comparable>();
|
newKey.add( qualityScoreKeyElement );
|
||||||
newKey.add( key.get(0) ); // read group
|
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get( newKey );
|
||||||
newKey.add( key.get(1) ); // quality score
|
|
||||||
RecalDatum deltaQReportedDatum = dataManager.getCollapsedTable(1).get( newKey );
|
|
||||||
double deltaQReported = 0.0;
|
double deltaQReported = 0.0;
|
||||||
if( deltaQReportedDatum != null ) {
|
if( deltaQReportedEmpirical != null ) {
|
||||||
deltaQReported = deltaQReportedDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - globalDeltaQ;
|
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The shift in quality due to each covariate by itself in turn
|
// The shift in quality due to each covariate by itself in turn
|
||||||
double deltaQCovariates = 0.0;
|
double deltaQCovariates = 0.0;
|
||||||
RecalDatum deltaQCovariateDatum;
|
Double deltaQCovariateEmpirical;
|
||||||
for( int iii = 2; iii < key.size(); iii++ ) {
|
for( int iii = 2; iii < key.size(); iii++ ) {
|
||||||
newKey = new ArrayList<Comparable>();
|
newKey.add( key.get(iii) ); // the given covariate
|
||||||
newKey.add( key.get(0) ); // read group
|
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( newKey );
|
||||||
newKey.add( key.get(1) ); // quality score
|
if( deltaQCovariateEmpirical != null ) {
|
||||||
newKey.add( key.get(iii) ); // given covariate
|
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||||
deltaQCovariateDatum = dataManager.getCollapsedTable(iii).get( newKey );
|
|
||||||
if( deltaQCovariateDatum != null ) {
|
|
||||||
deltaQCovariates += ( deltaQCovariateDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - (globalDeltaQ + deltaQReported) );
|
|
||||||
}
|
}
|
||||||
|
newKey.remove( 2 ); // this new covariate is always added in at position 2 in the newKey list
|
||||||
}
|
}
|
||||||
|
|
||||||
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue