BQSR bug triage

* fixed bug where some keys were using the same recal datum objects
    * fixed quantization qual calculations when combining multiple reports
    * fixed rounding error with empirical quality reported when combining reports
    * fixed combine routine in the gatk reports due to the primary keys being out of order
    * added auto-recalibration option to BQSR scala script
    * reduced the size of the recalibration report by ~15%
    * updated md5's
This commit is contained in:
Mauricio Carneiro 2012-04-04 16:40:55 -04:00
parent 1e65474fec
commit 7c3b3650bb
5 changed files with 1475 additions and 1462 deletions

View File

@ -224,22 +224,23 @@ public class RecalDataManager {
public static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap) {
List<GATKReportTable> result = new LinkedList<GATKReportTable>();
int tableIndex = 0;
final Pair<String, String> covariateValue = new Pair<String, String>(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s");
final Pair<String, String> covariateName = new Pair<String, String>(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s");
final Pair<String, String> eventType = new Pair<String, String>(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s");
final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f");
final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f");
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
Map<BitSet, RecalDatum> recalTable = entry.getValue();
boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs.
GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "");
final Pair<String, String> covariateValue = new Pair<String, String>(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s");
final Pair<String, String> covariateName = new Pair<String, String>(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s");
final Pair<String, String> eventType = new Pair<String, String>(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s");
final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.2f");
final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.2f");
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
long primaryKey = 0L;
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
List<Covariate> optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
@ -256,15 +257,17 @@ public class RecalDataManager {
columnNames.add(eventType); // the order of these column names is important here
columnNames.add(empiricalQuality);
columnNames.add(estimatedQReported);
if (isReadGroupTable)
columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported
columnNames.add(nObservations);
columnNames.add(nErrors);
reportTable.addPrimaryKey("PrimaryKey", false); // every table must have a primary key (hidden)
for (Pair<String, String> columnName : columnNames)
reportTable.addColumn(columnName.getFirst(), true, columnName.getSecond()); // every table must have the event type
long primaryKey = 0L;
for (Map.Entry<BitSet, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
BitSet bitSetKey = recalTableEntry.getKey();
Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
@ -274,8 +277,9 @@ public class RecalDataManager {
columnData.put(columnName, key);
}
RecalDatum datum = recalTableEntry.getValue();
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); // iterator.next() gives the column name for Empirical Quality
columnData.put(iterator.next().getFirst(), Math.round(datum.getEstimatedQReported())); // iterator.next() gives the column name for EstimatedQReported
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality());
if (isReadGroupTable)
columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table
columnData.put(iterator.next().getFirst(), datum.numObservations);
columnData.put(iterator.next().getFirst(), datum.numMismatches);

View File

@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.broadinstitute.sting.utils.QualityUtils;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
@ -76,7 +78,7 @@ public class RecalDatum extends RecalDatumOptimized {
public final void combine(final RecalDatum other) {
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
this.increment(other.numObservations, other.numMismatches);
this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations);
this.estimatedQReported = -10 * Math.log10(sumErrors / this.numObservations);
}
//---------------------------------------------------------------------------------------------------------------
@ -90,7 +92,7 @@ public class RecalDatum extends RecalDatumOptimized {
}
public final void calcEstimatedReportedQuality() {
this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / (double) numObservations);
this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / numObservations);
}
//---------------------------------------------------------------------------------------------------------------
@ -107,15 +109,16 @@ public class RecalDatum extends RecalDatumOptimized {
return empiricalQuality;
}
public final void resetCalculatedQualities() {
empiricalQuality = 0.0;
}
private double calcExpectedErrors() {
return (double) this.numObservations * qualToErrorProb(estimatedQReported);
return (double) this.numObservations * QualityUtils.qualToProb(estimatedQReported);
}
private double qualToErrorProb(final double qual) {
return Math.pow(10.0, qual / -10.0);
/**
* Makes a hard copy of the recal datum element
*
* @return a new recal datum object with the same contents of this datum.
*/
protected RecalDatum copy() {
return new RecalDatum(numObservations, numMismatches, estimatedQReported, empiricalQuality);
}
}

View File

@ -88,23 +88,31 @@ public class RecalibrationReport {
* @param other the recalibration report to combine with this one
*/
public void combine(RecalibrationReport other) {
Iterator<Map<BitSet, RecalDatum>> tableIterator = keysAndTablesMap.values().iterator(); // because these are ordered (linked hashmaps) we can iterate over the 'this' and do a for loop on the 'other' tables and be sure that we are looking at the equivalent tables on both objects
for (Map<BitSet, RecalDatum> otherTable : other.getKeysAndTablesMap().values()) { // iterate over all tables for 'other'
Map<BitSet, RecalDatum> thisTable = tableIterator.next(); // iterate over all tables for 'this'
for (Map.Entry<BitSet, RecalDatum> entry : otherTable.entrySet()) { // for each table, go through all the entries in the 'other' dataset to update 'this' dataset
BitSet key = entry.getKey();
RecalDatum otherDatum = entry.getValue();
RecalDatum thisDatum = thisTable.get(key);
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator();
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> otherEntry : other.getKeysAndTablesMap().entrySet()) {
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> thisEntry = thisIterator.next();
Map<BitSet, RecalDatum> thisTable = thisEntry.getValue();
BQSRKeyManager thisKeyManager = thisEntry.getKey();
BQSRKeyManager otherKeyManager = otherEntry.getKey();
for (Map.Entry<BitSet, RecalDatum> otherTableEntry : otherEntry.getValue().entrySet()) {
RecalDatum otherDatum = otherTableEntry.getValue();
BitSet otherBitKey = otherTableEntry.getKey();
List<Object> otherObjectKey = otherKeyManager.keySetFrom(otherBitKey);
BitSet thisBitKey = thisKeyManager.bitSetFromKey(otherObjectKey.toArray());
RecalDatum thisDatum = thisTable.get(thisBitKey);
if (thisDatum == null)
thisDatum = otherDatum; // sometimes the datum in other won't be present in 'this'. So just assign it!
thisTable.put(thisBitKey, otherDatum);
else
thisDatum.combine(otherDatum); // add the two datum objects into 'this'
thisDatum.resetCalculatedQualities(); // reset the empirical quality to make sure the user doesn't forget to recalculate it
thisDatum.combine(otherDatum);
}
}
}
public QuantizationInfo getQuantizationInfo() {
return quantizationInfo;
}
@ -279,13 +287,11 @@ public class RecalibrationReport {
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
*/
public void calculateEmpiricalAndQuantizedQualities() {
quantizationInfo.quantizeQualityScores(RAC.QUANTIZING_LEVELS);
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values()) {
for (RecalDatum datum : table.values()) {
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values())
for (RecalDatum datum : table.values())
datum.calcCombinedEmpiricalQuality(QualityUtils.MAX_QUAL_SCORE);
datum.calcEstimatedReportedQuality();
}
}
quantizationInfo = new QuantizationInfo(keysAndTablesMap, RAC.QUANTIZING_LEVELS);
}
public void output(PrintStream output) {

View File

@ -33,6 +33,8 @@ public class BQSRGathererUnitTest {
for (GATKReportTable originalTable : originalReport.getTables()) {
GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName());
List<String> columnsToTest = new LinkedList<String>();
columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
if (originalTable.getTableName().equals(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL
columnsToTest.add(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1);
@ -44,8 +46,6 @@ public class BQSRGathererUnitTest {
}
else if (originalTable.getTableName().startsWith("RecalTable")) {
columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
}
}

File diff suppressed because it is too large Load Diff