BQSR bug triage
* fixed bug where some keys were using the same recal datum objects
* fixed quantization qual calculations when combining multiple reports
* fixed rounding error with empirical quality reported when combining reports
* fixed combine routine in the gatk reports due to the primary keys being out of order
* added auto-recalibration option to BQSR scala script
* reduced the size of the recalibration report by ~15%
* updated md5's
This commit is contained in:
parent
1e65474fec
commit
7c3b3650bb
|
|
@ -224,22 +224,23 @@ public class RecalDataManager {
|
|||
public static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap) {
|
||||
List<GATKReportTable> result = new LinkedList<GATKReportTable>();
|
||||
int tableIndex = 0;
|
||||
|
||||
final Pair<String, String> covariateValue = new Pair<String, String>(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> covariateName = new Pair<String, String>(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> eventType = new Pair<String, String>(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f");
|
||||
final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f");
|
||||
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
|
||||
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
|
||||
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
|
||||
BQSRKeyManager keyManager = entry.getKey();
|
||||
Map<BitSet, RecalDatum> recalTable = entry.getValue();
|
||||
|
||||
boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs.
|
||||
GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "");
|
||||
final Pair<String, String> covariateValue = new Pair<String, String>(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> covariateName = new Pair<String, String>(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> eventType = new Pair<String, String>(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.2f");
|
||||
final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.2f");
|
||||
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
|
||||
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
|
||||
|
||||
long primaryKey = 0L;
|
||||
|
||||
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
|
||||
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
|
||||
List<Covariate> optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
|
||||
|
||||
ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
|
||||
|
|
@ -256,15 +257,17 @@ public class RecalDataManager {
|
|||
|
||||
columnNames.add(eventType); // the order of these column names is important here
|
||||
columnNames.add(empiricalQuality);
|
||||
columnNames.add(estimatedQReported);
|
||||
if (isReadGroupTable)
|
||||
columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported
|
||||
columnNames.add(nObservations);
|
||||
columnNames.add(nErrors);
|
||||
|
||||
|
||||
reportTable.addPrimaryKey("PrimaryKey", false); // every table must have a primary key (hidden)
|
||||
for (Pair<String, String> columnName : columnNames)
|
||||
reportTable.addColumn(columnName.getFirst(), true, columnName.getSecond()); // every table must have the event type
|
||||
|
||||
long primaryKey = 0L;
|
||||
|
||||
for (Map.Entry<BitSet, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
|
||||
BitSet bitSetKey = recalTableEntry.getKey();
|
||||
Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
|
||||
|
|
@ -274,8 +277,9 @@ public class RecalDataManager {
|
|||
columnData.put(columnName, key);
|
||||
}
|
||||
RecalDatum datum = recalTableEntry.getValue();
|
||||
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); // iterator.next() gives the column name for Empirical Quality
|
||||
columnData.put(iterator.next().getFirst(), Math.round(datum.getEstimatedQReported())); // iterator.next() gives the column name for EstimatedQReported
|
||||
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality());
|
||||
if (isReadGroupTable)
|
||||
columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table
|
||||
columnData.put(iterator.next().getFirst(), datum.numObservations);
|
||||
columnData.put(iterator.next().getFirst(), datum.numMismatches);
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
|
|
@ -76,7 +78,7 @@ public class RecalDatum extends RecalDatumOptimized {
|
|||
public final void combine(final RecalDatum other) {
|
||||
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
|
||||
this.increment(other.numObservations, other.numMismatches);
|
||||
this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations);
|
||||
this.estimatedQReported = -10 * Math.log10(sumErrors / this.numObservations);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -90,7 +92,7 @@ public class RecalDatum extends RecalDatumOptimized {
|
|||
}
|
||||
|
||||
public final void calcEstimatedReportedQuality() {
|
||||
this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / (double) numObservations);
|
||||
this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / numObservations);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -107,15 +109,16 @@ public class RecalDatum extends RecalDatumOptimized {
|
|||
return empiricalQuality;
|
||||
}
|
||||
|
||||
public final void resetCalculatedQualities() {
|
||||
empiricalQuality = 0.0;
|
||||
}
|
||||
|
||||
private double calcExpectedErrors() {
|
||||
return (double) this.numObservations * qualToErrorProb(estimatedQReported);
|
||||
return (double) this.numObservations * QualityUtils.qualToProb(estimatedQReported);
|
||||
}
|
||||
|
||||
private double qualToErrorProb(final double qual) {
|
||||
return Math.pow(10.0, qual / -10.0);
|
||||
/**
|
||||
* Makes a hard copy of the recal datum element
|
||||
*
|
||||
* @return a new recal datum object with the same contents of this datum.
|
||||
*/
|
||||
protected RecalDatum copy() {
|
||||
return new RecalDatum(numObservations, numMismatches, estimatedQReported, empiricalQuality);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -88,23 +88,31 @@ public class RecalibrationReport {
|
|||
* @param other the recalibration report to combine with this one
|
||||
*/
|
||||
public void combine(RecalibrationReport other) {
|
||||
Iterator<Map<BitSet, RecalDatum>> tableIterator = keysAndTablesMap.values().iterator(); // because these are ordered (linked hashmaps) we can iterate over the 'this' and do a for loop on the 'other' tables and be sure that we are looking at the equivalent tables on both objects
|
||||
for (Map<BitSet, RecalDatum> otherTable : other.getKeysAndTablesMap().values()) { // iterate over all tables for 'other'
|
||||
Map<BitSet, RecalDatum> thisTable = tableIterator.next(); // iterate over all tables for 'this'
|
||||
for (Map.Entry<BitSet, RecalDatum> entry : otherTable.entrySet()) { // for each table, go through all the entries in the 'other' dataset to update 'this' dataset
|
||||
BitSet key = entry.getKey();
|
||||
RecalDatum otherDatum = entry.getValue();
|
||||
RecalDatum thisDatum = thisTable.get(key);
|
||||
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator();
|
||||
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> otherEntry : other.getKeysAndTablesMap().entrySet()) {
|
||||
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> thisEntry = thisIterator.next();
|
||||
|
||||
Map<BitSet, RecalDatum> thisTable = thisEntry.getValue();
|
||||
BQSRKeyManager thisKeyManager = thisEntry.getKey();
|
||||
BQSRKeyManager otherKeyManager = otherEntry.getKey();
|
||||
|
||||
for (Map.Entry<BitSet, RecalDatum> otherTableEntry : otherEntry.getValue().entrySet()) {
|
||||
RecalDatum otherDatum = otherTableEntry.getValue();
|
||||
BitSet otherBitKey = otherTableEntry.getKey();
|
||||
List<Object> otherObjectKey = otherKeyManager.keySetFrom(otherBitKey);
|
||||
|
||||
BitSet thisBitKey = thisKeyManager.bitSetFromKey(otherObjectKey.toArray());
|
||||
RecalDatum thisDatum = thisTable.get(thisBitKey);
|
||||
|
||||
if (thisDatum == null)
|
||||
thisDatum = otherDatum; // sometimes the datum in other won't be present in 'this'. So just assign it!
|
||||
thisTable.put(thisBitKey, otherDatum);
|
||||
else
|
||||
thisDatum.combine(otherDatum); // add the two datum objects into 'this'
|
||||
thisDatum.resetCalculatedQualities(); // reset the empirical quality to make sure the user doesn't forget to recalculate it
|
||||
thisDatum.combine(otherDatum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public QuantizationInfo getQuantizationInfo() {
|
||||
return quantizationInfo;
|
||||
}
|
||||
|
|
@ -279,13 +287,11 @@ public class RecalibrationReport {
|
|||
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
|
||||
*/
|
||||
public void calculateEmpiricalAndQuantizedQualities() {
|
||||
quantizationInfo.quantizeQualityScores(RAC.QUANTIZING_LEVELS);
|
||||
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values()) {
|
||||
for (RecalDatum datum : table.values()) {
|
||||
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values())
|
||||
for (RecalDatum datum : table.values())
|
||||
datum.calcCombinedEmpiricalQuality(QualityUtils.MAX_QUAL_SCORE);
|
||||
datum.calcEstimatedReportedQuality();
|
||||
}
|
||||
}
|
||||
|
||||
quantizationInfo = new QuantizationInfo(keysAndTablesMap, RAC.QUANTIZING_LEVELS);
|
||||
}
|
||||
|
||||
public void output(PrintStream output) {
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@ public class BQSRGathererUnitTest {
|
|||
for (GATKReportTable originalTable : originalReport.getTables()) {
|
||||
GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName());
|
||||
List<String> columnsToTest = new LinkedList<String>();
|
||||
columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
|
||||
columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
|
||||
if (originalTable.getTableName().equals(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL
|
||||
columnsToTest.add(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
|
||||
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1);
|
||||
|
|
@ -44,8 +46,6 @@ public class BQSRGathererUnitTest {
|
|||
}
|
||||
|
||||
else if (originalTable.getTableName().startsWith("RecalTable")) {
|
||||
columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
|
||||
columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
|
||||
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue