Bad news folks: BQSR scatter-gather was totally busted; you absolutely cannot trust any BQSR table that was a product of SG (for any version of BQSR). I fixed BQSR-gathering, rewrote (and enabled) the unit test, and confirmed that outputs are now identical whether or not SG is used to create the table.

This commit is contained in:
Eric Banks 2012-09-20 14:14:34 -04:00
parent 2e6f533996
commit 1316b579f0
9 changed files with 79 additions and 65 deletions

View File

@ -77,7 +77,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
final byte qual = tempQualArray[eventIndex];
final boolean isError = tempErrorArray[eventIndex];
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getReadGroupTable();
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
@ -85,7 +85,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
else
rgPreviousDatum.combine(rgThisDatum);
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getQualityScoreTable();
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
if (qualPreviousDatum == null)
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);
@ -124,7 +124,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
final byte qual = tempQualArray[eventIndex];
final double isError = tempFractionalErrorArray[eventIndex];
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getReadGroupTable();
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
@ -132,7 +132,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
else
rgPreviousDatum.combine(rgThisDatum);
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getQualityScoreTable();
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
if (qualPreviousDatum == null)
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);

View File

@ -65,7 +65,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION);
final int eventIndex = EventType.BASE_SUBSTITUTION.index;
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getReadGroupTable();
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
@ -73,7 +73,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
else
rgPreviousDatum.combine(rgThisDatum);
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getQualityScoreTable();
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
if (qualPreviousDatum == null)
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);

View File

@ -166,8 +166,8 @@ public class BaseRecalibration {
private byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) {
final byte qualFromRead = (byte)(long)key[1];
final double globalDeltaQ = calculateGlobalDeltaQ(recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE), key, errorModel);
final double deltaQReported = calculateDeltaQReported(recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE), key, errorModel, globalDeltaQ, qualFromRead);
final double globalDeltaQ = calculateGlobalDeltaQ(recalibrationTables.getReadGroupTable(), key, errorModel);
final double deltaQReported = calculateDeltaQReported(recalibrationTables.getQualityScoreTable(), key, errorModel, globalDeltaQ, qualFromRead);
final double deltaQCovariates = calculateDeltaQCovariates(recalibrationTables, key, errorModel, globalDeltaQ, deltaQReported, qualFromRead);
double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula

View File

@ -34,7 +34,7 @@ public class QuantizationInfo {
for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L;
final NestedIntegerArray<RecalDatum> qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); // get the quality score table
final NestedIntegerArray<RecalDatum> qualTable = recalibrationTables.getQualityScoreTable(); // get the quality score table
for (final RecalDatum value : qualTable.getAllValues()) {
final RecalDatum datum = value;

View File

@ -68,6 +68,7 @@ public class RecalUtils {
public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1";
public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2";
public final static String ARGUMENT_COLUMN_NAME = "Argument";
public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value";
public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore";
public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count";
@ -399,7 +400,7 @@ public class RecalUtils {
final NestedHashMap deltaTable = new NestedHashMap();
// add the quality score table to the delta table
final NestedIntegerArray<RecalDatum> qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
final NestedIntegerArray<RecalDatum> qualTable = recalibrationTables.getQualityScoreTable();
for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table
final int[] newCovs = new int[4];
newCovs[0] = leaf.keys[0];

View File

@ -61,9 +61,9 @@ public class RecalibrationReport {
recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE)));
parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE));
parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable());
parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE));
parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getQualityScoreTable());
parseAllCovariatesTable(report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables);
@ -106,9 +106,9 @@ public class RecalibrationReport {
*/
public void combine(final RecalibrationReport other) {
for (RecalibrationTables.TableType type : RecalibrationTables.TableType.values()) {
final NestedIntegerArray<RecalDatum> myTable = recalibrationTables.getTable(type);
final NestedIntegerArray<RecalDatum> otherTable = other.recalibrationTables.getTable(type);
for ( int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++ ) {
final NestedIntegerArray<RecalDatum> myTable = recalibrationTables.getTable(tableIndex);
final NestedIntegerArray<RecalDatum> otherTable = other.recalibrationTables.getTable(tableIndex);
for (final NestedIntegerArray.Leaf row : otherTable.getAllLeaves()) {
final RecalDatum myDatum = myTable.get(row.keys);

View File

@ -67,8 +67,12 @@ public class RecalibrationTables {
tables[i] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
}
public NestedIntegerArray<RecalDatum> getTable(final TableType type) {
return (NestedIntegerArray<RecalDatum>)tables[type.index];
public NestedIntegerArray<RecalDatum> getReadGroupTable() {
return (NestedIntegerArray<RecalDatum>)tables[TableType.READ_GROUP_TABLE.index];
}
public NestedIntegerArray<RecalDatum> getQualityScoreTable() {
return (NestedIntegerArray<RecalDatum>)tables[TableType.QUALITY_SCORE_TABLE.index];
}
public NestedIntegerArray<RecalDatum> getTable(final int index) {

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
@ -7,49 +8,70 @@ import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
/**
* @author Mauricio Carneiro
* @since 3/7/12
* @author Eric Banks
* @since 9/20/12
*/
public class BQSRGathererUnitTest {
RecalibrationArgumentCollection RAC;
public class BQSRGathererUnitTest extends BaseTest {
private static File recal = new File("public/testdata/exampleGRP.grp");
private static File recal1 = new File(privateTestDir + "HiSeq.1mb.1RG.sg1.table");
private static File recal2 = new File(privateTestDir + "HiSeq.1mb.1RG.sg2.table");
private static File recal3 = new File(privateTestDir + "HiSeq.1mb.1RG.sg3.table");
private static File recal4 = new File(privateTestDir + "HiSeq.1mb.1RG.sg4.table");
private static File recal5 = new File(privateTestDir + "HiSeq.1mb.1RG.sg5.table");
//todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure
@Test(enabled = false)
public void testCombineSimilarFiles() {
private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table");
@Test(enabled = true)
public void testGatherBQSR() {
BQSRGatherer gatherer = new BQSRGatherer();
List<File> recalFiles = new LinkedList<File> ();
File output = new File("foo.grp");
recalFiles.add(recal);
recalFiles.add(recal);
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
recalFiles.add(recal1);
recalFiles.add(recal2);
recalFiles.add(recal3);
recalFiles.add(recal4);
recalFiles.add(recal5);
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(recal);
GATKReport calculatedReport = new GATKReport(output);
for (GATKReportTable originalTable : originalReport.getTables()) {
GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName());
List<String> columnsToTest = new LinkedList<String>();
columnsToTest.add(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME);
columnsToTest.add(RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
if (originalTable.getTableName().equals(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL
columnsToTest.add(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1);
}
else if (originalTable.getTableName().equals(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE)) {
columnsToTest.add(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
}
else if (originalTable.getTableName().startsWith("RecalTable")) {
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
}
}
GATKReport originalReport = new GATKReport(recal_original);
GATKReport calculatedReport = new GATKReport(output);
// test the Arguments table
List<String> columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
GATKReportTable originalTable = originalReport.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);
GATKReportTable calculatedTable = calculatedReport.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
// test the Quantized table
columnsToTest = Arrays.asList(RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME);
originalTable = originalReport.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE);
calculatedTable = calculatedReport.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE);
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
// test the RecalTable0 table
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
originalTable = originalReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
calculatedTable = calculatedReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
// test the RecalTable1 table
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
originalTable = originalReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE);
calculatedTable = calculatedReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE);
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
// test the RecalTable2 table
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.COVARIATE_VALUE_COLUMN_NAME, RecalUtils.COVARIATE_NAME_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
originalTable = originalReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
calculatedTable = calculatedReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
}
/**
@ -58,25 +80,12 @@ public class BQSRGathererUnitTest {
* @param original the original table
* @param calculated the calculated table
* @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor)
* @param factor 1 to test for equality, any other value to multiply the original value and match with the calculated
*/
private void testTablesWithColumnsAndFactor(GATKReportTable original, GATKReportTable calculated, List<String> columnsToTest, int factor) {
private void testTablesWithColumns(GATKReportTable original, GATKReportTable calculated, List<String> columnsToTest) {
for (int row = 0; row < original.getNumRows(); row++ ) {
for (String column : columnsToTest) {
Object actual = calculated.get(new Integer(row), column);
Object expected = original.get(row, column);
if (factor != 1) {
if (expected instanceof Double)
expected = (Double) expected * factor;
else if (expected instanceof Long)
expected = (Long) expected * factor;
else if (expected instanceof Integer)
expected = (Integer) expected * factor;
else if (expected instanceof Byte) {
expected = (Byte) expected * factor;
}
}
Assert.assertEquals(actual, expected, "Row: " + row + " Original Table: " + original.getTableName() + " Calc Table: " + calculated.getTableName());
}
}

View File

@ -76,8 +76,8 @@ public class RecalibrationReportUnitTest {
final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates);
final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates);
final NestedIntegerArray<RecalDatum> rgTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
final NestedIntegerArray<RecalDatum> qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
final NestedIntegerArray<RecalDatum> rgTable = recalibrationTables.getReadGroupTable();
final NestedIntegerArray<RecalDatum> qualTable = recalibrationTables.getQualityScoreTable();
for (int offset = 0; offset < length; offset++) {