From dcc4871468dfa3ef29b1eeb65bb93fa259249ba0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 18 Apr 2012 15:02:26 -0400 Subject: [PATCH 1/6] minor misc optimizations to PairHMM --- .../org/broadinstitute/sting/utils/MathUtils.java | 4 ++++ .../sting/utils/MathUtilsUnitTest.java | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 5e3160452..29d47cf3c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -106,6 +106,10 @@ public class MathUtils { return approxSum; } + public static double approximateLog10SumLog10(double a, double b, double c) { + return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); + } + public static double approximateLog10SumLog10(double small, double big) { // make sure small is really the smaller value if (small > big) { diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 5327d4cf2..04b0199d8 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -271,6 +271,19 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); } @Test From 68d0211fa1cf8e0f9803e64d60672b800716a20c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 18 Apr 2012 13:02:41 -0400 Subject: [PATCH 2/6] Improved BQSR plotting and some new parameters * Refactored CycleCovariate to be a fragment covariate instead of a per read covariate * Refactored the CycleCovariateUnitTest to test the pairing information * Updated BQSR Integration tests accordingly * Made quantization levels parameter not hidden anymore * Added hidden option to keep intermediate plotting files for debug purposes (they're automatically deleted) * Added hidden option not to generate the plots automatically (important for scatter/gathering) --- .../gatk/walkers/bqsr/CycleCovariate.java | 14 +++++++------- .../bqsr/RecalibrationArgumentCollection.java | 18 ++++++++++++++++-- .../gatk/walkers/bqsr/RecalibrationReport.java | 6 ++++++ .../walkers/bqsr/CycleCovariateUnitTest.java | 10 +++++++++- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java index 7bc6cd754..54a90a959 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java @@ -66,18 +66,18 @@ public class CycleCovariate implements StandardCovariate { // Discrete cycle platforms if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { - final short init; + final short readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? (short) -1 : 1; final short increment; - if (!read.getReadNegativeStrandFlag()) { - init = 1; - increment = 1; + short cycle; + if (read.getReadNegativeStrandFlag()) { + cycle = (short) (read.getReadLength() * readOrderFactor); + increment = (short) (-1 * readOrderFactor); } else { - init = (short) read.getReadLength(); - increment = -1; + cycle = readOrderFactor; + increment = readOrderFactor; } - short cycle = init; for (int i = 0; i < read.getReadLength(); i++) { cycles[i] = BitSetUtils.bitSetFrom(cycle); cycle += increment; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 4a695ecb6..b5768eedd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -143,9 +143,18 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) public byte DELETIONS_DEFAULT_QUALITY = 45; + /** + * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality + */ @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) public byte LOW_QUAL_TAIL = 2; + /** + * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. + * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. + */ + @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") + public int QUANTIZING_LEVELS = 16; @Hidden @@ -155,8 +164,11 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; @Hidden - @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") - public int QUANTIZING_LEVELS = 16; + @Argument(fullName = "keep_intermediate_files", shortName = "k", required = false, doc ="does not remove the temporary csv file created to generate the plots") + public boolean KEEP_INTERMEDIATE_FILES = false; + @Hidden + @Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering") + public boolean NO_PLOTS = false; public GATKReportTable generateReportTable() { GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run"); @@ -176,6 +188,8 @@ public class RecalibrationArgumentCollection { argumentsTable.set("default_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); argumentsTable.set("force_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); + argumentsTable.set("keep_intermediate_files", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES); + argumentsTable.set("no_plots", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS); return argumentsTable; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java index 19c04361b..2962c4674 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java @@ -279,6 +279,12 @@ public class RecalibrationReport { else if (primaryKey.equals("quantizing_levels")) RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); + + else if (primaryKey.equals("keep_intermediate_files")) + RAC.KEEP_INTERMEDIATE_FILES = Boolean.parseBoolean((String) value); + + else if (primaryKey.equals("no_plots")) + RAC.NO_PLOTS = Boolean.parseBoolean((String) value); } return RAC; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java index d80cddd3e..cec541a97 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java @@ -26,8 +26,9 @@ public class CycleCovariateUnitTest { @Test(enabled = true) public void testSimpleCycles() { - short readLength = 10; + short readLength = 10; GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); read.getReadGroup().setPlatform("illumina"); @@ -38,6 +39,13 @@ public class CycleCovariateUnitTest { values = covariate.getValues(read); verifyCovariateArray(values.getMismatches(), readLength, (short) -1); + read.setSecondOfPairFlag(true); + values = covariate.getValues(read); + verifyCovariateArray(values.getMismatches(), (short) -readLength, (short) 1); + + read.setReadNegativeStrandFlag(false); + values = covariate.getValues(read); + verifyCovariateArray(values.getMismatches(), (short) -1, (short) -1); } private void verifyCovariateArray(BitSet[] values, short init, short increment) { From eb22cd7222bf28facdd894e5065819c67a5b9d0d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 18 Apr 2012 23:02:10 -0400 Subject: [PATCH 3/6] Unit test to guarantee BQSR sequential calculation accuracy This test brings together the old and the new BQSR, building a recalibration table using the two separate frameworks and performing the recalibration calculation using the two different frameworks for 10,000+ bases and asserting that the calculations match in every case. --- .../sting/gatk/walkers/bqsr/RecalDatum.java | 2 +- .../walkers/recalibration/RecalDatum.java | 6 + .../recalibration/BaseRecalibration.java | 15 +- .../BaseRecalibrationUnitTest.java | 287 +++++++++++++++++- 4 files changed, 297 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index d232fde81..c71a00a3a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -102,7 +102,7 @@ public class RecalDatum extends Datum { @Override public String toString() { - return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality())); + return String.format("%d,%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()), (byte) Math.floor(getEstimatedQReported())); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java index adc352b1b..aa9098549 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java @@ -109,4 +109,10 @@ public class RecalDatum extends RecalDatumOptimized { private double qualToErrorProb( final double qual ) { return Math.pow(10.0, qual / -10.0); } + + + @Override + public String toString() { + return String.format("%d,%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()), (byte) Math.floor(getEstimatedQReported())); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 2badca44c..70eb9426b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -65,6 +65,19 @@ public class BaseRecalibration { quantizationInfo.quantizeQualityScores(quantizationLevels); } + /** + * This constructor only exists for testing purposes. + * + * @param quantizationInfo + * @param keysAndTablesMap + * @param requestedCovariates + */ + protected BaseRecalibration(QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, ArrayList requestedCovariates) { + this.quantizationInfo = quantizationInfo; + this.keysAndTablesMap = keysAndTablesMap; + this.requestedCovariates = requestedCovariates; + } + /** * Recalibrates the base qualities of a read * @@ -110,7 +123,7 @@ public class BaseRecalibration { * @param errorModel the event type * @return A recalibrated quality score as a byte */ - private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) { + protected byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) { final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code"; final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here"; diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java index f8f1ead9b..4f0d39991 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java @@ -1,13 +1,17 @@ package org.broadinstitute.sting.utils.recalibration; -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.utils.NGSPlatform; +import org.broadinstitute.sting.gatk.walkers.bqsr.*; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.io.File; +import java.util.*; /** * Unit tests for on-the-fly recalibration. @@ -17,13 +21,274 @@ import java.io.File; */ public class BaseRecalibrationUnitTest { - @Test(enabled=false) - public void testReadingReport() { - File csv = new File("public/testdata/exampleGATKREPORT.grp"); - BaseRecalibration baseRecalibration = new BaseRecalibration(csv, -1); - GATKSAMRecord read = ReadUtils.createRandomRead(1000); - read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA)); - baseRecalibration.recalibrateRead(read); - System.out.println("Success"); + private org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager dataManager; + private LinkedHashMap> keysAndTablesMap; + + private BQSRKeyManager rgKeyManager; + private BQSRKeyManager qsKeyManager; + private BQSRKeyManager cvKeyManager; + + private ReadGroupCovariate rgCovariate; + private QualityScoreCovariate qsCovariate; + private ContextCovariate cxCovariate; + private CycleCovariate cyCovariate; + + private GATKSAMRecord read = ReadUtils.createRandomRead(10000); + private BaseRecalibration baseRecalibration; + private ReadCovariates readCovariates; + + + @BeforeClass + public void init() { + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg"); + rg.setPlatform("illumina"); + read.setReadGroup(rg); + + byte[] quals = new byte[read.getReadLength()]; + for (int i = 0; i < read.getReadLength(); i++) + quals[i] = 20; + read.setBaseQualities(quals); + + RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + List requiredCovariates = new ArrayList(); + List optionalCovariates = new ArrayList(); + ArrayList requestedCovariates = new ArrayList(); + + dataManager = new org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager(true, 4); + keysAndTablesMap = new LinkedHashMap>(); + + rgCovariate = new ReadGroupCovariate(); + rgCovariate.initialize(RAC); + requiredCovariates.add(rgCovariate); + rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(rgKeyManager, new HashMap()); + + qsCovariate = new QualityScoreCovariate(); + qsCovariate.initialize(RAC); + requiredCovariates.add(qsCovariate); + qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(qsKeyManager, new HashMap()); + + cxCovariate = new ContextCovariate(); + cxCovariate.initialize(RAC); + optionalCovariates.add(cxCovariate); + cyCovariate = new CycleCovariate(); + cyCovariate.initialize(RAC); + optionalCovariates.add(cyCovariate); + cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(cvKeyManager, new HashMap()); + + + for (Covariate cov : requiredCovariates) + requestedCovariates.add(cov); + for (Covariate cov : optionalCovariates) + requestedCovariates.add(cov); + + readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); + + for (int i=0; i> mapEntry : keysAndTablesMap.entrySet()) { + List keys = mapEntry.getKey().bitSetsFromAllKeys(bitKeys, EventType.BASE_SUBSTITUTION); + for (BitSet key : keys) + updateCovariateWithKeySet(mapEntry.getValue(), key, newDatum); + } + } + dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_QUAL_SCORE); + + List quantizedQuals = new ArrayList(); + List qualCounts = new ArrayList(); + for (byte i = 0; i <= QualityUtils.MAX_QUAL_SCORE; i++) { + quantizedQuals.add(i); + qualCounts.add(1L); + } + QuantizationInfo quantizationInfo = new QuantizationInfo(quantizedQuals, qualCounts); + quantizationInfo.noQuantization(); + baseRecalibration = new BaseRecalibration(quantizationInfo, keysAndTablesMap, requestedCovariates); + + } + + + @Test(enabled=true) + public void testGoldStandardComparison() { + debugTables(); + for (int i = 0; i < read.getReadLength(); i++) { + BitSet [] bitKey = readCovariates.getKeySet(i, EventType.BASE_SUBSTITUTION); + Object [] objKey = buildObjectKey(bitKey); + byte v2 = baseRecalibration.performSequentialQualityCalculation(bitKey, EventType.BASE_SUBSTITUTION); + byte v1 = goldStandardSequentialCalculation(objKey); + Assert.assertEquals(v2, v1); + } + } + + private Object[] buildObjectKey(BitSet[] bitKey) { + Object[] key = new Object[bitKey.length]; + key[0] = rgCovariate.keyFromBitSet(bitKey[0]); + key[1] = qsCovariate.keyFromBitSet(bitKey[1]); + key[2] = cxCovariate.keyFromBitSet(bitKey[2]); + key[3] = cyCovariate.keyFromBitSet(bitKey[3]); + return key; + } + + private void debugTables() { + System.out.println("\nV1 Table\n"); + System.out.println("ReadGroup Table:"); + NestedHashMap nestedTable = dataManager.getCollapsedTable(0); + printNestedHashMap(nestedTable.data, ""); + System.out.println("\nQualityScore Table:"); + nestedTable = dataManager.getCollapsedTable(1); + printNestedHashMap(nestedTable.data, ""); + System.out.println("\nCovariates Table:"); + nestedTable = dataManager.getCollapsedTable(2); + printNestedHashMap(nestedTable.data, ""); + nestedTable = dataManager.getCollapsedTable(3); + printNestedHashMap(nestedTable.data, ""); + + + int i = 0; + System.out.println("\nV2 Table\n"); + for (Map.Entry> mapEntry : keysAndTablesMap.entrySet()) { + BQSRKeyManager keyManager = mapEntry.getKey(); + Map table = mapEntry.getValue(); + switch(i++) { + case 0 : + System.out.println("ReadGroup Table:"); + break; + case 1 : + System.out.println("QualityScore Table:"); + break; + case 2 : + System.out.println("Covariates Table:"); + break; + } + for (Map.Entry entry : table.entrySet()) { + BitSet key = entry.getKey(); + RecalDatum datum = entry.getValue(); + List keySet = keyManager.keySetFrom(key); + System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum)); + } + System.out.println(); + } + + + } + + private static void printNestedHashMap(Map table, String output) { + for (Object key : table.keySet()) { + String ret = ""; + if (output.isEmpty()) + ret = "" + key; + else + ret = output + "," + key; + + Object next = table.get(key); + if (next instanceof org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) + System.out.println(ret + " => " + next); + else + printNestedHashMap((Map) next, "" + ret); + } + } + + private void updateCovariateWithKeySet(final Map recalTable, final BitSet hashKey, final RecalDatum datum) { + RecalDatum previousDatum = recalTable.get(hashKey); // using the list of covariate values as a key, pick out the RecalDatum from the data HashMap + if (previousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it + recalTable.put(hashKey, datum.copy()); + else + previousDatum.combine(datum); // add one to the number of observations and potentially one to the number of mismatches + } + + /** + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * + * @param key The list of Comparables that were calculated from the covariates + * @return A recalibrated quality score as a byte + */ + private byte goldStandardSequentialCalculation(final Object... key) { + + final byte qualFromRead = (byte) Integer.parseInt(key[1].toString()); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; + + // The global quality shift (over the read group only) + readGroupCollapsedKey[0] = key[0]; + final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum globalRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey)); + double globalDeltaQ = 0.0; + if (globalRecalDatum != null) { + final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); + final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; + } + + // The shift in quality between reported and empirical + qualityScoreCollapsedKey[0] = key[0]; + qualityScoreCollapsedKey[1] = key[1]; + final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum qReportedRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey)); + double deltaQReported = 0.0; + if (qReportedRecalDatum != null) { + final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + + // The shift in quality due to each covariate by itself in turn + double deltaQCovariates = 0.0; + double deltaQCovariateEmpirical; + covariateCollapsedKey[0] = key[0]; + covariateCollapsedKey[1] = key[1]; + for (int iii = 2; iii < key.length; iii++) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum covariateRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey)); + if (covariateRecalDatum != null) { + deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); + deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); + } + } + + final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; + return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_QUAL_SCORE); + + // Verbose printouts used to validate with old recalibrator + //if(key.contains(null)) { + // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + + //return newQualityByte; + } + + public static double calcEmpiricalQual(final int observations, final int errors) { + final int smoothing = 1; + final double doubleMismatches = (double) (errors + smoothing); + final double doubleObservations = (double) ( observations + smoothing ); + double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + return Math.min(QualityUtils.MAX_QUAL_SCORE, empiricalQual); } } From 11001ab9a2fe7815ad514634ad298184bd735340 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 19 Apr 2012 11:32:10 -0400 Subject: [PATCH 4/6] Adding option to HaplotypeCaller to genotype the events on the chosen haplotypes as independent events. The filtered reads are now kept around so they can be passed to the variant annotations. Unfortunately the filtered reads aren't assigned a likelihood yet so they are all thrown in the Allele.NO_CALL bin. --- .../sting/gatk/walkers/annotator/RMSMappingQuality.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 97c15e747..ea7d6ae33 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -80,9 +80,8 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn } } - double rms = MathUtils.rms(qualities); - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", rms)); + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", MathUtils.rms(qualities))); return map; } From 76a6e37f4f374e20a40e72199dd38a26348ff5b9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 19 Apr 2012 11:45:56 -0400 Subject: [PATCH 5/6] Don't output callability metrics by default anymore; one can still have them output to the 'metrics' file (which is now @Hidden because they are really for GSA use). Added a TODO to move UG from @By reference to reads and rods once LIBS is cleaned up. --- .../gatk/walkers/genotyper/UnifiedGenotyper.java | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 9036e3a62..3cec931d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -116,6 +116,8 @@ import java.util.*; @ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) +// TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: +// TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatibleWalker { @@ -155,6 +157,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif @Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false) protected PrintStream verboseWriter = null; + @Hidden @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false) protected PrintStream metricsWriter = null; @@ -347,14 +350,6 @@ public class UnifiedGenotyper extends LocusWalker, Unif } public void onTraversalDone(UGStatistics sum) { - logger.info(String.format("Visited bases %d", sum.nBasesVisited)); - logger.info(String.format("Callable bases %d", sum.nBasesCallable)); - logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently)); - logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll())); - logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll())); - logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable())); - logger.info(String.format("Actual calls made %d", sum.nCallsMade)); - if ( metricsWriter != null ) { metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited)); metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable)); From 79272c5e1523f1fd11fca1cce7f8617890cde3d3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 19 Apr 2012 12:48:09 -0400 Subject: [PATCH 6/6] Thanks to Menachem for pointing out that the docs for genotyping_mode and output_mode were the same (and unclear). Fixed. --- .../gatk/walkers/genotyper/UnifiedArgumentCollection.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index d7174536e..f4ffbad91 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -55,13 +55,10 @@ public class UnifiedArgumentCollection { @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE; - /** - * Specifies how to determine the alternate allele to use for genotyping - */ - @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; /**