From e440d0ce69ad9146d78f836348089e3980dff6d7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 23 Apr 2012 14:43:42 -0400 Subject: [PATCH] BQSR triage #4 * fixed queue script plot file names * updated the ReadGroupCovariate to use the platform unit instead of sample + lane. * fixed plotting of marginalized reported qualities --- .../gatk/walkers/bqsr/AccuracyDatum.java | 52 -------------- .../sting/gatk/walkers/bqsr/BQSRGatherer.java | 20 +++--- .../gatk/walkers/bqsr/BQSRKeyManager.java | 2 +- .../gatk/walkers/bqsr/ReadGroupCovariate.java | 30 ++------ .../gatk/walkers/bqsr/RecalDataManager.java | 71 +++++++++++++------ .../sting/gatk/walkers/bqsr/RecalDatum.java | 4 ++ .../walkers/bqsr/RecalibrationReport.java | 3 - .../sting/utils/QualityUtils.java | 16 ++--- .../bqsr/ReadGroupCovariateUnitTest.java | 20 ++---- 9 files changed, 79 insertions(+), 139 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java deleted file mode 100644 index b66a81f34..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.LinkedList; -import java.util.List; - -/** - * Short one line description of the walker. - * - *

[Long description of the walker]

- * - * - *

Input

[Description of the Input]

- * - *

Output

[Description of the Output]

- * - *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T [walker name]
- *  
- * - * @author Mauricio Carneiro - * @since 4/17/12 - */ -public class AccuracyDatum extends RecalDatum { - private final List accuracy = new LinkedList(); - private final List reportedQualities = new LinkedList(); - - public AccuracyDatum(final RecalDatum recalDatum, final byte originalQuality) { - super(recalDatum); - accuracy.add(calculateAccuracy(recalDatum, originalQuality)); - reportedQualities.add(originalQuality); - } - - public void combine(final RecalDatum recalDatum, final byte originalQuality) { - this.combine(recalDatum); - accuracy.add(calculateAccuracy(recalDatum, originalQuality)); - reportedQualities.add(originalQuality); - } - - @Override - public String toString() { - return String.format("%s,%.2f,%.2f", super.toString(), MathUtils.average(reportedQualities), MathUtils.average(accuracy)); - } - - private static double calculateAccuracy(final RecalDatum recalDatum, final byte originalQuality) { - return recalDatum.getEmpiricalQuality() - originalQuality; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index d3be2d888..d91ddd221 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -65,15 +65,19 @@ public class BQSRGatherer extends Gatherer { if (generalReport == null) throw new ReviewedStingException(EMPTY_INPUT_LIST); - RecalibrationArgumentCollection RAC = generalReport.getRAC(); - if (RAC.recalibrationReport != null) { - RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); - RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); - } - else - RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); - generalReport.calculateEmpiricalAndQuantizedQualities(); + + RecalibrationArgumentCollection RAC = generalReport.getRAC(); + if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) { + File recal_out = new File(output.getName() + ".original"); + RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); + RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + } + else if (!RAC.NO_PLOTS) { + File recal_out = new File(output.getName() + ".recal"); + RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + } + generalReport.output(outputFile); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index 3ef25f9b8..1cb02f1c1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -184,7 +184,7 @@ public class BQSRKeyManager { * @return an object array with the values for each key */ public List keySetFrom(BitSet key) { - List objectKeys = new LinkedList(); + List objectKeys = new ArrayList(); for (RequiredCovariateInfo info : requiredCovariates) { BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java index eb20f7779..579643f56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java @@ -7,7 +7,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; import java.util.BitSet; import java.util.HashMap; -import java.util.regex.Pattern; /* * Copyright (c) 2009 The Broad Institute @@ -47,10 +46,6 @@ public class ReadGroupCovariate implements RequiredCovariate { private final HashMap readGroupLookupTable = new HashMap(); private final HashMap readGroupReverseLookupTable = new HashMap(); private short nextId = 0; - - private static final String LANE_TAG = "LN"; - private static final String SAMPLE_TAG = "SM"; - // Initialize any member variables using the command-line arguments passed to the walkers @Override @@ -105,31 +100,14 @@ public class ReadGroupCovariate implements RequiredCovariate { } /** - * Gather the sample and lane information from the read group record and return sample.lane - * - * If the bam file is missing the lane information, it tries to use the id regex standardized - * by the Broad Institute to extract the lane information - * - * If it fails to find either of the two pieces of information, will return the read group id instead. + * If the sample has a PU tag annotation, return that. If not, return the read group id. * * @param rg the read group record - * @return sample.lane or id if information is missing. + * @return platform unit or readgroup id */ private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) { - String lane = rg.getLane(); // take the sample's lane from the read group lane tag - String sample = rg.getSample(); // take the sample's name from the read group sample tag - String value = rg.getId(); // initialize the return value with the read group ID in case we can't find the sample or the lane - - if (lane == null) { // if this bam doesn't have the lane annotation in the read group try to take it from the read group id - String [] splitID = rg.getId().split(Pattern.quote(".")); - if (splitID.length > 1) // if the id doesn't follow the BROAD defined regex (PU.LANE), fall back to the read group id - lane = splitID[splitID.length - 1]; // take the lane from the readgroup id - } - - if (sample != null && lane != null) - value = sample + "." + lane; // the read group covariate is sample.lane (where the inforamtion is available) - - return value; + String platformUnit = rg.getPlatformUnit(); + return platformUnit == null ? rg.getId() : platformUnit; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index 177177e97..53e7c3f35 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -356,39 +356,65 @@ public class RecalDataManager { private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap> map, String recalibrationMode, boolean printHeader) { final int QUALITY_SCORE_COVARIATE_INDEX = 1; - final Map deltaTable = new HashMap(); + final Map deltaTable = new HashMap(); + BQSRKeyManager deltaKeyManager = null; for (Map.Entry> tableEntry : map.entrySet()) { BQSRKeyManager keyManager = tableEntry.getKey(); - if (keyManager.getOptionalCovariates().size() > 0) { // only need the 'all covariates' table - Map table = tableEntry.getValue(); - + if (keyManager.getOptionalCovariates().size() > 0) { // initialize with the 'all covariates' table // create a key manager for the delta table List requiredCovariates = keyManager.getRequiredCovariates().subList(0, 1); // include the read group covariate as the only required covariate List optionalCovariates = keyManager.getRequiredCovariates().subList(1, 2); // include the quality score covariate as an optional covariate optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates - BQSRKeyManager deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager + deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager + } + } + if (deltaKeyManager == null) + throw new ReviewedStingException ("Couldn't find the covariates table"); - // create delta table + boolean readyToPrint = false; + for (Map.Entry> tableEntry : map.entrySet()) { + BQSRKeyManager keyManager = tableEntry.getKey(); + + if (keyManager.getRequiredCovariates().size() == 2 && keyManager.getOptionalCovariates().isEmpty()) { // look for the QualityScore table + Map table = tableEntry.getValue(); + + // add the quality score table to the delta table + for (Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table + RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) + + List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key + List newCovs = new ArrayList(4); + newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score + newCovs.add(1, covs.get(1)); + newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate) + newCovs.add(3, covs.get(2)); + BitSet deltaKey = deltaKeyManager.bitSetFromKey(newCovs.toArray()); // create a new bitset key for the delta table + addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table + } + } + + else if (keyManager.getOptionalCovariates().size() > 0) { // look for the optional covariates table + Map table = tableEntry.getValue(); + + // add the optional covariates to the delta table for (Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key - byte originalQuality = Byte.parseByte((String) covs.get(QUALITY_SCORE_COVARIATE_INDEX)); // save the original quality for accuracy calculation later on covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table - addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table - - covs.set(1, originalQuality); // replace the covariate value with the quality score - covs.set(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate) - deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table - addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table + addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table } + readyToPrint = true; + } + + // output the csv file + if (readyToPrint) { - // print header if (printHeader) { List header = new LinkedList(); header.add("ReadGroup"); @@ -405,11 +431,11 @@ public class RecalDataManager { } // print each data line - for(Map.Entry deltaEntry : deltaTable.entrySet()) { + for(Map.Entry deltaEntry : deltaTable.entrySet()) { List deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey()); RecalDatum deltaDatum = deltaEntry.getValue(); deltaTableFile.print(Utils.join(",", deltaKeys)); - deltaTableFile.print("," + deltaDatum.toString()); + deltaTableFile.print("," + deltaDatum.stringForCSV()); deltaTableFile.println("," + recalibrationMode); } @@ -419,21 +445,20 @@ public class RecalDataManager { } /** - * Updates the current AccuracyDatum element in the delta table. + * Updates the current RecalDatum element in the delta table. * - * If it doesn't have an element yet, it creates an AccuracyDatum element and adds it to the delta table. + * If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table. * * @param deltaTable the delta table * @param deltaKey the key to the table * @param recalDatum the recal datum to combine with the accuracyDatum element in the table - * @param originalQuality the quality score to we can calculate the accuracy for the accuracyDatum element */ - private static void addToDeltaTable(Map deltaTable, BitSet deltaKey, RecalDatum recalDatum, byte originalQuality) { - AccuracyDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key + private static void addToDeltaTable(Map deltaTable, BitSet deltaKey, RecalDatum recalDatum) { + RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key if (deltaDatum == null) - deltaTable.put(deltaKey, new AccuracyDatum(recalDatum, originalQuality)); // if we don't have a key yet, create a new one with the same values as the curent datum + deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum else - deltaDatum.combine(recalDatum, originalQuality); // if we do have a datum, combine it with this one. + deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index ded3e619b..3eb3a3981 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -109,6 +109,10 @@ public class RecalDatum extends Datum { return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality())); } + public String stringForCSV() { + return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported()); + } + private double calcExpectedErrors() { return (double) this.numObservations * qualToErrorProb(estimatedQReported); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java index bd1ba112b..febbc1280 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java @@ -293,9 +293,6 @@ public class RecalibrationReport { else if (primaryKey.equals("no_plots")) RAC.NO_PLOTS = Boolean.parseBoolean((String) value); - else if (primaryKey.equals("no_plots")) - RAC.NO_PLOTS = Boolean.parseBoolean((String) value); - else if (primaryKey.equals("recalibration_report")) RAC.recalibrationReport = (value == null) ? null : new File((String) value); } diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index f53b439da..4acc0e2c3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,7 +9,7 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { - public final static byte MAX_RECALIBRATED_Q_SCORE = 50; + public final static byte MAX_RECALIBRATED_Q_SCORE = 93; public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); @@ -104,9 +104,8 @@ public class QualityUtils { */ static public byte probToQual(double prob, double eps) { double lp = Math.round(-10.0*Math.log10(1.0 - prob + eps)); - byte b = boundQual((int)lp); //System.out.printf("LP is %f, byte is %d%n", lp, b); - return b; + return boundQual((int)lp); } static public double phredScaleCorrectRate(double trueRate) { @@ -117,10 +116,6 @@ public class QualityUtils { return Math.abs(-10.0*Math.log10(errorRate)); } - static public double lodToPhredScaleErrorRate(double lod) { - return phredScaleErrorRate(1.0 / (Math.pow(10.0, lod) + 1.0)); - } - /** * Return a quality score, capped at max qual. * @@ -134,12 +129,11 @@ public class QualityUtils { /** * Returns an integer quality score bounded by 1 - maxQual. * - * @param qual - * @param maxQual - * @return + * @param qual the quality score + * @param maxQual the maximum quality + * @return the integer betwen 1 and maxqual. */ static public byte boundQual(int qual, byte maxQual) { - //return (byte) Math.min(qual, maxQual); return (byte) Math.max(Math.min(qual, maxQual), 1); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java index 6276022d1..f087ef0dd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java @@ -28,27 +28,17 @@ public class ReadGroupCovariateUnitTest { public void testSingleRecord() { final String expected = "SAMPLE.1"; GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); - rg.setSample("SAMPLE"); - rg.setLane("1"); + rg.setPlatformUnit(expected); runTest(rg, expected); } @Test(enabled = true) - public void testMissingLane() { - final String expected = "SAMPLE.7"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.7"); - rg.setSample("SAMPLE"); + public void testMissingPlatformUnit() { + final String expected = "MY.7"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); runTest(rg, expected); } - - @Test(enabled = true) - public void testMissingSample() { - final String expected = "MY.ID"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); - rg.setLane("1"); - runTest(rg, expected); - } - + private void runTest(GATKSAMReadGroupRecord rg, String expected) { GATKSAMRecord read = ReadUtils.createRandomRead(10); read.setReadGroup(rg);