BQSR triage #4

* fixed queue script plot file names
   * updated the ReadGroupCovariate to use the platform unit instead of sample + lane.
   * fixed plotting of marginalized reported qualities
This commit is contained in:
Mauricio Carneiro 2012-04-23 14:43:42 -04:00
parent d6277b70d8
commit e440d0ce69
9 changed files with 79 additions and 139 deletions

View File

@ -1,52 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.LinkedList;
import java.util.List;
/**
* Short one line description of the walker.
*
* <p> [Long description of the walker] </p>
*
*
* <h2>Input</h2> <p> [Description of the Input] </p>
*
* <h2>Output</h2> <p> [Description of the Output] </p>
*
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T [walker name]
* </pre>
*
* @author Mauricio Carneiro
* @since 4/17/12
*/
public class AccuracyDatum extends RecalDatum {
private final List<Double> accuracy = new LinkedList<Double>();
private final List<Byte> reportedQualities = new LinkedList<Byte>();
public AccuracyDatum(final RecalDatum recalDatum, final byte originalQuality) {
super(recalDatum);
accuracy.add(calculateAccuracy(recalDatum, originalQuality));
reportedQualities.add(originalQuality);
}
public void combine(final RecalDatum recalDatum, final byte originalQuality) {
this.combine(recalDatum);
accuracy.add(calculateAccuracy(recalDatum, originalQuality));
reportedQualities.add(originalQuality);
}
@Override
public String toString() {
return String.format("%s,%.2f,%.2f", super.toString(), MathUtils.average(reportedQualities), MathUtils.average(accuracy));
}
private static double calculateAccuracy(final RecalDatum recalDatum, final byte originalQuality) {
return recalDatum.getEmpiricalQuality() - originalQuality;
}
}

View File

@ -65,15 +65,19 @@ public class BQSRGatherer extends Gatherer {
if (generalReport == null)
throw new ReviewedStingException(EMPTY_INPUT_LIST);
RecalibrationArgumentCollection RAC = generalReport.getRAC();
if (RAC.recalibrationReport != null) {
RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport);
RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
}
else
RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
generalReport.calculateEmpiricalAndQuantizedQualities();
RecalibrationArgumentCollection RAC = generalReport.getRAC();
if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) {
File recal_out = new File(output.getName() + ".original");
RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport);
RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
}
else if (!RAC.NO_PLOTS) {
File recal_out = new File(output.getName() + ".recal");
RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
}
generalReport.output(outputFile);
}
}

View File

@ -184,7 +184,7 @@ public class BQSRKeyManager {
* @return an object array with the values for each key
*/
public List<Object> keySetFrom(BitSet key) {
List<Object> objectKeys = new LinkedList<Object>();
List<Object> objectKeys = new ArrayList<Object>();
for (RequiredCovariateInfo info : requiredCovariates) {
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface

View File

@ -7,7 +7,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.regex.Pattern;
/*
* Copyright (c) 2009 The Broad Institute
@ -47,10 +46,6 @@ public class ReadGroupCovariate implements RequiredCovariate {
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
private short nextId = 0;
private static final String LANE_TAG = "LN";
private static final String SAMPLE_TAG = "SM";
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
@ -105,31 +100,14 @@ public class ReadGroupCovariate implements RequiredCovariate {
}
/**
* Gather the sample and lane information from the read group record and return sample.lane
*
* If the bam file is missing the lane information, it tries to use the id regex standardized
* by the Broad Institute to extract the lane information
*
* If it fails to find either of the two pieces of information, will return the read group id instead.
* If the sample has a PU tag annotation, return that. If not, return the read group id.
*
* @param rg the read group record
* @return sample.lane or id if information is missing.
* @return platform unit or readgroup id
*/
private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) {
String lane = rg.getLane(); // take the sample's lane from the read group lane tag
String sample = rg.getSample(); // take the sample's name from the read group sample tag
String value = rg.getId(); // initialize the return value with the read group ID in case we can't find the sample or the lane
if (lane == null) { // if this bam doesn't have the lane annotation in the read group try to take it from the read group id
String [] splitID = rg.getId().split(Pattern.quote("."));
if (splitID.length > 1) // if the id doesn't follow the BROAD defined regex (PU.LANE), fall back to the read group id
lane = splitID[splitID.length - 1]; // take the lane from the readgroup id
}
if (sample != null && lane != null)
value = sample + "." + lane; // the read group covariate is sample.lane (where the inforamtion is available)
return value;
String platformUnit = rg.getPlatformUnit();
return platformUnit == null ? rg.getId() : platformUnit;
}
}

View File

@ -356,39 +356,65 @@ public class RecalDataManager {
private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> map, String recalibrationMode, boolean printHeader) {
final int QUALITY_SCORE_COVARIATE_INDEX = 1;
final Map<BitSet, AccuracyDatum> deltaTable = new HashMap<BitSet, AccuracyDatum>();
final Map<BitSet, RecalDatum> deltaTable = new HashMap<BitSet, RecalDatum>();
BQSRKeyManager deltaKeyManager = null;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getOptionalCovariates().size() > 0) { // only need the 'all covariates' table
Map<BitSet, RecalDatum> table = tableEntry.getValue();
if (keyManager.getOptionalCovariates().size() > 0) { // initialize with the 'all covariates' table
// create a key manager for the delta table
List<Covariate> requiredCovariates = keyManager.getRequiredCovariates().subList(0, 1); // include the read group covariate as the only required covariate
List<Covariate> optionalCovariates = keyManager.getRequiredCovariates().subList(1, 2); // include the quality score covariate as an optional covariate
optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates
BQSRKeyManager deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager
deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager
}
}
if (deltaKeyManager == null)
throw new ReviewedStingException ("Couldn't find the covariates table");
// create delta table
boolean readyToPrint = false;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getRequiredCovariates().size() == 2 && keyManager.getOptionalCovariates().isEmpty()) { // look for the QualityScore table
Map<BitSet, RecalDatum> table = tableEntry.getValue();
// add the quality score table to the delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
List<Object> newCovs = new ArrayList<Object>(4);
newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score
newCovs.add(1, covs.get(1));
newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate)
newCovs.add(3, covs.get(2));
BitSet deltaKey = deltaKeyManager.bitSetFromKey(newCovs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table
}
}
else if (keyManager.getOptionalCovariates().size() > 0) { // look for the optional covariates table
Map<BitSet, RecalDatum> table = tableEntry.getValue();
// add the optional covariates to the delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
byte originalQuality = Byte.parseByte((String) covs.get(QUALITY_SCORE_COVARIATE_INDEX)); // save the original quality for accuracy calculation later on
covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table
covs.set(1, originalQuality); // replace the covariate value with the quality score
covs.set(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate)
deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table
}
readyToPrint = true;
}
// output the csv file
if (readyToPrint) {
// print header
if (printHeader) {
List<String> header = new LinkedList<String>();
header.add("ReadGroup");
@ -405,11 +431,11 @@ public class RecalDataManager {
}
// print each data line
for(Map.Entry<BitSet, AccuracyDatum> deltaEntry : deltaTable.entrySet()) {
for(Map.Entry<BitSet, RecalDatum> deltaEntry : deltaTable.entrySet()) {
List<Object> deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey());
RecalDatum deltaDatum = deltaEntry.getValue();
deltaTableFile.print(Utils.join(",", deltaKeys));
deltaTableFile.print("," + deltaDatum.toString());
deltaTableFile.print("," + deltaDatum.stringForCSV());
deltaTableFile.println("," + recalibrationMode);
}
@ -419,21 +445,20 @@ public class RecalDataManager {
}
/**
* Updates the current AccuracyDatum element in the delta table.
* Updates the current RecalDatum element in the delta table.
*
* If it doesn't have an element yet, it creates an AccuracyDatum element and adds it to the delta table.
* If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table.
*
* @param deltaTable the delta table
* @param deltaKey the key to the table
* @param recalDatum the recal datum to combine with the accuracyDatum element in the table
* @param originalQuality the quality score to we can calculate the accuracy for the accuracyDatum element
*/
private static void addToDeltaTable(Map<BitSet, AccuracyDatum> deltaTable, BitSet deltaKey, RecalDatum recalDatum, byte originalQuality) {
AccuracyDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
private static void addToDeltaTable(Map<BitSet, RecalDatum> deltaTable, BitSet deltaKey, RecalDatum recalDatum) {
RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
if (deltaDatum == null)
deltaTable.put(deltaKey, new AccuracyDatum(recalDatum, originalQuality)); // if we don't have a key yet, create a new one with the same values as the curent datum
deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum
else
deltaDatum.combine(recalDatum, originalQuality); // if we do have a datum, combine it with this one.
deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one.
}

View File

@ -109,6 +109,10 @@ public class RecalDatum extends Datum {
return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()));
}
public String stringForCSV() {
return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported());
}
private double calcExpectedErrors() {
return (double) this.numObservations * qualToErrorProb(estimatedQReported);

View File

@ -293,9 +293,6 @@ public class RecalibrationReport {
else if (primaryKey.equals("no_plots"))
RAC.NO_PLOTS = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("no_plots"))
RAC.NO_PLOTS = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("recalibration_report"))
RAC.recalibrationReport = (value == null) ? null : new File((String) value);
}

View File

@ -9,7 +9,7 @@ import net.sf.samtools.SAMUtils;
* @author Kiran Garimella
*/
public class QualityUtils {
public final static byte MAX_RECALIBRATED_Q_SCORE = 50;
public final static byte MAX_RECALIBRATED_Q_SCORE = 93;
public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE;
public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
@ -104,9 +104,8 @@ public class QualityUtils {
*/
static public byte probToQual(double prob, double eps) {
double lp = Math.round(-10.0*Math.log10(1.0 - prob + eps));
byte b = boundQual((int)lp);
//System.out.printf("LP is %f, byte is %d%n", lp, b);
return b;
return boundQual((int)lp);
}
static public double phredScaleCorrectRate(double trueRate) {
@ -117,10 +116,6 @@ public class QualityUtils {
return Math.abs(-10.0*Math.log10(errorRate));
}
static public double lodToPhredScaleErrorRate(double lod) {
return phredScaleErrorRate(1.0 / (Math.pow(10.0, lod) + 1.0));
}
/**
* Return a quality score, capped at max qual.
*
@ -134,12 +129,11 @@ public class QualityUtils {
/**
* Returns an integer quality score bounded by 1 - maxQual.
*
* @param qual
* @param maxQual
* @return
* @param qual the quality score
* @param maxQual the maximum quality
* @return the integer betwen 1 and maxqual.
*/
static public byte boundQual(int qual, byte maxQual) {
//return (byte) Math.min(qual, maxQual);
return (byte) Math.max(Math.min(qual, maxQual), 1);
}
}

View File

@ -28,27 +28,17 @@ public class ReadGroupCovariateUnitTest {
public void testSingleRecord() {
final String expected = "SAMPLE.1";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
rg.setSample("SAMPLE");
rg.setLane("1");
rg.setPlatformUnit(expected);
runTest(rg, expected);
}
@Test(enabled = true)
public void testMissingLane() {
final String expected = "SAMPLE.7";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.7");
rg.setSample("SAMPLE");
public void testMissingPlatformUnit() {
final String expected = "MY.7";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected);
runTest(rg, expected);
}
@Test(enabled = true)
public void testMissingSample() {
final String expected = "MY.ID";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
rg.setLane("1");
runTest(rg, expected);
}
private void runTest(GATKSAMReadGroupRecord rg, String expected) {
GATKSAMRecord read = ReadUtils.createRandomRead(10);
read.setReadGroup(rg);