Adding engine capability to quantize qualities.

* Added parameter -qq to quantize qualities using a recalibration report
   * Added options to quantize using the recalibration report quantization levels, new nLevels and no quantization.
   * Updated BQSR scripts to make use of the new parameters
This commit is contained in:
Mauricio Carneiro 2012-04-08 20:44:39 -04:00
parent c055752583
commit 87e6bea6c1
5 changed files with 66 additions and 20 deletions

View File

@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
@ -190,7 +192,7 @@ public class GenomeAnalysisEngine {
private BaseRecalibration baseRecalibration = null;
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }
/**
* Actually run the GATK with the specified walker.
@ -216,7 +218,7 @@ public class GenomeAnalysisEngine {
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null)
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation();

View File

@ -193,6 +193,16 @@ public class GATKArgumentCollection {
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
/**
* Turns on the base quantization module. It requires a recalibration report (-BQSR).
*
* A value of 0 here means "do not quantize".
* Any value greater than zero will be used to recalculate the quantization using this many levels.
* Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all)
*/
@Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
public int quantizationLevels = -1;
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1;

View File

@ -19,13 +19,19 @@ import java.util.Map;
public class QuantizationInfo {
private List<Byte> quantizedQuals;
private List<Long> empiricalQualCounts;
int quantizationLevels;
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts, int quantizationLevels) {
this.quantizedQuals = quantizedQuals;
this.empiricalQualCounts = empiricalQualCounts;
this.quantizationLevels = quantizationLevels;
}
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
}
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int nLevels) {
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L;
@ -46,7 +52,9 @@ public class QuantizationInfo {
qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key
}
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
quantizeQualityScores(nLevels);
quantizeQualityScores(quantizationLevels);
this.quantizationLevels = quantizationLevels;
}
@ -55,10 +63,20 @@ public class QuantizationInfo {
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
}
public void noQuantization() {
this.quantizationLevels = QualityUtils.MAX_QUAL_SCORE;
for (int i = 0; i < this.quantizationLevels; i++)
quantizedQuals.set(i, (byte) i);
}
public List<Byte> getQuantizedQuals() {
return quantizedQuals;
}
public int getQuantizationLevels() {
return quantizationLevels;
}
public GATKReportTable generateReportTable() {
GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
@ -71,4 +89,16 @@ public class QuantizationInfo {
}
return quantizedTable;
}
private static int calculateQuantizationLevels(List<Byte> quantizedQuals) {
byte lastByte = -1;
int quantizationLevels = 0;
for (byte q : quantizedQuals) {
if (q != lastByte) {
quantizationLevels++;
lastByte = q;
}
}
return quantizationLevels;
}
}

View File

@ -44,23 +44,24 @@ import java.util.*;
public class BaseRecalibration {
private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
/**
* Constructor using a GATK Report file
*
* @param RECAL_FILE a GATK Report file containing the recalibration information
*/
public BaseRecalibration(final File RECAL_FILE) {
public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) {
RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
quantizationInfo = recalibrationReport.getQuantizationInfo();
keysAndTablesMap = recalibrationReport.getKeysAndTablesMap();
requestedCovariates = recalibrationReport.getRequestedCovariates();
quantizationInfo = recalibrationReport.getQuantizationInfo();
if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores
quantizationInfo.noQuantization();
else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report.
quantizationInfo.quantizeQualityScores(quantizationLevels);
}
/**
@ -71,17 +72,17 @@ public class BaseRecalibration {
* @param read the read to recalibrate
*/
public void recalibrateRead(final GATKSAMRecord read) {
final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
final byte[] originalQuals = read.getBaseQualities(errorModel);
final byte[] recalQuals = originalQuals.clone();
for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read
for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read
byte qualityScore = originalQuals[offset];
if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model
qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base
if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model
qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base
}
recalQuals[offset] = qualityScore;
}
@ -109,6 +110,9 @@ public class BaseRecalibration {
* @return A recalibrated quality score as a byte
*/
private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]);
double globalDeltaQ = 0.0;

View File

@ -20,7 +20,7 @@ public class BaseRecalibrationUnitTest {
@Test(enabled=false)
public void testReadingReport() {
File csv = new File("public/testdata/exampleGATKREPORT.grp");
BaseRecalibration baseRecalibration = new BaseRecalibration(csv);
BaseRecalibration baseRecalibration = new BaseRecalibration(csv, -1);
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA));
baseRecalibration.recalibrateRead(read);