Adding engine capability to quantize qualities.

* Added parameter -qq to quantize qualities using a recalibration report
   * Added options to quantize using the recalibration report quantization levels, new nLevels and no quantization.
   * Updated BQSR scripts to make use of the new parameters
This commit is contained in:
Mauricio Carneiro 2012-04-08 20:44:39 -04:00
parent c055752583
commit 87e6bea6c1
5 changed files with 66 additions and 20 deletions

View File

@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*; import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broad.tribble.Feature; import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.*;
@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadFilter;
@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
@ -190,7 +192,7 @@ public class GenomeAnalysisEngine {
private BaseRecalibration baseRecalibration = null; private BaseRecalibration baseRecalibration = null;
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; } public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
public boolean hasBaseRecalibration() { return baseRecalibration != null; } public boolean hasBaseRecalibration() { return baseRecalibration != null; }
public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); } public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }
/** /**
* Actually run the GATK with the specified walker. * Actually run the GATK with the specified walker.
@ -216,7 +218,7 @@ public class GenomeAnalysisEngine {
// if the use specified an input BQSR recalibration table then enable on the fly recalibration // if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null) if (this.getArguments().BQSR_RECAL_FILE != null)
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE); setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);
// Determine how the threads should be divided between CPU vs. IO. // Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation(); determineThreadAllocation();

View File

@ -193,6 +193,16 @@ public class GATKArgumentCollection {
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration") @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
/**
* Turns on the base quantization module. It requires a recalibration report (-BQSR).
*
* A value of 0 here means "do not quantize".
* Any value greater than zero will be used to recalculate the quantization using this many levels.
* Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all)
*/
@Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
public int quantizationLevels = -1;
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false) @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1; public byte defaultBaseQualities = -1;

View File

@ -19,13 +19,19 @@ import java.util.Map;
public class QuantizationInfo { public class QuantizationInfo {
private List<Byte> quantizedQuals; private List<Byte> quantizedQuals;
private List<Long> empiricalQualCounts; private List<Long> empiricalQualCounts;
int quantizationLevels;
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) { public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts, int quantizationLevels) {
this.quantizedQuals = quantizedQuals; this.quantizedQuals = quantizedQuals;
this.empiricalQualCounts = empiricalQualCounts; this.empiricalQualCounts = empiricalQualCounts;
this.quantizationLevels = quantizationLevels;
}
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
} }
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int nLevels) { public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++) for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L; qualHistogram[i] = 0L;
@ -46,7 +52,9 @@ public class QuantizationInfo {
qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key
} }
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
quantizeQualityScores(nLevels); quantizeQualityScores(quantizationLevels);
this.quantizationLevels = quantizationLevels;
} }
@ -55,10 +63,20 @@ public class QuantizationInfo {
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
} }
public void noQuantization() {
this.quantizationLevels = QualityUtils.MAX_QUAL_SCORE;
for (int i = 0; i < this.quantizationLevels; i++)
quantizedQuals.set(i, (byte) i);
}
public List<Byte> getQuantizedQuals() { public List<Byte> getQuantizedQuals() {
return quantizedQuals; return quantizedQuals;
} }
public int getQuantizationLevels() {
return quantizationLevels;
}
public GATKReportTable generateReportTable() { public GATKReportTable generateReportTable() {
GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map"); GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
@ -71,4 +89,16 @@ public class QuantizationInfo {
} }
return quantizedTable; return quantizedTable;
} }
private static int calculateQuantizationLevels(List<Byte> quantizedQuals) {
byte lastByte = -1;
int quantizationLevels = 0;
for (byte q : quantizedQuals) {
if (q != lastByte) {
quantizationLevels++;
lastByte = q;
}
}
return quantizationLevels;
}
} }

View File

@ -44,23 +44,24 @@ import java.util.*;
public class BaseRecalibration { public class BaseRecalibration {
private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
/** /**
* Constructor using a GATK Report file * Constructor using a GATK Report file
* *
* @param RECAL_FILE a GATK Report file containing the recalibration information * @param RECAL_FILE a GATK Report file containing the recalibration information
*/ */
public BaseRecalibration(final File RECAL_FILE) { public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) {
RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
quantizationInfo = recalibrationReport.getQuantizationInfo();
keysAndTablesMap = recalibrationReport.getKeysAndTablesMap(); keysAndTablesMap = recalibrationReport.getKeysAndTablesMap();
requestedCovariates = recalibrationReport.getRequestedCovariates(); requestedCovariates = recalibrationReport.getRequestedCovariates();
quantizationInfo = recalibrationReport.getQuantizationInfo();
if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores
quantizationInfo.noQuantization();
else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report.
quantizationInfo.quantizeQualityScores(quantizationLevels);
} }
/** /**
@ -71,17 +72,17 @@ public class BaseRecalibration {
* @param read the read to recalibrate * @param read the read to recalibrate
*/ */
public void recalibrateRead(final GATKSAMRecord read) { public void recalibrateRead(final GATKSAMRecord read) {
final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
final byte[] originalQuals = read.getBaseQualities(errorModel); final byte[] originalQuals = read.getBaseQualities(errorModel);
final byte[] recalQuals = originalQuals.clone(); final byte[] recalQuals = originalQuals.clone();
for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read
byte qualityScore = originalQuals[offset]; byte qualityScore = originalQuals[offset];
if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model
qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base
} }
recalQuals[offset] = qualityScore; recalQuals[offset] = qualityScore;
} }
@ -109,6 +110,9 @@ public class BaseRecalibration {
* @return A recalibrated quality score as a byte * @return A recalibrated quality score as a byte
*/ */
private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) { private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]); final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]);
double globalDeltaQ = 0.0; double globalDeltaQ = 0.0;

View File

@ -20,7 +20,7 @@ public class BaseRecalibrationUnitTest {
@Test(enabled=false) @Test(enabled=false)
public void testReadingReport() { public void testReadingReport() {
File csv = new File("public/testdata/exampleGATKREPORT.grp"); File csv = new File("public/testdata/exampleGATKREPORT.grp");
BaseRecalibration baseRecalibration = new BaseRecalibration(csv); BaseRecalibration baseRecalibration = new BaseRecalibration(csv, -1);
GATKSAMRecord read = ReadUtils.createRandomRead(1000); GATKSAMRecord read = ReadUtils.createRandomRead(1000);
read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA)); read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA));
baseRecalibration.recalibrateRead(read); baseRecalibration.recalibrateRead(read);