diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index aaf7d1e6e..039ca565a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.*; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; @@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; @@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; @@ -190,7 +192,7 @@ public class GenomeAnalysisEngine { private BaseRecalibration baseRecalibration = null; public BaseRecalibration getBaseRecalibration() { return baseRecalibration; } public boolean hasBaseRecalibration() { return baseRecalibration != null; } - public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); } + public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); } /** * Actually run the GATK with the specified walker. @@ -216,7 +218,7 @@ public class GenomeAnalysisEngine { // if the use specified an input BQSR recalibration table then enable on the fly recalibration if (this.getArguments().BQSR_RECAL_FILE != null) - setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE); + setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels); // Determine how the threads should be divided between CPU vs. IO. determineThreadAllocation(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 670f04bda..3a1408d59 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -193,6 +193,16 @@ public class GATKArgumentCollection { @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration") public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously + /** + * Turns on the base quantization module. It requires a recalibration report (-BQSR). + * + * A value of 0 here means "do not quantize". + * Any value greater than zero will be used to recalculate the quantization using this many levels. + * Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all) + */ + @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false) + public int quantizationLevels = -1; + @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false) public byte defaultBaseQualities = -1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java index 393230ee4..afe847583 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java @@ -19,13 +19,19 @@ import java.util.Map; public class QuantizationInfo { private List quantizedQuals; private List empiricalQualCounts; + int quantizationLevels; - public QuantizationInfo(List quantizedQuals, List empiricalQualCounts) { + public QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { this.quantizedQuals = quantizedQuals; this.empiricalQualCounts = empiricalQualCounts; + this.quantizationLevels = quantizationLevels; + } + + public QuantizationInfo(List quantizedQuals, List empiricalQualCounts) { + this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); } - public QuantizationInfo(Map> keysAndTablesMap, int nLevels) { + public QuantizationInfo(Map> keysAndTablesMap, int quantizationLevels) { final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution for (int i = 0; i < qualHistogram.length; i++) qualHistogram[i] = 0L; @@ -46,7 +52,9 @@ public class QuantizationInfo { qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key } empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities - quantizeQualityScores(nLevels); + quantizeQualityScores(quantizationLevels); + + this.quantizationLevels = quantizationLevels; } @@ -55,10 +63,20 @@ public class QuantizationInfo { quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) } + public void noQuantization() { + this.quantizationLevels = QualityUtils.MAX_QUAL_SCORE; + for (int i = 0; i < this.quantizationLevels; i++) + quantizedQuals.set(i, (byte) i); + } + public List getQuantizedQuals() { return quantizedQuals; } + public int getQuantizationLevels() { + return quantizationLevels; + } + public GATKReportTable generateReportTable() { GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map"); quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); @@ -71,4 +89,16 @@ public class QuantizationInfo { } return quantizedTable; } + + private static int calculateQuantizationLevels(List quantizedQuals) { + byte lastByte = -1; + int quantizationLevels = 0; + for (byte q : quantizedQuals) { + if (q != lastByte) { + quantizationLevels++; + lastByte = q; + } + } + return quantizationLevels; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 2411a7d04..3a5b07e58 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -44,23 +44,24 @@ import java.util.*; public class BaseRecalibration { private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) private LinkedHashMap> keysAndTablesMap; // quick access reference to the read group table and its key manager - private ArrayList requestedCovariates = new ArrayList(); // list of all covariates to be used in this calculation - private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code"; - private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here"; /** * Constructor using a GATK Report file * * @param RECAL_FILE a GATK Report file containing the recalibration information */ - public BaseRecalibration(final File RECAL_FILE) { + public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) { RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); - quantizationInfo = recalibrationReport.getQuantizationInfo(); keysAndTablesMap = recalibrationReport.getKeysAndTablesMap(); requestedCovariates = recalibrationReport.getRequestedCovariates(); + quantizationInfo = recalibrationReport.getQuantizationInfo(); + if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores + quantizationInfo.noQuantization(); + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. + quantizationInfo.quantizeQualityScores(quantizationLevels); } /** @@ -71,17 +72,17 @@ public class BaseRecalibration { * @param read the read to recalibrate */ public void recalibrateRead(final GATKSAMRecord read) { - final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read - for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings + final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read + for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings final byte[] originalQuals = read.getBaseQualities(errorModel); final byte[] recalQuals = originalQuals.clone(); - for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read + for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read byte qualityScore = originalQuals[offset]; - if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) - final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model - qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model + qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base } recalQuals[offset] = qualityScore; } @@ -109,6 +110,9 @@ public class BaseRecalibration { * @return A recalibrated quality score as a byte */ private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) { + final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code"; + final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here"; + final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]); double globalDeltaQ = 0.0; diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java index a372ef3f0..f8f1ead9b 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java @@ -20,7 +20,7 @@ public class BaseRecalibrationUnitTest { @Test(enabled=false) public void testReadingReport() { File csv = new File("public/testdata/exampleGATKREPORT.grp"); - BaseRecalibration baseRecalibration = new BaseRecalibration(csv); + BaseRecalibration baseRecalibration = new BaseRecalibration(csv, -1); GATKSAMRecord read = ReadUtils.createRandomRead(1000); read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA)); baseRecalibration.recalibrateRead(read);