Adding engine capability to quantize qualities.
* Added parameter -qq to quantize qualities using a recalibration report * Added options to quantize using the recalibration report quantization levels, new nLevels and no quantization. * Updated BQSR scripts to make use of the new parameters
This commit is contained in:
parent
c055752583
commit
87e6bea6c1
|
|
@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk;
|
|||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
|
|
@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.datasources.reads.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
|
|
@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
|
|||
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
|
@ -190,7 +192,7 @@ public class GenomeAnalysisEngine {
|
|||
private BaseRecalibration baseRecalibration = null;
|
||||
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
|
||||
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
|
||||
public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
|
||||
public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }
|
||||
|
||||
/**
|
||||
* Actually run the GATK with the specified walker.
|
||||
|
|
@ -216,7 +218,7 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
|
||||
if (this.getArguments().BQSR_RECAL_FILE != null)
|
||||
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
|
||||
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);
|
||||
|
||||
// Determine how the threads should be divided between CPU vs. IO.
|
||||
determineThreadAllocation();
|
||||
|
|
|
|||
|
|
@ -193,6 +193,16 @@ public class GATKArgumentCollection {
|
|||
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
|
||||
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
|
||||
|
||||
/**
|
||||
* Turns on the base quantization module. It requires a recalibration report (-BQSR).
|
||||
*
|
||||
* A value of 0 here means "do not quantize".
|
||||
* Any value greater than zero will be used to recalculate the quantization using this many levels.
|
||||
* Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all)
|
||||
*/
|
||||
@Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
|
||||
public int quantizationLevels = -1;
|
||||
|
||||
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
|
||||
public byte defaultBaseQualities = -1;
|
||||
|
||||
|
|
|
|||
|
|
@ -19,13 +19,19 @@ import java.util.Map;
|
|||
public class QuantizationInfo {
|
||||
private List<Byte> quantizedQuals;
|
||||
private List<Long> empiricalQualCounts;
|
||||
int quantizationLevels;
|
||||
|
||||
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
|
||||
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts, int quantizationLevels) {
|
||||
this.quantizedQuals = quantizedQuals;
|
||||
this.empiricalQualCounts = empiricalQualCounts;
|
||||
this.quantizationLevels = quantizationLevels;
|
||||
}
|
||||
|
||||
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
|
||||
this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
|
||||
}
|
||||
|
||||
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int nLevels) {
|
||||
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
|
||||
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
|
||||
for (int i = 0; i < qualHistogram.length; i++)
|
||||
qualHistogram[i] = 0L;
|
||||
|
|
@ -46,7 +52,9 @@ public class QuantizationInfo {
|
|||
qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key
|
||||
}
|
||||
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
|
||||
quantizeQualityScores(nLevels);
|
||||
quantizeQualityScores(quantizationLevels);
|
||||
|
||||
this.quantizationLevels = quantizationLevels;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -55,10 +63,20 @@ public class QuantizationInfo {
|
|||
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
|
||||
}
|
||||
|
||||
public void noQuantization() {
|
||||
this.quantizationLevels = QualityUtils.MAX_QUAL_SCORE;
|
||||
for (int i = 0; i < this.quantizationLevels; i++)
|
||||
quantizedQuals.set(i, (byte) i);
|
||||
}
|
||||
|
||||
public List<Byte> getQuantizedQuals() {
|
||||
return quantizedQuals;
|
||||
}
|
||||
|
||||
public int getQuantizationLevels() {
|
||||
return quantizationLevels;
|
||||
}
|
||||
|
||||
public GATKReportTable generateReportTable() {
|
||||
GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
|
||||
quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
|
||||
|
|
@ -71,4 +89,16 @@ public class QuantizationInfo {
|
|||
}
|
||||
return quantizedTable;
|
||||
}
|
||||
|
||||
private static int calculateQuantizationLevels(List<Byte> quantizedQuals) {
|
||||
byte lastByte = -1;
|
||||
int quantizationLevels = 0;
|
||||
for (byte q : quantizedQuals) {
|
||||
if (q != lastByte) {
|
||||
quantizationLevels++;
|
||||
lastByte = q;
|
||||
}
|
||||
}
|
||||
return quantizationLevels;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,23 +44,24 @@ import java.util.*;
|
|||
public class BaseRecalibration {
|
||||
private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
|
||||
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
|
||||
|
||||
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
|
||||
|
||||
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
|
||||
private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
|
||||
|
||||
/**
|
||||
* Constructor using a GATK Report file
|
||||
*
|
||||
* @param RECAL_FILE a GATK Report file containing the recalibration information
|
||||
*/
|
||||
public BaseRecalibration(final File RECAL_FILE) {
|
||||
public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) {
|
||||
RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
|
||||
|
||||
quantizationInfo = recalibrationReport.getQuantizationInfo();
|
||||
keysAndTablesMap = recalibrationReport.getKeysAndTablesMap();
|
||||
requestedCovariates = recalibrationReport.getRequestedCovariates();
|
||||
quantizationInfo = recalibrationReport.getQuantizationInfo();
|
||||
if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores
|
||||
quantizationInfo.noQuantization();
|
||||
else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report.
|
||||
quantizationInfo.quantizeQualityScores(quantizationLevels);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -71,17 +72,17 @@ public class BaseRecalibration {
|
|||
* @param read the read to recalibrate
|
||||
*/
|
||||
public void recalibrateRead(final GATKSAMRecord read) {
|
||||
final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read
|
||||
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
|
||||
final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read
|
||||
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
|
||||
final byte[] originalQuals = read.getBaseQualities(errorModel);
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read
|
||||
for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read
|
||||
byte qualityScore = originalQuals[offset];
|
||||
|
||||
if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
|
||||
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model
|
||||
qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base
|
||||
if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
|
||||
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model
|
||||
qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base
|
||||
}
|
||||
recalQuals[offset] = qualityScore;
|
||||
}
|
||||
|
|
@ -109,6 +110,9 @@ public class BaseRecalibration {
|
|||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
|
||||
final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
|
||||
final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
|
||||
|
||||
final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]);
|
||||
|
||||
double globalDeltaQ = 0.0;
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ public class BaseRecalibrationUnitTest {
|
|||
@Test(enabled=false)
|
||||
public void testReadingReport() {
|
||||
File csv = new File("public/testdata/exampleGATKREPORT.grp");
|
||||
BaseRecalibration baseRecalibration = new BaseRecalibration(csv);
|
||||
BaseRecalibration baseRecalibration = new BaseRecalibration(csv, -1);
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA));
|
||||
baseRecalibration.recalibrateRead(read);
|
||||
|
|
|
|||
Loading…
Reference in New Issue