Initial version of on-the-fly, lazy loading base quality score recalibration. It isn't completely hooked up yet but I'm committing so Mauricio and Mark can see how I envision it will fit together. Look it over and give any feedback. With the exception of the Solid specific code we are very very close to being able to remove TableRecalibrationWalker from the code base and just replace it with PrintReads -BQSR recal.csv
This commit is contained in:
parent
2cd33b2f1f
commit
5343f8ba67
|
|
@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
|
@ -179,10 +180,18 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private static final long GATK_RANDOM_SEED = 47382911L;
|
||||
private static Random randomGenerator = new Random(GATK_RANDOM_SEED);
|
||||
|
||||
public static Random getRandomGenerator() { return randomGenerator; }
|
||||
public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); }
|
||||
public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); }
|
||||
|
||||
/**
|
||||
* Static base quality score recalibration helper object
|
||||
*/
|
||||
private static BaseRecalibration baseRecalibration = null;
|
||||
public static BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
|
||||
public static boolean hasBaseRecalibration() { return baseRecalibration != null; }
|
||||
public static void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
|
||||
|
||||
/**
|
||||
* Actually run the GATK with the specified walker.
|
||||
*
|
||||
|
|
@ -205,6 +214,10 @@ public class GenomeAnalysisEngine {
|
|||
if (this.getArguments().nonDeterministicRandomSeed)
|
||||
resetRandomGenerator(System.currentTimeMillis());
|
||||
|
||||
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
|
||||
if (this.getArguments().RECAL_FILE != null)
|
||||
setBaseRecalibration(this.getArguments().RECAL_FILE);
|
||||
|
||||
// Determine how the threads should be divided between CPU vs. IO.
|
||||
determineThreadAllocation();
|
||||
|
||||
|
|
@ -224,7 +237,7 @@ public class GenomeAnalysisEngine {
|
|||
// create temp directories as necessary
|
||||
initializeTempDirectory();
|
||||
|
||||
// create the output streams "
|
||||
// create the output streams
|
||||
initializeOutputStreams(microScheduler.getOutputTracker());
|
||||
|
||||
Iterable<Shard> shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
||||
|
|
|
|||
|
|
@ -185,6 +185,15 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
|
||||
public Boolean useOriginalBaseQualities = false;
|
||||
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
|
||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
|
||||
public File RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
|
||||
|
||||
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
|
||||
public byte defaultBaseQualities = -1;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 9/26/11
|
||||
*/
|
||||
|
||||
public class ContextCovariate implements Covariate {
|
||||
|
||||
final int CONTEXT_SIZE = 8;
|
||||
String allN = "";
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
for( int iii = 0; iii < CONTEXT_SIZE; iii++ ) {
|
||||
allN += "N";
|
||||
}
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
byte[] bases = read.getReadBases();
|
||||
for(int i = 0; i < read.getReadLength(); i++) {
|
||||
comparable[i] = ( i-CONTEXT_SIZE < 0 ? allN : new String(Arrays.copyOfRange(bases,i-CONTEXT_SIZE,i)) );
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return str;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -25,7 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -86,14 +85,14 @@ public class RecalDataManager {
|
|||
PURGE_READ
|
||||
}
|
||||
|
||||
RecalDataManager() {
|
||||
public RecalDataManager() {
|
||||
data = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
dataCollapsedByCovariate = null;
|
||||
}
|
||||
|
||||
RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) {
|
||||
public RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) {
|
||||
if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
|
||||
data = null;
|
||||
dataCollapsedReadGroup = new NestedHashMap();
|
||||
|
|
|
|||
|
|
@ -181,7 +181,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
/////////////////////////////
|
||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
|
|
|
|||
|
|
@ -55,6 +55,14 @@ public class QualityUtils {
|
|||
return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
||||
}
|
||||
|
||||
static public double[] qualArrayToLog10ErrorProb(byte[] quals) {
|
||||
double[] returnArray = new double[quals.length];
|
||||
for( int iii = 0; iii < quals.length; iii++ ) {
|
||||
returnArray[iii] = ((double) quals[iii])/-10.0;
|
||||
}
|
||||
return returnArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a probability to a quality score. Note, this is capped at Q40.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
protected final boolean isBeforeInsertion;
|
||||
protected final boolean isNextToSoftClip;
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new pileup element.
|
||||
*
|
||||
|
|
@ -89,6 +88,14 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
public byte getQual() {
|
||||
return getQual(offset);
|
||||
}
|
||||
|
||||
public byte getBaseInsertionQual() {
|
||||
return getBaseInsertionQual(offset);
|
||||
}
|
||||
|
||||
public byte getBaseDeletionQual() {
|
||||
return getBaseDeletionQual(offset);
|
||||
}
|
||||
|
||||
public int getMappingQual() {
|
||||
return read.getMappingQuality();
|
||||
|
|
@ -111,6 +118,14 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseQualities()[offset];
|
||||
}
|
||||
|
||||
protected byte getBaseInsertionQual(final int offset) {
|
||||
return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseInsertionQualities()[offset];
|
||||
}
|
||||
|
||||
protected byte getBaseDeletionQual(final int offset) {
|
||||
return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseDeletionQualities()[offset];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final PileupElement pileupElement) {
|
||||
if (offset < pileupElement.offset)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,293 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Utility methods to facilitate on-the-fly base quality score recalibration.
|
||||
*
|
||||
* User: rpoplin
|
||||
* Date: 2/4/12
|
||||
*/
|
||||
|
||||
public class BaseRecalibration {
|
||||
|
||||
public enum BaseRecalibrationType {
|
||||
BASE_SUBSTITUTION,
|
||||
BASE_INSERTION,
|
||||
BASE_DELETION
|
||||
}
|
||||
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
|
||||
public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
public static final String EOF_MARKER = "EOF";
|
||||
private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here?
|
||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
|
||||
|
||||
public BaseRecalibration( final File RECAL_FILE ) {
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
|
||||
// Read in the data from the csv file and populate the data map and covariates list
|
||||
boolean sawEOF = false;
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
lineNumber++;
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true;
|
||||
} else if( COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
}
|
||||
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( null ); // BUGBUG: do any of the used covariates actually need the RecalibrationArgumentCollection?
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
|
||||
if ( !sawEOF ) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
|
||||
if( dataManager == null ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||
}
|
||||
|
||||
dataManager.generateEmpiricalQualities( 1, MAX_QUALITY_SCORE );
|
||||
}
|
||||
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
final Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
}
|
||||
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||
}
|
||||
|
||||
public byte[] recalibrateRead( final GATKSAMRecord read, final byte[] originalQuals ) {
|
||||
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates(read, requestedCovariates);
|
||||
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = qualityScore;
|
||||
}
|
||||
|
||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
|
||||
return recalQuals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table.
|
||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation( final Object... key ) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
readGroupCollapsedKey[0] = key[0];
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalRecalDatum != null ) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
qualityScoreCollapsedKey[0] = key[0];
|
||||
qualityScoreCollapsedKey[1] = key[1];
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
|
||||
double deltaQReported = 0.0;
|
||||
if( qReportedRecalDatum != null ) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
double deltaQCovariates = 0.0;
|
||||
double deltaQCovariateEmpirical;
|
||||
covariateCollapsedKey[0] = key[0];
|
||||
covariateCollapsedKey[1] = key[1];
|
||||
for( int iii = 2; iii < key.length; iii++ ) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
|
||||
//}
|
||||
//else {
|
||||
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
|
||||
//}
|
||||
|
||||
//return newQualityByte;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if( originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -25,8 +25,10 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
|
@ -48,6 +50,11 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start
|
||||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end
|
||||
|
||||
// Base Quality Score Recalibrator specific attribute tags
|
||||
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI";
|
||||
public static final String BQSR_BASE_DELETION_QUALITIES = "BD";
|
||||
public static final String BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG = "BR";
|
||||
|
||||
// the SAMRecord data we're caching
|
||||
private String mReadString = null;
|
||||
private GATKSAMReadGroupRecord mReadGroup = null;
|
||||
|
|
@ -155,6 +162,60 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
return super.equals(o);
|
||||
}
|
||||
|
||||
/*
|
||||
@Override
|
||||
public byte[] getBaseQualities() {
|
||||
if( getAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG ) != null ) {
|
||||
return super.getBaseQualities();
|
||||
} else {
|
||||
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
|
||||
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
|
||||
final byte[] quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, super.getBaseQualities() );
|
||||
setBaseQualities(quals);
|
||||
setAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG, true );
|
||||
return quals;
|
||||
} else { // just use the qualities that are in the read since we don't have the sufficient information to recalibrate on the fly
|
||||
return super.getBaseQualities();
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Accessors for base insertion and base deletion quality scores
|
||||
*/
|
||||
public byte[] getBaseInsertionQualities() {
|
||||
byte[] quals = getByteArrayAttribute( BQSR_BASE_INSERTION_QUALITIES );
|
||||
if( quals == null ) {
|
||||
quals = new byte[getBaseQualities().length];
|
||||
Arrays.fill(quals, (byte) 45); // allow for differing default values between BaseInsertions and BaseDeletions
|
||||
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
|
||||
// else give default values which are flat Q45
|
||||
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
|
||||
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
|
||||
}
|
||||
// add the qual array to the read so that we don't have to do the recalibration work again
|
||||
setAttribute( BQSR_BASE_INSERTION_QUALITIES, quals );
|
||||
}
|
||||
return quals;
|
||||
}
|
||||
|
||||
public byte[] getBaseDeletionQualities() {
|
||||
byte[] quals = getByteArrayAttribute( BQSR_BASE_DELETION_QUALITIES );
|
||||
if( quals == null ) {
|
||||
quals = new byte[getBaseQualities().length];
|
||||
Arrays.fill(quals, (byte) 45);
|
||||
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
|
||||
// else give default values which are flat Q45
|
||||
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
|
||||
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
|
||||
}
|
||||
// add the qual array to the read so that we don't have to do the recalibration work again
|
||||
setAttribute( BQSR_BASE_DELETION_QUALITIES, quals );
|
||||
}
|
||||
return quals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Efficient caching accessor that returns the GATK NGSPlatform of this read
|
||||
* @return
|
||||
|
|
|
|||
Loading…
Reference in New Issue