diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 6140d543a..97d1de1fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import java.io.File; import java.util.*; @@ -179,10 +180,18 @@ public class GenomeAnalysisEngine { */ private static final long GATK_RANDOM_SEED = 47382911L; private static Random randomGenerator = new Random(GATK_RANDOM_SEED); - public static Random getRandomGenerator() { return randomGenerator; } public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } + + /** + * Static base quality score recalibration helper object + */ + private static BaseRecalibration baseRecalibration = null; + public static BaseRecalibration getBaseRecalibration() { return baseRecalibration; } + public static boolean hasBaseRecalibration() { return baseRecalibration != null; } + public static void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); } + /** * Actually run the GATK with the specified walker. * @@ -205,6 +214,10 @@ public class GenomeAnalysisEngine { if (this.getArguments().nonDeterministicRandomSeed) resetRandomGenerator(System.currentTimeMillis()); + // if the use specified an input BQSR recalibration table then enable on the fly recalibration + if (this.getArguments().RECAL_FILE != null) + setBaseRecalibration(this.getArguments().RECAL_FILE); + // Determine how the threads should be divided between CPU vs. IO. determineThreadAllocation(); @@ -224,7 +237,7 @@ public class GenomeAnalysisEngine { // create temp directories as necessary initializeTempDirectory(); - // create the output streams " + // create the output streams initializeOutputStreams(microScheduler.getOutputTracker()); Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 08d2c1ad1..206fa5765 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -185,6 +185,15 @@ public class GATKArgumentCollection { @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) public Boolean useOriginalBaseQualities = false; + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + */ + @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration") + public File RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously + @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false) public byte defaultBaseQualities = -1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java new file mode 100644 index 000000000..837062dd2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import net.sf.samtools.SAMRecord; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 9/26/11 + */ + +public class ContextCovariate implements Covariate { + + final int CONTEXT_SIZE = 8; + String allN = ""; + + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + for( int iii = 0; iii < CONTEXT_SIZE; iii++ ) { + allN += "N"; + } + } + + public void getValues(SAMRecord read, Comparable[] comparable) { + byte[] bases = read.getReadBases(); + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = ( i-CONTEXT_SIZE < 0 ? allN : new String(Arrays.copyOfRange(bases,i-CONTEXT_SIZE,i)) ); + } + } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public final Comparable getValue( final String str ) { + return str; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index a0c928afa..66ad1fb9c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -86,14 +85,14 @@ public class RecalDataManager { PURGE_READ } - RecalDataManager() { + public RecalDataManager() { data = new NestedHashMap(); dataCollapsedReadGroup = null; dataCollapsedQualityScore = null; dataCollapsedByCovariate = null; } - RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { + public RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker data = null; dataCollapsedReadGroup = new NestedHashMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 6e214c6bb..a569aefd2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -181,7 +181,6 @@ public class TableRecalibrationWalker extends ReadWalker 127; -128 -> 128; -1 -> 255; etc. } + static public double[] qualArrayToLog10ErrorProb(byte[] quals) { + double[] returnArray = new double[quals.length]; + for( int iii = 0; iii < quals.length; iii++ ) { + returnArray[iii] = ((double) quals[iii])/-10.0; + } + return returnArray; + } + /** * Convert a probability to a quality score. Note, this is capped at Q40. * diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 9e2a66f6e..a4830223e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -27,7 +27,6 @@ public class PileupElement implements Comparable { protected final boolean isBeforeInsertion; protected final boolean isNextToSoftClip; - /** * Creates a new pileup element. * @@ -89,6 +88,14 @@ public class PileupElement implements Comparable { public byte getQual() { return getQual(offset); } + + public byte getBaseInsertionQual() { + return getBaseInsertionQual(offset); + } + + public byte getBaseDeletionQual() { + return getBaseDeletionQual(offset); + } public int getMappingQual() { return read.getMappingQuality(); @@ -111,6 +118,14 @@ public class PileupElement implements Comparable { return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseQualities()[offset]; } + protected byte getBaseInsertionQual(final int offset) { + return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseInsertionQualities()[offset]; + } + + protected byte getBaseDeletionQual(final int offset) { + return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseDeletionQualities()[offset]; + } + @Override public int compareTo(final PileupElement pileupElement) { if (offset < pileupElement.offset) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java new file mode 100644 index 000000000..2e785043d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; +import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager; +import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.NestedHashMap; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Utility methods to facilitate on-the-fly base quality score recalibration. + * + * User: rpoplin + * Date: 2/4/12 + */ + +public class BaseRecalibration { + + public enum BaseRecalibrationType { + BASE_SUBSTITUTION, + BASE_INSERTION, + BASE_DELETION + } + + private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation + public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + public static final String EOF_MARKER = "EOF"; + private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here? + private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. + + public BaseRecalibration( final File RECAL_FILE ) { + // Get a list of all available covariates + final List> classes = new PluginManager(Covariate.class).getPlugins(); + + int lineNumber = 0; + boolean foundAllCovariates = false; + + // Read in the data from the csv file and populate the data map and covariates list + boolean sawEOF = false; + try { + for ( String line : new XReadLines(RECAL_FILE) ) { + lineNumber++; + if ( EOF_MARKER.equals(line) ) { + sawEOF = true; + } else if( COMMENT_PATTERN.matcher(line).matches() ) { + ; // Skip over the comment lines, (which start with '#') + } + // Read in the covariates that were used from the input file + else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data + if( foundAllCovariates ) { + throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); + } else { // Found the covariate list in input file, loop through all of them and instantiate them + String[] vals = line.split(","); + for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical + boolean foundClass = false; + for( Class covClass : classes ) { + if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + foundClass = true; + try { + Covariate covariate = (Covariate)covClass.newInstance(); + requestedCovariates.add( covariate ); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + + } + } + + if( !foundClass ) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); + } + } + } + + } else { // Found a line of data + if( !foundAllCovariates ) { + foundAllCovariates = true; + + // At this point all the covariates should have been found and initialized + if( requestedCovariates.size() < 2 ) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); + } + + final boolean createCollapsedTables = true; + + // Initialize any covariate member variables using the shared argument collection + for( Covariate cov : requestedCovariates ) { + cov.initialize( null ); // BUGBUG: do any of the used covariates actually need the RecalibrationArgumentCollection? + } + // Initialize the data hashMaps + dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); + + } + addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap + } + } + + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); + } catch ( NumberFormatException e ) { + throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); + } + + if ( !sawEOF ) { + final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; + throw new UserException.MalformedFile(RECAL_FILE, errorMessage); + } + + if( dataManager == null ) { + throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); + } + + dataManager.generateEmpiricalQualities( 1, MAX_QUALITY_SCORE ); + } + + /** + * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * @param line A line of CSV data read from the recalibration table data file + */ + private void addCSVData(final File file, final String line) { + final String[] vals = line.split(","); + + // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly + if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); + } + + final Object[] key = new Object[requestedCovariates.size()]; + Covariate cov; + int iii; + for( iii = 0; iii < requestedCovariates.size(); iii++ ) { + cov = requestedCovariates.get( iii ); + key[iii] = cov.getValue( vals[iii] ); + } + + // Create a new datum using the number of observations, number of mismatches, and reported quality score + final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); + // Add that datum to all the collapsed tables which will be used in the sequential calculation + dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter + } + + public byte[] recalibrateRead( final GATKSAMRecord read, final byte[] originalQuals ) { + + final byte[] recalQuals = originalQuals.clone(); + + //compute all covariate values for this read + final Comparable[][] covariateValues_offset_x_covar = + RecalDataManager.computeCovariates(read, requestedCovariates); + + // For each base in the read + for( int offset = 0; offset < read.getReadLength(); offset++ ) { + + final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; + + Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); + if(qualityScore == null) + { + qualityScore = performSequentialQualityCalculation( fullCovariateKey ); + qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); + } + + recalQuals[offset] = qualityScore; + } + + preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low + + return recalQuals; + } + + /** + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * @param key The list of Comparables that were calculated from the covariates + * @return A recalibrated quality score as a byte + */ + private byte performSequentialQualityCalculation( final Object... key ) { + + final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; + + // The global quality shift (over the read group only) + readGroupCollapsedKey[0] = key[0]; + final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); + double globalDeltaQ = 0.0; + if( globalRecalDatum != null ) { + final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); + final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; + } + + // The shift in quality between reported and empirical + qualityScoreCollapsedKey[0] = key[0]; + qualityScoreCollapsedKey[1] = key[1]; + final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); + double deltaQReported = 0.0; + if( qReportedRecalDatum != null ) { + final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + + // The shift in quality due to each covariate by itself in turn + double deltaQCovariates = 0.0; + double deltaQCovariateEmpirical; + covariateCollapsedKey[0] = key[0]; + covariateCollapsedKey[1] = key[1]; + for( int iii = 2; iii < key.length; iii++ ) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); + if( covariateRecalDatum != null ) { + deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); + deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + } + } + + final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; + return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); + + // Verbose printouts used to validate with old recalibrator + //if(key.contains(null)) { + // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + + //return newQualityByte; + } + + /** + * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold + * @param originalQuals The list of original base quality scores + * @param recalQuals A list of the new recalibrated quality scores + */ + private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) { + for( int iii = 0; iii < recalQuals.length; iii++ ) { + if( originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter + recalQuals[iii] = originalQuals[iii]; + } + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 03b794ae3..e9b46ac24 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -25,8 +25,10 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.NGSPlatform; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -48,6 +50,11 @@ public class GATKSAMRecord extends BAMRecord { public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end + // Base Quality Score Recalibrator specific attribute tags + public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; + public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; + public static final String BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG = "BR"; + // the SAMRecord data we're caching private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; @@ -155,6 +162,60 @@ public class GATKSAMRecord extends BAMRecord { return super.equals(o); } + /* + @Override + public byte[] getBaseQualities() { + if( getAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG ) != null ) { + return super.getBaseQualities(); + } else { + // if the recal data was populated in the engine then recalibrate the quality scores on the fly + if( GenomeAnalysisEngine.hasBaseRecalibration() ) { + final byte[] quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, super.getBaseQualities() ); + setBaseQualities(quals); + setAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG, true ); + return quals; + } else { // just use the qualities that are in the read since we don't have the sufficient information to recalibrate on the fly + return super.getBaseQualities(); + } + } + } + */ + + /** + * Accessors for base insertion and base deletion quality scores + */ + public byte[] getBaseInsertionQualities() { + byte[] quals = getByteArrayAttribute( BQSR_BASE_INSERTION_QUALITIES ); + if( quals == null ) { + quals = new byte[getBaseQualities().length]; + Arrays.fill(quals, (byte) 45); // allow for differing default values between BaseInsertions and BaseDeletions + // if the recal data was populated in the engine then recalibrate the quality scores on the fly + // else give default values which are flat Q45 + if( GenomeAnalysisEngine.hasBaseRecalibration() ) { + quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities + } + // add the qual array to the read so that we don't have to do the recalibration work again + setAttribute( BQSR_BASE_INSERTION_QUALITIES, quals ); + } + return quals; + } + + public byte[] getBaseDeletionQualities() { + byte[] quals = getByteArrayAttribute( BQSR_BASE_DELETION_QUALITIES ); + if( quals == null ) { + quals = new byte[getBaseQualities().length]; + Arrays.fill(quals, (byte) 45); + // if the recal data was populated in the engine then recalibrate the quality scores on the fly + // else give default values which are flat Q45 + if( GenomeAnalysisEngine.hasBaseRecalibration() ) { + quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities + } + // add the qual array to the read so that we don't have to do the recalibration work again + setAttribute( BQSR_BASE_DELETION_QUALITIES, quals ); + } + return quals; + } + /** * Efficient caching accessor that returns the GATK NGSPlatform of this read * @return