diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 0702b08c1..cb2944d31 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -91,7 +90,7 @@ import java.util.TreeSet; */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class PrintReadsWalker extends ReadWalker { +public class PrintReadsWalker extends ReadWalker { @Output(doc="Write output to this BAM filename instead of STDOUT") SAMFileWriter out; @@ -129,6 +128,13 @@ public class PrintReadsWalker extends ReadWalker { @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) public Set sampleNames = new TreeSet(); + /** + * Erase all extra attributes in the read but keep the read group information + */ + @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false) + public boolean simplifyReads = false; + + private TreeSet samplesToChoose = new TreeSet(); private boolean SAMPLES_SPECIFIED = false; @@ -162,7 +168,7 @@ public class PrintReadsWalker extends ReadWalker { * The reads filter function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a SAMRecord + * @param read the read itself, as a GATKSAMRecord * @return true if the read passes the filter, false if it doesn't */ public boolean filter(ReferenceContext ref, GATKSAMRecord read) { @@ -208,11 +214,11 @@ public class PrintReadsWalker extends ReadWalker { * The reads map function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a SAMRecord + * @param read the read itself, as a GATKSAMRecord * @return the read itself */ - public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { - return read; + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { + return simplifyReads ? read.simplify() : read; } /** @@ -232,7 +238,7 @@ public class PrintReadsWalker extends ReadWalker { * @param output the output source * @return the SAMFileWriter, so that the next reduce can emit to the same source */ - public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { + public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) { output.addAlignment(read); return output; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java new file mode 100644 index 000000000..b66a81f34 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java @@ -0,0 +1,52 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.LinkedList; +import java.util.List; + +/** + * Short one line description of the walker. + * + *

[Long description of the walker]

+ * + * + *

Input

[Description of the Input]

+ * + *

Output

[Description of the Output]

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T [walker name]
+ *  
+ * + * @author Mauricio Carneiro + * @since 4/17/12 + */ +public class AccuracyDatum extends RecalDatum { + private final List accuracy = new LinkedList(); + private final List reportedQualities = new LinkedList(); + + public AccuracyDatum(final RecalDatum recalDatum, final byte originalQuality) { + super(recalDatum); + accuracy.add(calculateAccuracy(recalDatum, originalQuality)); + reportedQualities.add(originalQuality); + } + + public void combine(final RecalDatum recalDatum, final byte originalQuality) { + this.combine(recalDatum); + accuracy.add(calculateAccuracy(recalDatum, originalQuality)); + reportedQualities.add(originalQuality); + } + + @Override + public String toString() { + return String.format("%s,%.2f,%.2f", super.toString(), MathUtils.average(reportedQualities), MathUtils.average(accuracy)); + } + + private static double calculateAccuracy(final RecalDatum recalDatum, final byte originalQuality) { + return recalDatum.getEmpiricalQuality() - originalQuality; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index bcbda4b20..2b48e5871 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -52,7 +52,7 @@ public class BQSRKeyManager { for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management int nBits = required.numberOfBits(); // number of bits used by this covariate BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate - this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate + this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, mask, required)); // Create an object for this required covariate nRequiredBits += nBits; } @@ -184,7 +184,7 @@ public class BQSRKeyManager { * @return an object array with the values for each key */ public List keySetFrom(BitSet key) { - List objectKeys = new ArrayList(); + List objectKeys = new LinkedList(); for (RequiredCovariateInfo info : requiredCovariates) { BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface @@ -286,7 +286,7 @@ public class BQSRKeyManager { public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits ) public final Covariate covariate; // this allows reverse lookup of the Covariates in order - RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) { + RequiredCovariateInfo(int bitsBefore, BitSet mask, Covariate covariate) { this.bitsBefore = bitsBefore; this.mask = mask; this.covariate = covariate; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index 69461ed0e..c7c281943 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -47,9 +47,6 @@ public class ContextCovariate implements StandardCovariate { private int insertionsContextSize; private int deletionsContextSize; - private final BitSet NO_CONTEXT_BITSET = BitSetUtils.bitSetFrom(-1L); -// protected final String NO_CONTEXT_VALUE = "N"; // protected so we can UNIT TEST it - private byte LOW_QUAL_TAIL; // Initialize any member variables using the command-line arguments passed to the walkers diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateValues.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateValues.java index 00d3b650c..ebf90ebfd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateValues.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateValues.java @@ -14,9 +14,9 @@ import java.util.BitSet; * @since 2/8/12 */ public class CovariateValues { - private BitSet[] mismatches; - private BitSet[] insertions; - private BitSet[] deletions; + private final BitSet[] mismatches; + private final BitSet[] insertions; + private final BitSet[] deletions; public CovariateValues(BitSet[] mismatch, BitSet[] insertion, BitSet[] deletion) { this.mismatches = mismatch; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java similarity index 65% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java index 39807283a..b3ea88d58 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.QualityUtils; -import java.util.List; - /* * Copyright (c) 2010 The Broad Institute * @@ -38,10 +36,13 @@ import java.util.List; * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. */ -public class RecalDatumOptimized { +public class Datum { + + long numObservations; // number of bases seen in total + long numMismatches; // number of bases seen that didn't match the reference + + private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero - protected long numObservations; // number of bases seen in total - protected long numMismatches; // number of bases seen that didn't match the reference //--------------------------------------------------------------------------------------------------------------- // @@ -49,67 +50,43 @@ public class RecalDatumOptimized { // //--------------------------------------------------------------------------------------------------------------- - public RecalDatumOptimized() { + public Datum() { numObservations = 0L; numMismatches = 0L; } - public RecalDatumOptimized(final long _numObservations, final long _numMismatches) { - numObservations = _numObservations; - numMismatches = _numMismatches; - } - - public RecalDatumOptimized(final RecalDatumOptimized copy) { - this.numObservations = copy.numObservations; - this.numMismatches = copy.numMismatches; - } - //--------------------------------------------------------------------------------------------------------------- // // increment methods // //--------------------------------------------------------------------------------------------------------------- - public synchronized final void increment(final long incObservations, final long incMismatches) { + synchronized void increment(final long incObservations, final long incMismatches) { numObservations += incObservations; numMismatches += incMismatches; } - public synchronized final void increment(final RecalDatumOptimized other) { - increment(other.numObservations, other.numMismatches); - } - - public synchronized final void increment(final List data) { - for (RecalDatumOptimized other : data) { - this.increment(other); - } - } - //--------------------------------------------------------------------------------------------------------------- // // methods to derive empirical quality score // //--------------------------------------------------------------------------------------------------------------- - public final double empiricalQualDouble(final int smoothing, final double maxQual) { - final double doubleMismatches = (double) (numMismatches + smoothing); - final double doubleObservations = (double) (numObservations + smoothing); + double empiricalQualDouble() { + final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - return Math.min(empiricalQual, maxQual); + return Math.min(empiricalQual, (double) QualityUtils.MAX_QUAL_SCORE); } - public final byte empiricalQualByte(final int smoothing) { - final double doubleMismatches = (double) (numMismatches + smoothing); - final double doubleObservations = (double) (numObservations + smoothing); - return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40 - } - - public final byte empiricalQualByte() { - return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero + byte empiricalQualByte() { + final double doubleMismatches = (double) (numMismatches); + final double doubleObservations = (double) (numObservations); + return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40 } @Override - public final String toString() { + public String toString() { return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java index 4c53dcca5..6d004edb1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java @@ -7,8 +7,8 @@ public enum EventType { BASE_INSERTION(1, "I"), BASE_DELETION(2, "D"); - public int index; - public String representation; + public final int index; + private final String representation; private EventType(int index, String representation) { this.index = index; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java index afe847583..9c91a1874 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java @@ -19,9 +19,9 @@ import java.util.Map; public class QuantizationInfo { private List quantizedQuals; private List empiricalQualCounts; - int quantizationLevels; + private int quantizationLevels; - public QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { + private QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { this.quantizedQuals = quantizedQuals; this.empiricalQualCounts = empiricalQualCounts; this.quantizationLevels = quantizationLevels; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java index f87986b47..fc4445b22 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java @@ -13,9 +13,9 @@ import java.util.BitSet; * @since 2/8/12 */ public class ReadCovariates { - private BitSet[][] mismatchesKeySet; - private BitSet[][] insertionsKeySet; - private BitSet[][] deletionsKeySet; + private final BitSet[][] mismatchesKeySet; + private final BitSet[][] insertionsKeySet; + private final BitSet[][] deletionsKeySet; private int nextCovariateIndex; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index ac80e2017..cedff0a80 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -221,7 +220,7 @@ public class RecalDataManager { logger.info(""); } - public static List generateReportTables(Map> keysAndTablesMap) { + private static List generateReportTables(Map> keysAndTablesMap) { List result = new LinkedList(); int tableIndex = 0; @@ -349,7 +348,7 @@ public class RecalDataManager { * @param read The SAMRecord to parse * @return whether or not this read should be skipped */ - public static boolean checkColorSpace(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { + public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { if (ReadUtils.isSOLiDRead(read)) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); @@ -378,99 +377,10 @@ public class RecalDataManager { throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); else - return false; // otherwise, just skip the read + return true; // otherwise, just skip the read } } - return true; - } - - /** - * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases - * This method doesn't add the inconsistent tag to the read like parseColorSpace does - * - * @param read The SAMRecord to parse - * @param originalQualScores The array of original quality scores to modify during the correction - * @param solidRecalMode Which mode of solid recalibration to apply - * @param refBases The reference for this read - * @return A new array of quality scores that have been ref bias corrected - */ - public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) { - - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) { - colorSpace = ((String) attr).getBytes(); - } - else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - byte[] readBases = read.getReadBases(); - final byte[] colorImpliedBases = readBases.clone(); - byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases - if (read.getReadNegativeStrandFlag()) { - readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); - refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone()); - } - final int[] inconsistency = new int[readBases.length]; - byte prevBase = colorSpace[0]; // The sentinel - for (int iii = 0; iii < readBases.length; iii++) { - final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); - colorImpliedBases[iii] = thisBase; - inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1); - prevBase = readBases[iii]; - } - - // Now that we have the inconsistency array apply the desired correction to the inconsistent bases - if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0 - final boolean setBaseN = false; - originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } - else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) { - final boolean setBaseN = true; - originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } - else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases - solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead); - } - - } - else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - - return originalQualScores; - } - - public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { - if (ReadUtils.isSOLiDRead(read)) { - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) { - colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel - } - else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - for (byte color : colorSpace) { - if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { - return true; // There is a bad color in this SOLiD read and the user wants to skip over it - } - } - - } - else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - } - - return false; // There aren't any color no calls in this SOLiD read + return false; } /** @@ -625,16 +535,16 @@ public class RecalDataManager { * @param offset The offset in the read at which to check * @return Returns true if the base was inconsistent with the color space */ - public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) { + public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); if (attr != null) { final byte[] inconsistency = (byte[]) attr; // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! if (read.getReadNegativeStrandFlag()) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] != (byte) 0; + return inconsistency[inconsistency.length - offset - 1] == (byte) 0; } else { // Forward direction - return inconsistency[offset] != (byte) 0; + return inconsistency[offset] == (byte) 0; } // This block of code is for if you want to check both the offset and the next base for color space inconsistency @@ -654,7 +564,7 @@ public class RecalDataManager { } else { // No inconsistency array, so nothing is inconsistent - return false; + return true; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index 0b66bb182..d232fde81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -33,12 +33,11 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. */ -public class RecalDatum extends RecalDatumOptimized { +public class RecalDatum extends Datum { private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) - private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero //--------------------------------------------------------------------------------------------------------------- // @@ -50,7 +49,7 @@ public class RecalDatum extends RecalDatumOptimized { numObservations = 0L; numMismatches = 0L; estimatedQReported = 0.0; - empiricalQuality = 0.0; + empiricalQuality = -1.0; } public RecalDatum(final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality) { @@ -67,60 +66,52 @@ public class RecalDatum extends RecalDatumOptimized { this.empiricalQuality = copy.empiricalQuality; } - //--------------------------------------------------------------------------------------------------------------- - // - // increment methods - // - //--------------------------------------------------------------------------------------------------------------- - - public final void combine(final RecalDatum other) { + public void combine(final RecalDatum other) { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); this.increment(other.numObservations, other.numMismatches); this.estimatedQReported = -10 * Math.log10(sumErrors / this.numObservations); + this.empiricalQuality = -1.0; // reset the empirical quality calculation so we never have a wrongly calculated empirical quality stored } - //--------------------------------------------------------------------------------------------------------------- - // - // methods to derive empirical quality score - // - //--------------------------------------------------------------------------------------------------------------- - - public final void calcCombinedEmpiricalQuality(final int maxQual) { - this.empiricalQuality = empiricalQualDouble(SMOOTHING_CONSTANT, maxQual); // cache the value so we don't call log over and over again + public final void calcCombinedEmpiricalQuality() { + this.empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again } public final void calcEstimatedReportedQuality() { this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / numObservations); } - //--------------------------------------------------------------------------------------------------------------- - // - // misc. methods - // - //--------------------------------------------------------------------------------------------------------------- - public final double getEstimatedQReported() { return estimatedQReported; } public final double getEmpiricalQuality() { + if (empiricalQuality < 0) + calcCombinedEmpiricalQuality(); return empiricalQuality; } - private double calcExpectedErrors() { - return (double) this.numObservations * qualToErrorProb(estimatedQReported); - } - - private double qualToErrorProb(final double qual) { - return Math.pow(10.0, qual / -10.0); - } - /** * Makes a hard copy of the recal datum element * * @return a new recal datum object with the same contents of this datum. */ - protected RecalDatum copy() { + public RecalDatum copy() { return new RecalDatum(numObservations, numMismatches, estimatedQReported, empiricalQuality); } + + @Override + public String toString() { + return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality())); + } + + + private double calcExpectedErrors() { + return (double) this.numObservations * qualToErrorProb(estimatedQReported); + } + + private double qualToErrorProb(final double qual) { + return Math.pow(10.0, qual / -10.0); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 07cb8d7a8..4a695ecb6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.Utils; -import java.io.PrintStream; +import java.io.File; import java.util.Collections; import java.util.List; @@ -62,7 +62,7 @@ public class RecalibrationArgumentCollection { */ @Gather(BQSRGatherer.class) @Output - public PrintStream RECAL_FILE; + public File RECAL_FILE; /** * List all implemented covariates. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java index c434cc96b..19c04361b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java @@ -18,13 +18,11 @@ import java.util.*; */ public class RecalibrationReport { private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) - private LinkedHashMap> keysAndTablesMap; // quick access reference to the read group table and its key manager - private ArrayList requestedCovariates = new ArrayList(); // list of all covariates to be used in this calculation + private final LinkedHashMap> keysAndTablesMap; // quick access reference to the read group table and its key manager + private final ArrayList requestedCovariates = new ArrayList(); // list of all covariates to be used in this calculation - GATKReportTable argumentTable; // keep the argument table untouched just for output purposes - RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter | todo -- this should be a new parameter, not necessarily coming from the original table parameter list - - private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check."; + private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes + private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter public RecalibrationReport(final File RECAL_FILE) { GATKReport report = new GATKReport(RECAL_FILE); @@ -53,6 +51,7 @@ public class RecalibrationReport { final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES) + final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check."; if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE); table = parseReadGroupTable(keyManager, reportTable); @@ -292,7 +291,7 @@ public class RecalibrationReport { public void calculateEmpiricalAndQuantizedQualities() { for (Map table : keysAndTablesMap.values()) for (RecalDatum datum : table.values()) - datum.calcCombinedEmpiricalQuality(QualityUtils.MAX_QUAL_SCORE); + datum.calcCombinedEmpiricalQuality(); quantizationInfo = new QuantizationInfo(keysAndTablesMap, RAC.QUANTIZING_LEVELS); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 3a5b07e58..2badca44c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -51,6 +51,7 @@ public class BaseRecalibration { * Constructor using a GATK Report file * * @param RECAL_FILE a GATK Report file containing the recalibration information + * @param quantizationLevels number of bins to quantize the quality scores */ public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) { RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); @@ -80,7 +81,7 @@ public class BaseRecalibration { for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read byte qualityScore = originalQuals[offset]; - if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + if (qualityScore >= QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 51c3715f3..7d3477a7b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -335,10 +335,11 @@ public class GATKSAMRecord extends BAMRecord { /** * Clears all attributes except ReadGroup of the read. */ - public void simplify () { + public GATKSAMRecord simplify () { GATKSAMReadGroupRecord rg = getReadGroup(); this.clearAttributes(); setReadGroup(rg); + return this; } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java index 636d4ffb8..286b08a2c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java @@ -53,6 +53,17 @@ public class BQSRKeyManagerUnitTest { createReadAndTest(covariates, nRequired); } + @Test(enabled = true) + public void testOneCovariateWithOptionalCovariates() { + final int nRequired = 1; + final ArrayList covariates = new ArrayList(4); + covariates.add(new ReadGroupCovariate()); + covariates.add(new QualityScoreCovariate()); + covariates.add(new CycleCovariate()); + covariates.add(new ContextCovariate()); + createReadAndTest(covariates, nRequired); + } + private void createReadAndTest(List covariates, int nRequired) { int readLength = 1000; GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(ReadUtils.createRandomReadBases(readLength, true), ReadUtils.createRandomReadQuals(readLength), readLength + "M");