From ca11ab39e7fb0821129360d301dfcf2906ed78dd Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 5 Mar 2012 17:37:34 -0500 Subject: [PATCH] BitSets keys to lower BQSR's memory footprint Infrastructure: * Generic BitSet implementation with any precision (up to long) * Two's complement implementation of the bit set handles negative numbers (cycle covariate) * Memoized implementation of the BitSet utils for better performance. * All exponents are now calculated with bit shifts, fixing numerical precision issues with the double Math.pow. * Replace log/sqrt with bitwise logic to get rid of numerical issues BQSR: * All covariates output BitSets and have the functionality to decode them back into Object values. * Covariates are responsible for determining the size of the key they will use (number of bits). * Generalized KeyManager implementation combines any arbitrary number of covariates into one bitset key with event type * No more NestedHashMaps. Single key system now fits in one hash to reduce hash table objects overhead Tests: * Unit tests added to every method of BitSetUtils * Unit tests added to the generalized key system infrastructure of BQSRv2 (KeyManager) * Unit tests added to the cycle and context covariates (will add unit tests to all covariates) --- .../gatk/walkers/bqsr/ContextCovariate.java | 52 ++-- .../sting/gatk/walkers/bqsr/Covariate.java | 26 +- .../gatk/walkers/bqsr/CovariateKeySet.java | 100 +++--- .../gatk/walkers/bqsr/CovariateValues.java | 16 +- .../gatk/walkers/bqsr/CycleCovariate.java | 50 +-- .../walkers/bqsr/QualityScoreCovariate.java | 35 ++- .../gatk/walkers/bqsr/ReadGroupCovariate.java | 23 +- .../gatk/walkers/bqsr/RecalDataManager.java | 60 +++- .../walkers/bqsr/RecalDatumOptimized.java | 7 +- .../bqsr/RecalibrationArgumentCollection.java | 13 - .../sting/utils/BitSetUtils.java | 284 ++++++++++++++++++ .../broadinstitute/sting/utils/MathUtils.java | 121 -------- .../recalibration/BaseRecalibration.java | 148 ++++----- .../sting/utils/sam/GATKSAMRecord.java | 4 +- .../sting/utils/sam/ReadUtils.java | 68 ++++- .../bqsr/ContextCovariateUnitTest.java | 76 ++--- .../walkers/bqsr/CycleCovariateUnitTest.java | 68 +++++ .../sting/utils/BitSetUtilsUnitTest.java | 75 +++++ .../sting/utils/MathUtilsUnitTest.java | 99 +++--- 19 files changed, 868 insertions(+), 457 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index a1ab73341..acbe69248 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.BitSetUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -43,7 +43,10 @@ public class ContextCovariate implements StandardCovariate { private int mismatchesContextSize; private int insertionsContextSize; - private int deletionsContextSize; + private int deletionsContextSize; + + private final BitSet NO_CONTEXT_BITSET = BitSetUtils.bitSetFrom(-1L); + protected final String NO_CONTEXT_VALUE = "N"; // protected so we can UNIT TEST it // Initialize any member variables using the command-line arguments passed to the walkers @Override @@ -62,7 +65,7 @@ public class ContextCovariate implements StandardCovariate { int l = read.getReadLength(); BitSet[] mismatches = new BitSet[l]; BitSet[] insertions = new BitSet[l]; - BitSet[] deletions = new BitSet[l]; + BitSet[] deletions = new BitSet[l]; final boolean negativeStrand = read.getReadNegativeStrandFlag(); byte[] bases = read.getReadBases(); @@ -72,7 +75,7 @@ public class ContextCovariate implements StandardCovariate { for (int i = 0; i < read.getReadLength(); i++) { mismatches[i] = contextWith(bases, i, mismatchesContextSize); insertions[i] = contextWith(bases, i, insertionsContextSize); - deletions[i] = contextWith(bases, i, deletionsContextSize); + deletions[i] = contextWith(bases, i, deletionsContextSize); } if (negativeStrand) { @@ -89,24 +92,35 @@ public class ContextCovariate implements StandardCovariate { return str; } + @Override + public String keyFromBitSet(BitSet key) { + if (key.equals(NO_CONTEXT_BITSET)) + return NO_CONTEXT_VALUE; + return BitSetUtils.dnaFrom(key); + } + + @Override + public int numberOfBits() { + return Long.bitCount(-1L); + } + /** - * calculates the context of a base independent of the covariate mode + * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion) * - * @param bases the bases in the read to build the context from - * @param offset the position in the read to calculate the context for - * @param contextSize context size to use building the context - * @return + * @param bases the bases in the read to build the context from + * @param offset the position in the read to calculate the context for + * @param contextSize context size to use building the context + * @return the bitSet representing the Context */ - private BitSet contextWith(byte [] bases, int offset, int contextSize) { - if (offset < contextSize) - return null; - - String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); - if (context.contains("N")) - return null; - - return MathUtils.bitSetFrom(context); - } + private BitSet contextWith(byte[] bases, int offset, int contextSize) { + BitSet result = NO_CONTEXT_BITSET; + if (offset >= contextSize) { + String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); + if (!context.contains("N")) + result = BitSetUtils.bitSetFrom(context); + } + return result; + } /** * Reverses the given array in place. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java index 80d8cff5d..341b9e7af 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.BitSet; + /* * Copyright (c) 2009 The Broad Institute * @@ -53,7 +55,29 @@ public interface Covariate { */ public CovariateValues getValues(GATKSAMRecord read); - public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration + /** + * Used to get the covariate's value from input csv file during on-the-fly recalibration + * + * @param str the key in string type (read from the csv) + * @return the key in it's correct type. + */ + public Object getValue(String str); + + /** + * Converts the bitset representation of the key (used internally for table indexing) to String format for file output. + * + * @param key the bitset representation of the key + * @return a string representation of the key + */ + public String keyFromBitSet(BitSet key); + + /** + * Each covariate should determine how many bits are necessary to encode it's data + * + * @return The number of bits used to represent the values of this covariate. + */ + public int numberOfBits(); + } interface RequiredCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java index 1b62160a3..19a8aab07 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java @@ -2,87 +2,107 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.BitSet; +import java.util.HashMap; + /** - * The object temporarily held by a read that describes all of it's covariates. - * + * The object temporarily held by a read that describes all of it's covariates. + * * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap * * @author Mauricio Carneiro * @since 2/8/12 */ public class CovariateKeySet { - private Object[][] mismatchesKeySet; - private Object[][] insertionsKeySet; - private Object[][] deletionsKeySet; + private BitSet[][] mismatchesKeySet; + private BitSet[][] insertionsKeySet; + private BitSet[][] deletionsKeySet; private int nextCovariateIndex; - - private static String mismatchesCovariateName = "M"; - private static String insertionsCovariateName = "I"; - private static String deletionsCovariateName = "D"; + + // private static String mismatchesCovariateName = "M"; + // private static String insertionsCovariateName = "I"; + // private static String deletionsCovariateName = "D"; + // + // private static BitSet mismatchesCovariateBitSet = BitSetUtils.bitSetFrom(0); + // private static BitSet insertionsCovariateBitSet = BitSetUtils.bitSetFrom(1); + // private static BitSet deletionsCovariateBitSet = BitSetUtils.bitSetFrom(2); + + private static HashMap nameToType = new HashMap(); + private static HashMap bitSetToName = new HashMap(); public CovariateKeySet(int readLength, int numberOfCovariates) { - numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format) - this.mismatchesKeySet = new Object[readLength][numberOfCovariates]; - this.insertionsKeySet = new Object[readLength][numberOfCovariates]; - this.deletionsKeySet = new Object[readLength][numberOfCovariates]; - initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName); - initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName); - initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName); + // numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format) + this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates]; + this.insertionsKeySet = new BitSet[readLength][numberOfCovariates]; + this.deletionsKeySet = new BitSet[readLength][numberOfCovariates]; + // initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateBitSet); + // initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateBitSet); + // initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateBitSet); this.nextCovariateIndex = 0; + + // nameToType.put(mismatchesCovariateName, RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION); + // nameToType.put(insertionsCovariateName, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); + // nameToType.put(deletionsCovariateName, RecalDataManager.BaseRecalibrationType.BASE_DELETION); + // + // bitSetToName.put(BitSetUtils.bitSetFrom(0), mismatchesCovariateName); + // bitSetToName.put(BitSetUtils.bitSetFrom(1), insertionsCovariateName); + // bitSetToName.put(BitSetUtils.bitSetFrom(2), deletionsCovariateName); } - + public void addCovariate(CovariateValues covariate) { transposeCovariateValues(mismatchesKeySet, covariate.getMismatches()); transposeCovariateValues(insertionsKeySet, covariate.getInsertions()); - transposeCovariateValues(deletionsKeySet, covariate.getDeletions()); + transposeCovariateValues(deletionsKeySet, covariate.getDeletions()); nextCovariateIndex++; } - public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) { - if (modelString.equals(mismatchesCovariateName)) - return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION; - else if (modelString.equals(insertionsCovariateName)) - return RecalDataManager.BaseRecalibrationType.BASE_INSERTION; - else if (modelString.equals(deletionsCovariateName)) - return RecalDataManager.BaseRecalibrationType.BASE_DELETION; - throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString); + public static RecalDataManager.BaseRecalibrationType errorModelFrom(final String modelString) { + if (!nameToType.containsKey(modelString)) + throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString); + return nameToType.get(modelString); } - public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) { + public static String eventNameFrom(final BitSet bitSet) { + if (!bitSetToName.containsKey(bitSet)) + throw new ReviewedStingException("Unrecognized Event Type BitSet: " + bitSet); + return bitSetToName.get(bitSet); + } + + public BitSet[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) { switch (errorModel) { case BASE_SUBSTITUTION: - return getMismatchesKeySet(readPosition); + return getMismatchesKeySet(readPosition); case BASE_INSERTION: - return getInsertionsKeySet(readPosition); + return getInsertionsKeySet(readPosition); case BASE_DELETION: - return getDeletionsKeySet(readPosition); + return getDeletionsKeySet(readPosition); default: - throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel); } } - public Object[] getMismatchesKeySet(int readPosition) { + public BitSet[] getMismatchesKeySet(int readPosition) { return mismatchesKeySet[readPosition]; } - public Object[] getInsertionsKeySet(int readPosition) { + public BitSet[] getInsertionsKeySet(int readPosition) { return insertionsKeySet[readPosition]; } - public Object[] getDeletionsKeySet(int readPosition) { + public BitSet[] getDeletionsKeySet(int readPosition) { return deletionsKeySet[readPosition]; } - private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) { - for (int i=0; i= 0) { while (iii >= 0 && bases[iii] == (byte) 'T') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } while (iii >= 0 && bases[iii] == (byte) 'A') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } while (iii >= 0 && bases[iii] == (byte) 'C') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } while (iii >= 0 && bases[iii] == (byte) 'G') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } if (iii >= 0) { @@ -181,7 +183,7 @@ public class CycleCovariate implements StandardCovariate { cycle++; } if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } } @@ -192,7 +194,7 @@ public class CycleCovariate implements StandardCovariate { else { throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid"); } - + return new CovariateValues(cycles, cycles, cycles); } @@ -201,4 +203,14 @@ public class CycleCovariate implements StandardCovariate { public final Object getValue(final String str) { return Integer.parseInt(str); } + + @Override + public String keyFromBitSet(BitSet key) { + return String.format("%d", BitSetUtils.shortFrom(key)); + } + + @Override + public int numberOfBits() { + return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java index 373210bdb..4f92b7fbc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java @@ -1,7 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import org.broadinstitute.sting.utils.BitSetUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.BitSet; + /* * Copyright (c) 2009 The Broad Institute * @@ -37,6 +40,8 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; public class QualityScoreCovariate implements RequiredCovariate { + private final int MAX_QUAL = 50; + // Initialize any member variables using the command-line arguments passed to the walkers @Override public void initialize(final RecalibrationArgumentCollection RAC) { @@ -46,18 +51,18 @@ public class QualityScoreCovariate implements RequiredCovariate { public CovariateValues getValues(final GATKSAMRecord read) { int readLength = read.getReadLength(); - Integer [] mismatches = new Integer[readLength]; - Integer [] insertions = new Integer[readLength]; - Integer [] deletions = new Integer[readLength]; + BitSet[] mismatches = new BitSet[readLength]; + BitSet[] insertions = new BitSet[readLength]; + BitSet[] deletions = new BitSet[readLength]; - byte [] baseQualities = read.getBaseQualities(); - byte [] baseInsertionQualities = read.getBaseInsertionQualities(); - byte [] baseDeletionQualities = read.getBaseDeletionQualities(); + byte[] baseQualities = read.getBaseQualities(); + byte[] baseInsertionQualities = read.getBaseInsertionQualities(); + byte[] baseDeletionQualities = read.getBaseDeletionQualities(); - for (int i=0; i readGroupLookupTable = new HashMap(); private final HashMap readGroupReverseLookupTable = new HashMap(); private short nextId = 0; @@ -54,7 +56,7 @@ public class ReadGroupCovariate implements RequiredCovariate { final int l = read.getReadLength(); final String readGroupId = read.getReadGroup().getReadGroupId(); short shortId; - if (readGroupLookupTable.containsKey(readGroupId)) + if (readGroupLookupTable.containsKey(readGroupId)) shortId = readGroupLookupTable.get(readGroupId); else { shortId = nextId; @@ -62,8 +64,9 @@ public class ReadGroupCovariate implements RequiredCovariate { readGroupReverseLookupTable.put(nextId, readGroupId); nextId++; } - Short [] readGroups = new Short[l]; - Arrays.fill(readGroups, shortId); + BitSet rg = BitSetUtils.bitSetFrom(shortId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset + BitSet[] readGroups = new BitSet[l]; + Arrays.fill(readGroups, rg); return new CovariateValues(readGroups, readGroups, readGroups); } @@ -72,10 +75,20 @@ public class ReadGroupCovariate implements RequiredCovariate { public final Object getValue(final String str) { return str; } - + + @Override + public String keyFromBitSet(BitSet key) { + return decodeReadGroup((short) BitSetUtils.longFrom(key)); + } + public final String decodeReadGroup(final short id) { return readGroupReverseLookupTable.get(id); } + + @Override + public int numberOfBits() { + return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index cc60ac010..47284b098 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -58,18 +58,44 @@ public class RecalDataManager { private final HashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed private final HashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores - public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams - public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores + public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams + public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color private static boolean warnUserNullPlatform = false; - private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ + private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ public enum BaseRecalibrationType { - BASE_SUBSTITUTION, - BASE_INSERTION, - BASE_DELETION + BASE_SUBSTITUTION(0, "M"), + BASE_INSERTION(1, "I"), + BASE_DELETION(2, "D"); + + public int index; + public String representation; + + private BaseRecalibrationType(int index, String representation) { + this.index = index; + this.representation = representation; + } + + public static BaseRecalibrationType eventFrom(int index) { + switch (index) { + case 0: + return BASE_SUBSTITUTION; + case 1: + return BASE_INSERTION; + case 2: + return BASE_DELETION; + default: + throw new ReviewedStingException(String.format("Event %d does not exist.", index)); + } + } + + @Override + public String toString() { + return representation; + } } public enum SOLID_RECAL_MODE { @@ -119,7 +145,7 @@ public class RecalDataManager { dataCollapsedReadGroup = new HashMap(); dataCollapsedQualityScore = new HashMap(); dataCollapsedByCovariate = new HashMap>(); - for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) { + for (final BaseRecalibrationType errorModel : BaseRecalibrationType.values()) { dataCollapsedReadGroup.put(errorModel, new NestedHashMap()); dataCollapsedQualityScore.put(errorModel, new NestedHashMap()); dataCollapsedByCovariate.put(errorModel, new ArrayList()); @@ -136,10 +162,10 @@ public class RecalDataManager { } } - public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) { + public static CovariateKeySet covariateKeySetFrom(GATKSAMRecord read) { return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE); } - + /** * Add the given mapping to all of the collapsed hash tables * @@ -147,7 +173,7 @@ public class RecalDataManager { * @param fullDatum The RecalDatum which is the data for this mapping * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table */ - public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) { + public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel) { // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around //data.put(key, thisDatum); // add the mapping to the main table @@ -208,7 +234,7 @@ public class RecalDataManager { */ public final void generateEmpiricalQualities(final int smoothing, final int maxQual) { - for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) { + for (final BaseRecalibrationType errorModel : BaseRecalibrationType.values()) { recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual); recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual); for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) { @@ -551,6 +577,7 @@ public class RecalDataManager { /** * Given the base and the color calculate the next base in the sequence * + * @param read the read * @param prevBase The base * @param color The color * @return The next base in the sequence @@ -615,11 +642,12 @@ public class RecalDataManager { * Computes all requested covariates for every offset in the given read * by calling covariate.getValues(..). * + * It populates an array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + * * @param read The read for which to compute covariate values. * @param requestedCovariates The list of requested covariates. - * @return An array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. */ public static void computeCovariates(final GATKSAMRecord read, final List requestedCovariates) { final int numRequestedCovariates = requestedCovariates.size(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java index 233380820..39807283a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java @@ -94,7 +94,7 @@ public class RecalDatumOptimized { public final double empiricalQualDouble(final int smoothing, final double maxQual) { final double doubleMismatches = (double) (numMismatches + smoothing); final double doubleObservations = (double) (numObservations + smoothing); - double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); return Math.min(empiricalQual, maxQual); } @@ -106,9 +106,10 @@ public class RecalDatumOptimized { public final byte empiricalQualByte() { return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero - } + } - public final String outputToCSV() { + @Override + public final String toString() { return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index cc6f67cc9..7ef402083 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer; import java.io.PrintStream; -import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -92,16 +91,6 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") protected boolean RUN_WITHOUT_DBSNP = false; - ///////////////////////////// - // protected Member Variables - ///////////////////////////// - protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables) - protected final ArrayList requestedCovariates = new ArrayList();// A list to hold the covariate objects that were requested - - protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped. - protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed. - - /** * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the * reads which have had the reference inserted because of color space inconsistencies. @@ -153,7 +142,6 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) public byte DELETIONS_DEFAULT_QUALITY = 45; - @Hidden @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; @@ -161,5 +149,4 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; - } diff --git a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java new file mode 100644 index 000000000..6d3493211 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java @@ -0,0 +1,284 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.ByteArrayOutputStream; +import java.io.ObjectOutputStream; +import java.util.BitSet; + +/** + * Utilities for bitset conversion + * + * @author Mauricio Carneiro + * @since 3/5/12 + */ +public class BitSetUtils { + + static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion. + static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers) + static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers) + static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length + + /** + * Creates an long out of a bitset + * + * @param bitSet the bitset + * @return a long from the bitset representation + */ + public static long longFrom(final BitSet bitSet) { + return longFrom(bitSet, NBITS_LONG_REPRESENTATION); + } + + /** + * Creates a short integer from a bitset + * + * @param bitSet the bitset + * @return a short from the bitset representation + */ + public static short shortFrom(final BitSet bitSet) { + return (short) longFrom(bitSet, NBITS_SHORT_REPRESENTATION); + } + + /** + * Cretes an integer with any number of bits (up to 64 -- long precision) from a bitset + * + * @param bitSet the bitset + * @param nBits the number of bits to be used for this representation + * @return an integer with nBits from the bitset representation + */ + public static long longFrom(final BitSet bitSet, final int nBits) { + long number = 0; + for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0 && bitIndex <= nBits; bitIndex = bitSet.nextSetBit(bitIndex + 1)) + number |= 1L << bitIndex; + + return number; + } + + /** + * Creates a BitSet representation of a given long + * + * @param number the number to turn into a bitset + * @return a bitset representation of the long + */ + public static BitSet bitSetFrom(long number) { + return bitSetFrom(number, NBITS_LONG_REPRESENTATION); + } + + /** + * Creates a BitSet representation of a given short + * + * @param number the number to turn into a bitset + * @return a bitset representation of the short + */ + public static BitSet bitSetFrom(short number) { + return bitSetFrom(number, NBITS_SHORT_REPRESENTATION); + } + + /** + * Creates a BitSet representation of an arbitrary integer (number of bits capped at 64 -- long precision) + * + * @param number the number to turn into a bitset + * @param nBits the number of bits to use as precision for this conversion + * @return a bitset representation of the integer + */ + public static BitSet bitSetFrom(long number, int nBits) { + BitSet bitSet = new BitSet(); + boolean isNegative = number < 0; + int bitIndex = 0; + while (number != 0) { + if (number % 2 != 0) + bitSet.set(bitIndex); + bitIndex++; + number /= 2; + } + if (isNegative) { + boolean foundFirstSetBit = false; + for (int i = bitSet.nextSetBit(0); i < nBits && i >= 0; i++) { + boolean bit = bitSet.get(i); + if (!foundFirstSetBit && bit) + foundFirstSetBit = true; // maintain all bits until the first 1 is found (inclusive) + else if (foundFirstSetBit) + bitSet.flip(i); // flip every other bit up to NBITS_REPRESENTATION + } + } + return bitSet; + } + + /** + * Converts a BitSet into the dna string representation. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the + * base_10 representation of the sequence. This is important for us to know how to bring the number + * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented + * as 0's and leading 0's are omitted). + * + * quasi-canonical because A is represented by a 0, therefore, + * instead of : 0, 1, 2, 3, 10, 11, 12, ... + * we have : 0, 1, 2, 3, 00, 01, 02, ... + * + * but we can correctly decode it because we know the final length. + * + * @param bitSet the bitset representation of the dna sequence + * @return the dna sequence represented by the bitset + */ + public static String dnaFrom(final BitSet bitSet) { + long number = longFrom(bitSet); // the base_10 representation of the bit set + if (number < 0) + throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); + + int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls) + number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation + + String dna = ""; + while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) + byte base = (byte) (number % 4); + switch (base) { + case 0: + dna = "A" + dna; + break; + case 1: + dna = "C" + dna; + break; + case 2: + dna = "G" + dna; + break; + case 3: + dna = "T" + dna; + break; + } + number /= 4; + } + for (int j = dna.length(); j < length; j++) + dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) + + return dna; + } + + /** + * Creates a BitSet representation of a given dna string. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * The bit representation of a dna string is the simple: + * 0 A 4 AA 8 CA + * 1 C 5 AC ... + * 2 G 6 AG 1343 TTGGT + * 3 T 7 AT 1364 TTTTT + * + * To convert from dna to number, we convert the dna string to base10 and add all combinations that + * preceded the string (with smaller lengths). + * + * @param dna the dna sequence + * @return the bitset representing the dna sequence + */ + public static BitSet bitSetFrom(String dna) { + if (dna.length() > MAX_DNA_CONTEXT) + throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length())); + + long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set + long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string + for (int i = 0; i < dna.length(); i++) { + baseTen *= 4; + switch (dna.charAt(i)) { + case 'A': + baseTen += 0; + break; + case 'C': + baseTen += 1; + break; + case 'G': + baseTen += 2; + break; + case 'T': + baseTen += 3; + break; + } + } + return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. + } + + /** + * Calculates the number of bits necessary to represent a given number of elements + * + * @param numberOfElements the number of elements to represent (must be positive) + * @return the number of bits necessary to represent this many elements + */ + public static int numberOfBitsToRepresent(long numberOfElements) { + if (numberOfElements < 0) + throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements); + + if (numberOfElements == 1L) + return 1; // special case + + int n = 0; + numberOfElements--; + while (numberOfElements > 0) { + numberOfElements = numberOfElements >> 1; + n++; + } + return n; + } + + /** + * Calculates the length of the DNA context for a given base 10 number + * + * It is important to know the length given the base 10 number to calculate the number of combinations + * and to disambiguate the "quasi-canonical" state. + * + * This method also calculates the number of combinations as a by-product, but since it memoizes the + * results, a subsequent call to combinationsFor(length) is O(1). + * + * @param number the base 10 representation of the bitset + * @return the length of the DNA context represented by this number + */ + private static int contextLengthFor(long number) { + int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. + long combinations = combinationsFor(length); // the next context (we advance it so we know which one was preceding it). + while (combinations <= number) { // find the length of the dna string (length) + length++; + combinations = combinationsFor(length); // calculate the next context + } + return length; + } + + /** + * The sum of all combinations of a context of a given length from length = 0 to length. + * + * Memoized implementation of sum(4^i) , where i=[0,length] + * + * @param length the length of the DNA context + * @return the sum of all combinations leading up to this context length. + */ + private static long combinationsFor(int length) { + if (length > MAX_DNA_CONTEXT) + throw new ReviewedStingException(String.format("Context cannot be longer than %d bases but requested %d.", MAX_DNA_CONTEXT, length)); + + // only calculate the number of combinations if the table hasn't already cached the value + if (length > 0 && combinationsPerLength[length] == 0) { + long combinations = 0L; + for (int i = 1; i <= length; i++) + combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) ) + combinationsPerLength[length] = combinations; + } + return combinationsPerLength[length]; + } + + + public static byte[] sizeOf(Object obj) throws java.io.IOException + { + ByteArrayOutputStream byteObject = new ByteArrayOutputStream(); + ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteObject); + objectOutputStream.writeObject(obj); + objectOutputStream.flush(); + objectOutputStream.close(); + byteObject.close(); + + return byteObject.toByteArray(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 90b5630b6..bfc326d2d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -29,7 +29,6 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.math.BigDecimal; @@ -1527,124 +1526,4 @@ public class MathUtils { } - /** - * Creates an integer out of a bitset - * - * @param bitSet the bitset - * @return an integer with the bitset representation - */ - public static long intFrom(final BitSet bitSet) { - long number = 0; - for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1)) - number |= 1L << bitIndex; - - return number; - } - - /** - * Creates a BitSet representation of a given integer - * - * @param number the number to turn into a bitset - * @return a bitset representation of the integer - */ - public static BitSet bitSetFrom(long number) { - BitSet bitSet = new BitSet(); - int bitIndex = 0; - while (number > 0) { - if (number%2 > 0) - bitSet.set(bitIndex); - bitIndex++; - number /= 2; - } - return bitSet; - } - - /** - * Converts a BitSet into the dna string representation. - * - * Warning: This conversion is limited to long precision, therefore the dna sequence cannot - * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create - * a bitSetFrom(BigNumber) method. - * - * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the - * base_10 representation of the sequence. This is important for us to know how to bring the number - * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented - * as 0's and leading 0's are omitted). - * - * quasi-canonical because A is represented by a 0, therefore, - * instead of : 0, 1, 2, 3, 10, 11, 12, ... - * we have : 0, 1, 2, 3, 00, 01, 02, ... - * - * but we can correctly decode it because we know the final length. - * - * @param bitSet the bitset representation of the dna sequence - * @return the dna sequence represented by the bitset - */ - public static String dnaFrom(final BitSet bitSet) { - long number = intFrom(bitSet); // the base_10 representation of the bit set - long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later) - long nextContext = 4; // the next context (we advance it so we know which one was preceding it). - int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. - while (nextContext <= number) { // find the length of the dna string (i) - preContext = nextContext; // keep track of the number of combinations in the preceding context - nextContext += Math.pow(4, ++i);// calculate the next context - } - number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation - - String dna = ""; - while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) - byte base = (byte) (number % 4); - switch (base) { - case 0 : dna = "A" + dna; break; - case 1 : dna = "C" + dna; break; - case 2 : dna = "G" + dna; break; - case 3 : dna = "T" + dna; break; - } - number /= 4; - } - for (int j = dna.length(); j < i; j++) - dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) - - return dna; - } - - /** - * Creates a BitSet representation of a given dna string. - * - * Warning: This conversion is limited to long precision, therefore the dna sequence cannot - * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create - * a bitSetFrom(BigNumber) method. - * - * The bit representation of a dna string is the simple: - * 0 A 4 AA 8 CA - * 1 C 5 AC ... - * 2 G 6 AG 1343 TTGGT - * 3 T 7 AT 1364 TTTTT - * - * To convert from dna to number, we convert the dna string to base10 and add all combinations that - * preceded the string (with smaller lengths). - * - * @param dna the dna sequence - * @return the bitset representing the dna sequence - */ - public static BitSet bitSetFrom(String dna) { - if (dna.length() > 31) - throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length())); - - long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set - long preContext = 0; // the sum of all combinations that preceded the length of the dna string - for (int i=0; i0) - preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...) - } - - return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 74083ced2..be50d3174 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -43,7 +43,7 @@ import java.util.regex.Pattern; /** * Utility methods to facilitate on-the-fly base quality score recalibration. - * + * * User: rpoplin * Date: 2/4/12 */ @@ -58,7 +58,7 @@ public class BaseRecalibration { private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here? private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values. - public BaseRecalibration( final File RECAL_FILE ) { + public BaseRecalibration(final File RECAL_FILE) { // Get a list of all available covariates final List> classes = new PluginManager(Covariate.class).getPlugins(); @@ -68,27 +68,29 @@ public class BaseRecalibration { // Read in the data from the csv file and populate the data map and covariates list boolean sawEOF = false; try { - for ( String line : new XReadLines(RECAL_FILE) ) { + for (String line : new XReadLines(RECAL_FILE)) { lineNumber++; - if ( EOF_MARKER.equals(line) ) { + if (EOF_MARKER.equals(line)) { sawEOF = true; - } else if( COMMENT_PATTERN.matcher(line).matches() ) { + } + else if (COMMENT_PATTERN.matcher(line).matches()) { ; // Skip over the comment lines, (which start with '#') } // Read in the covariates that were used from the input file - else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data - if( foundAllCovariates ) { - throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); - } else { // Found the covariate list in input file, loop through all of them and instantiate them + else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data + if (foundAllCovariates) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE); + } + else { // Found the covariate list in input file, loop through all of them and instantiate them String[] vals = line.split(","); - for( int iii = 0; iii < vals.length - 4; iii++ ) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical + for (int iii = 0; iii < vals.length - 4; iii++) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical boolean foundClass = false; - for( Class covClass : classes ) { - if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + for (Class covClass : classes) { + if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) { foundClass = true; try { - Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -96,63 +98,65 @@ public class BaseRecalibration { } } - if( !foundClass ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); + if (!foundClass) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option."); } } } - } else { // Found a line of data - if( !foundAllCovariates ) { + } + else { // Found a line of data + if (!foundAllCovariates) { foundAllCovariates = true; // At this point all the covariates should have been found and initialized - if( requestedCovariates.size() < 2 ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); + if (requestedCovariates.size() < 2) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE); } final boolean createCollapsedTables = true; // Initialize any covariate member variables using the shared argument collection RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - for( Covariate cov : requestedCovariates ) { - cov.initialize( RAC ); + for (Covariate cov : requestedCovariates) { + cov.initialize(RAC); } // Initialize the data hashMaps - dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); + dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size()); } addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap } } - } catch ( FileNotFoundException e ) { + } catch (FileNotFoundException e) { throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); - } catch ( NumberFormatException e ) { + } catch (NumberFormatException e) { throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); } - if ( !sawEOF ) { + if (!sawEOF) { final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; throw new UserException.MalformedFile(RECAL_FILE, errorMessage); } - if( dataManager == null ) { + if (dataManager == null) { throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); } - dataManager.generateEmpiricalQualities( 1, MAX_QUALITY_SCORE ); + dataManager.generateEmpiricalQualities(1, MAX_QUALITY_SCORE); } - + /** * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * * @param line A line of CSV data read from the recalibration table data file */ private void addCSVData(final File file, final String line) { final String[] vals = line.split(","); // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 4 ) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical + if (vals.length != requestedCovariates.size() + 4) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); } @@ -160,48 +164,48 @@ public class BaseRecalibration { final Object[] key = new Object[requestedCovariates.size()]; Covariate cov; int iii; - for( iii = 0; iii < requestedCovariates.size(); iii++ ) { - cov = requestedCovariates.get( iii ); - key[iii] = cov.getValue( vals[iii] ); + for (iii = 0; iii < requestedCovariates.size(); iii++) { + cov = requestedCovariates.get(iii); + key[iii] = cov.getValue(vals[iii]); } final String modelString = vals[iii++]; - final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.getErrorModelFromString(modelString); + final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.errorModelFrom(modelString); // Create a new datum using the number of observations, number of mismatches, and reported quality score - final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); + final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0); // Add that datum to all the collapsed tables which will be used in the sequential calculation - - dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter + + dataManager.addToAllTables(key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter } - - public void recalibrateRead( final GATKSAMRecord read ) { + + public void recalibrateRead(final GATKSAMRecord read) { //compute all covariate values for this read RecalDataManager.computeCovariates(read, requestedCovariates); - final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read ); + final CovariateKeySet covariateKeySet = RecalDataManager.covariateKeySetFrom(read); - for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) { - final byte[] originalQuals = read.getBaseQualities( errorModel ); + for (final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values()) { + final byte[] originalQuals = read.getBaseQualities(errorModel); final byte[] recalQuals = originalQuals.clone(); // For each base in the read - for( int offset = 0; offset < read.getReadLength(); offset++ ) { - + for (int offset = 0; offset < read.getReadLength(); offset++) { + final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel); - final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length-1); // need to strip off the error mode which was appended to the list of covariates + final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length - 1); // need to strip off the error mode which was appended to the list of covariates // BUGBUG: This caching seems to put the entire key set into memory which negates the benefits of storing the delta delta tables? //Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode); //if( qualityScore == null ) { - final byte qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey ); + final byte qualityScore = performSequentialQualityCalculation(errorModel, fullCovariateKey); // qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode); //} - + recalQuals[offset] = qualityScore; } - - preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low - read.setBaseQualities( recalQuals, errorModel ); + + preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low + read.setBaseQualities(recalQuals, errorModel); } } @@ -211,27 +215,28 @@ public class BaseRecalibration { * * Given the full recalibration table, we perform the following preprocessing steps: * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) * @param key The list of Comparables that were calculated from the covariates * @return A recalibrated quality score as a byte */ - private byte performSequentialQualityCalculation( final RecalDataManager.BaseRecalibrationType errorModel, final Object... key ) { + private byte performSequentialQualityCalculation(final RecalDataManager.BaseRecalibrationType errorModel, final Object... key) { - final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); + final byte qualFromRead = (byte) Integer.parseInt(key[1].toString()); final Object[] readGroupCollapsedKey = new Object[1]; final Object[] qualityScoreCollapsedKey = new Object[2]; final Object[] covariateCollapsedKey = new Object[3]; // The global quality shift (over the read group only) readGroupCollapsedKey[0] = key[0]; - final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0, errorModel).get( readGroupCollapsedKey )); + final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0, errorModel).get(readGroupCollapsedKey)); double globalDeltaQ = 0.0; - if( globalRecalDatum != null ) { + if (globalRecalDatum != null) { final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; @@ -240,9 +245,9 @@ public class BaseRecalibration { // The shift in quality between reported and empirical qualityScoreCollapsedKey[0] = key[0]; qualityScoreCollapsedKey[1] = key[1]; - final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1, errorModel).get( qualityScoreCollapsedKey )); + final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1, errorModel).get(qualityScoreCollapsedKey)); double deltaQReported = 0.0; - if( qReportedRecalDatum != null ) { + if (qReportedRecalDatum != null) { final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; } @@ -252,27 +257,28 @@ public class BaseRecalibration { double deltaQCovariateEmpirical; covariateCollapsedKey[0] = key[0]; covariateCollapsedKey[1] = key[1]; - for( int iii = 2; iii < key.length; iii++ ) { - covariateCollapsedKey[2] = key[iii]; // The given covariate - final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii, errorModel).get( covariateCollapsedKey )); - if( covariateRecalDatum != null ) { + for (int iii = 2; iii < key.length; iii++) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii, errorModel).get(covariateCollapsedKey)); + if (covariateRecalDatum != null) { deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); - deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); } } final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); + return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE); } /** * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold + * * @param originalQuals The list of original base quality scores - * @param recalQuals A list of the new recalibrated quality scores + * @param recalQuals A list of the new recalibrated quality scores */ - private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) { - for( int iii = 0; iii < recalQuals.length; iii++ ) { - if( originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter + private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) { + for (int iii = 0; iii < recalQuals.length; iii++) { + if (originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter recalQuals[iii] = originalQuals[iii]; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 648dafb81..41bf74e4b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -213,8 +213,8 @@ public class GATKSAMRecord extends BAMRecord { byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_DELETION_QUALITIES ) ); if( quals == null ) { quals = new byte[getBaseQualities().length]; - Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 + Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_DELETION); } return quals; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 91389f0bf..9d731e489 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; @@ -495,7 +496,7 @@ public class ReadUtils { /** * Is a base inside a read? * - * @param read the read to evaluate + * @param read the read to evaluate * @param referenceCoordinate the reference coordinate of the base to test * @return true if it is inside the read, false otherwise. */ @@ -541,9 +542,9 @@ public class ReadUtils { * * See getCoverageDistributionOfRead for information on how the coverage is calculated. * - * @param list the list of reads covering the region + * @param list the list of reads covering the region * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) * @return an array with the coverage of each position from startLocation to stopLocation */ public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { @@ -563,9 +564,9 @@ public class ReadUtils { * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample * reads for variant regions, and deletions count as variants) * - * @param read the read to get the coverage distribution of + * @param read the read to get the coverage distribution of * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) * @return an array with the coverage of each position from startLocation to stopLocation */ public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { @@ -611,9 +612,9 @@ public class ReadUtils { * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. * Example: Read => {true, true, false, ... false} * - * @param readList the list of reads to generate the association mappings + * @param readList the list of reads to generate the association mappings * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) * @return the two hashmaps described above */ public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { @@ -622,7 +623,6 @@ public class ReadUtils { HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); - for (int i = startLocation; i <= stopLocation; i++) locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists @@ -631,7 +631,7 @@ public class ReadUtils { int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); - for (int i=0; i 0) { // Update the hash for this locus @@ -649,6 +649,55 @@ public class ReadUtils { return new Pair>, HashMap>(locusToReadMap, readToLocusMap); } + /** + * Create random read qualities + * + * @param length the length of the read + * @return an array with randomized base qualities between 0 and 50 + */ + public static byte[] createRandomReadQuals(int length) { + Random random = GenomeAnalysisEngine.getRandomGenerator(); + byte[] quals = new byte[length]; + for (int i = 0; i < length; i++) + quals[i] = (byte) random.nextInt(50); + return quals; + } + + /** + * Create random read qualities + * + * @param length the length of the read + * @param allowNs whether or not to allow N's in the read + * @return an array with randomized bases (A-N) with equal probability + */ + public static byte[] createRandomReadBases(int length, boolean allowNs) { + Random random = GenomeAnalysisEngine.getRandomGenerator(); + int numberOfBases = allowNs ? 5 : 4; + byte[] bases = new byte[length]; + for (int i = 0; i < length; i++) { + switch (random.nextInt(numberOfBases)) { + case 0: + bases[i] = 'A'; + break; + case 1: + bases[i] = 'C'; + break; + case 2: + bases[i] = 'G'; + break; + case 3: + bases[i] = 'T'; + break; + case 4: + bases[i] = 'N'; + break; + default: + throw new ReviewedStingException("Something went wrong, this is just impossible"); + } + } + return bases; + } + public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { String[] sequenceRecordNames = new String[sequenceDictionary.size()]; int sequenceRecordIndex = 0; @@ -656,4 +705,5 @@ public class ReadUtils { sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); return Arrays.deepToString(sequenceRecordNames); } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index aa6a72ef9..312ad252e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -1,9 +1,9 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -12,37 +12,13 @@ import java.util.BitSet; import java.util.Random; /** - * Short one line description of the walker. - * - *

- * [Long description of the walker] - *

- * - * - *

Input

- *

- * [Description of the Input] - *

- * - *

Output

- *

- * [Description of the Output] - *

- * - *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T [walker name]
- *  
- * * @author Mauricio Carneiro * @since 3/1/12 */ public class ContextCovariateUnitTest { ContextCovariate covariate; RecalibrationArgumentCollection RAC; - Random random; + Random random; @BeforeClass public void init() { @@ -55,49 +31,35 @@ public class ContextCovariateUnitTest { @Test(enabled = true) public void testSimpleContexts() { - byte [] quals = createRandomReadQuals(101); - byte [] bbases = createRandomReadBases(101); + byte[] quals = ReadUtils.createRandomReadQuals(10000); + byte[] bbases = ReadUtils.createRandomReadBases(10000, true); String bases = stringFrom(bbases); + // System.out.println("Read: " + bases); GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); CovariateValues values = covariate.getValues(read); - verifyCovariateArray((BitSet []) values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases); - verifyCovariateArray((BitSet []) values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases); - verifyCovariateArray((BitSet []) values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases); + verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases); + verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases); + verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases); } - + private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { - for (int i=0; i= contextSize) - Assert.assertEquals(MathUtils.dnaFrom(values[i]), bases.substring(i-contextSize, i)); - else - Assert.assertNull(values[i]); + for (int i = 0; i < values.length; i++) { + String expectedContext = covariate.NO_CONTEXT_VALUE; + if (i >= contextSize) { + String context = bases.substring(i - contextSize, i); + if (!context.contains("N")) + expectedContext = context; + } + // System.out.println(String.format("Context [%d]:\n%s\n%s\n", i, covariate.keyFromBitSet(values[i]), expectedContext)); + Assert.assertEquals(covariate.keyFromBitSet(values[i]), expectedContext); } } - private String stringFrom(byte [] array) { + private String stringFrom(byte[] array) { String s = ""; for (byte value : array) s += (char) value; return s; } - private byte [] createRandomReadQuals(int length) { - byte [] quals = new byte[length]; - for (int i=0; i