BitSets keys to lower BQSR's memory footprint

Infrastructure:
	* Generic BitSet implementation with any precision (up to long)
	* Two's complement implementation of the bit set handles negative numbers (cycle covariate)
	* Memoized implementation of the BitSet utils for better performance.
	* All exponents are now calculated with bit shifts, fixing numerical precision issues with the double Math.pow.
	* Replace log/sqrt with bitwise logic to get rid of numerical issues

 BQSR:
	* All covariates output BitSets and have the functionality to decode them back into Object values.
	* Covariates are responsible for determining the size of the key they will use (number of bits).
	* Generalized KeyManager implementation combines any arbitrary number of covariates into one bitset key with event type
	* No more NestedHashMaps. Single key system now fits in one hash to reduce hash table objects overhead

 Tests:
	* Unit tests added to every method of BitSetUtils
	* Unit tests added to the generalized key system infrastructure of BQSRv2 (KeyManager)
	* Unit tests added to the cycle and context covariates (will add unit tests to all covariates)
This commit is contained in:
Mauricio Carneiro 2012-03-05 17:37:34 -05:00
parent e86ce8f3d6
commit ca11ab39e7
19 changed files with 868 additions and 457 deletions

View File

@ -26,7 +26,7 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -45,6 +45,9 @@ public class ContextCovariate implements StandardCovariate {
private int insertionsContextSize; private int insertionsContextSize;
private int deletionsContextSize; private int deletionsContextSize;
private final BitSet NO_CONTEXT_BITSET = BitSetUtils.bitSetFrom(-1L);
protected final String NO_CONTEXT_VALUE = "N"; // protected so we can UNIT TEST it
// Initialize any member variables using the command-line arguments passed to the walkers // Initialize any member variables using the command-line arguments passed to the walkers
@Override @Override
public void initialize(final RecalibrationArgumentCollection RAC) { public void initialize(final RecalibrationArgumentCollection RAC) {
@ -89,23 +92,34 @@ public class ContextCovariate implements StandardCovariate {
return str; return str;
} }
@Override
public String keyFromBitSet(BitSet key) {
if (key.equals(NO_CONTEXT_BITSET))
return NO_CONTEXT_VALUE;
return BitSetUtils.dnaFrom(key);
}
@Override
public int numberOfBits() {
return Long.bitCount(-1L);
}
/** /**
* calculates the context of a base independent of the covariate mode * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion)
* *
* @param bases the bases in the read to build the context from * @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for * @param offset the position in the read to calculate the context for
* @param contextSize context size to use building the context * @param contextSize context size to use building the context
* @return * @return the bitSet representing the Context
*/ */
private BitSet contextWith(byte[] bases, int offset, int contextSize) { private BitSet contextWith(byte[] bases, int offset, int contextSize) {
if (offset < contextSize) BitSet result = NO_CONTEXT_BITSET;
return null; if (offset >= contextSize) {
String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
if (context.contains("N")) if (!context.contains("N"))
return null; result = BitSetUtils.bitSetFrom(context);
}
return MathUtils.bitSetFrom(context); return result;
} }
/** /**

View File

@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
* *
@ -53,7 +55,29 @@ public interface Covariate {
*/ */
public CovariateValues getValues(GATKSAMRecord read); public CovariateValues getValues(GATKSAMRecord read);
public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration /**
* Used to get the covariate's value from input csv file during on-the-fly recalibration
*
* @param str the key in string type (read from the csv)
* @return the key in it's correct type.
*/
public Object getValue(String str);
/**
* Converts the bitset representation of the key (used internally for table indexing) to String format for file output.
*
* @param key the bitset representation of the key
* @return a string representation of the key
*/
public String keyFromBitSet(BitSet key);
/**
* Each covariate should determine how many bits are necessary to encode it's data
*
* @return The number of bits used to represent the values of this covariate.
*/
public int numberOfBits();
} }
interface RequiredCovariate extends Covariate {} interface RequiredCovariate extends Covariate {}

View File

@ -2,6 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.BitSet;
import java.util.HashMap;
/** /**
* The object temporarily held by a read that describes all of it's covariates. * The object temporarily held by a read that describes all of it's covariates.
* *
@ -11,25 +14,40 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
* @since 2/8/12 * @since 2/8/12
*/ */
public class CovariateKeySet { public class CovariateKeySet {
private Object[][] mismatchesKeySet; private BitSet[][] mismatchesKeySet;
private Object[][] insertionsKeySet; private BitSet[][] insertionsKeySet;
private Object[][] deletionsKeySet; private BitSet[][] deletionsKeySet;
private int nextCovariateIndex; private int nextCovariateIndex;
private static String mismatchesCovariateName = "M"; // private static String mismatchesCovariateName = "M";
private static String insertionsCovariateName = "I"; // private static String insertionsCovariateName = "I";
private static String deletionsCovariateName = "D"; // private static String deletionsCovariateName = "D";
//
// private static BitSet mismatchesCovariateBitSet = BitSetUtils.bitSetFrom(0);
// private static BitSet insertionsCovariateBitSet = BitSetUtils.bitSetFrom(1);
// private static BitSet deletionsCovariateBitSet = BitSetUtils.bitSetFrom(2);
private static HashMap<String, RecalDataManager.BaseRecalibrationType> nameToType = new HashMap<String, RecalDataManager.BaseRecalibrationType>();
private static HashMap<BitSet, String> bitSetToName = new HashMap<BitSet, String>();
public CovariateKeySet(int readLength, int numberOfCovariates) { public CovariateKeySet(int readLength, int numberOfCovariates) {
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format) // numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
this.mismatchesKeySet = new Object[readLength][numberOfCovariates]; this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates];
this.insertionsKeySet = new Object[readLength][numberOfCovariates]; this.insertionsKeySet = new BitSet[readLength][numberOfCovariates];
this.deletionsKeySet = new Object[readLength][numberOfCovariates]; this.deletionsKeySet = new BitSet[readLength][numberOfCovariates];
initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName); // initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateBitSet);
initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName); // initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateBitSet);
initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName); // initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateBitSet);
this.nextCovariateIndex = 0; this.nextCovariateIndex = 0;
// nameToType.put(mismatchesCovariateName, RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION);
// nameToType.put(insertionsCovariateName, RecalDataManager.BaseRecalibrationType.BASE_INSERTION);
// nameToType.put(deletionsCovariateName, RecalDataManager.BaseRecalibrationType.BASE_DELETION);
//
// bitSetToName.put(BitSetUtils.bitSetFrom(0), mismatchesCovariateName);
// bitSetToName.put(BitSetUtils.bitSetFrom(1), insertionsCovariateName);
// bitSetToName.put(BitSetUtils.bitSetFrom(2), deletionsCovariateName);
} }
public void addCovariate(CovariateValues covariate) { public void addCovariate(CovariateValues covariate) {
@ -39,17 +57,19 @@ public class CovariateKeySet {
nextCovariateIndex++; nextCovariateIndex++;
} }
public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) { public static RecalDataManager.BaseRecalibrationType errorModelFrom(final String modelString) {
if (modelString.equals(mismatchesCovariateName)) if (!nameToType.containsKey(modelString))
return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION;
else if (modelString.equals(insertionsCovariateName))
return RecalDataManager.BaseRecalibrationType.BASE_INSERTION;
else if (modelString.equals(deletionsCovariateName))
return RecalDataManager.BaseRecalibrationType.BASE_DELETION;
throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString); throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString);
return nameToType.get(modelString);
} }
public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) { public static String eventNameFrom(final BitSet bitSet) {
if (!bitSetToName.containsKey(bitSet))
throw new ReviewedStingException("Unrecognized Event Type BitSet: " + bitSet);
return bitSetToName.get(bitSet);
}
public BitSet[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) {
switch (errorModel) { switch (errorModel) {
case BASE_SUBSTITUTION: case BASE_SUBSTITUTION:
return getMismatchesKeySet(readPosition); return getMismatchesKeySet(readPosition);
@ -62,24 +82,24 @@ public class CovariateKeySet {
} }
} }
public Object[] getMismatchesKeySet(int readPosition) { public BitSet[] getMismatchesKeySet(int readPosition) {
return mismatchesKeySet[readPosition]; return mismatchesKeySet[readPosition];
} }
public Object[] getInsertionsKeySet(int readPosition) { public BitSet[] getInsertionsKeySet(int readPosition) {
return insertionsKeySet[readPosition]; return insertionsKeySet[readPosition];
} }
public Object[] getDeletionsKeySet(int readPosition) { public BitSet[] getDeletionsKeySet(int readPosition) {
return deletionsKeySet[readPosition]; return deletionsKeySet[readPosition];
} }
private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) { private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) {
for (int i = 0; i < covariateValues.length; i++) for (int i = 0; i < covariateValues.length; i++)
keySet[i][nextCovariateIndex] = covariateValues[i]; keySet[i][nextCovariateIndex] = covariateValues[i];
} }
private void initializeCovariateKeySet (Object[][] keySet, String covariateName) { private void initializeCovariateKeySet(BitSet[][] keySet, BitSet covariateName) {
int readLength = keySet.length; int readLength = keySet.length;
int lastCovariateIndex = keySet[0].length - 1; int lastCovariateIndex = keySet[0].length - 1;
for (int i = 0; i < readLength; i++) for (int i = 0; i < readLength; i++)

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import java.util.BitSet;
/** /**
* An object to hold the different covariate values for all bases in the read. * An object to hold the different covariate values for all bases in the read.
* *
@ -12,25 +14,25 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* @since 2/8/12 * @since 2/8/12
*/ */
public class CovariateValues { public class CovariateValues {
private Object[] mismatches; private BitSet[] mismatches;
private Object[] insertions; private BitSet[] insertions;
private Object[] deletions; private BitSet[] deletions;
public CovariateValues(Object[] mismatch, Object[] insertion, Object[] deletion) { public CovariateValues(BitSet[] mismatch, BitSet[] insertion, BitSet[] deletion) {
this.mismatches = mismatch; this.mismatches = mismatch;
this.insertions = insertion; this.insertions = insertion;
this.deletions = deletion; this.deletions = deletion;
} }
public Object[] getMismatches() { public BitSet[] getMismatches() {
return mismatches; return mismatches;
} }
public Object[] getInsertions() { public BitSet[] getInsertions() {
return insertions; return insertions;
} }
public Object[] getDeletions() { public BitSet[] getDeletions() {
return deletions; return deletions;
} }

View File

@ -1,10 +1,12 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
import java.util.EnumSet; import java.util.EnumSet;
/* /*
@ -59,13 +61,13 @@ public class CycleCovariate implements StandardCovariate {
// Used to pick out the covariate's value from attributes of the read // Used to pick out the covariate's value from attributes of the read
@Override @Override
public CovariateValues getValues(final GATKSAMRecord read) { public CovariateValues getValues(final GATKSAMRecord read) {
Integer [] cycles = new Integer[read.getReadLength()]; BitSet[] cycles = new BitSet[read.getReadLength()];
final NGSPlatform ngsPlatform = read.getNGSPlatform(); final NGSPlatform ngsPlatform = read.getNGSPlatform();
// Discrete cycle platforms // Discrete cycle platforms
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final int init; final short init;
final int increment; final short increment;
if (!read.getReadNegativeStrandFlag()) { if (!read.getReadNegativeStrandFlag()) {
// Differentiate between first and second of pair. // Differentiate between first and second of pair.
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
@ -88,19 +90,19 @@ public class CycleCovariate implements StandardCovariate {
else { else {
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
//second of pair, negative strand //second of pair, negative strand
init = -read.getReadLength(); init = (short) -read.getReadLength();
increment = 1; increment = 1;
} }
else { else {
//first of pair, negative strand //first of pair, negative strand
init = read.getReadLength(); init = (short) read.getReadLength();
increment = -1; increment = -1;
} }
} }
int cycle = init; short cycle = init;
for (int i = 0; i < read.getReadLength(); i++) { for (int i = 0; i < read.getReadLength(); i++) {
cycles[i] = cycle; cycles[i] = BitSetUtils.bitSetFrom(cycle);
cycle += increment; cycle += increment;
} }
} }
@ -119,7 +121,7 @@ public class CycleCovariate implements StandardCovariate {
// the current sequential model would consider the effects independently instead of jointly. // the current sequential model would consider the effects independently instead of jointly.
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
int cycle = multiplyByNegative1 ? -1 : 1; short cycle = multiplyByNegative1 ? (short) -1 : 1;
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one // For example, AAAAAAA was probably read in two flow cycles but here we count it as one
@ -127,19 +129,19 @@ public class CycleCovariate implements StandardCovariate {
int iii = 0; int iii = 0;
while (iii < readLength) { while (iii < readLength) {
while (iii < readLength && bases[iii] == (byte) 'T') { while (iii < readLength && bases[iii] == (byte) 'T') {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++; iii++;
} }
while (iii < readLength && bases[iii] == (byte) 'A') { while (iii < readLength && bases[iii] == (byte) 'A') {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++; iii++;
} }
while (iii < readLength && bases[iii] == (byte) 'C') { while (iii < readLength && bases[iii] == (byte) 'C') {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++; iii++;
} }
while (iii < readLength && bases[iii] == (byte) 'G') { while (iii < readLength && bases[iii] == (byte) 'G') {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++; iii++;
} }
if (iii < readLength) { if (iii < readLength) {
@ -149,7 +151,7 @@ public class CycleCovariate implements StandardCovariate {
cycle++; cycle++;
} }
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++; iii++;
} }
@ -159,19 +161,19 @@ public class CycleCovariate implements StandardCovariate {
int iii = readLength - 1; int iii = readLength - 1;
while (iii >= 0) { while (iii >= 0) {
while (iii >= 0 && bases[iii] == (byte) 'T') { while (iii >= 0 && bases[iii] == (byte) 'T') {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--; iii--;
} }
while (iii >= 0 && bases[iii] == (byte) 'A') { while (iii >= 0 && bases[iii] == (byte) 'A') {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--; iii--;
} }
while (iii >= 0 && bases[iii] == (byte) 'C') { while (iii >= 0 && bases[iii] == (byte) 'C') {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--; iii--;
} }
while (iii >= 0 && bases[iii] == (byte) 'G') { while (iii >= 0 && bases[iii] == (byte) 'G') {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--; iii--;
} }
if (iii >= 0) { if (iii >= 0) {
@ -181,7 +183,7 @@ public class CycleCovariate implements StandardCovariate {
cycle++; cycle++;
} }
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = cycle; cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--; iii--;
} }
} }
@ -201,4 +203,14 @@ public class CycleCovariate implements StandardCovariate {
public final Object getValue(final String str) { public final Object getValue(final String str) {
return Integer.parseInt(str); return Integer.parseInt(str);
} }
@Override
public String keyFromBitSet(BitSet key) {
return String.format("%d", BitSetUtils.shortFrom(key));
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative
}
} }

View File

@ -1,7 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
* *
@ -37,6 +40,8 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
public class QualityScoreCovariate implements RequiredCovariate { public class QualityScoreCovariate implements RequiredCovariate {
private final int MAX_QUAL = 50;
// Initialize any member variables using the command-line arguments passed to the walkers // Initialize any member variables using the command-line arguments passed to the walkers
@Override @Override
public void initialize(final RecalibrationArgumentCollection RAC) { public void initialize(final RecalibrationArgumentCollection RAC) {
@ -46,18 +51,18 @@ public class QualityScoreCovariate implements RequiredCovariate {
public CovariateValues getValues(final GATKSAMRecord read) { public CovariateValues getValues(final GATKSAMRecord read) {
int readLength = read.getReadLength(); int readLength = read.getReadLength();
Integer [] mismatches = new Integer[readLength]; BitSet[] mismatches = new BitSet[readLength];
Integer [] insertions = new Integer[readLength]; BitSet[] insertions = new BitSet[readLength];
Integer [] deletions = new Integer[readLength]; BitSet[] deletions = new BitSet[readLength];
byte[] baseQualities = read.getBaseQualities(); byte[] baseQualities = read.getBaseQualities();
byte[] baseInsertionQualities = read.getBaseInsertionQualities(); byte[] baseInsertionQualities = read.getBaseInsertionQualities();
byte[] baseDeletionQualities = read.getBaseDeletionQualities(); byte[] baseDeletionQualities = read.getBaseDeletionQualities();
for (int i = 0; i < baseQualities.length; i++) { for (int i = 0; i < baseQualities.length; i++) {
mismatches[i] = (int) baseQualities[i]; mismatches[i] = BitSetUtils.bitSetFrom(baseQualities[i]);
insertions[i] = (int) baseInsertionQualities[i]; insertions[i] = BitSetUtils.bitSetFrom(baseInsertionQualities[i]);
deletions[i] = (int) baseDeletionQualities[i]; deletions[i] = BitSetUtils.bitSetFrom(baseDeletionQualities[i]);
} }
return new CovariateValues(mismatches, insertions, deletions); return new CovariateValues(mismatches, insertions, deletions);
@ -68,4 +73,14 @@ public class QualityScoreCovariate implements RequiredCovariate {
public final Object getValue(final String str) { public final Object getValue(final String str) {
return Integer.parseInt(str); return Integer.parseInt(str);
} }
@Override
public String keyFromBitSet(BitSet key) {
return String.format("%d", BitSetUtils.longFrom(key));
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(MAX_QUAL);
}
} }

View File

@ -1,8 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays; import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap; import java.util.HashMap;
/* /*
@ -62,8 +64,9 @@ public class ReadGroupCovariate implements RequiredCovariate {
readGroupReverseLookupTable.put(nextId, readGroupId); readGroupReverseLookupTable.put(nextId, readGroupId);
nextId++; nextId++;
} }
Short [] readGroups = new Short[l]; BitSet rg = BitSetUtils.bitSetFrom(shortId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
Arrays.fill(readGroups, shortId); BitSet[] readGroups = new BitSet[l];
Arrays.fill(readGroups, rg);
return new CovariateValues(readGroups, readGroups, readGroups); return new CovariateValues(readGroups, readGroups, readGroups);
} }
@ -73,9 +76,19 @@ public class ReadGroupCovariate implements RequiredCovariate {
return str; return str;
} }
@Override
public String keyFromBitSet(BitSet key) {
return decodeReadGroup((short) BitSetUtils.longFrom(key));
}
public final String decodeReadGroup(final short id) { public final String decodeReadGroup(final short id) {
return readGroupReverseLookupTable.get(id); return readGroupReverseLookupTable.get(id);
} }
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
}
} }

View File

@ -67,9 +67,35 @@ public class RecalDataManager {
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
public enum BaseRecalibrationType { public enum BaseRecalibrationType {
BASE_SUBSTITUTION, BASE_SUBSTITUTION(0, "M"),
BASE_INSERTION, BASE_INSERTION(1, "I"),
BASE_DELETION BASE_DELETION(2, "D");
public int index;
public String representation;
private BaseRecalibrationType(int index, String representation) {
this.index = index;
this.representation = representation;
}
public static BaseRecalibrationType eventFrom(int index) {
switch (index) {
case 0:
return BASE_SUBSTITUTION;
case 1:
return BASE_INSERTION;
case 2:
return BASE_DELETION;
default:
throw new ReviewedStingException(String.format("Event %d does not exist.", index));
}
}
@Override
public String toString() {
return representation;
}
} }
public enum SOLID_RECAL_MODE { public enum SOLID_RECAL_MODE {
@ -136,7 +162,7 @@ public class RecalDataManager {
} }
} }
public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) { public static CovariateKeySet covariateKeySetFrom(GATKSAMRecord read) {
return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE); return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
} }
@ -551,6 +577,7 @@ public class RecalDataManager {
/** /**
* Given the base and the color calculate the next base in the sequence * Given the base and the color calculate the next base in the sequence
* *
* @param read the read
* @param prevBase The base * @param prevBase The base
* @param color The color * @param color The color
* @return The next base in the sequence * @return The next base in the sequence
@ -615,11 +642,12 @@ public class RecalDataManager {
* Computes all requested covariates for every offset in the given read * Computes all requested covariates for every offset in the given read
* by calling covariate.getValues(..). * by calling covariate.getValues(..).
* *
* @param read The read for which to compute covariate values. * It populates an array of covariate values where result[i][j] is the covariate
* @param requestedCovariates The list of requested covariates.
* @return An array of covariate values where result[i][j] is the covariate
* value for the ith position in the read and the jth covariate in * value for the ith position in the read and the jth covariate in
* reqeustedCovariates list. * reqeustedCovariates list.
*
* @param read The read for which to compute covariate values.
* @param requestedCovariates The list of requested covariates.
*/ */
public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) { public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
final int numRequestedCovariates = requestedCovariates.size(); final int numRequestedCovariates = requestedCovariates.size();

View File

@ -108,7 +108,8 @@ public class RecalDatumOptimized {
return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero
} }
public final String outputToCSV() { @Override
public final String toString() {
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte()); return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
} }

View File

@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer; import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
@ -92,16 +91,6 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
protected boolean RUN_WITHOUT_DBSNP = false; protected boolean RUN_WITHOUT_DBSNP = false;
/////////////////////////////
// protected Member Variables
/////////////////////////////
protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables)
protected final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>();// A list to hold the covariate objects that were requested
protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
/** /**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the * CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies. * reads which have had the reference inserted because of color space inconsistencies.
@ -153,7 +142,6 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
public byte DELETIONS_DEFAULT_QUALITY = 45; public byte DELETIONS_DEFAULT_QUALITY = 45;
@Hidden @Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null; public String DEFAULT_PLATFORM = null;
@ -161,5 +149,4 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null; public String FORCE_PLATFORM = null;
} }

View File

@ -0,0 +1,284 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.ByteArrayOutputStream;
import java.io.ObjectOutputStream;
import java.util.BitSet;
/**
* Utilities for bitset conversion
*
* @author Mauricio Carneiro
* @since 3/5/12
*/
public class BitSetUtils {
static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion.
static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers)
static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers)
static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length
/**
* Creates an long out of a bitset
*
* @param bitSet the bitset
* @return a long from the bitset representation
*/
public static long longFrom(final BitSet bitSet) {
return longFrom(bitSet, NBITS_LONG_REPRESENTATION);
}
/**
* Creates a short integer from a bitset
*
* @param bitSet the bitset
* @return a short from the bitset representation
*/
public static short shortFrom(final BitSet bitSet) {
return (short) longFrom(bitSet, NBITS_SHORT_REPRESENTATION);
}
/**
* Cretes an integer with any number of bits (up to 64 -- long precision) from a bitset
*
* @param bitSet the bitset
* @param nBits the number of bits to be used for this representation
* @return an integer with nBits from the bitset representation
*/
public static long longFrom(final BitSet bitSet, final int nBits) {
long number = 0;
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0 && bitIndex <= nBits; bitIndex = bitSet.nextSetBit(bitIndex + 1))
number |= 1L << bitIndex;
return number;
}
/**
* Creates a BitSet representation of a given long
*
* @param number the number to turn into a bitset
* @return a bitset representation of the long
*/
public static BitSet bitSetFrom(long number) {
return bitSetFrom(number, NBITS_LONG_REPRESENTATION);
}
/**
* Creates a BitSet representation of a given short
*
* @param number the number to turn into a bitset
* @return a bitset representation of the short
*/
public static BitSet bitSetFrom(short number) {
return bitSetFrom(number, NBITS_SHORT_REPRESENTATION);
}
/**
* Creates a BitSet representation of an arbitrary integer (number of bits capped at 64 -- long precision)
*
* @param number the number to turn into a bitset
* @param nBits the number of bits to use as precision for this conversion
* @return a bitset representation of the integer
*/
public static BitSet bitSetFrom(long number, int nBits) {
BitSet bitSet = new BitSet();
boolean isNegative = number < 0;
int bitIndex = 0;
while (number != 0) {
if (number % 2 != 0)
bitSet.set(bitIndex);
bitIndex++;
number /= 2;
}
if (isNegative) {
boolean foundFirstSetBit = false;
for (int i = bitSet.nextSetBit(0); i < nBits && i >= 0; i++) {
boolean bit = bitSet.get(i);
if (!foundFirstSetBit && bit)
foundFirstSetBit = true; // maintain all bits until the first 1 is found (inclusive)
else if (foundFirstSetBit)
bitSet.flip(i); // flip every other bit up to NBITS_REPRESENTATION
}
}
return bitSet;
}
/**
* Converts a BitSet into the dna string representation.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
* base_10 representation of the sequence. This is important for us to know how to bring the number
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
* as 0's and leading 0's are omitted).
*
* quasi-canonical because A is represented by a 0, therefore,
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
* we have : 0, 1, 2, 3, 00, 01, 02, ...
*
* but we can correctly decode it because we know the final length.
*
* @param bitSet the bitset representation of the dna sequence
* @return the dna sequence represented by the bitset
*/
public static String dnaFrom(final BitSet bitSet) {
long number = longFrom(bitSet); // the base_10 representation of the bit set
if (number < 0)
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
String dna = "";
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
byte base = (byte) (number % 4);
switch (base) {
case 0:
dna = "A" + dna;
break;
case 1:
dna = "C" + dna;
break;
case 2:
dna = "G" + dna;
break;
case 3:
dna = "T" + dna;
break;
}
number /= 4;
}
for (int j = dna.length(); j < length; j++)
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna;
}
/**
* Creates a BitSet representation of a given dna string.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* The bit representation of a dna string is the simple:
* 0 A 4 AA 8 CA
* 1 C 5 AC ...
* 2 G 6 AG 1343 TTGGT
* 3 T 7 AT 1364 TTTTT
*
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
* preceded the string (with smaller lengths).
*
* @param dna the dna sequence
* @return the bitset representing the dna sequence
*/
public static BitSet bitSetFrom(String dna) {
if (dna.length() > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string
for (int i = 0; i < dna.length(); i++) {
baseTen *= 4;
switch (dna.charAt(i)) {
case 'A':
baseTen += 0;
break;
case 'C':
baseTen += 1;
break;
case 'G':
baseTen += 2;
break;
case 'T':
baseTen += 3;
break;
}
}
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}
/**
* Calculates the number of bits necessary to represent a given number of elements
*
* @param numberOfElements the number of elements to represent (must be positive)
* @return the number of bits necessary to represent this many elements
*/
public static int numberOfBitsToRepresent(long numberOfElements) {
if (numberOfElements < 0)
throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements);
if (numberOfElements == 1L)
return 1; // special case
int n = 0;
numberOfElements--;
while (numberOfElements > 0) {
numberOfElements = numberOfElements >> 1;
n++;
}
return n;
}
/**
* Calculates the length of the DNA context for a given base 10 number
*
* It is important to know the length given the base 10 number to calculate the number of combinations
* and to disambiguate the "quasi-canonical" state.
*
* This method also calculates the number of combinations as a by-product, but since it memoizes the
* results, a subsequent call to combinationsFor(length) is O(1).
*
* @param number the base 10 representation of the bitset
* @return the length of the DNA context represented by this number
*/
private static int contextLengthFor(long number) {
int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
long combinations = combinationsFor(length); // the next context (we advance it so we know which one was preceding it).
while (combinations <= number) { // find the length of the dna string (length)
length++;
combinations = combinationsFor(length); // calculate the next context
}
return length;
}
/**
* The sum of all combinations of a context of a given length from length = 0 to length.
*
* Memoized implementation of sum(4^i) , where i=[0,length]
*
* @param length the length of the DNA context
* @return the sum of all combinations leading up to this context length.
*/
private static long combinationsFor(int length) {
if (length > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("Context cannot be longer than %d bases but requested %d.", MAX_DNA_CONTEXT, length));
// only calculate the number of combinations if the table hasn't already cached the value
if (length > 0 && combinationsPerLength[length] == 0) {
long combinations = 0L;
for (int i = 1; i <= length; i++)
combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) )
combinationsPerLength[length] = combinations;
}
return combinationsPerLength[length];
}
public static byte[] sizeOf(Object obj) throws java.io.IOException
{
ByteArrayOutputStream byteObject = new ByteArrayOutputStream();
ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteObject);
objectOutputStream.writeObject(obj);
objectOutputStream.flush();
objectOutputStream.close();
byteObject.close();
return byteObject.toByteArray();
}
}

View File

@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import java.math.BigDecimal; import java.math.BigDecimal;
@ -1527,124 +1526,4 @@ public class MathUtils {
} }
/**
* Creates an integer out of a bitset
*
* @param bitSet the bitset
* @return an integer with the bitset representation
*/
public static long intFrom(final BitSet bitSet) {
long number = 0;
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1))
number |= 1L << bitIndex;
return number;
}
/**
* Creates a BitSet representation of a given integer
*
* @param number the number to turn into a bitset
* @return a bitset representation of the integer
*/
public static BitSet bitSetFrom(long number) {
BitSet bitSet = new BitSet();
int bitIndex = 0;
while (number > 0) {
if (number%2 > 0)
bitSet.set(bitIndex);
bitIndex++;
number /= 2;
}
return bitSet;
}
/**
* Converts a BitSet into the dna string representation.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
* base_10 representation of the sequence. This is important for us to know how to bring the number
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
* as 0's and leading 0's are omitted).
*
* quasi-canonical because A is represented by a 0, therefore,
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
* we have : 0, 1, 2, 3, 00, 01, 02, ...
*
* but we can correctly decode it because we know the final length.
*
* @param bitSet the bitset representation of the dna sequence
* @return the dna sequence represented by the bitset
*/
public static String dnaFrom(final BitSet bitSet) {
long number = intFrom(bitSet); // the base_10 representation of the bit set
long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later)
long nextContext = 4; // the next context (we advance it so we know which one was preceding it).
int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
while (nextContext <= number) { // find the length of the dna string (i)
preContext = nextContext; // keep track of the number of combinations in the preceding context
nextContext += Math.pow(4, ++i);// calculate the next context
}
number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
String dna = "";
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
byte base = (byte) (number % 4);
switch (base) {
case 0 : dna = "A" + dna; break;
case 1 : dna = "C" + dna; break;
case 2 : dna = "G" + dna; break;
case 3 : dna = "T" + dna; break;
}
number /= 4;
}
for (int j = dna.length(); j < i; j++)
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna;
}
/**
* Creates a BitSet representation of a given dna string.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* The bit representation of a dna string is the simple:
* 0 A 4 AA 8 CA
* 1 C 5 AC ...
* 2 G 6 AG 1343 TTGGT
* 3 T 7 AT 1364 TTTTT
*
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
* preceded the string (with smaller lengths).
*
* @param dna the dna sequence
* @return the bitset representing the dna sequence
*/
public static BitSet bitSetFrom(String dna) {
if (dna.length() > 31)
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length()));
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
long preContext = 0; // the sum of all combinations that preceded the length of the dna string
for (int i=0; i<dna.length(); i++) {
baseTen *= 4;
switch(dna.charAt(i)) {
case 'A': baseTen += 0; break;
case 'C': baseTen += 1; break;
case 'G': baseTen += 2; break;
case 'T': baseTen += 3; break;
}
if (i>0)
preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...)
}
return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}
} }

View File

@ -72,14 +72,16 @@ public class BaseRecalibration {
lineNumber++; lineNumber++;
if (EOF_MARKER.equals(line)) { if (EOF_MARKER.equals(line)) {
sawEOF = true; sawEOF = true;
} else if( COMMENT_PATTERN.matcher(line).matches() ) { }
else if (COMMENT_PATTERN.matcher(line).matches()) {
; // Skip over the comment lines, (which start with '#') ; // Skip over the comment lines, (which start with '#')
} }
// Read in the covariates that were used from the input file // Read in the covariates that were used from the input file
else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
if (foundAllCovariates) { if (foundAllCovariates) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE); throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
} else { // Found the covariate list in input file, loop through all of them and instantiate them }
else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(","); String[] vals = line.split(",");
for (int iii = 0; iii < vals.length - 4; iii++) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical for (int iii = 0; iii < vals.length - 4; iii++) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical
boolean foundClass = false; boolean foundClass = false;
@ -102,7 +104,8 @@ public class BaseRecalibration {
} }
} }
} else { // Found a line of data }
else { // Found a line of data
if (!foundAllCovariates) { if (!foundAllCovariates) {
foundAllCovariates = true; foundAllCovariates = true;
@ -146,6 +149,7 @@ public class BaseRecalibration {
/** /**
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
*
* @param line A line of CSV data read from the recalibration table data file * @param line A line of CSV data read from the recalibration table data file
*/ */
private void addCSVData(final File file, final String line) { private void addCSVData(final File file, final String line) {
@ -165,7 +169,7 @@ public class BaseRecalibration {
key[iii] = cov.getValue(vals[iii]); key[iii] = cov.getValue(vals[iii]);
} }
final String modelString = vals[iii++]; final String modelString = vals[iii++];
final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.getErrorModelFromString(modelString); final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.errorModelFrom(modelString);
// Create a new datum using the number of observations, number of mismatches, and reported quality score // Create a new datum using the number of observations, number of mismatches, and reported quality score
final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0); final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0);
@ -178,7 +182,7 @@ public class BaseRecalibration {
//compute all covariate values for this read //compute all covariate values for this read
RecalDataManager.computeCovariates(read, requestedCovariates); RecalDataManager.computeCovariates(read, requestedCovariates);
final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read ); final CovariateKeySet covariateKeySet = RecalDataManager.covariateKeySetFrom(read);
for (final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values()) { for (final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values()) {
final byte[] originalQuals = read.getBaseQualities(errorModel); final byte[] originalQuals = read.getBaseQualities(errorModel);
@ -217,6 +221,7 @@ public class BaseRecalibration {
* - The final shift equation is: * - The final shift equation is:
* *
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
*
* @param key The list of Comparables that were calculated from the covariates * @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte * @return A recalibrated quality score as a byte
*/ */
@ -267,6 +272,7 @@ public class BaseRecalibration {
/** /**
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
*
* @param originalQuals The list of original base quality scores * @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores * @param recalQuals A list of the new recalibrated quality scores
*/ */

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import net.sf.samtools.*; import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
@ -622,7 +623,6 @@ public class ReadUtils {
HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2*(stopLocation - startLocation + 1), 0.5f); HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2*(stopLocation - startLocation + 1), 0.5f);
HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2*readList.size(), 0.5f); HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2*readList.size(), 0.5f);
for (int i = startLocation; i <= stopLocation; i++) for (int i = startLocation; i <= stopLocation; i++)
locusToReadMap.put(i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists locusToReadMap.put(i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists
@ -649,6 +649,55 @@ public class ReadUtils {
return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>(locusToReadMap, readToLocusMap); return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>(locusToReadMap, readToLocusMap);
} }
/**
* Create random read qualities
*
* @param length the length of the read
* @return an array with randomized base qualities between 0 and 50
*/
public static byte[] createRandomReadQuals(int length) {
Random random = GenomeAnalysisEngine.getRandomGenerator();
byte[] quals = new byte[length];
for (int i = 0; i < length; i++)
quals[i] = (byte) random.nextInt(50);
return quals;
}
/**
* Create random read qualities
*
* @param length the length of the read
* @param allowNs whether or not to allow N's in the read
* @return an array with randomized bases (A-N) with equal probability
*/
public static byte[] createRandomReadBases(int length, boolean allowNs) {
Random random = GenomeAnalysisEngine.getRandomGenerator();
int numberOfBases = allowNs ? 5 : 4;
byte[] bases = new byte[length];
for (int i = 0; i < length; i++) {
switch (random.nextInt(numberOfBases)) {
case 0:
bases[i] = 'A';
break;
case 1:
bases[i] = 'C';
break;
case 2:
bases[i] = 'G';
break;
case 3:
bases[i] = 'T';
break;
case 4:
bases[i] = 'N';
break;
default:
throw new ReviewedStingException("Something went wrong, this is just impossible");
}
}
return bases;
}
public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
String[] sequenceRecordNames = new String[sequenceDictionary.size()]; String[] sequenceRecordNames = new String[sequenceDictionary.size()];
int sequenceRecordIndex = 0; int sequenceRecordIndex = 0;
@ -656,4 +705,5 @@ public class ReadUtils {
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
return Arrays.deepToString(sequenceRecordNames); return Arrays.deepToString(sequenceRecordNames);
} }
} }

View File

@ -1,9 +1,9 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -12,30 +12,6 @@ import java.util.BitSet;
import java.util.Random; import java.util.Random;
/** /**
* Short one line description of the walker.
*
* <p>
* [Long description of the walker]
* </p>
*
*
* <h2>Input</h2>
* <p>
* [Description of the Input]
* </p>
*
* <h2>Output</h2>
* <p>
* [Description of the Output]
* </p>
*
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T [walker name]
* </pre>
*
* @author Mauricio Carneiro * @author Mauricio Carneiro
* @since 3/1/12 * @since 3/1/12
*/ */
@ -55,22 +31,27 @@ public class ContextCovariateUnitTest {
@Test(enabled = true) @Test(enabled = true)
public void testSimpleContexts() { public void testSimpleContexts() {
byte [] quals = createRandomReadQuals(101); byte[] quals = ReadUtils.createRandomReadQuals(10000);
byte [] bbases = createRandomReadBases(101); byte[] bbases = ReadUtils.createRandomReadBases(10000, true);
String bases = stringFrom(bbases); String bases = stringFrom(bbases);
// System.out.println("Read: " + bases);
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
CovariateValues values = covariate.getValues(read); CovariateValues values = covariate.getValues(read);
verifyCovariateArray((BitSet []) values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases); verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases);
verifyCovariateArray((BitSet []) values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases); verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases);
verifyCovariateArray((BitSet []) values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases); verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases);
} }
private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) {
for (int i = 0; i < values.length; i++) { for (int i = 0; i < values.length; i++) {
if (i >= contextSize) String expectedContext = covariate.NO_CONTEXT_VALUE;
Assert.assertEquals(MathUtils.dnaFrom(values[i]), bases.substring(i-contextSize, i)); if (i >= contextSize) {
else String context = bases.substring(i - contextSize, i);
Assert.assertNull(values[i]); if (!context.contains("N"))
expectedContext = context;
}
// System.out.println(String.format("Context [%d]:\n%s\n%s\n", i, covariate.keyFromBitSet(values[i]), expectedContext));
Assert.assertEquals(covariate.keyFromBitSet(values[i]), expectedContext);
} }
} }
@ -81,23 +62,4 @@ public class ContextCovariateUnitTest {
return s; return s;
} }
private byte [] createRandomReadQuals(int length) {
byte [] quals = new byte[length];
for (int i=0; i<length; i++)
quals[i] = (byte) random.nextInt(50);
return quals;
}
private byte [] createRandomReadBases(int length) {
byte [] bases = new byte[length];
for (int i=0; i<length; i++) {
switch(random.nextInt(4)) {
case 0: bases[i] = 'A'; break;
case 1: bases[i] = 'C'; break;
case 2: bases[i] = 'G'; break;
case 3: bases[i] = 'T'; break;
}
}
return bases;
}
} }

View File

@ -0,0 +1,68 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.BitSet;
import java.util.Random;
/**
* @author Mauricio Carneiro
* @since 3/1/12
*/
public class CycleCovariateUnitTest {
CycleCovariate covariate;
RecalibrationArgumentCollection RAC;
Random random;
@BeforeClass
public void init() {
RAC = new RecalibrationArgumentCollection();
covariate = new CycleCovariate();
random = GenomeAnalysisEngine.getRandomGenerator();
covariate.initialize(RAC);
}
@Test(enabled = true)
public void testSimpleCycles() {
short readLength = 10;
byte[] quals = ReadUtils.createRandomReadQuals(readLength);
byte[] bbases = ReadUtils.createRandomReadBases(readLength, true);
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
read.getReadGroup().setPlatform("illumina");
CovariateValues values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), (short) 1, (short) 1);
read.setReadNegativeStrandFlag(true);
values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), readLength, (short) -1);
read.setReadPairedFlag(true);
read.setSecondOfPairFlag(true);
values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), (short) -readLength, (short) 1);
read.setReadNegativeStrandFlag(false);
values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), (short) -1, (short) -1);
}
private void verifyCovariateArray(BitSet[] values, short init, short increment) {
for (short i = 0; i < values.length; i++) {
short actual = Short.decode(covariate.keyFromBitSet(values[i]));
int expected = init + (increment * i);
// System.out.println(String.format("%d: %d, %d", i, actual, expected));
Assert.assertEquals(actual, expected);
}
}
}

View File

@ -0,0 +1,75 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.Random;
/**
* @author Mauricio Carneiro
* @since 3/5/12
*/
public class BitSetUtilsUnitTest {
private static int RANDOM_NUMBERS_TO_TRY = 87380;
private static Random random;
@BeforeClass
public void init() {
random = GenomeAnalysisEngine.getRandomGenerator();
}
@Test(enabled = true)
public void testLongBitSet() {
long[] numbers = {0L, 1L, 428L, 65536L, 239847L, 4611686018427387903L, Long.MAX_VALUE, Long.MIN_VALUE, -1L, -2L, -7L, -128L, -65536L, -100000L};
for (long n : numbers)
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n);
for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) {
long n = random.nextLong();
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); // Because class Random uses a seed with only 48 bits, this algorithm will not return all possible long values.
}
}
@Test(enabled = true)
public void testShortBitSet() {
short[] numbers = {0, 1, 428, 25934, 23847, 16168, Short.MAX_VALUE, Short.MIN_VALUE, -1, -2, -7, -128, -12312, -31432};
for (long n : numbers)
Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n);
for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) {
short n = (short) random.nextInt();
Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n);
}
}
@Test(enabled = true)
public void testDNAAndBitSetConversion() {
String[] dna = {"AGGTGTTGT", "CCCCCCCCCCCCCC", "GGGGGGGGGGGGGG", "TTTTTTTTTTTTTT", "GTAGACCGATCTCAGCTAGT", "AACGTCAATGCAGTCAAGTCAGACGTGGGTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"};
// Test all contexts of size 1-8.
for (long n = 0; n < RANDOM_NUMBERS_TO_TRY; n++)
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(n)))), n);
// Test the special cases listed in the dna array
for (String d : dna)
Assert.assertEquals(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(d)), d);
}
@Test(enabled = true)
public void testNumberOfBitsToRepresent() {
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(0), 0); // Make sure 0 elements need 0 bits to be represented
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(1), 1); // Make sure 1 element needs 1 bit to be represented
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(3), 2); // Make sure 3 elements need 2 bit to be represented
for (int i = 1; i < 63; i++) { // Can't test i == 63 because n1 is a negative number
long n1 = 1L << i;
long n2 = Math.abs(random.nextLong()) % n1;
long n3 = n1 | n2;
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(n3), (n3 == n1) ? i : i + 1);
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(n1), i);
}
}
}

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.utils; package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
@ -131,7 +130,8 @@ public class MathUtilsUnitTest extends BaseTest {
int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24};
MathUtils.RunningAverage r = new MathUtils.RunningAverage(); MathUtils.RunningAverage r = new MathUtils.RunningAverage();
for (int i = 0; i < numbers.length; i++) r.add((double) numbers[i]); for (int i = 0; i < numbers.length; i++)
r.add((double) numbers[i]);
Assert.assertEquals((long) numbers.length, r.observationCount()); Assert.assertEquals((long) numbers.length, r.observationCount());
Assert.assertTrue(r.mean() - 3224.625 < 2e-10); Assert.assertTrue(r.mean() - 3224.625 < 2e-10);
@ -223,35 +223,6 @@ public class MathUtilsUnitTest extends BaseTest {
return set.isEmpty(); return set.isEmpty();
} }
@Test(enabled = true)
public void testIntAndBitSetConversion() {
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(239847)), 239847);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(12726)), 12726);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(0)), 0);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(1)), 1);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(65536)), 65536);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(Long.MAX_VALUE)), Long.MAX_VALUE);
}
@Test(enabled = true)
public void testDNAAndBitSetConversion() {
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("ACGT")), "ACGT");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AGGTGTTGT")), "AGGTGTTGT");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("A")), "A");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("C")), "C");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("G")), "G");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("T")), "T");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CC")), "CC");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AA")), "AA");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AAAA")), "AAAA");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CCCCCCCCCCCCCC")), "CCCCCCCCCCCCCC");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GGGGGGGGGGGGGG")), "GGGGGGGGGGGGGG");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("TTTTTTTTTTTTTT")), "TTTTTTTTTTTTTT");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GTAGACCGATCTCAGCTAGT")), "GTAGACCGATCTCAGCTAGT");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31)
}
@Test @Test
public void testApproximateLog10SumLog10() { public void testApproximateLog10SumLog10() {
Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);