diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index 89a30e4f5..a1ab73341 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -26,10 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; +import java.util.BitSet; /** * Created by IntelliJ IDEA. @@ -43,10 +45,6 @@ public class ContextCovariate implements StandardCovariate { private int insertionsContextSize; private int deletionsContextSize; - private String mismatchesNoContext = ""; - private String insertionsNoContext = ""; - private String deletionsNoContext = ""; - // Initialize any member variables using the command-line arguments passed to the walkers @Override public void initialize(final RecalibrationArgumentCollection RAC) { @@ -57,29 +55,26 @@ public class ContextCovariate implements StandardCovariate { if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0) throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize)); - // initialize no context strings given the size of the context for each covariate type - mismatchesNoContext = makeAllNStringWithLength(mismatchesContextSize); - insertionsNoContext = makeAllNStringWithLength(insertionsContextSize); - deletionsNoContext = makeAllNStringWithLength( deletionsContextSize); } @Override public CovariateValues getValues(final GATKSAMRecord read) { int l = read.getReadLength(); - String[] mismatches = new String [l]; - String[] insertions = new String [l]; - String[] deletions = new String [l]; + BitSet[] mismatches = new BitSet[l]; + BitSet[] insertions = new BitSet[l]; + BitSet[] deletions = new BitSet[l]; final boolean negativeStrand = read.getReadNegativeStrandFlag(); byte[] bases = read.getReadBases(); - if (negativeStrand) { - bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place - } + if (negativeStrand) + bases = BaseUtils.simpleReverseComplement(bases); + for (int i = 0; i < read.getReadLength(); i++) { - mismatches[i] = contextWith(bases, i, mismatchesContextSize, mismatchesNoContext); - insertions[i] = contextWith(bases, i, insertionsContextSize, insertionsNoContext); - deletions[i] = contextWith(bases, i, deletionsContextSize, deletionsNoContext); + mismatches[i] = contextWith(bases, i, mismatchesContextSize); + insertions[i] = contextWith(bases, i, insertionsContextSize); + deletions[i] = contextWith(bases, i, deletionsContextSize); } + if (negativeStrand) { reverse(mismatches); reverse(insertions); @@ -90,7 +85,7 @@ public class ContextCovariate implements StandardCovariate { // Used to get the covariate's value from input csv file during on-the-fly recalibration @Override - public final Comparable getValue(final String str) { + public final Object getValue(final String str) { return str; } @@ -100,29 +95,28 @@ public class ContextCovariate implements StandardCovariate { * @param bases the bases in the read to build the context from * @param offset the position in the read to calculate the context for * @param contextSize context size to use building the context - * @param noContextString string to return if the position is not far enough in the read to have a full context before. * @return */ - private String contextWith(byte [] bases, int offset, int contextSize, String noContextString) { - return (offset < contextSize) ? noContextString : new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); + private BitSet contextWith(byte [] bases, int offset, int contextSize) { + if (offset < contextSize) + return null; + + String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); + if (context.contains("N")) + return null; + + return MathUtils.bitSetFrom(context); } - - private String makeAllNStringWithLength(int length) { - String s = ""; - for (int i=0; i= 0; bitIndex = bitSet.nextSetBit(bitIndex+1)) - integer |= 1 << bitIndex; + number |= 1L << bitIndex; - return integer; + return number; } /** * Creates a BitSet representation of a given integer * - * @param integer the number to turn into a bitset + * @param number the number to turn into a bitset * @return a bitset representation of the integer */ - public static BitSet bitSetFrom(int integer) { - BitSet bitSet = new BitSet((int) Math.ceil(Math.sqrt(integer))); + public static BitSet bitSetFrom(long number) { + BitSet bitSet = new BitSet(); int bitIndex = 0; - while (integer > 0) { - if (integer%2 > 0) + while (number > 0) { + if (number%2 > 0) bitSet.set(bitIndex); bitIndex++; - integer /= 2; + number /= 2; } return bitSet; } + /** + * Converts a BitSet into the dna string representation. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the + * base_10 representation of the sequence. This is important for us to know how to bring the number + * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented + * as 0's and leading 0's are omitted). + * + * quasi-canonical because A is represented by a 0, therefore, + * instead of : 0, 1, 2, 3, 10, 11, 12, ... + * we have : 0, 1, 2, 3, 00, 01, 02, ... + * + * but we can correctly decode it because we know the final length. + * + * @param bitSet the bitset representation of the dna sequence + * @return the dna sequence represented by the bitset + */ + public static String dnaFrom(final BitSet bitSet) { + long number = intFrom(bitSet); // the base_10 representation of the bit set + long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later) + long nextContext = 4; // the next context (we advance it so we know which one was preceding it). + int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. + while (nextContext <= number) { // find the length of the dna string (i) + preContext = nextContext; // keep track of the number of combinations in the preceding context + nextContext += Math.pow(4, ++i);// calculate the next context + } + number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation + + String dna = ""; + while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) + byte base = (byte) (number % 4); + switch (base) { + case 0 : dna = "A" + dna; break; + case 1 : dna = "C" + dna; break; + case 2 : dna = "G" + dna; break; + case 3 : dna = "T" + dna; break; + } + number /= 4; + } + for (int j = dna.length(); j < i; j++) + dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) + + return dna; + } + + /** + * Creates a BitSet representation of a given dna string. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * The bit representation of a dna string is the simple: + * 0 A 4 AA 8 CA + * 1 C 5 AC ... + * 2 G 6 AG 1343 TTGGT + * 3 T 7 AT 1364 TTTTT + * + * To convert from dna to number, we convert the dna string to base10 and add all combinations that + * preceded the string (with smaller lengths). + * + * @param dna the dna sequence + * @return the bitset representing the dna sequence + */ + public static BitSet bitSetFrom(String dna) { + if (dna.length() > 31) + throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length())); + + long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set + long preContext = 0; // the sum of all combinations that preceded the length of the dna string + for (int i=0; i0) + preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...) + } + + return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. + } + + } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 5b50c91a6..75fc44873 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -207,14 +207,34 @@ public class MathUtilsUnitTest extends BaseTest { @Test(enabled = true) public void testIntAndBitSetConversion() { - Assert.assertEquals(428, MathUtils.intFrom(MathUtils.bitSetFrom(428))); - Assert.assertEquals(239847, MathUtils.intFrom(MathUtils.bitSetFrom(239847))); - Assert.assertEquals(12726, MathUtils.intFrom(MathUtils.bitSetFrom(12726))); - Assert.assertEquals(0, MathUtils.intFrom(MathUtils.bitSetFrom(0))); - Assert.assertEquals(1, MathUtils.intFrom(MathUtils.bitSetFrom(1))); - Assert.assertEquals(65536, MathUtils.intFrom(MathUtils.bitSetFrom(65536))); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(239847)), 239847); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(12726)), 12726); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(0)), 0); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(1)), 1); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(65536)), 65536); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(Long.MAX_VALUE)), Long.MAX_VALUE); } + @Test(enabled = true) + public void testDNAAndBitSetConversion() { + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("ACGT")), "ACGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AGGTGTTGT")), "AGGTGTTGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("A")), "A"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("C")), "C"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("G")), "G"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("T")), "T"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CC")), "CC"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AA")), "AA"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AAAA")), "AAAA"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CCCCCCCCCCCCCC")), "CCCCCCCCCCCCCC"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GGGGGGGGGGGGGG")), "GGGGGGGGGGGGGG"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("TTTTTTTTTTTTTT")), "TTTTTTTTTTTTTT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GTAGACCGATCTCAGCTAGT")), "GTAGACCGATCTCAGCTAGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31) + } + + private boolean hasUniqueElements(Object[] x) { for (int i = 0; i < x.length; i++) for (int j = i + 1; j < x.length; j++)