Merge branch 'master' of ssh://gsa1.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-03-01 10:55:21 -05:00
commit 88e060e1e9
8 changed files with 162 additions and 59 deletions

View File

@ -26,10 +26,12 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.BitSet;
/**
* Created by IntelliJ IDEA.
@ -43,10 +45,6 @@ public class ContextCovariate implements StandardCovariate {
private int insertionsContextSize;
private int deletionsContextSize;
private String mismatchesNoContext = "";
private String insertionsNoContext = "";
private String deletionsNoContext = "";
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
@ -57,29 +55,26 @@ public class ContextCovariate implements StandardCovariate {
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
// initialize no context strings given the size of the context for each covariate type
mismatchesNoContext = makeAllNStringWithLength(mismatchesContextSize);
insertionsNoContext = makeAllNStringWithLength(insertionsContextSize);
deletionsNoContext = makeAllNStringWithLength( deletionsContextSize);
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
int l = read.getReadLength();
String[] mismatches = new String [l];
String[] insertions = new String [l];
String[] deletions = new String [l];
BitSet[] mismatches = new BitSet[l];
BitSet[] insertions = new BitSet[l];
BitSet[] deletions = new BitSet[l];
final boolean negativeStrand = read.getReadNegativeStrandFlag();
byte[] bases = read.getReadBases();
if (negativeStrand) {
bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place
}
if (negativeStrand)
bases = BaseUtils.simpleReverseComplement(bases);
for (int i = 0; i < read.getReadLength(); i++) {
mismatches[i] = contextWith(bases, i, mismatchesContextSize, mismatchesNoContext);
insertions[i] = contextWith(bases, i, insertionsContextSize, insertionsNoContext);
deletions[i] = contextWith(bases, i, deletionsContextSize, deletionsNoContext);
mismatches[i] = contextWith(bases, i, mismatchesContextSize);
insertions[i] = contextWith(bases, i, insertionsContextSize);
deletions[i] = contextWith(bases, i, deletionsContextSize);
}
if (negativeStrand) {
reverse(mismatches);
reverse(insertions);
@ -90,7 +85,7 @@ public class ContextCovariate implements StandardCovariate {
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
public final Object getValue(final String str) {
return str;
}
@ -100,29 +95,28 @@ public class ContextCovariate implements StandardCovariate {
* @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for
* @param contextSize context size to use building the context
* @param noContextString string to return if the position is not far enough in the read to have a full context before.
* @return
*/
private String contextWith(byte [] bases, int offset, int contextSize, String noContextString) {
return (offset < contextSize) ? noContextString : new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
private BitSet contextWith(byte [] bases, int offset, int contextSize) {
if (offset < contextSize)
return null;
String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
if (context.contains("N"))
return null;
return MathUtils.bitSetFrom(context);
}
private String makeAllNStringWithLength(int length) {
String s = "";
for (int i=0; i<length; i++)
s += "N";
return s;
}
/**
* Reverses the given array in place.
*
* @param array any array
*/
private static void reverse(final Comparable[] array) {
private static void reverse(final Object[] array) {
final int arrayLength = array.length;
for (int l = 0, r = arrayLength - 1; l < r; l++, r--) {
final Comparable temp = array[l];
final Object temp = array[l];
array[l] = array[r];
array[r] = temp;
}

View File

@ -53,7 +53,7 @@ public interface Covariate {
*/
public CovariateValues getValues(GATKSAMRecord read);
public Comparable getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration
public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration
}
interface RequiredCovariate extends Covariate {}

View File

@ -12,25 +12,25 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* @since 2/8/12
*/
public class CovariateValues {
private Comparable[] mismatches;
private Comparable[] insertions;
private Comparable[] deletions;
private Object[] mismatches;
private Object[] insertions;
private Object[] deletions;
public CovariateValues(Comparable[] mismatch, Comparable[] insertion, Comparable[] deletion) {
public CovariateValues(Object[] mismatch, Object[] insertion, Object[] deletion) {
this.mismatches = mismatch;
this.insertions = insertion;
this.deletions = deletion;
}
public Comparable[] getMismatches() {
public Object[] getMismatches() {
return mismatches;
}
public Comparable[] getInsertions() {
public Object[] getInsertions() {
return insertions;
}
public Comparable[] getDeletions() {
public Object[] getDeletions() {
return deletions;
}

View File

@ -198,7 +198,7 @@ public class CycleCovariate implements StandardCovariate {
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
public final Object getValue(final String str) {
return Integer.parseInt(str);
}
}

View File

@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -67,7 +65,7 @@ public class QualityScoreCovariate implements RequiredCovariate {
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
public final Object getValue(final String str) {
return Integer.parseInt(str);
}
}

View File

@ -55,7 +55,7 @@ public class ReadGroupCovariate implements RequiredCovariate {
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
public final Object getValue(final String str) {
return str;
}
}

View File

@ -29,6 +29,7 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.math.BigDecimal;
@ -1619,30 +1620,120 @@ public class MathUtils {
* @param bitSet the bitset
* @return an integer with the bitset representation
*/
public static int intFrom(final BitSet bitSet) {
int integer = 0;
public static long intFrom(final BitSet bitSet) {
long number = 0;
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1))
integer |= 1 << bitIndex;
number |= 1L << bitIndex;
return integer;
return number;
}
/**
* Creates a BitSet representation of a given integer
*
* @param integer the number to turn into a bitset
* @param number the number to turn into a bitset
* @return a bitset representation of the integer
*/
public static BitSet bitSetFrom(int integer) {
BitSet bitSet = new BitSet((int) Math.ceil(Math.sqrt(integer)));
public static BitSet bitSetFrom(long number) {
BitSet bitSet = new BitSet();
int bitIndex = 0;
while (integer > 0) {
if (integer%2 > 0)
while (number > 0) {
if (number%2 > 0)
bitSet.set(bitIndex);
bitIndex++;
integer /= 2;
number /= 2;
}
return bitSet;
}
/**
* Converts a BitSet into the dna string representation.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
* base_10 representation of the sequence. This is important for us to know how to bring the number
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
* as 0's and leading 0's are omitted).
*
* quasi-canonical because A is represented by a 0, therefore,
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
* we have : 0, 1, 2, 3, 00, 01, 02, ...
*
* but we can correctly decode it because we know the final length.
*
* @param bitSet the bitset representation of the dna sequence
* @return the dna sequence represented by the bitset
*/
public static String dnaFrom(final BitSet bitSet) {
long number = intFrom(bitSet); // the base_10 representation of the bit set
long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later)
long nextContext = 4; // the next context (we advance it so we know which one was preceding it).
int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
while (nextContext <= number) { // find the length of the dna string (i)
preContext = nextContext; // keep track of the number of combinations in the preceding context
nextContext += Math.pow(4, ++i);// calculate the next context
}
number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
String dna = "";
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
byte base = (byte) (number % 4);
switch (base) {
case 0 : dna = "A" + dna; break;
case 1 : dna = "C" + dna; break;
case 2 : dna = "G" + dna; break;
case 3 : dna = "T" + dna; break;
}
number /= 4;
}
for (int j = dna.length(); j < i; j++)
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna;
}
/**
* Creates a BitSet representation of a given dna string.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* The bit representation of a dna string is the simple:
* 0 A 4 AA 8 CA
* 1 C 5 AC ...
* 2 G 6 AG 1343 TTGGT
* 3 T 7 AT 1364 TTTTT
*
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
* preceded the string (with smaller lengths).
*
* @param dna the dna sequence
* @return the bitset representing the dna sequence
*/
public static BitSet bitSetFrom(String dna) {
if (dna.length() > 31)
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length()));
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
long preContext = 0; // the sum of all combinations that preceded the length of the dna string
for (int i=0; i<dna.length(); i++) {
baseTen *= 4;
switch(dna.charAt(i)) {
case 'A': baseTen += 0; break;
case 'C': baseTen += 1; break;
case 'G': baseTen += 2; break;
case 'T': baseTen += 3; break;
}
if (i>0)
preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...)
}
return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}
}

View File

@ -207,14 +207,34 @@ public class MathUtilsUnitTest extends BaseTest {
@Test(enabled = true)
public void testIntAndBitSetConversion() {
Assert.assertEquals(428, MathUtils.intFrom(MathUtils.bitSetFrom(428)));
Assert.assertEquals(239847, MathUtils.intFrom(MathUtils.bitSetFrom(239847)));
Assert.assertEquals(12726, MathUtils.intFrom(MathUtils.bitSetFrom(12726)));
Assert.assertEquals(0, MathUtils.intFrom(MathUtils.bitSetFrom(0)));
Assert.assertEquals(1, MathUtils.intFrom(MathUtils.bitSetFrom(1)));
Assert.assertEquals(65536, MathUtils.intFrom(MathUtils.bitSetFrom(65536)));
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(239847)), 239847);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(12726)), 12726);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(0)), 0);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(1)), 1);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(65536)), 65536);
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(Long.MAX_VALUE)), Long.MAX_VALUE);
}
@Test(enabled = true)
public void testDNAAndBitSetConversion() {
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("ACGT")), "ACGT");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AGGTGTTGT")), "AGGTGTTGT");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("A")), "A");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("C")), "C");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("G")), "G");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("T")), "T");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CC")), "CC");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AA")), "AA");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AAAA")), "AAAA");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CCCCCCCCCCCCCC")), "CCCCCCCCCCCCCC");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GGGGGGGGGGGGGG")), "GGGGGGGGGGGGGG");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("TTTTTTTTTTTTTT")), "TTTTTTTTTTTTTT");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GTAGACCGATCTCAGCTAGT")), "GTAGACCGATCTCAGCTAGT");
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31)
}
private boolean hasUniqueElements(Object[] x) {
for (int i = 0; i < x.length; i++)
for (int j = i + 1; j < x.length; j++)