Merge pull request #255 from broadinstitute/rp_gga_mode_kmer_function

Break out the GGA kmers and the read kmers into separate functions for t...
This commit is contained in:
Mark DePristo 2013-06-03 11:30:34 -07:00
commit a05c543728
5 changed files with 74 additions and 17 deletions

View File

@ -143,8 +143,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
// something went wrong, so abort right now with a null graph
return null;
// now go through the graph already seeded with the reference sequence and add the read kmers to it as well as the artificial GGA haplotypes
if ( ! addReadKmersToGraph(builder, reads, activeAlleleHaplotypes) )
// add the artificial GGA haplotypes to the graph
if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) )
// something went wrong, so abort right now with a null graph
return null;
// now go through the graph already seeded with the reference sequence and add the read kmers to it
if ( ! addReadKmersToGraph(builder, reads) )
// some problem was detected adding the reads to the graph, return null to indicate we failed
return null;
@ -153,17 +158,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
}
/**
* Add the high-quality kmers from the reads to the graph
* Add the high-quality kmers from the artificial GGA haplotypes to the graph
*
* @param builder a debruijn graph builder to add the read kmers to
* @param reads a non-null list of reads whose kmers we want to add to the graph
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
*/
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads, final List<Haplotype> activeAlleleHaplotypes) {
protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List<Haplotype> activeAlleleHaplotypes) {
final int kmerLength = builder.getKmerSize();
// First pull kmers out of the artificial GGA haplotypes and throw them on the graph
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
final int end = haplotype.length() - kmerLength;
for( int start = 0; start < end; start++ ) {
@ -171,6 +175,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
}
}
// always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way
return true;
}
/**
* Add the high-quality kmers from the reads to the graph
*
* @param builder a debruijn graph builder to add the read kmers to
* @param reads a non-null list of reads whose kmers we want to add to the graph
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
*/
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads) {
final int kmerLength = builder.getKmerSize();
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
final byte[] sequence = read.getReadBases();

View File

@ -352,7 +352,7 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
* Merge until the graph has no vertices that are candidates for merging
*/
public boolean transformUntilComplete() {
boolean didAtLeastOneTranform = false;
boolean didAtLeastOneTransform = false;
boolean foundNodesToMerge = true;
while( foundNodesToMerge ) {
foundNodesToMerge = false;
@ -360,13 +360,13 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
for( final SeqVertex v : vertexSet() ) {
foundNodesToMerge = tryToTransform(v);
if ( foundNodesToMerge ) {
didAtLeastOneTranform = true;
didAtLeastOneTransform = true;
break;
}
}
}
return didAtLeastOneTranform;
return didAtLeastOneTransform;
}
/**

View File

@ -97,6 +97,8 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
// add the reference sequence to the graph
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
// add the artificial GGA haplotypes to the graph
int hapCount = 0;
for( final Haplotype h : activeAlleleHaplotypes ) {
final int[] counts = new int[h.length()];

View File

@ -147,7 +147,50 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
}
}
assembler.addReadKmersToGraph(builder, Arrays.asList(read), Collections.<Haplotype>emptyList());
assembler.addReadKmersToGraph(builder, Arrays.asList(read));
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
for ( final Kmer addedKmer : builder.addedPairs ) {
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
}
}
@DataProvider(name = "AddGGAKmersToGraph")
public Object[][] makeAddGGAKmersToGraphData() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
final int readLen = bases.length();
final List<Integer> allBadStarts = new ArrayList<Integer>(readLen);
for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
tests.add(new Object[]{bases, kmerSize});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "AddGGAKmersToGraph", enabled = ! DEBUG)
public void testAddGGAKmersToGraph(final String bases, final int kmerSize) {
final int readLen = bases.length();
final DeBruijnAssembler assembler = new DeBruijnAssembler();
final MockBuilder builder = new MockBuilder(kmerSize);
final Set<String> expectedBases = new HashSet<String>();
final Set<Integer> expectedStarts = new LinkedHashSet<Integer>();
for ( int i = 0; i < readLen; i++) {
boolean good = true;
for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
good &= i + j < readLen;
}
if ( good ) {
expectedStarts.add(i);
expectedBases.add(bases.substring(i, i + kmerSize + 1));
}
}
assembler.addGGAKmersToGraph(builder, Arrays.asList(new Haplotype(bases.getBytes())));
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
for ( final Kmer addedKmer : builder.addedPairs ) {
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);

View File

@ -244,9 +244,6 @@ public class MathUtils {
public static double sumLog10(final double[] log10values) {
return Math.pow(10.0, log10sumLog10(log10values));
// double s = 0.0;
// for ( double v : log10values) s += Math.pow(10.0, v);
// return s;
}
public static double log10sumLog10(final double[] log10values) {
@ -859,11 +856,8 @@ public class MathUtils {
break;
sum += x;
i++;
//System.out.printf(" %d/%d", sum, i);
}
//System.out.printf("Sum = %d, n = %d, maxI = %d, avg = %f%n", sum, i, maxI, (1.0 * sum) / i);
return (1.0 * sum) / i;
}
@ -1359,7 +1353,7 @@ public class MathUtils {
}
/**
* Compute in a numerical correct way the quanity log10(1-x)
* Compute in a numerical correct way the quantity log10(1-x)
*
* Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow
* in 1-x when x is very small