Merge pull request #255 from broadinstitute/rp_gga_mode_kmer_function
Break out the GGA kmers and the read kmers into separate functions for t...
This commit is contained in:
commit
a05c543728
|
|
@ -143,8 +143,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// something went wrong, so abort right now with a null graph
|
||||
return null;
|
||||
|
||||
// now go through the graph already seeded with the reference sequence and add the read kmers to it as well as the artificial GGA haplotypes
|
||||
if ( ! addReadKmersToGraph(builder, reads, activeAlleleHaplotypes) )
|
||||
// add the artificial GGA haplotypes to the graph
|
||||
if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) )
|
||||
// something went wrong, so abort right now with a null graph
|
||||
return null;
|
||||
|
||||
// now go through the graph already seeded with the reference sequence and add the read kmers to it
|
||||
if ( ! addReadKmersToGraph(builder, reads) )
|
||||
// some problem was detected adding the reads to the graph, return null to indicate we failed
|
||||
return null;
|
||||
|
||||
|
|
@ -153,17 +158,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
/**
|
||||
* Add the high-quality kmers from the reads to the graph
|
||||
* Add the high-quality kmers from the artificial GGA haplotypes to the graph
|
||||
*
|
||||
* @param builder a debruijn graph builder to add the read kmers to
|
||||
* @param reads a non-null list of reads whose kmers we want to add to the graph
|
||||
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
|
||||
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
||||
*/
|
||||
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads, final List<Haplotype> activeAlleleHaplotypes) {
|
||||
protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List<Haplotype> activeAlleleHaplotypes) {
|
||||
|
||||
final int kmerLength = builder.getKmerSize();
|
||||
|
||||
// First pull kmers out of the artificial GGA haplotypes and throw them on the graph
|
||||
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
|
||||
final int end = haplotype.length() - kmerLength;
|
||||
for( int start = 0; start < end; start++ ) {
|
||||
|
|
@ -171,6 +175,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
// always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the high-quality kmers from the reads to the graph
|
||||
*
|
||||
* @param builder a debruijn graph builder to add the read kmers to
|
||||
* @param reads a non-null list of reads whose kmers we want to add to the graph
|
||||
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
||||
*/
|
||||
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads) {
|
||||
final int kmerLength = builder.getKmerSize();
|
||||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
|
|
|
|||
|
|
@ -352,7 +352,7 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
* Merge until the graph has no vertices that are candidates for merging
|
||||
*/
|
||||
public boolean transformUntilComplete() {
|
||||
boolean didAtLeastOneTranform = false;
|
||||
boolean didAtLeastOneTransform = false;
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
|
|
@ -360,13 +360,13 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
for( final SeqVertex v : vertexSet() ) {
|
||||
foundNodesToMerge = tryToTransform(v);
|
||||
if ( foundNodesToMerge ) {
|
||||
didAtLeastOneTranform = true;
|
||||
didAtLeastOneTransform = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return didAtLeastOneTranform;
|
||||
return didAtLeastOneTransform;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -97,6 +97,8 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
|
||||
// add the reference sequence to the graph
|
||||
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
||||
|
||||
// add the artificial GGA haplotypes to the graph
|
||||
int hapCount = 0;
|
||||
for( final Haplotype h : activeAlleleHaplotypes ) {
|
||||
final int[] counts = new int[h.length()];
|
||||
|
|
|
|||
|
|
@ -147,7 +147,50 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
assembler.addReadKmersToGraph(builder, Arrays.asList(read), Collections.<Haplotype>emptyList());
|
||||
assembler.addReadKmersToGraph(builder, Arrays.asList(read));
|
||||
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
|
||||
for ( final Kmer addedKmer : builder.addedPairs ) {
|
||||
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "AddGGAKmersToGraph")
|
||||
public Object[][] makeAddGGAKmersToGraphData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
|
||||
final int readLen = bases.length();
|
||||
final List<Integer> allBadStarts = new ArrayList<Integer>(readLen);
|
||||
for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
|
||||
|
||||
for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
|
||||
tests.add(new Object[]{bases, kmerSize});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "AddGGAKmersToGraph", enabled = ! DEBUG)
|
||||
public void testAddGGAKmersToGraph(final String bases, final int kmerSize) {
|
||||
final int readLen = bases.length();
|
||||
final DeBruijnAssembler assembler = new DeBruijnAssembler();
|
||||
final MockBuilder builder = new MockBuilder(kmerSize);
|
||||
|
||||
final Set<String> expectedBases = new HashSet<String>();
|
||||
final Set<Integer> expectedStarts = new LinkedHashSet<Integer>();
|
||||
for ( int i = 0; i < readLen; i++) {
|
||||
boolean good = true;
|
||||
for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
|
||||
good &= i + j < readLen;
|
||||
}
|
||||
if ( good ) {
|
||||
expectedStarts.add(i);
|
||||
expectedBases.add(bases.substring(i, i + kmerSize + 1));
|
||||
}
|
||||
}
|
||||
|
||||
assembler.addGGAKmersToGraph(builder, Arrays.asList(new Haplotype(bases.getBytes())));
|
||||
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
|
||||
for ( final Kmer addedKmer : builder.addedPairs ) {
|
||||
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
|
||||
|
|
|
|||
|
|
@ -244,9 +244,6 @@ public class MathUtils {
|
|||
|
||||
public static double sumLog10(final double[] log10values) {
|
||||
return Math.pow(10.0, log10sumLog10(log10values));
|
||||
// double s = 0.0;
|
||||
// for ( double v : log10values) s += Math.pow(10.0, v);
|
||||
// return s;
|
||||
}
|
||||
|
||||
public static double log10sumLog10(final double[] log10values) {
|
||||
|
|
@ -859,11 +856,8 @@ public class MathUtils {
|
|||
break;
|
||||
sum += x;
|
||||
i++;
|
||||
//System.out.printf(" %d/%d", sum, i);
|
||||
}
|
||||
|
||||
//System.out.printf("Sum = %d, n = %d, maxI = %d, avg = %f%n", sum, i, maxI, (1.0 * sum) / i);
|
||||
|
||||
return (1.0 * sum) / i;
|
||||
}
|
||||
|
||||
|
|
@ -1359,7 +1353,7 @@ public class MathUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Compute in a numerical correct way the quanity log10(1-x)
|
||||
* Compute in a numerical correct way the quantity log10(1-x)
|
||||
*
|
||||
* Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow
|
||||
* in 1-x when x is very small
|
||||
|
|
|
|||
Loading…
Reference in New Issue