Merge pull request #255 from broadinstitute/rp_gga_mode_kmer_function
Break out the GGA kmers and the read kmers into separate functions for t...
This commit is contained in:
commit
a05c543728
|
|
@ -143,8 +143,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
||||||
// something went wrong, so abort right now with a null graph
|
// something went wrong, so abort right now with a null graph
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
// now go through the graph already seeded with the reference sequence and add the read kmers to it as well as the artificial GGA haplotypes
|
// add the artificial GGA haplotypes to the graph
|
||||||
if ( ! addReadKmersToGraph(builder, reads, activeAlleleHaplotypes) )
|
if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) )
|
||||||
|
// something went wrong, so abort right now with a null graph
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// now go through the graph already seeded with the reference sequence and add the read kmers to it
|
||||||
|
if ( ! addReadKmersToGraph(builder, reads) )
|
||||||
// some problem was detected adding the reads to the graph, return null to indicate we failed
|
// some problem was detected adding the reads to the graph, return null to indicate we failed
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
|
|
@ -153,17 +158,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add the high-quality kmers from the reads to the graph
|
* Add the high-quality kmers from the artificial GGA haplotypes to the graph
|
||||||
*
|
*
|
||||||
* @param builder a debruijn graph builder to add the read kmers to
|
* @param builder a debruijn graph builder to add the read kmers to
|
||||||
* @param reads a non-null list of reads whose kmers we want to add to the graph
|
|
||||||
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
|
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
|
||||||
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
||||||
*/
|
*/
|
||||||
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads, final List<Haplotype> activeAlleleHaplotypes) {
|
protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List<Haplotype> activeAlleleHaplotypes) {
|
||||||
|
|
||||||
final int kmerLength = builder.getKmerSize();
|
final int kmerLength = builder.getKmerSize();
|
||||||
|
|
||||||
// First pull kmers out of the artificial GGA haplotypes and throw them on the graph
|
|
||||||
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
|
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
|
||||||
final int end = haplotype.length() - kmerLength;
|
final int end = haplotype.length() - kmerLength;
|
||||||
for( int start = 0; start < end; start++ ) {
|
for( int start = 0; start < end; start++ ) {
|
||||||
|
|
@ -171,6 +175,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add the high-quality kmers from the reads to the graph
|
||||||
|
*
|
||||||
|
* @param builder a debruijn graph builder to add the read kmers to
|
||||||
|
* @param reads a non-null list of reads whose kmers we want to add to the graph
|
||||||
|
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
||||||
|
*/
|
||||||
|
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads) {
|
||||||
|
final int kmerLength = builder.getKmerSize();
|
||||||
|
|
||||||
// Next pull kmers out of every read and throw them on the graph
|
// Next pull kmers out of every read and throw them on the graph
|
||||||
for( final GATKSAMRecord read : reads ) {
|
for( final GATKSAMRecord read : reads ) {
|
||||||
final byte[] sequence = read.getReadBases();
|
final byte[] sequence = read.getReadBases();
|
||||||
|
|
|
||||||
|
|
@ -352,7 +352,7 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
||||||
* Merge until the graph has no vertices that are candidates for merging
|
* Merge until the graph has no vertices that are candidates for merging
|
||||||
*/
|
*/
|
||||||
public boolean transformUntilComplete() {
|
public boolean transformUntilComplete() {
|
||||||
boolean didAtLeastOneTranform = false;
|
boolean didAtLeastOneTransform = false;
|
||||||
boolean foundNodesToMerge = true;
|
boolean foundNodesToMerge = true;
|
||||||
while( foundNodesToMerge ) {
|
while( foundNodesToMerge ) {
|
||||||
foundNodesToMerge = false;
|
foundNodesToMerge = false;
|
||||||
|
|
@ -360,13 +360,13 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
||||||
for( final SeqVertex v : vertexSet() ) {
|
for( final SeqVertex v : vertexSet() ) {
|
||||||
foundNodesToMerge = tryToTransform(v);
|
foundNodesToMerge = tryToTransform(v);
|
||||||
if ( foundNodesToMerge ) {
|
if ( foundNodesToMerge ) {
|
||||||
didAtLeastOneTranform = true;
|
didAtLeastOneTransform = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return didAtLeastOneTranform;
|
return didAtLeastOneTransform;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -97,6 +97,8 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
||||||
|
|
||||||
// add the reference sequence to the graph
|
// add the reference sequence to the graph
|
||||||
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
||||||
|
|
||||||
|
// add the artificial GGA haplotypes to the graph
|
||||||
int hapCount = 0;
|
int hapCount = 0;
|
||||||
for( final Haplotype h : activeAlleleHaplotypes ) {
|
for( final Haplotype h : activeAlleleHaplotypes ) {
|
||||||
final int[] counts = new int[h.length()];
|
final int[] counts = new int[h.length()];
|
||||||
|
|
|
||||||
|
|
@ -147,7 +147,50 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
assembler.addReadKmersToGraph(builder, Arrays.asList(read), Collections.<Haplotype>emptyList());
|
assembler.addReadKmersToGraph(builder, Arrays.asList(read));
|
||||||
|
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
|
||||||
|
for ( final Kmer addedKmer : builder.addedPairs ) {
|
||||||
|
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "AddGGAKmersToGraph")
|
||||||
|
public Object[][] makeAddGGAKmersToGraphData() {
|
||||||
|
List<Object[]> tests = new ArrayList<Object[]>();
|
||||||
|
|
||||||
|
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||||
|
final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
|
||||||
|
final int readLen = bases.length();
|
||||||
|
final List<Integer> allBadStarts = new ArrayList<Integer>(readLen);
|
||||||
|
for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
|
||||||
|
|
||||||
|
for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
|
||||||
|
tests.add(new Object[]{bases, kmerSize});
|
||||||
|
}
|
||||||
|
|
||||||
|
return tests.toArray(new Object[][]{});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "AddGGAKmersToGraph", enabled = ! DEBUG)
|
||||||
|
public void testAddGGAKmersToGraph(final String bases, final int kmerSize) {
|
||||||
|
final int readLen = bases.length();
|
||||||
|
final DeBruijnAssembler assembler = new DeBruijnAssembler();
|
||||||
|
final MockBuilder builder = new MockBuilder(kmerSize);
|
||||||
|
|
||||||
|
final Set<String> expectedBases = new HashSet<String>();
|
||||||
|
final Set<Integer> expectedStarts = new LinkedHashSet<Integer>();
|
||||||
|
for ( int i = 0; i < readLen; i++) {
|
||||||
|
boolean good = true;
|
||||||
|
for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
|
||||||
|
good &= i + j < readLen;
|
||||||
|
}
|
||||||
|
if ( good ) {
|
||||||
|
expectedStarts.add(i);
|
||||||
|
expectedBases.add(bases.substring(i, i + kmerSize + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assembler.addGGAKmersToGraph(builder, Arrays.asList(new Haplotype(bases.getBytes())));
|
||||||
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
|
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
|
||||||
for ( final Kmer addedKmer : builder.addedPairs ) {
|
for ( final Kmer addedKmer : builder.addedPairs ) {
|
||||||
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
|
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
|
||||||
|
|
|
||||||
|
|
@ -244,9 +244,6 @@ public class MathUtils {
|
||||||
|
|
||||||
public static double sumLog10(final double[] log10values) {
|
public static double sumLog10(final double[] log10values) {
|
||||||
return Math.pow(10.0, log10sumLog10(log10values));
|
return Math.pow(10.0, log10sumLog10(log10values));
|
||||||
// double s = 0.0;
|
|
||||||
// for ( double v : log10values) s += Math.pow(10.0, v);
|
|
||||||
// return s;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static double log10sumLog10(final double[] log10values) {
|
public static double log10sumLog10(final double[] log10values) {
|
||||||
|
|
@ -859,11 +856,8 @@ public class MathUtils {
|
||||||
break;
|
break;
|
||||||
sum += x;
|
sum += x;
|
||||||
i++;
|
i++;
|
||||||
//System.out.printf(" %d/%d", sum, i);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//System.out.printf("Sum = %d, n = %d, maxI = %d, avg = %f%n", sum, i, maxI, (1.0 * sum) / i);
|
|
||||||
|
|
||||||
return (1.0 * sum) / i;
|
return (1.0 * sum) / i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1359,7 +1353,7 @@ public class MathUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute in a numerical correct way the quanity log10(1-x)
|
* Compute in a numerical correct way the quantity log10(1-x)
|
||||||
*
|
*
|
||||||
* Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow
|
* Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow
|
||||||
* in 1-x when x is very small
|
* in 1-x when x is very small
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue