From fb86887bf2e51e1784a9c8285c02c31f5373ff64 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 8 Apr 2013 17:19:08 -0400 Subject: [PATCH] Fast algorithm for determining which kmers are good in a read -- old algorithm was O(kmerSize * readLen) for each read. New algorithm is O(readLen) -- Added real unit tests for the addKmersFromReads to the graph. Using a builder is great because we can create a MockBuilder that captures all of the calls, and then verify that all of the added kmers are the ones we'd expect. --- .../haplotypecaller/DeBruijnAssembler.java | 33 ++++---- .../haplotypecaller/DeBruijnGraphBuilder.java | 56 ++++++++++++-- .../walkers/haplotypecaller/KMerCounter.java | 15 +++- .../DeBruijnAssemblerUnitTest.java | 76 +++++++++++++++++++ ...t.java => KMerCounterCaseFixUnitTest.java} | 6 +- 5 files changed, 153 insertions(+), 33 deletions(-) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{KmerCounterUnitTest.java => KMerCounterCaseFixUnitTest.java} (98%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index b19b1f8d7..8b89bb1bb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -55,11 +55,11 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -283,30 +283,27 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced - if( sequence.length > kmerLength + KMER_OVERLAP ) { - final int kmersInSequence = sequence.length - kmerLength + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - // TODO -- this is quite expense as it does O(kmerLength^2 work) per read - // if the qualities of all the bases in the kmers are high enough - boolean badKmer = false; - for( int jjj = iii; jjj < iii + kmerLength + 1; jjj++) { - if( qualities[jjj] < minBaseQualityToUseInAssembly ) { - badKmer = true; - break; - } - } - if( !badKmer ) { + if ( sequence.length > kmerLength + KMER_OVERLAP ) { + int lastGood = -1; // the index of the last good base we've seen + for( int end = 0; end < sequence.length; end++ ) { + if ( qualities[end] < minBaseQualityToUseInAssembly ) { + lastGood = -1; // reset the last good base + } else if ( lastGood == -1 ) { + lastGood = end; // we're at a good base, the last good one is us + } else if ( end - kmerLength >= lastGood ) { + // end - kmerLength (the start) is after the lastGood base, so that kmer is good + final int start = end - kmerLength; // how many observations of this kmer have we seen? A normal read counts for 1, but // a reduced read might imply a higher multiplicity for our the edge int countNumber = 1; - if( read.isReducedRead() ) { + if ( read.isReducedRead() ) { // compute mean number of reduced read counts in current kmer span // precise rounding can make a difference with low consensus counts // TODO -- optimization: should extend arrayMax function to take start stop values - countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + kmerLength)); + countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, start, end)); } - builder.addKmerPairFromSeqToGraph(sequence, iii, countNumber); + builder.addKmerPairFromSeqToGraph(sequence, start, countNumber); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java index 418174e64..0f66082c6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java @@ -46,32 +46,56 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; /** + * Fast approach to building a DeBruijnGraph + * + * Follows the model: + * + * for each X that has bases for the final graph: + * addKmer pair (single kmer with kmer size + 1 spanning the pair) + * + * flushKmersToGraph + * * User: depristo * Date: 4/7/13 * Time: 4:14 PM */ public class DeBruijnGraphBuilder { - private final static Logger logger = Logger.getLogger(DeBruijnGraphBuilder.class); - + /** The size of the kmer graph we want to build */ private final int kmerSize; - private final DeBruijnGraph graph; - private final KmerCounter counter; + /** The graph we're going to add kmers to */ + private final DeBruijnGraph graph; + + /** keeps counts of all kmer pairs added since the last flush */ + private final KMerCounter counter; + + /** + * Create a new builder that will write out kmers to graph + * + * @param graph a non-null graph that can contain already added kmers + */ public DeBruijnGraphBuilder(final DeBruijnGraph graph) { if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null"); this.kmerSize = graph.getKmerSize(); this.graph = graph; - this.counter = new KmerCounter(kmerSize + 1); + this.counter = new KMerCounter(kmerSize + 1); } + /** + * The graph we're building + * @return a non-null graph + */ public DeBruijnGraph getGraph() { return graph; } + /** + * The kmer size of our graph + * @return positive integer + */ public int getKmerSize() { return kmerSize; } @@ -93,14 +117,30 @@ public class DeBruijnGraphBuilder { addKmerPair(kmerPair, multiplicity); } - + /** + * Add a single kmer pair to this builder + * @param kmerPair a kmer pair is a single kmer that has kmerSize + 1 bp, where 0 -> kmersize and 1 -> kmersize + 1 + * will have an edge added to this + * @param multiplicity the desired multiplicity of this edge + */ public void addKmerPair(final Kmer kmerPair, final int multiplicity) { if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length()); counter.addKmer(kmerPair, multiplicity); } + /** + * Flushes the currently added kmers to the graph + * + * After this function is called the builder is reset to an empty state + * + * This flushing is expensive, so many kmers should be added to the builder before flushing. The most + * efficient workflow is to add all of the kmers of a particular class (all ref bases, or all read bases) + * then and do one flush when completed + * + * @param addRefEdges should the kmers present in the builder be added to the graph with isRef = true for the edges? + */ public void flushKmersToGraph(final boolean addRefEdges) { - for ( final KmerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) { + for ( final KMerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) { final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases(); final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases(); graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java index 38d829a87..a7194f85f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java @@ -59,8 +59,8 @@ import java.util.Map; * Date: 3/8/13 * Time: 1:16 PM */ -public class KmerCounter { - //private final static Logger logger = Logger.getLogger(KmerCounter.class); +public class KMerCounter { + //private final static Logger logger = Logger.getLogger(KMerCounter.class); /** * A map of for each kmer to its num occurrences in addKmers @@ -73,7 +73,7 @@ public class KmerCounter { * * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 */ - public KmerCounter(final int kmerLength) { + public KMerCounter(final int kmerLength) { if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); this.kmerLength = kmerLength; } @@ -89,10 +89,17 @@ public class KmerCounter { return counted == null ? 0 : counted.count; } + /** + * Get an unordered collection of the counted kmers in this counter + * @return a non-null collection + */ public Collection getCountedKmers() { return countsByKMer.values(); } + /** + * Remove all current counts, resetting the counter to an empty state + */ public void clear() { countsByKMer.clear(); } @@ -117,7 +124,7 @@ public class KmerCounter { @Override public String toString() { - final StringBuilder b = new StringBuilder("KmerCounter{"); + final StringBuilder b = new StringBuilder("KMerCounter{"); b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); b.append("\n}"); return b.toString(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 59d13dee4..e6dea4d11 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -55,13 +55,16 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; @@ -122,4 +125,77 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { } } + private static class MockBuilder extends DeBruijnGraphBuilder { + public final List addedPairs = new LinkedList(); + + private MockBuilder(final int kmerSize) { + super(new DeBruijnGraph(kmerSize)); + } + + @Override + public void addKmerPair(Kmer kmerPair, int multiplicity) { + logger.info("addKmerPair" + kmerPair); + addedPairs.add(kmerPair); + } + + @Override + public void flushKmersToGraph(boolean addRefEdges) { + // do nothing + } + } + + @DataProvider(name = "AddReadKmersToGraph") + public Object[][] makeAddReadKmersToGraphData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final String bases = "ACGTAACCGGTTAAACCCGGGTTT"; + final int readLen = bases.length(); + final List allBadStarts = new ArrayList(readLen); + for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i); + + for ( final int kmerSize : Arrays.asList(3, 4, 5) ) { + for ( final int nBadQuals : Arrays.asList(0, 1, 2) ) { + for ( final List badStarts : Utils.makePermutations(allBadStarts, nBadQuals, false) ) { + tests.add(new Object[]{bases, kmerSize, badStarts}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AddReadKmersToGraph") + public void testAddReadKmersToGraph(final String bases, final int kmerSize, final List badQualsSites) { + final int readLen = bases.length(); + final DeBruijnAssembler assembler = new DeBruijnAssembler(); + final MockBuilder builder = new MockBuilder(kmerSize); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + final byte[] quals = Utils.dupBytes((byte)20, bases.length()); + for ( final int badSite : badQualsSites ) quals[badSite] = 0; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLen); + read.setReadBases(bases.getBytes()); + read.setBaseQualities(quals); + + final Set expectedBases = new HashSet(); + final Set expectedStarts = new LinkedHashSet(); + for ( int i = 0; i < readLen; i++) { + boolean good = true; + for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing + good &= i + j < readLen && quals[i+j] >= assembler.getMinBaseQualityToUseInAssembly(); + } + if ( good ) { + expectedStarts.add(i); + expectedBases.add(bases.substring(i, i + kmerSize + 1)); + } + } + + assembler.addReadKmersToGraph(builder, Arrays.asList(read)); + Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size()); + for ( final Kmer addedKmer : builder.addedPairs ) { + Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases); + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java similarity index 98% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java index 9b3c8d481..c049121a3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java @@ -50,10 +50,10 @@ import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.Test; -public class KmerCounterUnitTest extends BaseTest { +public class KMerCounterCaseFixUnitTest extends BaseTest { @Test public void testMyData() { - final KmerCounter counter = new KmerCounter(3); + final KMerCounter counter = new KMerCounter(3); Assert.assertNotNull(counter.toString()); @@ -78,7 +78,7 @@ public class KmerCounterUnitTest extends BaseTest { Assert.assertNotNull(counter.toString()); } - private void testCounting(final KmerCounter counter, final String in, final int expectedCount) { + private void testCounting(final KMerCounter counter, final String in, final int expectedCount) { Assert.assertEquals(counter.getKmerCount(new Kmer(in)), expectedCount); } }