diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 6343d79ef..9c6c255df 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -154,7 +154,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { continue; if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); - DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); + DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); @@ -189,10 +189,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { seqGraph.simplifyGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor); - // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef - // might blow up because there's no reference source node - if ( seqGraph.vertexSet().size() == 1 ) - return seqGraph; + // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can + // happen in cases where for example the reference somehow manages to acquire a cycle, or + // where the entire assembly collapses back into the reference sequence. + if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) + return null; + seqGraph.removePathsNotConnectedToRef(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor); @@ -214,64 +216,102 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) - protected DeBruijnGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH); + @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"}) + protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype ) { + final DeBruijnGraph graph = new DeBruijnGraph(kmerLength); // First pull kmers from the reference haplotype and add them to the graph - final byte[] refSequence = refHaplotype.getBases(); - if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { - final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true, 1) ) { - if( DEBUG ) { - System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping"); - } - return null; - } - } - } else { - // not enough reference sequence to build a kmer graph of this length, return null + if ( ! addReferenceKmersToGraph(graph, refHaplotype.getBases()) ) + // something went wrong, so abort right now with a null graph return null; - } + + // now go through the graph already seeded with the reference sequence and add the read kmers to it + if ( ! addReadKmersToGraph(graph, reads) ) + // some problem was detected adding the reads to the graph, return null to indicate we failed + return null; + + graph.cleanNonRefPaths(); + return graph; + } + + /** + * Add the high-quality kmers from the reads to the graph + * + * @param graph a graph to add the read kmers to + * @param reads a non-null list of reads whose kmers we want to add to the graph + * @return true if we successfully added the read kmers to the graph without corrupting it in some way + */ + protected boolean addReadKmersToGraph(final DeBruijnGraph graph, final List reads) { + final int kmerLength = graph.getKmerSize(); // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced - if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + if( sequence.length > kmerLength + KMER_OVERLAP ) { + final int kmersInSequence = sequence.length - kmerLength + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { // if the qualities of all the bases in the kmers are high enough boolean badKmer = false; - for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { + for( int jjj = iii; jjj < iii + kmerLength + 1; jjj++) { if( qualities[jjj] < minBaseQualityToUseInAssembly ) { badKmer = true; break; } } if( !badKmer ) { + // how many observations of this kmer have we seen? A normal read counts for 1, but + // a reduced read might imply a higher multiplicity for our the edge int countNumber = 1; if( read.isReducedRead() ) { // compute mean number of reduced read counts in current kmer span // precise rounding can make a difference with low consensus counts - countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + KMER_LENGTH)); + // TODO -- optimization: should extend arrayMax function to take start stop values + countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + kmerLength)); } - final byte[] kmer1 = Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH); - final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH); - - for( int kkk=0; kkk < countNumber; kkk++ ) { - graph.addKmersToGraph(kmer1, kmer2, false, 1); - } + graph.addKmerPairFromSeqToGraph(sequence, iii, false, countNumber); } } } } - graph.cleanNonRefPaths(); - return graph; + // always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way + return true; + } + + /** + * Add the kmers from the reference sequence to the DeBruijnGraph + * + * @param graph the graph to add the reference kmers to. Must be empty + * @param refSequence the reference sequence from which we'll get our kmers + * @return true if we succeeded in creating a good graph from the reference sequence, false otherwise + */ + protected boolean addReferenceKmersToGraph(final DeBruijnGraph graph, final byte[] refSequence) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( graph.vertexSet().size() != 0 ) throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + graph.vertexSet().size() + " vertices in it already: " + graph); + if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null"); + + + final int kmerLength = graph.getKmerSize(); + if( refSequence.length < kmerLength + KMER_OVERLAP ) { + // not enough reference sequence to build a kmer graph of this length, return null + return false; + } + + final int kmersInSequence = refSequence.length - kmerLength + 1; + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + graph.addKmerPairFromSeqToGraph(refSequence, iii, true, 1); + } + + // we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices + if ( graph.vertexSet().size() != kmersInSequence ) { + if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping"); + return false; + } + + return true; } protected void printGraphs(final List graphs) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java index 0e20c311b..fd8581254 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java @@ -126,30 +126,37 @@ public class DeBruijnGraph extends BaseGraph { * @param kmer1 the source kmer for the edge * @param kmer2 the target kmer for the edge * @param isRef true if the added edge is a reference edge - * @return will return false if trying to add a reference edge which creates a cycle in the assembly graph */ - public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) { + public void addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) { if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); } - final int numVertexBefore = vertexSet().size(); final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 ); - addVertex(v1); final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 ); - addVertex(v2); - if( isRef && vertexSet().size() == numVertexBefore ) { return false; } + final BaseEdge toAdd = new BaseEdge(isRef, multiplicity); - final BaseEdge targetEdge = getEdge(v1, v2); - if ( targetEdge == null ) { - addEdge(v1, v2, new BaseEdge( isRef, multiplicity )); - } else { - if( isRef ) { - targetEdge.setIsRef( true ); - } - targetEdge.setMultiplicity(targetEdge.getMultiplicity() + multiplicity); - } - return true; + addVertices(v1, v2); + addOrUpdateEdge(v1, v2, toAdd); + } + + /** + * Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this + * graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The + * edge between them is added with isRef and multiplicity + * + * @param sequence a sequence of bases from which we want to extract a pair of kmers + * @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize() + * @param isRef should the edge between the two kmers be a reference edge? + * @param multiplicity what's the multiplicity of the edge between these two kmers + */ + public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final boolean isRef, final int multiplicity ) { + if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); + if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start); + if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length); + final byte[] kmer1 = Arrays.copyOfRange(sequence, start, start + getKmerSize()); + final byte[] kmer2 = Arrays.copyOfRange(sequence, start + 1, start + 1 + getKmerSize()); + addKmersToGraph(kmer1, kmer2, isRef, multiplicity); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 663d619a8..cce623b76 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -72,8 +72,8 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { public void testReferenceCycleGraph() { String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; - final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); - final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); + final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true)); + final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true)); Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation."); Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");