From e4e7d39e2c8e9cb6a21f5152f46e20d334f81df0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 23 May 2013 12:02:19 -0400 Subject: [PATCH] Fix FN problem stemming from sequence graphs that contain cycles. Problem: The sequence graphs can get very complex and it's not enough just to test that any given read has non-unique kmers. Reads with variants can have kmers that match unique regions of the reference, and this causes cycles in the final sequence graph. Ultimately the problem is that kmers of 10/25 may not be large enough for these complex regions. Solution: We continue to try kmers of 10/25 but detect whether cycles exist; if so, we do not use them. If (and only if) we can't get usable graphs from the 10/25 kmers, then we start iterating over larger kmers until we either can generate a graph without cycles or attempt too many iterations. --- .../haplotypecaller/HaplotypeCaller.java | 6 +- .../readthreading/ReadThreadingAssembler.java | 145 +++++++++++------- .../readthreading/ReadThreadingGraph.java | 18 ++- .../ReadThreadingGraphUnitTest.java | 35 +++++ 4 files changed, 147 insertions(+), 57 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index e55413649..a41b68e2c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -266,6 +266,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) protected List kmerSizes = Arrays.asList(10, 25); + @Advanced + @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false) + protected boolean dontIncreaseKmerSizesForCycles = false; + /** * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the @@ -520,7 +524,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH); assemblyEngine = useDebruijnAssembler ? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler) - : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes); + : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles); assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index 123b36640..0887929ab 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -63,11 +64,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128; private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000; + private final static int KMER_SIZE_ITERATION_INCREASE = 10; + private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6; /** The min and max kmer sizes to try when building the graph. */ private final List kmerSizes; private final int maxAllowedPathsForReadThreadingAssembler; + private final boolean dontIncreaseKmerSizesForCycles; private boolean requireReasonableNumberOfPaths = false; protected boolean removePathsNotConnectedToRef = true; private boolean justReturnRawGraph = false; @@ -77,10 +81,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25)); } - public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes, final boolean dontIncreaseKmerSizesForCycles) { super(maxAllowedPathsForReadThreadingAssembler); this.kmerSizes = kmerSizes; this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler; + this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles; + } + + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { + this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, false); } /** for testing purposes */ @@ -89,67 +98,99 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { } @Override - public List assemble( final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) { + public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) { final List graphs = new LinkedList<>(); + // first, try using the requested kmer sizes for ( final int kmerSize : kmerSizes ) { - final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly); + final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes); + if ( graph != null ) + graphs.add(graph); + } - // add the reference sequence to the graph - rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); - - // add the artificial GGA haplotypes to the graph - int hapCount = 0; - for( final Haplotype h : activeAlleleHaplotypes ) { - final int[] counts = new int[h.length()]; - Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS); - rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false); - } - - // Next pull kmers out of every read and throw them on the graph - for( final GATKSAMRecord read : reads ) { - rtgraph.addRead(read); - } - - // actually build the read threading graph - rtgraph.buildGraphIfNecessary(); - printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); - - // go through and prune all of the chains where all edges have <= pruneFactor. This must occur - // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering - // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 - rtgraph.pruneLowWeightChains(pruneFactor); - - // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if - // we can recover them by merging some N bases from the chain back into the reference uniquely, for - // N < kmerSize - if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); - - // remove all heading and trailing paths - if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); - - printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); - - final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); - - // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph - if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph); - - if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); - printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); - initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction - - final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); - if ( seqGraph != null ) { - if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) { - graphs.add(seqGraph); - } + // if none of those worked, iterate over larger sizes if allowed to do so + if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) { + int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE; + int numIterations = 1; + while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { + final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes); + if ( graph != null ) + graphs.add(graph); + kmerSize += KMER_SIZE_ITERATION_INCREASE; + numIterations++; } } return graphs; } + /** + * Creates the sequence graph for the given kmerSize + * + * @param reads reads to use + * @param refHaplotype reference haplotype + * @param kmerSize kmer size + * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph + * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths) + */ + protected SeqGraph createGraph(final List reads, final Haplotype refHaplotype, final int kmerSize, final List activeAlleleHaplotypes) { + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly); + + // add the reference sequence to the graph + rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); + + // add the artificial GGA haplotypes to the graph + int hapCount = 0; + for ( final Haplotype h : activeAlleleHaplotypes ) { + final int[] counts = new int[h.length()]; + Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS); + rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false); + } + + // Next pull kmers out of every read and throw them on the graph + for( final GATKSAMRecord read : reads ) { + rtgraph.addRead(read); + } + + // actually build the read threading graph + rtgraph.buildGraphIfNecessary(); + + // sanity check: make sure there are no cycles in the graph + if ( rtgraph.hasCycles() ) { + if ( debug ) logger.info("Not using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler because it contains a cycle"); + return null; + } + + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); + + // go through and prune all of the chains where all edges have <= pruneFactor. This must occur + // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering + // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 + rtgraph.pruneLowWeightChains(pruneFactor); + + // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if + // we can recover them by merging some N bases from the chain back into the reference uniquely, for + // N < kmerSize + if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); + + // remove all heading and trailing paths + if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); + + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); + + final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); + + // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph + if ( justReturnRawGraph ) return initialSeqGraph; + + if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); + printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); + initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction + + final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); + return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph; + } + /** * Did we find a reasonable number of paths in this graph? * @param graph diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index 8e879377f..bbc1618ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -54,6 +54,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.PrimitivePair; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.jgrapht.EdgeFactory; +import org.jgrapht.alg.CycleDetector; import java.io.File; import java.util.*; @@ -297,7 +298,7 @@ public class ReadThreadingGraph extends BaseGraph(this).detectCycles(); + } + public void recoverDanglingTails() { if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); @@ -409,7 +417,8 @@ public class ReadThreadingGraph extends BaseGraph determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) { // count up occurrences of kmers within each read final KMerCounter counter = new KMerCounter(kmerSize); - for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) { + final int stopPosition = seqForKmers.stop - kmerSize; + for ( int i = 0; i <= stopPosition; i++ ) { final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize); counter.addKmer(kmer, 1); } @@ -578,7 +587,7 @@ public class ReadThreadingGraph extends BaseGraph " + uniqueMergeVertex); @@ -590,7 +599,8 @@ public class ReadThreadingGraph extends BaseGraph reads = new ArrayList<>(); + for ( int index = 0; index < alt.length() - 100; index += 20 ) + reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M")); + + // test that there are cycles detected for small kmer + final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25); + rtgraph25.addSequence("ref", ref.getBytes(), null, true); + for ( final GATKSAMRecord read : reads ) + rtgraph25.addRead(read); + rtgraph25.buildGraphIfNecessary(); + Assert.assertTrue(rtgraph25.hasCycles()); + + // test that there are no cycles detected for large kmer + final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75); + rtgraph75.addSequence("ref", ref.getBytes(), null, true); + for ( final GATKSAMRecord read : reads ) + rtgraph75.addRead(read); + rtgraph75.buildGraphIfNecessary(); + Assert.assertFalse(rtgraph75.hasCycles()); + } + // TODO -- update to use determineKmerSizeAndNonUniques directly // @DataProvider(name = "KmerSizeData") // public Object[][] makeKmerSizeDataProvider() {