Fix FN problem stemming from sequence graphs that contain cycles.

Problem:
The sequence graphs can get very complex and it's not enough just to test that any given read has non-unique kmers.
Reads with variants can have kmers that match unique regions of the reference, and this causes cycles in the final
sequence graph.  Ultimately the problem is that kmers of 10/25 may not be large enough for these complex regions.

Solution:
We continue to try kmers of 10/25 but detect whether cycles exist; if so, we do not use them.  If (and only if) we
can't get usable graphs from the 10/25 kmers, then we start iterating over larger kmers until we either can generate
a graph without cycles or attempt too many iterations.
This commit is contained in:
Eric Banks 2013-05-23 12:02:19 -04:00
parent 210007cd09
commit e4e7d39e2c
4 changed files with 147 additions and 57 deletions

View File

@ -266,6 +266,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
protected List<Integer> kmerSizes = Arrays.asList(10, 25);
@Advanced
@Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false)
protected boolean dontIncreaseKmerSizesForCycles = false;
/**
* Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype
* considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the
@ -520,7 +524,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
assemblyEngine = useDebruijnAssembler
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes);
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles);
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);

View File

@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -63,11 +64,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
private final static int KMER_SIZE_ITERATION_INCREASE = 10;
private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6;
/** The min and max kmer sizes to try when building the graph. */
private final List<Integer> kmerSizes;
private final int maxAllowedPathsForReadThreadingAssembler;
private final boolean dontIncreaseKmerSizesForCycles;
private boolean requireReasonableNumberOfPaths = false;
protected boolean removePathsNotConnectedToRef = true;
private boolean justReturnRawGraph = false;
@ -77,10 +81,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25));
}
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes, final boolean dontIncreaseKmerSizesForCycles) {
super(maxAllowedPathsForReadThreadingAssembler);
this.kmerSizes = kmerSizes;
this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler;
this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles;
}
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, false);
}
/** for testing purposes */
@ -89,67 +98,99 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
}
@Override
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
public List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes) {
final List<SeqGraph> graphs = new LinkedList<>();
// first, try using the requested kmer sizes
for ( final int kmerSize : kmerSizes ) {
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
if ( graph != null )
graphs.add(graph);
}
// add the reference sequence to the graph
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
// add the artificial GGA haplotypes to the graph
int hapCount = 0;
for( final Haplotype h : activeAlleleHaplotypes ) {
final int[] counts = new int[h.length()];
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
}
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
rtgraph.addRead(read);
}
// actually build the read threading graph
rtgraph.buildGraphIfNecessary();
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
rtgraph.pruneLowWeightChains(pruneFactor);
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
// we can recover them by merging some N bases from the chain back into the reference uniquely, for
// N < kmerSize
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
// remove all heading and trailing paths
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph);
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
if ( seqGraph != null ) {
if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) {
graphs.add(seqGraph);
}
// if none of those worked, iterate over larger sizes if allowed to do so
if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
int numIterations = 1;
while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
if ( graph != null )
graphs.add(graph);
kmerSize += KMER_SIZE_ITERATION_INCREASE;
numIterations++;
}
}
return graphs;
}
/**
* Creates the sequence graph for the given kmerSize
*
* @param reads reads to use
* @param refHaplotype reference haplotype
* @param kmerSize kmer size
* @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
* @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths)
*/
protected SeqGraph createGraph(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final int kmerSize, final List<Haplotype> activeAlleleHaplotypes) {
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
// add the reference sequence to the graph
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
// add the artificial GGA haplotypes to the graph
int hapCount = 0;
for ( final Haplotype h : activeAlleleHaplotypes ) {
final int[] counts = new int[h.length()];
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
}
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
rtgraph.addRead(read);
}
// actually build the read threading graph
rtgraph.buildGraphIfNecessary();
// sanity check: make sure there are no cycles in the graph
if ( rtgraph.hasCycles() ) {
if ( debug ) logger.info("Not using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler because it contains a cycle");
return null;
}
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
rtgraph.pruneLowWeightChains(pruneFactor);
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
// we can recover them by merging some N bases from the chain back into the reference uniquely, for
// N < kmerSize
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
// remove all heading and trailing paths
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
if ( justReturnRawGraph ) return initialSeqGraph;
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph;
}
/**
* Did we find a reasonable number of paths in this graph?
* @param graph

View File

@ -54,6 +54,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.jgrapht.EdgeFactory;
import org.jgrapht.alg.CycleDetector;
import java.io.File;
import java.util.*;
@ -297,7 +298,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
public void buildGraphIfNecessary() {
if ( alreadyBuilt ) return;
// determine the kmer size we'll uses, and capture the set of nonUniques for that kmer size
// determine the kmer size we'll use, and capture the set of nonUniques for that kmer size
final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize);
nonUniqueKmers = result.nonUniques;
@ -321,6 +322,13 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
alreadyBuilt = true;
}
/**
* @return true if the graph has cycles, false otherwise
*/
public boolean hasCycles() {
return new CycleDetector<>(this).detectCycles();
}
public void recoverDanglingTails() {
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
@ -409,7 +417,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
private Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
// count up occurrences of kmers within each read
final KMerCounter counter = new KMerCounter(kmerSize);
for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) {
final int stopPosition = seqForKmers.stop - kmerSize;
for ( int i = 0; i <= stopPosition; i++ ) {
final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
counter.addKmer(kmer, 1);
}
@ -578,7 +587,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
// none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
final MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
if ( isRef && uniqueMergeVertex != null )
throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
@ -590,7 +599,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
}
/**
* Get the longest stretch of high quality bases in read and pass that sequence to the graph
* Add the given read to the sequence graph. Ultimately the read will get sent through addSequence(), but first
* this method ensures we only use high quality bases and accounts for reduced reads, etc.
*
* @param read a non-null read
*/

View File

@ -49,6 +49,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.Test;
@ -145,6 +150,36 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
}
}
@Test(enabled = !DEBUG)
public void testCyclesInGraph() {
// b37 20:12655200-12655850
final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
// SNP at 20:12655528 creates a cycle for small kmers
final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
final List<GATKSAMRecord> reads = new ArrayList<>();
for ( int index = 0; index < alt.length() - 100; index += 20 )
reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M"));
// test that there are cycles detected for small kmer
final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25);
rtgraph25.addSequence("ref", ref.getBytes(), null, true);
for ( final GATKSAMRecord read : reads )
rtgraph25.addRead(read);
rtgraph25.buildGraphIfNecessary();
Assert.assertTrue(rtgraph25.hasCycles());
// test that there are no cycles detected for large kmer
final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75);
rtgraph75.addSequence("ref", ref.getBytes(), null, true);
for ( final GATKSAMRecord read : reads )
rtgraph75.addRead(read);
rtgraph75.buildGraphIfNecessary();
Assert.assertFalse(rtgraph75.hasCycles());
}
// TODO -- update to use determineKmerSizeAndNonUniques directly
// @DataProvider(name = "KmerSizeData")
// public Object[][] makeKmerSizeDataProvider() {