From e4e7d39e2c8e9cb6a21f5152f46e20d334f81df0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 23 May 2013 12:02:19 -0400 Subject: [PATCH 1/6] Fix FN problem stemming from sequence graphs that contain cycles. Problem: The sequence graphs can get very complex and it's not enough just to test that any given read has non-unique kmers. Reads with variants can have kmers that match unique regions of the reference, and this causes cycles in the final sequence graph. Ultimately the problem is that kmers of 10/25 may not be large enough for these complex regions. Solution: We continue to try kmers of 10/25 but detect whether cycles exist; if so, we do not use them. If (and only if) we can't get usable graphs from the 10/25 kmers, then we start iterating over larger kmers until we either can generate a graph without cycles or attempt too many iterations. --- .../haplotypecaller/HaplotypeCaller.java | 6 +- .../readthreading/ReadThreadingAssembler.java | 145 +++++++++++------- .../readthreading/ReadThreadingGraph.java | 18 ++- .../ReadThreadingGraphUnitTest.java | 35 +++++ 4 files changed, 147 insertions(+), 57 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index e55413649..a41b68e2c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -266,6 +266,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) protected List kmerSizes = Arrays.asList(10, 25); + @Advanced + @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false) + protected boolean dontIncreaseKmerSizesForCycles = false; + /** * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the @@ -520,7 +524,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH); assemblyEngine = useDebruijnAssembler ? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler) - : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes); + : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles); assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index 123b36640..0887929ab 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -63,11 +64,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128; private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000; + private final static int KMER_SIZE_ITERATION_INCREASE = 10; + private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6; /** The min and max kmer sizes to try when building the graph. */ private final List kmerSizes; private final int maxAllowedPathsForReadThreadingAssembler; + private final boolean dontIncreaseKmerSizesForCycles; private boolean requireReasonableNumberOfPaths = false; protected boolean removePathsNotConnectedToRef = true; private boolean justReturnRawGraph = false; @@ -77,10 +81,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25)); } - public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes, final boolean dontIncreaseKmerSizesForCycles) { super(maxAllowedPathsForReadThreadingAssembler); this.kmerSizes = kmerSizes; this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler; + this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles; + } + + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { + this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, false); } /** for testing purposes */ @@ -89,67 +98,99 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { } @Override - public List assemble( final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) { + public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) { final List graphs = new LinkedList<>(); + // first, try using the requested kmer sizes for ( final int kmerSize : kmerSizes ) { - final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly); + final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes); + if ( graph != null ) + graphs.add(graph); + } - // add the reference sequence to the graph - rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); - - // add the artificial GGA haplotypes to the graph - int hapCount = 0; - for( final Haplotype h : activeAlleleHaplotypes ) { - final int[] counts = new int[h.length()]; - Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS); - rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false); - } - - // Next pull kmers out of every read and throw them on the graph - for( final GATKSAMRecord read : reads ) { - rtgraph.addRead(read); - } - - // actually build the read threading graph - rtgraph.buildGraphIfNecessary(); - printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); - - // go through and prune all of the chains where all edges have <= pruneFactor. This must occur - // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering - // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 - rtgraph.pruneLowWeightChains(pruneFactor); - - // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if - // we can recover them by merging some N bases from the chain back into the reference uniquely, for - // N < kmerSize - if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); - - // remove all heading and trailing paths - if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); - - printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); - - final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); - - // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph - if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph); - - if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); - printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); - initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction - - final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); - if ( seqGraph != null ) { - if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) { - graphs.add(seqGraph); - } + // if none of those worked, iterate over larger sizes if allowed to do so + if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) { + int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE; + int numIterations = 1; + while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { + final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes); + if ( graph != null ) + graphs.add(graph); + kmerSize += KMER_SIZE_ITERATION_INCREASE; + numIterations++; } } return graphs; } + /** + * Creates the sequence graph for the given kmerSize + * + * @param reads reads to use + * @param refHaplotype reference haplotype + * @param kmerSize kmer size + * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph + * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths) + */ + protected SeqGraph createGraph(final List reads, final Haplotype refHaplotype, final int kmerSize, final List activeAlleleHaplotypes) { + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly); + + // add the reference sequence to the graph + rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); + + // add the artificial GGA haplotypes to the graph + int hapCount = 0; + for ( final Haplotype h : activeAlleleHaplotypes ) { + final int[] counts = new int[h.length()]; + Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS); + rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false); + } + + // Next pull kmers out of every read and throw them on the graph + for( final GATKSAMRecord read : reads ) { + rtgraph.addRead(read); + } + + // actually build the read threading graph + rtgraph.buildGraphIfNecessary(); + + // sanity check: make sure there are no cycles in the graph + if ( rtgraph.hasCycles() ) { + if ( debug ) logger.info("Not using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler because it contains a cycle"); + return null; + } + + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); + + // go through and prune all of the chains where all edges have <= pruneFactor. This must occur + // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering + // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 + rtgraph.pruneLowWeightChains(pruneFactor); + + // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if + // we can recover them by merging some N bases from the chain back into the reference uniquely, for + // N < kmerSize + if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); + + // remove all heading and trailing paths + if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); + + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); + + final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); + + // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph + if ( justReturnRawGraph ) return initialSeqGraph; + + if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); + printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); + initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction + + final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); + return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph; + } + /** * Did we find a reasonable number of paths in this graph? * @param graph diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index 8e879377f..bbc1618ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -54,6 +54,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.PrimitivePair; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.jgrapht.EdgeFactory; +import org.jgrapht.alg.CycleDetector; import java.io.File; import java.util.*; @@ -297,7 +298,7 @@ public class ReadThreadingGraph extends BaseGraph(this).detectCycles(); + } + public void recoverDanglingTails() { if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); @@ -409,7 +417,8 @@ public class ReadThreadingGraph extends BaseGraph determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) { // count up occurrences of kmers within each read final KMerCounter counter = new KMerCounter(kmerSize); - for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) { + final int stopPosition = seqForKmers.stop - kmerSize; + for ( int i = 0; i <= stopPosition; i++ ) { final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize); counter.addKmer(kmer, 1); } @@ -578,7 +587,7 @@ public class ReadThreadingGraph extends BaseGraph " + uniqueMergeVertex); @@ -590,7 +599,8 @@ public class ReadThreadingGraph extends BaseGraph reads = new ArrayList<>(); + for ( int index = 0; index < alt.length() - 100; index += 20 ) + reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M")); + + // test that there are cycles detected for small kmer + final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25); + rtgraph25.addSequence("ref", ref.getBytes(), null, true); + for ( final GATKSAMRecord read : reads ) + rtgraph25.addRead(read); + rtgraph25.buildGraphIfNecessary(); + Assert.assertTrue(rtgraph25.hasCycles()); + + // test that there are no cycles detected for large kmer + final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75); + rtgraph75.addSequence("ref", ref.getBytes(), null, true); + for ( final GATKSAMRecord read : reads ) + rtgraph75.addRead(read); + rtgraph75.buildGraphIfNecessary(); + Assert.assertFalse(rtgraph75.hasCycles()); + } + // TODO -- update to use determineKmerSizeAndNonUniques directly // @DataProvider(name = "KmerSizeData") // public Object[][] makeKmerSizeDataProvider() { From 77868d034f4e006b8e33d2e9bf39447b88790ba7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 May 2013 14:00:43 -0400 Subject: [PATCH 2/6] Do not allow the use of Ns in reads for graph construction. Ns are treated as wildcards in the PairHMM so creating haplotypes with Ns gives them artificial advantages over other ones. This was the cause of at least one FN where there were Ns at a SNP position. --- .../readthreading/ReadThreadingGraph.java | 15 +++++++++- .../LocalAssemblyEngineUnitTest.java | 2 +- .../ReadThreadingGraphUnitTest.java | 30 ++++++++++++++++--- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index bbc1618ac..ab6b17c35 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -50,6 +50,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.PrimitivePair; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -611,7 +612,7 @@ public class ReadThreadingGraph extends BaseGraph= minBaseQualityToUseInAssembly; + } + /** * Get the set of non-unique kmers in this graph. For debugging purposes * @return a non-null set of kmers diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java index 74361de1b..a74ce1c75 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java @@ -251,7 +251,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { for ( int snpPos = 0; snpPos < windowSize; snpPos++) { if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) { final byte[] altBases = ref.getBytes(); - altBases[snpPos] = 'N'; + altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A'; final String alt = new String(altBases); tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java index 340777513..67ee52734 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -48,10 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -180,7 +178,31 @@ public class ReadThreadingGraphUnitTest extends BaseTest { Assert.assertFalse(rtgraph75.hasCycles()); } - // TODO -- update to use determineKmerSizeAndNonUniques directly + @Test(enabled = !DEBUG) + public void testNsInReadsAreNotUsedForGraph() { + + final int length = 100; + final byte[] ref = Utils.dupBytes((byte)'A', length); + + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25); + rtgraph.addSequence("ref", ref, null, true); + + // add reads with Ns at any position + for ( int i = 0; i < length; i++ ) { + final byte[] bases = ref.clone(); + bases[i] = 'N'; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M"); + rtgraph.addRead(read); + } + rtgraph.buildGraphIfNecessary(); + + final SeqGraph graph = rtgraph.convertToSequenceGraph(); + final KBestPaths pathFinder = new KBestPaths<>(false); + Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); + } + + +// TODO -- update to use determineKmerSizeAndNonUniques directly // @DataProvider(name = "KmerSizeData") // public Object[][] makeKmerSizeDataProvider() { // List tests = new ArrayList(); From c0e3874db095836e389d4ff7c277fb778a54c7b8 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 3 Jun 2013 14:34:29 -0400 Subject: [PATCH 3/6] Change the HC's phredScaledGlobalReadMismappingRate from 60 to 45, because Ryan and Mark told me to. --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index a41b68e2c..24fd5901f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -336,7 +336,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) - protected int phredScaledGlobalReadMismappingRate = 60; + protected int phredScaledGlobalReadMismappingRate = 45; @Advanced @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) From c0030f3f2dd36d29d70b1fa06daf2e399e99169a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 4 Jun 2013 09:26:50 -0400 Subject: [PATCH 4/6] We no longer subset down to the best N haplotypes for the GL calculation. I explain in comments within the code that this was causing problems with the marginalization over events. --- .../haplotypecaller/HaplotypeCaller.java | 26 +++++-------------- .../haplotypecaller/LocalAssemblyEngine.java | 2 +- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 24fd5901f..3e411ae33 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -694,11 +694,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) ); - // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final List bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap); + // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there + // was a bad interaction between that selection and the marginalization that happens over each event when computing + // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the + // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included + // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, - bestHaplotypes, + assemblyResult.haplotypes, stratifiedReadMap, perSampleFilteredReadList, assemblyResult.fullReferenceWithPadding, @@ -711,7 +714,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // TODO -- must disable if we are doing NCT, or set the output type of ! presorted if ( bamWriter != null ) { haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc, - bestHaplotypes, + assemblyResult.haplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); } @@ -863,21 +866,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true); } - /** - * Select the best N haplotypes according to their likelihoods, if appropriate - * - * @param haplotypes a list of haplotypes to consider - * @param stratifiedReadMap a map from samples -> read likelihoods - * @return the list of haplotypes to genotype - */ - protected List selectBestHaplotypesForGenotyping(final List haplotypes, final Map stratifiedReadMap) { - if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - return haplotypes; - } else { - return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); - } - } - //--------------------------------------------------------------------------------------------------------------- // // reduce diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 1a5f34bc3..2a74e9dd0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -215,7 +215,7 @@ public abstract class LocalAssemblyEngine { returnHaplotypes.add(h); if ( debug ) - logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize()); + logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize()); } } } From dadcfe296dff64dae226aad1da57da2c512c3870 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 5 Jun 2013 14:26:23 -0400 Subject: [PATCH 5/6] Reworking of the dangling tails merging code. We now run Smith-Waterman on the dangling tail against the corresponding reference tail. If we can generate a reasonable, low entropy alignment then we trigger the merge to the reference path; otherwise we abort. Also, we put in a check for low-complexity of graphs and don't let those pass through. Added tests for this implementation that checks exact SW results and correct edges added. --- .../haplotypecaller/graphs/BaseGraph.java | 18 ++ .../haplotypecaller/graphs/GraphUtils.java | 10 +- .../walkers/haplotypecaller/graphs/Path.java | 3 +- .../readthreading/ReadThreadingAssembler.java | 28 ++- .../readthreading/ReadThreadingGraph.java | 207 ++++++++++++++---- .../graphs/BaseGraphUnitTest.java | 15 ++ .../ReadThreadingAssemblerUnitTest.java | 3 +- .../ReadThreadingGraphUnitTest.java | 76 +++++++ .../sting/utils/sam/AlignmentUtils.java | 17 ++ .../smithwaterman/SWPairwiseAlignment.java | 15 ++ .../utils/sam/AlignmentUtilsUnitTest.java | 7 + 11 files changed, 339 insertions(+), 60 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index c963fb6e5..70ef539f3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -676,6 +676,24 @@ public class BaseGraph extends Default '}'; } + /** + * The base sequence for the given path. + * Note, this assumes that the path does not start with a source node. + * + * @param path the list of vertexes that make up the path + * @return non-null sequence of bases corresponding to the given path + */ + @Ensures({"result != null"}) + public byte[] getBasesForPath(final List path) { + if ( path == null ) throw new IllegalArgumentException("Path cannot be null"); + + final StringBuffer sb = new StringBuffer(); + for ( final DeBruijnVertex v : path ) + sb.append((char)v.getSuffix()); + + return sb.toString().getBytes(); + } + /** * Get the set of vertices within distance edges of source, regardless of edge direction * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java index 4aa6047a9..73a1daa3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java @@ -171,7 +171,15 @@ final public class GraphUtils { return foundDup ? null : new PrimitivePair.Int(longestPos, length); } - private static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) { + /** + * calculates the longest suffix match between a sequence and a smaller kmer + * + * @param seq the (reference) sequence + * @param kmer the smaller kmer sequence + * @param seqStart the index (inclusive) on seq to start looking backwards from + * @return the longest matching suffix + */ + public static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) { for ( int len = 1; len <= kmer.length; len++ ) { final int seqI = seqStart - len + 1; final int kmerI = kmer.length - len; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index a07b98bb6..2e84e1d22 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; @@ -92,7 +91,7 @@ public class Path { /** * Create a new Path containing no edges and starting at initialVertex * @param initialVertex the starting vertex of the path - * @param graph the graph this path with follow through + * @param graph the graph this path will follow through */ public Path(final T initialVertex, final BaseGraph graph) { if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null"); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index 0887929ab..f4290f2bb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -55,7 +55,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; import java.util.Arrays; -import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -89,7 +88,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { } public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { - this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, false); + this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true); } /** for testing purposes */ @@ -103,7 +102,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { // first, try using the requested kmer sizes for ( final int kmerSize : kmerSizes ) { - final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes); + final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles); if ( graph != null ) graphs.add(graph); } @@ -113,7 +112,8 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE; int numIterations = 1; while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { - final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes); + // on the last attempt we will allow low complexity graphs + final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT); if ( graph != null ) graphs.add(graph); kmerSize += KMER_SIZE_ITERATION_INCREASE; @@ -131,9 +131,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { * @param refHaplotype reference haplotype * @param kmerSize kmer size * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph - * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths) + * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs + * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity) */ - protected SeqGraph createGraph(final List reads, final Haplotype refHaplotype, final int kmerSize, final List activeAlleleHaplotypes) { + protected SeqGraph createGraph(final List reads, + final Haplotype refHaplotype, + final int kmerSize, + final List activeAlleleHaplotypes, + final boolean allowLowComplexityGraphs) { final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly); // add the reference sequence to the graph @@ -157,7 +162,13 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { // sanity check: make sure there are no cycles in the graph if ( rtgraph.hasCycles() ) { - if ( debug ) logger.info("Not using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler because it contains a cycle"); + if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle"); + return null; + } + + // sanity check: make sure the graph had enough complexity with the given kmer + if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) { + if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity"); return null; } @@ -169,8 +180,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { rtgraph.pruneLowWeightChains(pruneFactor); // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if - // we can recover them by merging some N bases from the chain back into the reference uniquely, for - // N < kmerSize + // we can recover them by merging some N bases from the chain back into the reference if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); // remove all heading and trailing paths diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index ab6b17c35..8d8cb83f6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -46,14 +46,19 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman; import org.jgrapht.EdgeFactory; import org.jgrapht.alg.CycleDetector; @@ -80,9 +85,6 @@ public class ReadThreadingGraph extends BaseGraph their corresponding vertex in the graph */ - private Map uniqueKmers = new LinkedHashMap(); + private Map uniqueKmers = new LinkedHashMap<>(); /** * @@ -113,8 +115,6 @@ public class ReadThreadingGraph extends BaseGraph danglingPath, referencePath; + final byte[] danglingPathString, referencePathString; + final Cigar cigar; + + public DanglingTailMergeResult(final List danglingPath, + final List referencePath, + final byte[] danglingPathString, + final byte[] referencePathString, + final Cigar cigar) { + this.danglingPath = danglingPath; + this.referencePath = referencePath; + this.danglingPathString = danglingPathString; + this.referencePathString = referencePathString; + this.cigar = cigar; + } + } + + /** + * Attempt to attach vertex with out-degree == 0 to the graph + * * @param vertex the vertex to recover + * @return 1 if we successfully recovered the vertex and 0 otherwise */ protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) { if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0"); - final byte[] kmer = vertex.getSequence(); - if ( ! nonUniqueKmers.contains(new Kmer(kmer)) ) { - // don't attempt to fix non-unique kmers! - final MultiDeBruijnVertex uniqueMergePoint = danglingTailMergePoint(kmer); - if ( uniqueMergePoint != null ) { - addEdge(vertex, uniqueMergePoint, new MultiSampleEdge(false, 1)); - return 1; - } - } + // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths + final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex); - return 0; + // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path + if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) ) + return 0; + + // merge + return mergeDanglingTail(danglingTailMergeResult); } /** - * Find a unique merge point for kmer in the reference sequence - * @param kmer the full kmer of the dangling tail - * @return a vertex appropriate to merge kmer into, or null if none could be found + * Determine whether the provided cigar is okay to merge into the reference path + * + * @param cigar the cigar to analyze + * @return true if it's okay to merge, false otherwise */ - private MultiDeBruijnVertex danglingTailMergePoint(final byte[] kmer) { - final PrimitivePair.Int endAndLength = GraphUtils.findLongestUniqueSuffixMatch(refSeq, kmer); - if ( endAndLength != null && endAndLength.second >= MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL && endAndLength.first + 1 < refKmers.length) { - final int len = endAndLength.second; - final MultiDeBruijnVertex mergePoint = refKmers[endAndLength.first + 1]; -// logger.info("recoverDanglingChain of kmer " + new String(kmer) + " merged to " + mergePoint + " with match size " + len); - final Set nonUniquesAtLength = determineKmerSizeAndNonUniques(len, len).nonUniques; - final Kmer matchedKmer = new Kmer(kmer, kmer.length - len, len); - if ( nonUniquesAtLength.contains(matchedKmer) ) { -// logger.info("Rejecting merge " + new String(kmer) + " because match kmer " + matchedKmer + " isn't unique across all reads"); - return null; - } else { - return mergePoint; - } + protected boolean cigarIsOkayToMerge(final Cigar cigar) { + + final List elements = cigar.getCigarElements(); + + // don't allow more than a couple of different ops + if ( elements.size() > 3 ) + return false; + + // the last element must be an M + if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M ) + return false; + + // TODO -- do we want to check whether the Ms mismatch too much also? + + return true; + } + + /** + * Actually merge the dangling tail if possible + * + * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference + * @return 1 if merge was successful, 0 otherwise + */ + protected int mergeDanglingTail(final DanglingTailMergeResult danglingTailMergeResult) { + + final List elements = danglingTailMergeResult.cigar.getCigarElements(); + final CigarElement lastElement = elements.get(elements.size() - 1); + if ( lastElement.getOperator() != CigarOperator.M ) + throw new IllegalArgumentException("The last Cigar element must be an M"); + + final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1; + final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength()); + if ( matchingSuffix == 0 ) + return 0; + + final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0); + final int refIndexToMerge = lastRefIndex - matchingSuffix + 1; + addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), new MultiSampleEdge(false, 1)); + return 1; + } + + /** + * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the + * provided vertex is the sink) and the reference path. + * + * @param vertex the sink of the dangling tail + * @return a SmithWaterman object which can be null if no proper alignment could be generated + */ + protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex) { + + // find the lowest common ancestor path between vertex and the reference sink if available + final List altPath = findPathToLowestCommonAncestorOfReference(vertex); + if ( altPath == null ) + return null; + + // now get the reference path from the LCA + final List refPath = getReferencePath(altPath.get(0)); + + // create the Smith-Waterman strings to use + final byte[] refBases = getBasesForPath(refPath); + final byte[] altBases = getBasesForPath(altPath); + + // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) + final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWPairwiseAlignment.OVERHANG_STRATEGY.INDEL); + return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); + } + + /** + * Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex + * + * @param vertex the original vertex + * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or + * has an ancestor with multiple incoming edges before hitting the reference path + */ + protected List findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) { + final LinkedList path = new LinkedList<>(); + + MultiDeBruijnVertex v = vertex; + while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) { + path.addFirst(v); + v = getEdgeSource(incomingEdgeOf(v)); + } + path.addFirst(v); + + return isReferenceNode(v) ? path : null; + } + + /** + * Finds the path downwards in the graph from this vertex to the reference sink, including this vertex + * + * @param start the reference vertex to start from + * @return the path (non-null, non-empty) + */ + protected List getReferencePath(final MultiDeBruijnVertex start) { + if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path"); + + final List path = new ArrayList<>(); + + MultiDeBruijnVertex v = start; + while ( v != null ) { + path.add(v); + v = getNextReferenceVertex(v); } - return null; + return path; } /** @@ -330,6 +432,16 @@ public class ReadThreadingGraph extends BaseGraph(this).detectCycles(); } + /** + * Does the graph not have enough complexity? We define low complexity as a situation where the number + * of non-unique kmers is more than 20% of the total number of kmers. + * + * @return true if the graph has low complexity, false otherwise + */ + public boolean isLowComplexity() { + return nonUniqueKmers.size() * 4 > uniqueKmers.size(); + } + public void recoverDanglingTails() { if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); @@ -341,7 +453,8 @@ public class ReadThreadingGraph extends BaseGraph vertexes = new ArrayList<>(); + for ( int i = 0; i <= testString.length() - kmerSize; i++ ) { + vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize))); + } + + final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes)); + Assert.assertEquals(result, testString.substring(kmerSize - 1)); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java index 3f10fc72c..8269b9c20 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java @@ -83,7 +83,8 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest { } public SeqGraph assemble() { - assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests + assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests + assembler.setRecoverDanglingTails(false); // needed to pass some of the tests assembler.setDebugGraphTransformations(true); final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.emptyList()).get(0); if ( DEBUG ) graph.printGraph(new File("test.dot"), 0); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java index 67ee52734..ed91cccb3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; @@ -201,6 +202,81 @@ public class ReadThreadingGraphUnitTest extends BaseTest { Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); } + @DataProvider(name = "DanglingTails") + public Object[][] makeDanglingTailsData() { + List tests = new ArrayList(); + + // add 1M to the expected CIGAR because it includes the previous (common) base too + tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype + tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion + tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion + tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp + tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps + tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1}); // funky SW alignment + tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1}); // very little data + tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch + tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "DanglingTails", enabled = !DEBUG) + public void testDanglingTails(final String refEnd, + final String altEnd, + final String cigar, + final boolean cigarIsGood, + final int mergePointDistanceFromSink) { + + final int kmerSize = 15; + + // construct the haplotypes + final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT"; + final String ref = commonPrefix + refEnd; + final String alt = commonPrefix + altEnd; + + // create the graph and populate it + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); + rtgraph.addSequence("ref", ref.getBytes(), null, true); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); + rtgraph.addRead(read); + rtgraph.buildGraphIfNecessary(); + + // confirm that we have just a single dangling tail + MultiDeBruijnVertex altSink = null; + for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { + if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) { + Assert.assertTrue(altSink == null, "We found more than one non-reference sink"); + altSink = v; + } + } + + Assert.assertTrue(altSink != null, "We did not find a non-reference sink"); + + // confirm that the SW alignment agrees with our expectations + final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink); + Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); + + // confirm that the goodness of the cigar agrees with our expectations + Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar), cigarIsGood); + + // confirm that the tail merging works as expected + if ( cigarIsGood ) { + final int mergeResult = rtgraph.mergeDanglingTail(result); + Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1); + + // confirm that we created the appropriate edge + if ( mergePointDistanceFromSink >= 0 ) { + MultiDeBruijnVertex v = altSink; + for ( int i = 0; i < mergePointDistanceFromSink; i++ ) { + if ( rtgraph.inDegreeOf(v) != 1 ) + Assert.fail("Encountered vertex with multiple sources"); + v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v)); + } + Assert.assertTrue(rtgraph.outDegreeOf(v) > 1); + } + } + } + // TODO -- update to use determineKmerSizeAndNonUniques directly // @DataProvider(name = "KmerSizeData") diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index fa35e3f53..762ce4858 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -800,6 +800,23 @@ public final class AlignmentUtils { return new Cigar(elements); } + /** + * Removing a trailing deletion from the incoming cigar if present + * + * @param c the cigar we want to update + * @return a non-null Cigar + */ + @Requires("c != null") + @Ensures("result != null") + public static Cigar removeTrailingDeletions(final Cigar c) { + + final List elements = c.getCigarElements(); + if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.D ) + return c; + + return new Cigar(elements.subList(0, elements.size() - 1)); + } + /** * Move the indel in a given cigar string one base to the left * diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java index 84c33d4a5..1abf9f836 100644 --- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java @@ -118,6 +118,21 @@ public class SWPairwiseAlignment implements SmithWaterman { align(seq1,seq2); } + /** + * Create a new SW pairwise aligner + * + * After creating the object the two sequences are aligned with an internal call to align(seq1, seq2) + * + * @param seq1 the first sequence we want to align + * @param seq2 the second sequence we want to align + * @param strategy the overhang strategy to use + */ + public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final OVERHANG_STRATEGY strategy) { + this(SWParameterSet.ORIGINAL_DEFAULT.parameters); + overhang_strategy = strategy; + align(seq1, seq2); + } + /** * Create a new SW pairwise aligner, without actually doing any alignment yet * diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index e7d54c460..fbf0242a3 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -1033,5 +1033,12 @@ public class AlignmentUtilsUnitTest { Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected); } + @Test(dataProvider = "StartsOrEndsWithInsertionOrDeletionData", enabled = true) + public void testRemoveTrailingDeletions(final String cigar, final boolean expected) { + final Cigar originalCigar = TextCigarCodec.getSingleton().decode(cigar); + final Cigar newCigar = AlignmentUtils.removeTrailingDeletions(originalCigar); + + Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D")); + } } From 2c3c680eb704348cb8e20572f5b9d2fb3a5a986c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 5 Jun 2013 12:22:14 -0400 Subject: [PATCH 6/6] Misc changes and cleanup from all previous commits in this push. 1. By default, do not include the UG CEU callset for assessment. 2. Updated md5s that are different now with all the HC changes. --- ...lexAndSymbolicVariantsIntegrationTest.java | 6 +++--- .../HaplotypeCallerIntegrationTest.java | 20 +++++++++---------- ...aplotypeCallerParallelIntegrationTest.java | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index fba294c3d..073d54ec5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "8d7728909b1b8eb3f30f2f1583f054a8"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d21f15a5809fe5259af41ae6774af6f1"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "db71826dc798ff1cdf0c5d05b0ede976"); + "d4a0797c2fd4c103bf9a137633376156"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "42831d5463552911b7da9de0b4a27289"); + "a9872228d0275a30f5a1f7e070a9c9f4"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 904f15728..dbdd0afcd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -78,12 +78,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "1b15e4647013ab2c3ce7073c420d8640"); + HCTest(CEUTRIO_BAM, "", "e9167a1bfc0fc276586788d1ce1be408"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "423be27dc2cf7fd10baf465cf93e18e2"); + HCTest(NA12878_BAM, "", "b1d46afb9659ac3b92a3d131b58924ef"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -94,7 +94,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "a28e6f14e28708283d61c1e423bbdcb1"); + "d83856b8136776bd731a8037c16b71fa"); } @Test @@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "8344d86751b707c53b296c297eba4bfa"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "70c4476816f5d35c9978c378dbeac09b"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -147,7 +147,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "dea98f257d39fa1447a12c36a6bbf4a3"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "947aae309ecab7cd3f17ff9810884924"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -157,14 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("7cd1c5e2642ae8ddf38932aba1f51d69")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ee55ff4c6ec1bbef88e21cc0f45d4c47")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); executeTest("HCTestStructuralIndels: ", spec); } @@ -186,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("4886a98bf699f4e7f4491160749ada6a")); + Arrays.asList("0124c4923d96ec0f8222b596dd4ef534")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -194,7 +194,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("86bdd07a3ac4f6ce239c30efea8bf5ba")); + Arrays.asList("0e020dcfdf249225714f5cd86ed3869f")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } @@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("7b23a288a31cafca3946f14f2381e7cb")); + Arrays.asList("446a786bb539f3ec2084dd75167568aa")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index ff5a501cc..62e685eab 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { List tests = new ArrayList(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "c277fd65365d59b734260dd8423313bb"}); + tests.add(new Object[]{nct, "ef42a438b82681d1c0f921c57e16ff12"}); } return tests.toArray(new Object[][]{});