diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 680ae06e1..fb7fb652c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -266,6 +266,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) protected List kmerSizes = Arrays.asList(10, 25); + @Advanced + @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false) + protected boolean dontIncreaseKmerSizesForCycles = false; + /** * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the @@ -332,7 +336,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) - protected int phredScaledGlobalReadMismappingRate = 60; + protected int phredScaledGlobalReadMismappingRate = 45; @Advanced @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) @@ -535,7 +539,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH); assemblyEngine = useDebruijnAssembler ? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler) - : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes); + : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles); assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); @@ -705,11 +709,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) ); - // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final List bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap); + // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there + // was a bad interaction between that selection and the marginalization that happens over each event when computing + // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the + // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included + // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, - bestHaplotypes, + assemblyResult.haplotypes, stratifiedReadMap, perSampleFilteredReadList, assemblyResult.fullReferenceWithPadding, @@ -722,7 +729,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // TODO -- must disable if we are doing NCT, or set the output type of ! presorted if ( bamWriter != null ) { haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc, - bestHaplotypes, + assemblyResult.haplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); } @@ -879,21 +886,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true); } - /** - * Select the best N haplotypes according to their likelihoods, if appropriate - * - * @param haplotypes a list of haplotypes to consider - * @param stratifiedReadMap a map from samples -> read likelihoods - * @return the list of haplotypes to genotype - */ - protected List selectBestHaplotypesForGenotyping(final List haplotypes, final Map stratifiedReadMap) { - if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - return haplotypes; - } else { - return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); - } - } - //--------------------------------------------------------------------------------------------------------------- // // reduce diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 9f2197a84..c889d7995 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -233,7 +233,7 @@ public abstract class LocalAssemblyEngine { returnHaplotypes.add(h); if ( debug ) - logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize()); + logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize()); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index c963fb6e5..70ef539f3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -676,6 +676,24 @@ public class BaseGraph extends Default '}'; } + /** + * The base sequence for the given path. + * Note, this assumes that the path does not start with a source node. + * + * @param path the list of vertexes that make up the path + * @return non-null sequence of bases corresponding to the given path + */ + @Ensures({"result != null"}) + public byte[] getBasesForPath(final List path) { + if ( path == null ) throw new IllegalArgumentException("Path cannot be null"); + + final StringBuffer sb = new StringBuffer(); + for ( final DeBruijnVertex v : path ) + sb.append((char)v.getSuffix()); + + return sb.toString().getBytes(); + } + /** * Get the set of vertices within distance edges of source, regardless of edge direction * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java index 4aa6047a9..73a1daa3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java @@ -171,7 +171,15 @@ final public class GraphUtils { return foundDup ? null : new PrimitivePair.Int(longestPos, length); } - private static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) { + /** + * calculates the longest suffix match between a sequence and a smaller kmer + * + * @param seq the (reference) sequence + * @param kmer the smaller kmer sequence + * @param seqStart the index (inclusive) on seq to start looking backwards from + * @return the longest matching suffix + */ + public static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) { for ( int len = 1; len <= kmer.length; len++ ) { final int seqI = seqStart - len + 1; final int kmerI = kmer.length - len; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index a07b98bb6..2e84e1d22 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; @@ -92,7 +91,7 @@ public class Path { /** * Create a new Path containing no edges and starting at initialVertex * @param initialVertex the starting vertex of the path - * @param graph the graph this path with follow through + * @param graph the graph this path will follow through */ public Path(final T initialVertex, final BaseGraph graph) { if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null"); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index 123b36640..f4290f2bb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -49,12 +49,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; import java.util.Arrays; -import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -63,11 +63,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128; private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000; + private final static int KMER_SIZE_ITERATION_INCREASE = 10; + private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6; /** The min and max kmer sizes to try when building the graph. */ private final List kmerSizes; private final int maxAllowedPathsForReadThreadingAssembler; + private final boolean dontIncreaseKmerSizesForCycles; private boolean requireReasonableNumberOfPaths = false; protected boolean removePathsNotConnectedToRef = true; private boolean justReturnRawGraph = false; @@ -77,10 +80,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25)); } - public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes, final boolean dontIncreaseKmerSizesForCycles) { super(maxAllowedPathsForReadThreadingAssembler); this.kmerSizes = kmerSizes; this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler; + this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles; + } + + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { + this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true); } /** for testing purposes */ @@ -89,67 +97,110 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { } @Override - public List assemble( final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) { + public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) { final List graphs = new LinkedList<>(); + // first, try using the requested kmer sizes for ( final int kmerSize : kmerSizes ) { - final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly); + final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles); + if ( graph != null ) + graphs.add(graph); + } - // add the reference sequence to the graph - rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); - - // add the artificial GGA haplotypes to the graph - int hapCount = 0; - for( final Haplotype h : activeAlleleHaplotypes ) { - final int[] counts = new int[h.length()]; - Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS); - rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false); - } - - // Next pull kmers out of every read and throw them on the graph - for( final GATKSAMRecord read : reads ) { - rtgraph.addRead(read); - } - - // actually build the read threading graph - rtgraph.buildGraphIfNecessary(); - printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); - - // go through and prune all of the chains where all edges have <= pruneFactor. This must occur - // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering - // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 - rtgraph.pruneLowWeightChains(pruneFactor); - - // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if - // we can recover them by merging some N bases from the chain back into the reference uniquely, for - // N < kmerSize - if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); - - // remove all heading and trailing paths - if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); - - printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); - - final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); - - // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph - if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph); - - if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); - printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); - initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction - - final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); - if ( seqGraph != null ) { - if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) { - graphs.add(seqGraph); - } + // if none of those worked, iterate over larger sizes if allowed to do so + if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) { + int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE; + int numIterations = 1; + while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { + // on the last attempt we will allow low complexity graphs + final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT); + if ( graph != null ) + graphs.add(graph); + kmerSize += KMER_SIZE_ITERATION_INCREASE; + numIterations++; } } return graphs; } + /** + * Creates the sequence graph for the given kmerSize + * + * @param reads reads to use + * @param refHaplotype reference haplotype + * @param kmerSize kmer size + * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph + * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs + * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity) + */ + protected SeqGraph createGraph(final List reads, + final Haplotype refHaplotype, + final int kmerSize, + final List activeAlleleHaplotypes, + final boolean allowLowComplexityGraphs) { + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly); + + // add the reference sequence to the graph + rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); + + // add the artificial GGA haplotypes to the graph + int hapCount = 0; + for ( final Haplotype h : activeAlleleHaplotypes ) { + final int[] counts = new int[h.length()]; + Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS); + rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false); + } + + // Next pull kmers out of every read and throw them on the graph + for( final GATKSAMRecord read : reads ) { + rtgraph.addRead(read); + } + + // actually build the read threading graph + rtgraph.buildGraphIfNecessary(); + + // sanity check: make sure there are no cycles in the graph + if ( rtgraph.hasCycles() ) { + if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle"); + return null; + } + + // sanity check: make sure the graph had enough complexity with the given kmer + if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) { + if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity"); + return null; + } + + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); + + // go through and prune all of the chains where all edges have <= pruneFactor. This must occur + // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering + // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 + rtgraph.pruneLowWeightChains(pruneFactor); + + // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if + // we can recover them by merging some N bases from the chain back into the reference + if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); + + // remove all heading and trailing paths + if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); + + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); + + final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); + + // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph + if ( justReturnRawGraph ) return initialSeqGraph; + + if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); + printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); + initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction + + final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); + return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph; + } + /** * Did we find a reasonable number of paths in this graph? * @param graph diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index 8e879377f..8d8cb83f6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -46,14 +46,21 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman; import org.jgrapht.EdgeFactory; +import org.jgrapht.alg.CycleDetector; import java.io.File; import java.util.*; @@ -78,9 +85,6 @@ public class ReadThreadingGraph extends BaseGraph their corresponding vertex in the graph */ - private Map uniqueKmers = new LinkedHashMap(); + private Map uniqueKmers = new LinkedHashMap<>(); /** * @@ -111,8 +115,6 @@ public class ReadThreadingGraph extends BaseGraph danglingPath, referencePath; + final byte[] danglingPathString, referencePathString; + final Cigar cigar; + + public DanglingTailMergeResult(final List danglingPath, + final List referencePath, + final byte[] danglingPathString, + final byte[] referencePathString, + final Cigar cigar) { + this.danglingPath = danglingPath; + this.referencePath = referencePath; + this.danglingPathString = danglingPathString; + this.referencePathString = referencePathString; + this.cigar = cigar; + } + } + + /** + * Attempt to attach vertex with out-degree == 0 to the graph + * * @param vertex the vertex to recover + * @return 1 if we successfully recovered the vertex and 0 otherwise */ protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) { if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0"); - final byte[] kmer = vertex.getSequence(); - if ( ! nonUniqueKmers.contains(new Kmer(kmer)) ) { - // don't attempt to fix non-unique kmers! - final MultiDeBruijnVertex uniqueMergePoint = danglingTailMergePoint(kmer); - if ( uniqueMergePoint != null ) { - addEdge(vertex, uniqueMergePoint, new MultiSampleEdge(false, 1)); - return 1; - } - } + // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths + final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex); - return 0; + // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path + if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) ) + return 0; + + // merge + return mergeDanglingTail(danglingTailMergeResult); } /** - * Find a unique merge point for kmer in the reference sequence - * @param kmer the full kmer of the dangling tail - * @return a vertex appropriate to merge kmer into, or null if none could be found + * Determine whether the provided cigar is okay to merge into the reference path + * + * @param cigar the cigar to analyze + * @return true if it's okay to merge, false otherwise */ - private MultiDeBruijnVertex danglingTailMergePoint(final byte[] kmer) { - final PrimitivePair.Int endAndLength = GraphUtils.findLongestUniqueSuffixMatch(refSeq, kmer); - if ( endAndLength != null && endAndLength.second >= MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL && endAndLength.first + 1 < refKmers.length) { - final int len = endAndLength.second; - final MultiDeBruijnVertex mergePoint = refKmers[endAndLength.first + 1]; -// logger.info("recoverDanglingChain of kmer " + new String(kmer) + " merged to " + mergePoint + " with match size " + len); - final Set nonUniquesAtLength = determineKmerSizeAndNonUniques(len, len).nonUniques; - final Kmer matchedKmer = new Kmer(kmer, kmer.length - len, len); - if ( nonUniquesAtLength.contains(matchedKmer) ) { -// logger.info("Rejecting merge " + new String(kmer) + " because match kmer " + matchedKmer + " isn't unique across all reads"); - return null; - } else { - return mergePoint; - } + protected boolean cigarIsOkayToMerge(final Cigar cigar) { + + final List elements = cigar.getCigarElements(); + + // don't allow more than a couple of different ops + if ( elements.size() > 3 ) + return false; + + // the last element must be an M + if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M ) + return false; + + // TODO -- do we want to check whether the Ms mismatch too much also? + + return true; + } + + /** + * Actually merge the dangling tail if possible + * + * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference + * @return 1 if merge was successful, 0 otherwise + */ + protected int mergeDanglingTail(final DanglingTailMergeResult danglingTailMergeResult) { + + final List elements = danglingTailMergeResult.cigar.getCigarElements(); + final CigarElement lastElement = elements.get(elements.size() - 1); + if ( lastElement.getOperator() != CigarOperator.M ) + throw new IllegalArgumentException("The last Cigar element must be an M"); + + final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1; + final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength()); + if ( matchingSuffix == 0 ) + return 0; + + final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0); + final int refIndexToMerge = lastRefIndex - matchingSuffix + 1; + addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), new MultiSampleEdge(false, 1)); + return 1; + } + + /** + * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the + * provided vertex is the sink) and the reference path. + * + * @param vertex the sink of the dangling tail + * @return a SmithWaterman object which can be null if no proper alignment could be generated + */ + protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex) { + + // find the lowest common ancestor path between vertex and the reference sink if available + final List altPath = findPathToLowestCommonAncestorOfReference(vertex); + if ( altPath == null ) + return null; + + // now get the reference path from the LCA + final List refPath = getReferencePath(altPath.get(0)); + + // create the Smith-Waterman strings to use + final byte[] refBases = getBasesForPath(refPath); + final byte[] altBases = getBasesForPath(altPath); + + // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) + final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWPairwiseAlignment.OVERHANG_STRATEGY.INDEL); + return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); + } + + /** + * Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex + * + * @param vertex the original vertex + * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or + * has an ancestor with multiple incoming edges before hitting the reference path + */ + protected List findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) { + final LinkedList path = new LinkedList<>(); + + MultiDeBruijnVertex v = vertex; + while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) { + path.addFirst(v); + v = getEdgeSource(incomingEdgeOf(v)); + } + path.addFirst(v); + + return isReferenceNode(v) ? path : null; + } + + /** + * Finds the path downwards in the graph from this vertex to the reference sink, including this vertex + * + * @param start the reference vertex to start from + * @return the path (non-null, non-empty) + */ + protected List getReferencePath(final MultiDeBruijnVertex start) { + if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path"); + + final List path = new ArrayList<>(); + + MultiDeBruijnVertex v = start; + while ( v != null ) { + path.add(v); + v = getNextReferenceVertex(v); } - return null; + return path; } /** @@ -297,7 +401,7 @@ public class ReadThreadingGraph extends BaseGraph(this).detectCycles(); + } + + /** + * Does the graph not have enough complexity? We define low complexity as a situation where the number + * of non-unique kmers is more than 20% of the total number of kmers. + * + * @return true if the graph has low complexity, false otherwise + */ + public boolean isLowComplexity() { + return nonUniqueKmers.size() * 4 > uniqueKmers.size(); + } + public void recoverDanglingTails() { if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); @@ -332,7 +453,8 @@ public class ReadThreadingGraph extends BaseGraph determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) { // count up occurrences of kmers within each read final KMerCounter counter = new KMerCounter(kmerSize); - for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) { + final int stopPosition = seqForKmers.stop - kmerSize; + for ( int i = 0; i <= stopPosition; i++ ) { final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize); counter.addKmer(kmer, 1); } @@ -578,7 +701,7 @@ public class ReadThreadingGraph extends BaseGraph " + uniqueMergeVertex); @@ -590,7 +713,8 @@ public class ReadThreadingGraph extends BaseGraph= minBaseQualityToUseInAssembly; + } + /** * Get the set of non-unique kmers in this graph. For debugging purposes * @return a non-null set of kmers diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index fba294c3d..073d54ec5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "8d7728909b1b8eb3f30f2f1583f054a8"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d21f15a5809fe5259af41ae6774af6f1"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "db71826dc798ff1cdf0c5d05b0ede976"); + "d4a0797c2fd4c103bf9a137633376156"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "42831d5463552911b7da9de0b4a27289"); + "a9872228d0275a30f5a1f7e070a9c9f4"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 904f15728..dbdd0afcd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -78,12 +78,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "1b15e4647013ab2c3ce7073c420d8640"); + HCTest(CEUTRIO_BAM, "", "e9167a1bfc0fc276586788d1ce1be408"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "423be27dc2cf7fd10baf465cf93e18e2"); + HCTest(NA12878_BAM, "", "b1d46afb9659ac3b92a3d131b58924ef"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -94,7 +94,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "a28e6f14e28708283d61c1e423bbdcb1"); + "d83856b8136776bd731a8037c16b71fa"); } @Test @@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "8344d86751b707c53b296c297eba4bfa"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "70c4476816f5d35c9978c378dbeac09b"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -147,7 +147,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "dea98f257d39fa1447a12c36a6bbf4a3"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "947aae309ecab7cd3f17ff9810884924"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -157,14 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("7cd1c5e2642ae8ddf38932aba1f51d69")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ee55ff4c6ec1bbef88e21cc0f45d4c47")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); executeTest("HCTestStructuralIndels: ", spec); } @@ -186,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("4886a98bf699f4e7f4491160749ada6a")); + Arrays.asList("0124c4923d96ec0f8222b596dd4ef534")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -194,7 +194,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("86bdd07a3ac4f6ce239c30efea8bf5ba")); + Arrays.asList("0e020dcfdf249225714f5cd86ed3869f")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } @@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("7b23a288a31cafca3946f14f2381e7cb")); + Arrays.asList("446a786bb539f3ec2084dd75167568aa")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index ff5a501cc..62e685eab 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { List tests = new ArrayList(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "c277fd65365d59b734260dd8423313bb"}); + tests.add(new Object[]{nct, "ef42a438b82681d1c0f921c57e16ff12"}); } return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java index 9f6013235..2fda56665 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java @@ -251,7 +251,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { for ( int snpPos = 0; snpPos < windowSize; snpPos++) { if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) { final byte[] altBases = ref.getBytes(); - altBases[snpPos] = 'N'; + altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A'; final String alt = new String(altBases); tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java index e57f5d6e0..f9cbc6c73 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -312,4 +312,19 @@ public class BaseGraphUnitTest extends BaseTest { Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph)); } + + @Test(enabled = true) + public void testGetBases() { + + final int kmerSize = 4; + final String testString = "AATGGGGGCAATACTA"; + + final List vertexes = new ArrayList<>(); + for ( int i = 0; i <= testString.length() - kmerSize; i++ ) { + vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize))); + } + + final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes)); + Assert.assertEquals(result, testString.substring(kmerSize - 1)); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java index 3f10fc72c..8269b9c20 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java @@ -83,7 +83,8 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest { } public SeqGraph assemble() { - assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests + assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests + assembler.setRecoverDanglingTails(false); // needed to pass some of the tests assembler.setDebugGraphTransformations(true); final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.emptyList()).get(0); if ( DEBUG ) graph.printGraph(new File("test.dot"), 0); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java index 10c1cc00d..ed91cccb3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -48,8 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; @@ -145,7 +149,136 @@ public class ReadThreadingGraphUnitTest extends BaseTest { } } - // TODO -- update to use determineKmerSizeAndNonUniques directly + @Test(enabled = !DEBUG) + public void testCyclesInGraph() { + + // b37 20:12655200-12655850 + final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; + + // SNP at 20:12655528 creates a cycle for small kmers + final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; + + final List reads = new ArrayList<>(); + for ( int index = 0; index < alt.length() - 100; index += 20 ) + reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M")); + + // test that there are cycles detected for small kmer + final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25); + rtgraph25.addSequence("ref", ref.getBytes(), null, true); + for ( final GATKSAMRecord read : reads ) + rtgraph25.addRead(read); + rtgraph25.buildGraphIfNecessary(); + Assert.assertTrue(rtgraph25.hasCycles()); + + // test that there are no cycles detected for large kmer + final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75); + rtgraph75.addSequence("ref", ref.getBytes(), null, true); + for ( final GATKSAMRecord read : reads ) + rtgraph75.addRead(read); + rtgraph75.buildGraphIfNecessary(); + Assert.assertFalse(rtgraph75.hasCycles()); + } + + @Test(enabled = !DEBUG) + public void testNsInReadsAreNotUsedForGraph() { + + final int length = 100; + final byte[] ref = Utils.dupBytes((byte)'A', length); + + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25); + rtgraph.addSequence("ref", ref, null, true); + + // add reads with Ns at any position + for ( int i = 0; i < length; i++ ) { + final byte[] bases = ref.clone(); + bases[i] = 'N'; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M"); + rtgraph.addRead(read); + } + rtgraph.buildGraphIfNecessary(); + + final SeqGraph graph = rtgraph.convertToSequenceGraph(); + final KBestPaths pathFinder = new KBestPaths<>(false); + Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); + } + + @DataProvider(name = "DanglingTails") + public Object[][] makeDanglingTailsData() { + List tests = new ArrayList(); + + // add 1M to the expected CIGAR because it includes the previous (common) base too + tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype + tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion + tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion + tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp + tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps + tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1}); // funky SW alignment + tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1}); // very little data + tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch + tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "DanglingTails", enabled = !DEBUG) + public void testDanglingTails(final String refEnd, + final String altEnd, + final String cigar, + final boolean cigarIsGood, + final int mergePointDistanceFromSink) { + + final int kmerSize = 15; + + // construct the haplotypes + final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT"; + final String ref = commonPrefix + refEnd; + final String alt = commonPrefix + altEnd; + + // create the graph and populate it + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); + rtgraph.addSequence("ref", ref.getBytes(), null, true); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); + rtgraph.addRead(read); + rtgraph.buildGraphIfNecessary(); + + // confirm that we have just a single dangling tail + MultiDeBruijnVertex altSink = null; + for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { + if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) { + Assert.assertTrue(altSink == null, "We found more than one non-reference sink"); + altSink = v; + } + } + + Assert.assertTrue(altSink != null, "We did not find a non-reference sink"); + + // confirm that the SW alignment agrees with our expectations + final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink); + Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); + + // confirm that the goodness of the cigar agrees with our expectations + Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar), cigarIsGood); + + // confirm that the tail merging works as expected + if ( cigarIsGood ) { + final int mergeResult = rtgraph.mergeDanglingTail(result); + Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1); + + // confirm that we created the appropriate edge + if ( mergePointDistanceFromSink >= 0 ) { + MultiDeBruijnVertex v = altSink; + for ( int i = 0; i < mergePointDistanceFromSink; i++ ) { + if ( rtgraph.inDegreeOf(v) != 1 ) + Assert.fail("Encountered vertex with multiple sources"); + v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v)); + } + Assert.assertTrue(rtgraph.outDegreeOf(v) > 1); + } + } + } + + +// TODO -- update to use determineKmerSizeAndNonUniques directly // @DataProvider(name = "KmerSizeData") // public Object[][] makeKmerSizeDataProvider() { // List tests = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index fa35e3f53..762ce4858 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -800,6 +800,23 @@ public final class AlignmentUtils { return new Cigar(elements); } + /** + * Removing a trailing deletion from the incoming cigar if present + * + * @param c the cigar we want to update + * @return a non-null Cigar + */ + @Requires("c != null") + @Ensures("result != null") + public static Cigar removeTrailingDeletions(final Cigar c) { + + final List elements = c.getCigarElements(); + if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.D ) + return c; + + return new Cigar(elements.subList(0, elements.size() - 1)); + } + /** * Move the indel in a given cigar string one base to the left * diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java index 84c33d4a5..1abf9f836 100644 --- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java @@ -118,6 +118,21 @@ public class SWPairwiseAlignment implements SmithWaterman { align(seq1,seq2); } + /** + * Create a new SW pairwise aligner + * + * After creating the object the two sequences are aligned with an internal call to align(seq1, seq2) + * + * @param seq1 the first sequence we want to align + * @param seq2 the second sequence we want to align + * @param strategy the overhang strategy to use + */ + public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final OVERHANG_STRATEGY strategy) { + this(SWParameterSet.ORIGINAL_DEFAULT.parameters); + overhang_strategy = strategy; + align(seq1, seq2); + } + /** * Create a new SW pairwise aligner, without actually doing any alignment yet * diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index e7d54c460..fbf0242a3 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -1033,5 +1033,12 @@ public class AlignmentUtilsUnitTest { Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected); } + @Test(dataProvider = "StartsOrEndsWithInsertionOrDeletionData", enabled = true) + public void testRemoveTrailingDeletions(final String cigar, final boolean expected) { + final Cigar originalCigar = TextCigarCodec.getSingleton().decode(cigar); + final Cigar newCigar = AlignmentUtils.removeTrailingDeletions(originalCigar); + + Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D")); + } }