Fix FN problem stemming from sequence graphs that contain cycles.
Problem: The sequence graphs can get very complex and it's not enough just to test that any given read has non-unique kmers. Reads with variants can have kmers that match unique regions of the reference, and this causes cycles in the final sequence graph. Ultimately the problem is that kmers of 10/25 may not be large enough for these complex regions. Solution: We continue to try kmers of 10/25 but detect whether cycles exist; if so, we do not use them. If (and only if) we can't get usable graphs from the 10/25 kmers, then we start iterating over larger kmers until we either can generate a graph without cycles or attempt too many iterations.
This commit is contained in:
parent
210007cd09
commit
e4e7d39e2c
|
|
@ -266,6 +266,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
@Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
|
@Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
|
||||||
protected List<Integer> kmerSizes = Arrays.asList(10, 25);
|
protected List<Integer> kmerSizes = Arrays.asList(10, 25);
|
||||||
|
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false)
|
||||||
|
protected boolean dontIncreaseKmerSizesForCycles = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype
|
* Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype
|
||||||
* considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the
|
* considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the
|
||||||
|
|
@ -520,7 +524,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
|
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
|
||||||
assemblyEngine = useDebruijnAssembler
|
assemblyEngine = useDebruijnAssembler
|
||||||
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
|
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
|
||||||
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes);
|
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles);
|
||||||
|
|
||||||
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
|
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
|
||||||
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
|
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
|
|
@ -63,11 +64,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
||||||
|
|
||||||
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
|
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
|
||||||
private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
|
private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
|
||||||
|
private final static int KMER_SIZE_ITERATION_INCREASE = 10;
|
||||||
|
private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6;
|
||||||
|
|
||||||
/** The min and max kmer sizes to try when building the graph. */
|
/** The min and max kmer sizes to try when building the graph. */
|
||||||
private final List<Integer> kmerSizes;
|
private final List<Integer> kmerSizes;
|
||||||
private final int maxAllowedPathsForReadThreadingAssembler;
|
private final int maxAllowedPathsForReadThreadingAssembler;
|
||||||
|
|
||||||
|
private final boolean dontIncreaseKmerSizesForCycles;
|
||||||
private boolean requireReasonableNumberOfPaths = false;
|
private boolean requireReasonableNumberOfPaths = false;
|
||||||
protected boolean removePathsNotConnectedToRef = true;
|
protected boolean removePathsNotConnectedToRef = true;
|
||||||
private boolean justReturnRawGraph = false;
|
private boolean justReturnRawGraph = false;
|
||||||
|
|
@ -77,10 +81,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
||||||
this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25));
|
this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25));
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
|
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes, final boolean dontIncreaseKmerSizesForCycles) {
|
||||||
super(maxAllowedPathsForReadThreadingAssembler);
|
super(maxAllowedPathsForReadThreadingAssembler);
|
||||||
this.kmerSizes = kmerSizes;
|
this.kmerSizes = kmerSizes;
|
||||||
this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler;
|
this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler;
|
||||||
|
this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
|
||||||
|
this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** for testing purposes */
|
/** for testing purposes */
|
||||||
|
|
@ -89,67 +98,99 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
|
public List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes) {
|
||||||
final List<SeqGraph> graphs = new LinkedList<>();
|
final List<SeqGraph> graphs = new LinkedList<>();
|
||||||
|
|
||||||
|
// first, try using the requested kmer sizes
|
||||||
for ( final int kmerSize : kmerSizes ) {
|
for ( final int kmerSize : kmerSizes ) {
|
||||||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
|
final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
|
||||||
|
if ( graph != null )
|
||||||
|
graphs.add(graph);
|
||||||
|
}
|
||||||
|
|
||||||
// add the reference sequence to the graph
|
// if none of those worked, iterate over larger sizes if allowed to do so
|
||||||
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
|
||||||
|
int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
|
||||||
// add the artificial GGA haplotypes to the graph
|
int numIterations = 1;
|
||||||
int hapCount = 0;
|
while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
|
||||||
for( final Haplotype h : activeAlleleHaplotypes ) {
|
final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
|
||||||
final int[] counts = new int[h.length()];
|
if ( graph != null )
|
||||||
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
|
graphs.add(graph);
|
||||||
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
|
kmerSize += KMER_SIZE_ITERATION_INCREASE;
|
||||||
}
|
numIterations++;
|
||||||
|
|
||||||
// Next pull kmers out of every read and throw them on the graph
|
|
||||||
for( final GATKSAMRecord read : reads ) {
|
|
||||||
rtgraph.addRead(read);
|
|
||||||
}
|
|
||||||
|
|
||||||
// actually build the read threading graph
|
|
||||||
rtgraph.buildGraphIfNecessary();
|
|
||||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
|
|
||||||
|
|
||||||
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
|
|
||||||
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
|
|
||||||
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
|
|
||||||
rtgraph.pruneLowWeightChains(pruneFactor);
|
|
||||||
|
|
||||||
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
|
|
||||||
// we can recover them by merging some N bases from the chain back into the reference uniquely, for
|
|
||||||
// N < kmerSize
|
|
||||||
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
|
|
||||||
|
|
||||||
// remove all heading and trailing paths
|
|
||||||
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
|
|
||||||
|
|
||||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
|
|
||||||
|
|
||||||
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
|
|
||||||
|
|
||||||
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
|
|
||||||
if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph);
|
|
||||||
|
|
||||||
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
|
||||||
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
|
|
||||||
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
|
|
||||||
|
|
||||||
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
|
|
||||||
if ( seqGraph != null ) {
|
|
||||||
if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) {
|
|
||||||
graphs.add(seqGraph);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return graphs;
|
return graphs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates the sequence graph for the given kmerSize
|
||||||
|
*
|
||||||
|
* @param reads reads to use
|
||||||
|
* @param refHaplotype reference haplotype
|
||||||
|
* @param kmerSize kmer size
|
||||||
|
* @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
|
||||||
|
* @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths)
|
||||||
|
*/
|
||||||
|
protected SeqGraph createGraph(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final int kmerSize, final List<Haplotype> activeAlleleHaplotypes) {
|
||||||
|
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
|
||||||
|
|
||||||
|
// add the reference sequence to the graph
|
||||||
|
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
||||||
|
|
||||||
|
// add the artificial GGA haplotypes to the graph
|
||||||
|
int hapCount = 0;
|
||||||
|
for ( final Haplotype h : activeAlleleHaplotypes ) {
|
||||||
|
final int[] counts = new int[h.length()];
|
||||||
|
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
|
||||||
|
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next pull kmers out of every read and throw them on the graph
|
||||||
|
for( final GATKSAMRecord read : reads ) {
|
||||||
|
rtgraph.addRead(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
// actually build the read threading graph
|
||||||
|
rtgraph.buildGraphIfNecessary();
|
||||||
|
|
||||||
|
// sanity check: make sure there are no cycles in the graph
|
||||||
|
if ( rtgraph.hasCycles() ) {
|
||||||
|
if ( debug ) logger.info("Not using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler because it contains a cycle");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
|
||||||
|
|
||||||
|
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
|
||||||
|
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
|
||||||
|
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
|
||||||
|
rtgraph.pruneLowWeightChains(pruneFactor);
|
||||||
|
|
||||||
|
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
|
||||||
|
// we can recover them by merging some N bases from the chain back into the reference uniquely, for
|
||||||
|
// N < kmerSize
|
||||||
|
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
|
||||||
|
|
||||||
|
// remove all heading and trailing paths
|
||||||
|
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
|
||||||
|
|
||||||
|
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
|
||||||
|
|
||||||
|
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
|
||||||
|
|
||||||
|
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
|
||||||
|
if ( justReturnRawGraph ) return initialSeqGraph;
|
||||||
|
|
||||||
|
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
||||||
|
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
|
||||||
|
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
|
||||||
|
|
||||||
|
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
|
||||||
|
return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Did we find a reasonable number of paths in this graph?
|
* Did we find a reasonable number of paths in this graph?
|
||||||
* @param graph
|
* @param graph
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.jgrapht.EdgeFactory;
|
import org.jgrapht.EdgeFactory;
|
||||||
|
import org.jgrapht.alg.CycleDetector;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -297,7 +298,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
public void buildGraphIfNecessary() {
|
public void buildGraphIfNecessary() {
|
||||||
if ( alreadyBuilt ) return;
|
if ( alreadyBuilt ) return;
|
||||||
|
|
||||||
// determine the kmer size we'll uses, and capture the set of nonUniques for that kmer size
|
// determine the kmer size we'll use, and capture the set of nonUniques for that kmer size
|
||||||
final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize);
|
final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize);
|
||||||
nonUniqueKmers = result.nonUniques;
|
nonUniqueKmers = result.nonUniques;
|
||||||
|
|
||||||
|
|
@ -321,6 +322,13 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
alreadyBuilt = true;
|
alreadyBuilt = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return true if the graph has cycles, false otherwise
|
||||||
|
*/
|
||||||
|
public boolean hasCycles() {
|
||||||
|
return new CycleDetector<>(this).detectCycles();
|
||||||
|
}
|
||||||
|
|
||||||
public void recoverDanglingTails() {
|
public void recoverDanglingTails() {
|
||||||
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
|
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
|
||||||
|
|
||||||
|
|
@ -409,7 +417,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
private Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
|
private Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
|
||||||
// count up occurrences of kmers within each read
|
// count up occurrences of kmers within each read
|
||||||
final KMerCounter counter = new KMerCounter(kmerSize);
|
final KMerCounter counter = new KMerCounter(kmerSize);
|
||||||
for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) {
|
final int stopPosition = seqForKmers.stop - kmerSize;
|
||||||
|
for ( int i = 0; i <= stopPosition; i++ ) {
|
||||||
final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
|
final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
|
||||||
counter.addKmer(kmer, 1);
|
counter.addKmer(kmer, 1);
|
||||||
}
|
}
|
||||||
|
|
@ -578,7 +587,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
|
|
||||||
// none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
|
// none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
|
||||||
final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
|
final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
|
||||||
MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
|
final MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
|
||||||
|
|
||||||
if ( isRef && uniqueMergeVertex != null )
|
if ( isRef && uniqueMergeVertex != null )
|
||||||
throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
|
throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
|
||||||
|
|
@ -590,7 +599,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the longest stretch of high quality bases in read and pass that sequence to the graph
|
* Add the given read to the sequence graph. Ultimately the read will get sent through addSequence(), but first
|
||||||
|
* this method ensures we only use high quality bases and accounts for reduced reads, etc.
|
||||||
*
|
*
|
||||||
* @param read a non-null read
|
* @param read a non-null read
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.testng.Assert;
|
import org.testng.Assert;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
|
@ -145,6 +150,36 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(enabled = !DEBUG)
|
||||||
|
public void testCyclesInGraph() {
|
||||||
|
|
||||||
|
// b37 20:12655200-12655850
|
||||||
|
final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
|
||||||
|
|
||||||
|
// SNP at 20:12655528 creates a cycle for small kmers
|
||||||
|
final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
|
||||||
|
|
||||||
|
final List<GATKSAMRecord> reads = new ArrayList<>();
|
||||||
|
for ( int index = 0; index < alt.length() - 100; index += 20 )
|
||||||
|
reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M"));
|
||||||
|
|
||||||
|
// test that there are cycles detected for small kmer
|
||||||
|
final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25);
|
||||||
|
rtgraph25.addSequence("ref", ref.getBytes(), null, true);
|
||||||
|
for ( final GATKSAMRecord read : reads )
|
||||||
|
rtgraph25.addRead(read);
|
||||||
|
rtgraph25.buildGraphIfNecessary();
|
||||||
|
Assert.assertTrue(rtgraph25.hasCycles());
|
||||||
|
|
||||||
|
// test that there are no cycles detected for large kmer
|
||||||
|
final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75);
|
||||||
|
rtgraph75.addSequence("ref", ref.getBytes(), null, true);
|
||||||
|
for ( final GATKSAMRecord read : reads )
|
||||||
|
rtgraph75.addRead(read);
|
||||||
|
rtgraph75.buildGraphIfNecessary();
|
||||||
|
Assert.assertFalse(rtgraph75.hasCycles());
|
||||||
|
}
|
||||||
|
|
||||||
// TODO -- update to use determineKmerSizeAndNonUniques directly
|
// TODO -- update to use determineKmerSizeAndNonUniques directly
|
||||||
// @DataProvider(name = "KmerSizeData")
|
// @DataProvider(name = "KmerSizeData")
|
||||||
// public Object[][] makeKmerSizeDataProvider() {
|
// public Object[][] makeKmerSizeDataProvider() {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue