Merge pull request #268 from broadinstitute/eb_calling_accuracy_improvements_to_HC
Eb calling accuracy improvements to hc
This commit is contained in:
commit
e1fd3dff9a
|
|
@ -266,6 +266,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
@Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
|
@Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
|
||||||
protected List<Integer> kmerSizes = Arrays.asList(10, 25);
|
protected List<Integer> kmerSizes = Arrays.asList(10, 25);
|
||||||
|
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false)
|
||||||
|
protected boolean dontIncreaseKmerSizesForCycles = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype
|
* Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype
|
||||||
* considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the
|
* considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the
|
||||||
|
|
@ -332,7 +336,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
*/
|
*/
|
||||||
@Advanced
|
@Advanced
|
||||||
@Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false)
|
@Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false)
|
||||||
protected int phredScaledGlobalReadMismappingRate = 60;
|
protected int phredScaledGlobalReadMismappingRate = 45;
|
||||||
|
|
||||||
@Advanced
|
@Advanced
|
||||||
@Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
|
@Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
|
||||||
|
|
@ -535,7 +539,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
|
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
|
||||||
assemblyEngine = useDebruijnAssembler
|
assemblyEngine = useDebruijnAssembler
|
||||||
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
|
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
|
||||||
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes);
|
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles);
|
||||||
|
|
||||||
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
|
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
|
||||||
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
|
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
|
||||||
|
|
@ -705,11 +709,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
|
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
|
||||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) );
|
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) );
|
||||||
|
|
||||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
// Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there
|
||||||
final List<Haplotype> bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap);
|
// was a bad interaction between that selection and the marginalization that happens over each event when computing
|
||||||
|
// GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the
|
||||||
|
// haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included
|
||||||
|
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
|
||||||
|
|
||||||
final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
|
final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
|
||||||
bestHaplotypes,
|
assemblyResult.haplotypes,
|
||||||
stratifiedReadMap,
|
stratifiedReadMap,
|
||||||
perSampleFilteredReadList,
|
perSampleFilteredReadList,
|
||||||
assemblyResult.fullReferenceWithPadding,
|
assemblyResult.fullReferenceWithPadding,
|
||||||
|
|
@ -722,7 +729,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
|
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
|
||||||
if ( bamWriter != null ) {
|
if ( bamWriter != null ) {
|
||||||
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
|
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
|
||||||
bestHaplotypes,
|
assemblyResult.haplotypes,
|
||||||
calledHaplotypes.getCalledHaplotypes(),
|
calledHaplotypes.getCalledHaplotypes(),
|
||||||
stratifiedReadMap);
|
stratifiedReadMap);
|
||||||
}
|
}
|
||||||
|
|
@ -879,21 +886,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Select the best N haplotypes according to their likelihoods, if appropriate
|
|
||||||
*
|
|
||||||
* @param haplotypes a list of haplotypes to consider
|
|
||||||
* @param stratifiedReadMap a map from samples -> read likelihoods
|
|
||||||
* @return the list of haplotypes to genotype
|
|
||||||
*/
|
|
||||||
protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
|
||||||
if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
|
||||||
return haplotypes;
|
|
||||||
} else {
|
|
||||||
return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// reduce
|
// reduce
|
||||||
|
|
|
||||||
|
|
@ -233,7 +233,7 @@ public abstract class LocalAssemblyEngine {
|
||||||
returnHaplotypes.add(h);
|
returnHaplotypes.add(h);
|
||||||
|
|
||||||
if ( debug )
|
if ( debug )
|
||||||
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
|
logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -676,6 +676,24 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The base sequence for the given path.
|
||||||
|
* Note, this assumes that the path does not start with a source node.
|
||||||
|
*
|
||||||
|
* @param path the list of vertexes that make up the path
|
||||||
|
* @return non-null sequence of bases corresponding to the given path
|
||||||
|
*/
|
||||||
|
@Ensures({"result != null"})
|
||||||
|
public byte[] getBasesForPath(final List<? extends DeBruijnVertex> path) {
|
||||||
|
if ( path == null ) throw new IllegalArgumentException("Path cannot be null");
|
||||||
|
|
||||||
|
final StringBuffer sb = new StringBuffer();
|
||||||
|
for ( final DeBruijnVertex v : path )
|
||||||
|
sb.append((char)v.getSuffix());
|
||||||
|
|
||||||
|
return sb.toString().getBytes();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the set of vertices within distance edges of source, regardless of edge direction
|
* Get the set of vertices within distance edges of source, regardless of edge direction
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -171,7 +171,15 @@ final public class GraphUtils {
|
||||||
return foundDup ? null : new PrimitivePair.Int(longestPos, length);
|
return foundDup ? null : new PrimitivePair.Int(longestPos, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
|
/**
|
||||||
|
* calculates the longest suffix match between a sequence and a smaller kmer
|
||||||
|
*
|
||||||
|
* @param seq the (reference) sequence
|
||||||
|
* @param kmer the smaller kmer sequence
|
||||||
|
* @param seqStart the index (inclusive) on seq to start looking backwards from
|
||||||
|
* @return the longest matching suffix
|
||||||
|
*/
|
||||||
|
public static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
|
||||||
for ( int len = 1; len <= kmer.length; len++ ) {
|
for ( int len = 1; len <= kmer.length; len++ ) {
|
||||||
final int seqI = seqStart - len + 1;
|
final int seqI = seqStart - len + 1;
|
||||||
final int kmerI = kmer.length - len;
|
final int kmerI = kmer.length - len;
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,6 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||||
|
|
||||||
import com.google.java.contract.Ensures;
|
import com.google.java.contract.Ensures;
|
||||||
import com.google.java.contract.Requires;
|
|
||||||
import net.sf.samtools.Cigar;
|
import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
import net.sf.samtools.CigarOperator;
|
import net.sf.samtools.CigarOperator;
|
||||||
|
|
@ -92,7 +91,7 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
||||||
/**
|
/**
|
||||||
* Create a new Path containing no edges and starting at initialVertex
|
* Create a new Path containing no edges and starting at initialVertex
|
||||||
* @param initialVertex the starting vertex of the path
|
* @param initialVertex the starting vertex of the path
|
||||||
* @param graph the graph this path with follow through
|
* @param graph the graph this path will follow through
|
||||||
*/
|
*/
|
||||||
public Path(final T initialVertex, final BaseGraph<T, E> graph) {
|
public Path(final T initialVertex, final BaseGraph<T, E> graph) {
|
||||||
if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null");
|
if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null");
|
||||||
|
|
|
||||||
|
|
@ -49,12 +49,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
@ -63,11 +63,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
||||||
|
|
||||||
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
|
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
|
||||||
private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
|
private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
|
||||||
|
private final static int KMER_SIZE_ITERATION_INCREASE = 10;
|
||||||
|
private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6;
|
||||||
|
|
||||||
/** The min and max kmer sizes to try when building the graph. */
|
/** The min and max kmer sizes to try when building the graph. */
|
||||||
private final List<Integer> kmerSizes;
|
private final List<Integer> kmerSizes;
|
||||||
private final int maxAllowedPathsForReadThreadingAssembler;
|
private final int maxAllowedPathsForReadThreadingAssembler;
|
||||||
|
|
||||||
|
private final boolean dontIncreaseKmerSizesForCycles;
|
||||||
private boolean requireReasonableNumberOfPaths = false;
|
private boolean requireReasonableNumberOfPaths = false;
|
||||||
protected boolean removePathsNotConnectedToRef = true;
|
protected boolean removePathsNotConnectedToRef = true;
|
||||||
private boolean justReturnRawGraph = false;
|
private boolean justReturnRawGraph = false;
|
||||||
|
|
@ -77,10 +80,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
||||||
this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25));
|
this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25));
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
|
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes, final boolean dontIncreaseKmerSizesForCycles) {
|
||||||
super(maxAllowedPathsForReadThreadingAssembler);
|
super(maxAllowedPathsForReadThreadingAssembler);
|
||||||
this.kmerSizes = kmerSizes;
|
this.kmerSizes = kmerSizes;
|
||||||
this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler;
|
this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler;
|
||||||
|
this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
|
||||||
|
this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** for testing purposes */
|
/** for testing purposes */
|
||||||
|
|
@ -89,67 +97,110 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
|
public List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes) {
|
||||||
final List<SeqGraph> graphs = new LinkedList<>();
|
final List<SeqGraph> graphs = new LinkedList<>();
|
||||||
|
|
||||||
|
// first, try using the requested kmer sizes
|
||||||
for ( final int kmerSize : kmerSizes ) {
|
for ( final int kmerSize : kmerSizes ) {
|
||||||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
|
final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles);
|
||||||
|
if ( graph != null )
|
||||||
|
graphs.add(graph);
|
||||||
|
}
|
||||||
|
|
||||||
// add the reference sequence to the graph
|
// if none of those worked, iterate over larger sizes if allowed to do so
|
||||||
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
|
||||||
|
int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
|
||||||
// add the artificial GGA haplotypes to the graph
|
int numIterations = 1;
|
||||||
int hapCount = 0;
|
while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
|
||||||
for( final Haplotype h : activeAlleleHaplotypes ) {
|
// on the last attempt we will allow low complexity graphs
|
||||||
final int[] counts = new int[h.length()];
|
final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT);
|
||||||
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
|
if ( graph != null )
|
||||||
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
|
graphs.add(graph);
|
||||||
}
|
kmerSize += KMER_SIZE_ITERATION_INCREASE;
|
||||||
|
numIterations++;
|
||||||
// Next pull kmers out of every read and throw them on the graph
|
|
||||||
for( final GATKSAMRecord read : reads ) {
|
|
||||||
rtgraph.addRead(read);
|
|
||||||
}
|
|
||||||
|
|
||||||
// actually build the read threading graph
|
|
||||||
rtgraph.buildGraphIfNecessary();
|
|
||||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
|
|
||||||
|
|
||||||
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
|
|
||||||
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
|
|
||||||
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
|
|
||||||
rtgraph.pruneLowWeightChains(pruneFactor);
|
|
||||||
|
|
||||||
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
|
|
||||||
// we can recover them by merging some N bases from the chain back into the reference uniquely, for
|
|
||||||
// N < kmerSize
|
|
||||||
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
|
|
||||||
|
|
||||||
// remove all heading and trailing paths
|
|
||||||
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
|
|
||||||
|
|
||||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
|
|
||||||
|
|
||||||
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
|
|
||||||
|
|
||||||
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
|
|
||||||
if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph);
|
|
||||||
|
|
||||||
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
|
||||||
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
|
|
||||||
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
|
|
||||||
|
|
||||||
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
|
|
||||||
if ( seqGraph != null ) {
|
|
||||||
if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) {
|
|
||||||
graphs.add(seqGraph);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return graphs;
|
return graphs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates the sequence graph for the given kmerSize
|
||||||
|
*
|
||||||
|
* @param reads reads to use
|
||||||
|
* @param refHaplotype reference haplotype
|
||||||
|
* @param kmerSize kmer size
|
||||||
|
* @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
|
||||||
|
* @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
|
||||||
|
* @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
|
||||||
|
*/
|
||||||
|
protected SeqGraph createGraph(final List<GATKSAMRecord> reads,
|
||||||
|
final Haplotype refHaplotype,
|
||||||
|
final int kmerSize,
|
||||||
|
final List<Haplotype> activeAlleleHaplotypes,
|
||||||
|
final boolean allowLowComplexityGraphs) {
|
||||||
|
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
|
||||||
|
|
||||||
|
// add the reference sequence to the graph
|
||||||
|
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
||||||
|
|
||||||
|
// add the artificial GGA haplotypes to the graph
|
||||||
|
int hapCount = 0;
|
||||||
|
for ( final Haplotype h : activeAlleleHaplotypes ) {
|
||||||
|
final int[] counts = new int[h.length()];
|
||||||
|
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
|
||||||
|
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next pull kmers out of every read and throw them on the graph
|
||||||
|
for( final GATKSAMRecord read : reads ) {
|
||||||
|
rtgraph.addRead(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
// actually build the read threading graph
|
||||||
|
rtgraph.buildGraphIfNecessary();
|
||||||
|
|
||||||
|
// sanity check: make sure there are no cycles in the graph
|
||||||
|
if ( rtgraph.hasCycles() ) {
|
||||||
|
if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanity check: make sure the graph had enough complexity with the given kmer
|
||||||
|
if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) {
|
||||||
|
if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
|
||||||
|
|
||||||
|
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
|
||||||
|
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
|
||||||
|
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
|
||||||
|
rtgraph.pruneLowWeightChains(pruneFactor);
|
||||||
|
|
||||||
|
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
|
||||||
|
// we can recover them by merging some N bases from the chain back into the reference
|
||||||
|
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
|
||||||
|
|
||||||
|
// remove all heading and trailing paths
|
||||||
|
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
|
||||||
|
|
||||||
|
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
|
||||||
|
|
||||||
|
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
|
||||||
|
|
||||||
|
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
|
||||||
|
if ( justReturnRawGraph ) return initialSeqGraph;
|
||||||
|
|
||||||
|
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
||||||
|
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
|
||||||
|
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
|
||||||
|
|
||||||
|
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
|
||||||
|
return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Did we find a reasonable number of paths in this graph?
|
* Did we find a reasonable number of paths in this graph?
|
||||||
* @param graph
|
* @param graph
|
||||||
|
|
|
||||||
|
|
@ -46,14 +46,21 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
||||||
|
|
||||||
|
import net.sf.samtools.Cigar;
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
|
import net.sf.samtools.CigarOperator;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||||
|
import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman;
|
||||||
import org.jgrapht.EdgeFactory;
|
import org.jgrapht.EdgeFactory;
|
||||||
|
import org.jgrapht.alg.CycleDetector;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -78,9 +85,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
/** for debugging info printing */
|
/** for debugging info printing */
|
||||||
private static int counter = 0;
|
private static int counter = 0;
|
||||||
|
|
||||||
/** we require at least this many bases to be uniquely matching to merge a dangling tail */
|
|
||||||
private final static int MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL = 5;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sequences added for read threading before we've actually built the graph
|
* Sequences added for read threading before we've actually built the graph
|
||||||
*/
|
*/
|
||||||
|
|
@ -94,7 +98,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
/**
|
/**
|
||||||
* A map from kmers -> their corresponding vertex in the graph
|
* A map from kmers -> their corresponding vertex in the graph
|
||||||
*/
|
*/
|
||||||
private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<Kmer, MultiDeBruijnVertex>();
|
private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
|
@ -111,8 +115,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
// --------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------
|
||||||
private Kmer refSource;
|
private Kmer refSource;
|
||||||
private boolean alreadyBuilt;
|
private boolean alreadyBuilt;
|
||||||
byte[] refSeq;
|
|
||||||
MultiDeBruijnVertex[] refKmers;
|
|
||||||
|
|
||||||
public ReadThreadingGraph() {
|
public ReadThreadingGraph() {
|
||||||
this(25, false, (byte)6);
|
this(25, false, (byte)6);
|
||||||
|
|
@ -146,8 +148,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
uniqueKmers.clear();
|
uniqueKmers.clear();
|
||||||
refSource = null;
|
refSource = null;
|
||||||
alreadyBuilt = false;
|
alreadyBuilt = false;
|
||||||
refSeq = null;
|
|
||||||
refKmers = null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -224,13 +224,10 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
|
|
||||||
if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name);
|
if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name);
|
||||||
|
|
||||||
// keep track of information about the reference kmers for merging dangling tails
|
// keep track of information about the reference source
|
||||||
if ( seqForKmers.isRef ) {
|
if ( seqForKmers.isRef ) {
|
||||||
if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev " + refSource + " new is " + startingVertex);
|
if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev: " + refSource + ", new: " + startingVertex);
|
||||||
refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize);
|
refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize);
|
||||||
refSeq = seqForKmers.sequence;
|
|
||||||
refKmers = new MultiDeBruijnVertex[refSeq.length];
|
|
||||||
for ( int i = 0; i < kmerSize; i++ ) refKmers[i] = null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
|
// loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
|
||||||
|
|
@ -240,54 +237,161 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
|
|
||||||
vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef);
|
vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef);
|
||||||
if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name);
|
if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name);
|
||||||
|
|
||||||
// keep track of the reference kmers for merging dangling tails
|
|
||||||
if ( seqForKmers.isRef ) refKmers[i + kmerSize - 1] = vertex;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Attempt to attach vertex with out-degree == 0 to the graph by finding a unique matching kmer to the reference
|
* Class to keep track of the important dangling tail merging data
|
||||||
|
*/
|
||||||
|
protected final class DanglingTailMergeResult {
|
||||||
|
final List<MultiDeBruijnVertex> danglingPath, referencePath;
|
||||||
|
final byte[] danglingPathString, referencePathString;
|
||||||
|
final Cigar cigar;
|
||||||
|
|
||||||
|
public DanglingTailMergeResult(final List<MultiDeBruijnVertex> danglingPath,
|
||||||
|
final List<MultiDeBruijnVertex> referencePath,
|
||||||
|
final byte[] danglingPathString,
|
||||||
|
final byte[] referencePathString,
|
||||||
|
final Cigar cigar) {
|
||||||
|
this.danglingPath = danglingPath;
|
||||||
|
this.referencePath = referencePath;
|
||||||
|
this.danglingPathString = danglingPathString;
|
||||||
|
this.referencePathString = referencePathString;
|
||||||
|
this.cigar = cigar;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attempt to attach vertex with out-degree == 0 to the graph
|
||||||
|
*
|
||||||
* @param vertex the vertex to recover
|
* @param vertex the vertex to recover
|
||||||
|
* @return 1 if we successfully recovered the vertex and 0 otherwise
|
||||||
*/
|
*/
|
||||||
protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) {
|
protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) {
|
||||||
if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");
|
if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");
|
||||||
|
|
||||||
final byte[] kmer = vertex.getSequence();
|
// generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
|
||||||
if ( ! nonUniqueKmers.contains(new Kmer(kmer)) ) {
|
final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex);
|
||||||
// don't attempt to fix non-unique kmers!
|
|
||||||
final MultiDeBruijnVertex uniqueMergePoint = danglingTailMergePoint(kmer);
|
|
||||||
if ( uniqueMergePoint != null ) {
|
|
||||||
addEdge(vertex, uniqueMergePoint, new MultiSampleEdge(false, 1));
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
// if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
|
||||||
|
if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
// merge
|
||||||
|
return mergeDanglingTail(danglingTailMergeResult);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find a unique merge point for kmer in the reference sequence
|
* Determine whether the provided cigar is okay to merge into the reference path
|
||||||
* @param kmer the full kmer of the dangling tail
|
*
|
||||||
* @return a vertex appropriate to merge kmer into, or null if none could be found
|
* @param cigar the cigar to analyze
|
||||||
|
* @return true if it's okay to merge, false otherwise
|
||||||
*/
|
*/
|
||||||
private MultiDeBruijnVertex danglingTailMergePoint(final byte[] kmer) {
|
protected boolean cigarIsOkayToMerge(final Cigar cigar) {
|
||||||
final PrimitivePair.Int endAndLength = GraphUtils.findLongestUniqueSuffixMatch(refSeq, kmer);
|
|
||||||
if ( endAndLength != null && endAndLength.second >= MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL && endAndLength.first + 1 < refKmers.length) {
|
final List<CigarElement> elements = cigar.getCigarElements();
|
||||||
final int len = endAndLength.second;
|
|
||||||
final MultiDeBruijnVertex mergePoint = refKmers[endAndLength.first + 1];
|
// don't allow more than a couple of different ops
|
||||||
// logger.info("recoverDanglingChain of kmer " + new String(kmer) + " merged to " + mergePoint + " with match size " + len);
|
if ( elements.size() > 3 )
|
||||||
final Set<Kmer> nonUniquesAtLength = determineKmerSizeAndNonUniques(len, len).nonUniques;
|
return false;
|
||||||
final Kmer matchedKmer = new Kmer(kmer, kmer.length - len, len);
|
|
||||||
if ( nonUniquesAtLength.contains(matchedKmer) ) {
|
// the last element must be an M
|
||||||
// logger.info("Rejecting merge " + new String(kmer) + " because match kmer " + matchedKmer + " isn't unique across all reads");
|
if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M )
|
||||||
return null;
|
return false;
|
||||||
} else {
|
|
||||||
return mergePoint;
|
// TODO -- do we want to check whether the Ms mismatch too much also?
|
||||||
}
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Actually merge the dangling tail if possible
|
||||||
|
*
|
||||||
|
* @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference
|
||||||
|
* @return 1 if merge was successful, 0 otherwise
|
||||||
|
*/
|
||||||
|
protected int mergeDanglingTail(final DanglingTailMergeResult danglingTailMergeResult) {
|
||||||
|
|
||||||
|
final List<CigarElement> elements = danglingTailMergeResult.cigar.getCigarElements();
|
||||||
|
final CigarElement lastElement = elements.get(elements.size() - 1);
|
||||||
|
if ( lastElement.getOperator() != CigarOperator.M )
|
||||||
|
throw new IllegalArgumentException("The last Cigar element must be an M");
|
||||||
|
|
||||||
|
final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1;
|
||||||
|
final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength());
|
||||||
|
if ( matchingSuffix == 0 )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0);
|
||||||
|
final int refIndexToMerge = lastRefIndex - matchingSuffix + 1;
|
||||||
|
addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), new MultiSampleEdge(false, 1));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
|
||||||
|
* provided vertex is the sink) and the reference path.
|
||||||
|
*
|
||||||
|
* @param vertex the sink of the dangling tail
|
||||||
|
* @return a SmithWaterman object which can be null if no proper alignment could be generated
|
||||||
|
*/
|
||||||
|
protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex) {
|
||||||
|
|
||||||
|
// find the lowest common ancestor path between vertex and the reference sink if available
|
||||||
|
final List<MultiDeBruijnVertex> altPath = findPathToLowestCommonAncestorOfReference(vertex);
|
||||||
|
if ( altPath == null )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// now get the reference path from the LCA
|
||||||
|
final List<MultiDeBruijnVertex> refPath = getReferencePath(altPath.get(0));
|
||||||
|
|
||||||
|
// create the Smith-Waterman strings to use
|
||||||
|
final byte[] refBases = getBasesForPath(refPath);
|
||||||
|
final byte[] altBases = getBasesForPath(altPath);
|
||||||
|
|
||||||
|
// run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting)
|
||||||
|
final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWPairwiseAlignment.OVERHANG_STRATEGY.INDEL);
|
||||||
|
return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex
|
||||||
|
*
|
||||||
|
* @param vertex the original vertex
|
||||||
|
* @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
|
||||||
|
* has an ancestor with multiple incoming edges before hitting the reference path
|
||||||
|
*/
|
||||||
|
protected List<MultiDeBruijnVertex> findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) {
|
||||||
|
final LinkedList<MultiDeBruijnVertex> path = new LinkedList<>();
|
||||||
|
|
||||||
|
MultiDeBruijnVertex v = vertex;
|
||||||
|
while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) {
|
||||||
|
path.addFirst(v);
|
||||||
|
v = getEdgeSource(incomingEdgeOf(v));
|
||||||
|
}
|
||||||
|
path.addFirst(v);
|
||||||
|
|
||||||
|
return isReferenceNode(v) ? path : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the path downwards in the graph from this vertex to the reference sink, including this vertex
|
||||||
|
*
|
||||||
|
* @param start the reference vertex to start from
|
||||||
|
* @return the path (non-null, non-empty)
|
||||||
|
*/
|
||||||
|
protected List<MultiDeBruijnVertex> getReferencePath(final MultiDeBruijnVertex start) {
|
||||||
|
if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path");
|
||||||
|
|
||||||
|
final List<MultiDeBruijnVertex> path = new ArrayList<>();
|
||||||
|
|
||||||
|
MultiDeBruijnVertex v = start;
|
||||||
|
while ( v != null ) {
|
||||||
|
path.add(v);
|
||||||
|
v = getNextReferenceVertex(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return path;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -297,7 +401,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
public void buildGraphIfNecessary() {
|
public void buildGraphIfNecessary() {
|
||||||
if ( alreadyBuilt ) return;
|
if ( alreadyBuilt ) return;
|
||||||
|
|
||||||
// determine the kmer size we'll uses, and capture the set of nonUniques for that kmer size
|
// determine the kmer size we'll use, and capture the set of nonUniques for that kmer size
|
||||||
final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize);
|
final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize);
|
||||||
nonUniqueKmers = result.nonUniques;
|
nonUniqueKmers = result.nonUniques;
|
||||||
|
|
||||||
|
|
@ -321,6 +425,23 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
alreadyBuilt = true;
|
alreadyBuilt = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return true if the graph has cycles, false otherwise
|
||||||
|
*/
|
||||||
|
public boolean hasCycles() {
|
||||||
|
return new CycleDetector<>(this).detectCycles();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does the graph not have enough complexity? We define low complexity as a situation where the number
|
||||||
|
* of non-unique kmers is more than 20% of the total number of kmers.
|
||||||
|
*
|
||||||
|
* @return true if the graph has low complexity, false otherwise
|
||||||
|
*/
|
||||||
|
public boolean isLowComplexity() {
|
||||||
|
return nonUniqueKmers.size() * 4 > uniqueKmers.size();
|
||||||
|
}
|
||||||
|
|
||||||
public void recoverDanglingTails() {
|
public void recoverDanglingTails() {
|
||||||
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
|
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
|
||||||
|
|
||||||
|
|
@ -332,7 +453,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
nRecovered += recoverDanglingChain(v);
|
nRecovered += recoverDanglingChain(v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
|
|
||||||
|
if ( debugGraphTransformations ) logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** structure that keeps track of the non-unique kmers for a given kmer size */
|
/** structure that keeps track of the non-unique kmers for a given kmer size */
|
||||||
|
|
@ -409,7 +531,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
private Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
|
private Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
|
||||||
// count up occurrences of kmers within each read
|
// count up occurrences of kmers within each read
|
||||||
final KMerCounter counter = new KMerCounter(kmerSize);
|
final KMerCounter counter = new KMerCounter(kmerSize);
|
||||||
for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) {
|
final int stopPosition = seqForKmers.stop - kmerSize;
|
||||||
|
for ( int i = 0; i <= stopPosition; i++ ) {
|
||||||
final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
|
final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
|
||||||
counter.addKmer(kmer, 1);
|
counter.addKmer(kmer, 1);
|
||||||
}
|
}
|
||||||
|
|
@ -578,7 +701,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
|
|
||||||
// none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
|
// none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
|
||||||
final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
|
final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
|
||||||
MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
|
final MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
|
||||||
|
|
||||||
if ( isRef && uniqueMergeVertex != null )
|
if ( isRef && uniqueMergeVertex != null )
|
||||||
throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
|
throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
|
||||||
|
|
@ -590,7 +713,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the longest stretch of high quality bases in read and pass that sequence to the graph
|
* Add the given read to the sequence graph. Ultimately the read will get sent through addSequence(), but first
|
||||||
|
* this method ensures we only use high quality bases and accounts for reduced reads, etc.
|
||||||
*
|
*
|
||||||
* @param read a non-null read
|
* @param read a non-null read
|
||||||
*/
|
*/
|
||||||
|
|
@ -601,7 +725,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
|
|
||||||
int lastGood = -1; // the index of the last good base we've seen
|
int lastGood = -1; // the index of the last good base we've seen
|
||||||
for( int end = 0; end <= sequence.length; end++ ) {
|
for( int end = 0; end <= sequence.length; end++ ) {
|
||||||
if ( end == sequence.length || qualities[end] < minBaseQualityToUseInAssembly ) {
|
if ( end == sequence.length || ! baseIsUsableForAssembly(sequence[end], qualities[end]) ) {
|
||||||
// the first good base is at lastGood, can be -1 if last base was bad
|
// the first good base is at lastGood, can be -1 if last base was bad
|
||||||
final int start = lastGood;
|
final int start = lastGood;
|
||||||
// the stop base is end - 1 (if we're not at the end of the sequence)
|
// the stop base is end - 1 (if we're not at the end of the sequence)
|
||||||
|
|
@ -621,6 +745,18 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether a base can safely be used for assembly.
|
||||||
|
* Currently disallows Ns and/or those with low quality
|
||||||
|
*
|
||||||
|
* @param base the base under consideration
|
||||||
|
* @param qual the quality of that base
|
||||||
|
* @return true if the base can be used for assembly, false otherwise
|
||||||
|
*/
|
||||||
|
protected boolean baseIsUsableForAssembly(final byte base, final byte qual) {
|
||||||
|
return base != BaseUtils.Base.N.base && qual >= minBaseQualityToUseInAssembly;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the set of non-unique kmers in this graph. For debugging purposes
|
* Get the set of non-unique kmers in this graph. For debugging purposes
|
||||||
* @return a non-null set of kmers
|
* @return a non-null set of kmers
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerMultiSampleComplex1() {
|
public void testHaplotypeCallerMultiSampleComplex1() {
|
||||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "8d7728909b1b8eb3f30f2f1583f054a8");
|
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d21f15a5809fe5259af41ae6774af6f1");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||||
|
|
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerMultiSampleGGAComplex() {
|
public void testHaplotypeCallerMultiSampleGGAComplex() {
|
||||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
|
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
|
||||||
"db71826dc798ff1cdf0c5d05b0ede976");
|
"d4a0797c2fd4c103bf9a137633376156");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||||
"42831d5463552911b7da9de0b4a27289");
|
"a9872228d0275a30f5a1f7e070a9c9f4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -78,12 +78,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerMultiSample() {
|
public void testHaplotypeCallerMultiSample() {
|
||||||
HCTest(CEUTRIO_BAM, "", "1b15e4647013ab2c3ce7073c420d8640");
|
HCTest(CEUTRIO_BAM, "", "e9167a1bfc0fc276586788d1ce1be408");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerSingleSample() {
|
public void testHaplotypeCallerSingleSample() {
|
||||||
HCTest(NA12878_BAM, "", "423be27dc2cf7fd10baf465cf93e18e2");
|
HCTest(NA12878_BAM, "", "b1d46afb9659ac3b92a3d131b58924ef");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = false) // can't annotate the rsID's yet
|
@Test(enabled = false) // can't annotate the rsID's yet
|
||||||
|
|
@ -94,7 +94,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerMultiSampleGGA() {
|
public void testHaplotypeCallerMultiSampleGGA() {
|
||||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||||
"a28e6f14e28708283d61c1e423bbdcb1");
|
"d83856b8136776bd731a8037c16b71fa");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "8344d86751b707c53b296c297eba4bfa");
|
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "70c4476816f5d35c9978c378dbeac09b");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
|
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
|
||||||
|
|
@ -147,7 +147,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHaplotypeCallerNearbySmallIntervals() {
|
public void testHaplotypeCallerNearbySmallIntervals() {
|
||||||
HCTestNearbySmallIntervals(NA12878_BAM, "", "dea98f257d39fa1447a12c36a6bbf4a3");
|
HCTestNearbySmallIntervals(NA12878_BAM, "", "947aae309ecab7cd3f17ff9810884924");
|
||||||
}
|
}
|
||||||
|
|
||||||
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
||||||
|
|
@ -157,14 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("7cd1c5e2642ae8ddf38932aba1f51d69"));
|
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
|
||||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void HCTestStructuralIndels() {
|
public void HCTestStructuralIndels() {
|
||||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ee55ff4c6ec1bbef88e21cc0f45d4c47"));
|
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320"));
|
||||||
executeTest("HCTestStructuralIndels: ", spec);
|
executeTest("HCTestStructuralIndels: ", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -186,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
public void HCTestReducedBam() {
|
public void HCTestReducedBam() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||||
Arrays.asList("4886a98bf699f4e7f4491160749ada6a"));
|
Arrays.asList("0124c4923d96ec0f8222b596dd4ef534"));
|
||||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -194,7 +194,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||||
Arrays.asList("86bdd07a3ac4f6ce239c30efea8bf5ba"));
|
Arrays.asList("0e020dcfdf249225714f5cd86ed3869f"));
|
||||||
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||||
public void HCTestDBSNPAnnotationWGS() {
|
public void HCTestDBSNPAnnotationWGS() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||||
Arrays.asList("7b23a288a31cafca3946f14f2381e7cb"));
|
Arrays.asList("446a786bb539f3ec2084dd75167568aa"));
|
||||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
|
||||||
List<Object[]> tests = new ArrayList<Object[]>();
|
List<Object[]> tests = new ArrayList<Object[]>();
|
||||||
|
|
||||||
for ( final int nct : Arrays.asList(1, 2, 4) ) {
|
for ( final int nct : Arrays.asList(1, 2, 4) ) {
|
||||||
tests.add(new Object[]{nct, "c277fd65365d59b734260dd8423313bb"});
|
tests.add(new Object[]{nct, "ef42a438b82681d1c0f921c57e16ff12"});
|
||||||
}
|
}
|
||||||
|
|
||||||
return tests.toArray(new Object[][]{});
|
return tests.toArray(new Object[][]{});
|
||||||
|
|
|
||||||
|
|
@ -251,7 +251,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
|
||||||
for ( int snpPos = 0; snpPos < windowSize; snpPos++) {
|
for ( int snpPos = 0; snpPos < windowSize; snpPos++) {
|
||||||
if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) {
|
if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) {
|
||||||
final byte[] altBases = ref.getBytes();
|
final byte[] altBases = ref.getBytes();
|
||||||
altBases[snpPos] = 'N';
|
altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A';
|
||||||
final String alt = new String(altBases);
|
final String alt = new String(altBases);
|
||||||
tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt});
|
tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -312,4 +312,19 @@ public class BaseGraphUnitTest extends BaseTest {
|
||||||
|
|
||||||
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
|
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testGetBases() {
|
||||||
|
|
||||||
|
final int kmerSize = 4;
|
||||||
|
final String testString = "AATGGGGGCAATACTA";
|
||||||
|
|
||||||
|
final List<DeBruijnVertex> vertexes = new ArrayList<>();
|
||||||
|
for ( int i = 0; i <= testString.length() - kmerSize; i++ ) {
|
||||||
|
vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize)));
|
||||||
|
}
|
||||||
|
|
||||||
|
final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes));
|
||||||
|
Assert.assertEquals(result, testString.substring(kmerSize - 1));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -83,7 +83,8 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
public SeqGraph assemble() {
|
public SeqGraph assemble() {
|
||||||
assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests
|
assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests
|
||||||
|
assembler.setRecoverDanglingTails(false); // needed to pass some of the tests
|
||||||
assembler.setDebugGraphTransformations(true);
|
assembler.setDebugGraphTransformations(true);
|
||||||
final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0);
|
final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0);
|
||||||
if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
|
if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
|
||||||
|
|
|
||||||
|
|
@ -48,8 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
||||||
|
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
||||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.testng.Assert;
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.DataProvider;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
@ -145,7 +149,136 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO -- update to use determineKmerSizeAndNonUniques directly
|
@Test(enabled = !DEBUG)
|
||||||
|
public void testCyclesInGraph() {
|
||||||
|
|
||||||
|
// b37 20:12655200-12655850
|
||||||
|
final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
|
||||||
|
|
||||||
|
// SNP at 20:12655528 creates a cycle for small kmers
|
||||||
|
final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
|
||||||
|
|
||||||
|
final List<GATKSAMRecord> reads = new ArrayList<>();
|
||||||
|
for ( int index = 0; index < alt.length() - 100; index += 20 )
|
||||||
|
reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M"));
|
||||||
|
|
||||||
|
// test that there are cycles detected for small kmer
|
||||||
|
final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25);
|
||||||
|
rtgraph25.addSequence("ref", ref.getBytes(), null, true);
|
||||||
|
for ( final GATKSAMRecord read : reads )
|
||||||
|
rtgraph25.addRead(read);
|
||||||
|
rtgraph25.buildGraphIfNecessary();
|
||||||
|
Assert.assertTrue(rtgraph25.hasCycles());
|
||||||
|
|
||||||
|
// test that there are no cycles detected for large kmer
|
||||||
|
final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75);
|
||||||
|
rtgraph75.addSequence("ref", ref.getBytes(), null, true);
|
||||||
|
for ( final GATKSAMRecord read : reads )
|
||||||
|
rtgraph75.addRead(read);
|
||||||
|
rtgraph75.buildGraphIfNecessary();
|
||||||
|
Assert.assertFalse(rtgraph75.hasCycles());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = !DEBUG)
|
||||||
|
public void testNsInReadsAreNotUsedForGraph() {
|
||||||
|
|
||||||
|
final int length = 100;
|
||||||
|
final byte[] ref = Utils.dupBytes((byte)'A', length);
|
||||||
|
|
||||||
|
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25);
|
||||||
|
rtgraph.addSequence("ref", ref, null, true);
|
||||||
|
|
||||||
|
// add reads with Ns at any position
|
||||||
|
for ( int i = 0; i < length; i++ ) {
|
||||||
|
final byte[] bases = ref.clone();
|
||||||
|
bases[i] = 'N';
|
||||||
|
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M");
|
||||||
|
rtgraph.addRead(read);
|
||||||
|
}
|
||||||
|
rtgraph.buildGraphIfNecessary();
|
||||||
|
|
||||||
|
final SeqGraph graph = rtgraph.convertToSequenceGraph();
|
||||||
|
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<>(false);
|
||||||
|
Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "DanglingTails")
|
||||||
|
public Object[][] makeDanglingTailsData() {
|
||||||
|
List<Object[]> tests = new ArrayList<Object[]>();
|
||||||
|
|
||||||
|
// add 1M to the expected CIGAR because it includes the previous (common) base too
|
||||||
|
tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype
|
||||||
|
tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion
|
||||||
|
tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion
|
||||||
|
tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp
|
||||||
|
tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps
|
||||||
|
tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1}); // funky SW alignment
|
||||||
|
tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1}); // very little data
|
||||||
|
tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch
|
||||||
|
tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex
|
||||||
|
|
||||||
|
return tests.toArray(new Object[][]{});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "DanglingTails", enabled = !DEBUG)
|
||||||
|
public void testDanglingTails(final String refEnd,
|
||||||
|
final String altEnd,
|
||||||
|
final String cigar,
|
||||||
|
final boolean cigarIsGood,
|
||||||
|
final int mergePointDistanceFromSink) {
|
||||||
|
|
||||||
|
final int kmerSize = 15;
|
||||||
|
|
||||||
|
// construct the haplotypes
|
||||||
|
final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
|
||||||
|
final String ref = commonPrefix + refEnd;
|
||||||
|
final String alt = commonPrefix + altEnd;
|
||||||
|
|
||||||
|
// create the graph and populate it
|
||||||
|
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize);
|
||||||
|
rtgraph.addSequence("ref", ref.getBytes(), null, true);
|
||||||
|
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M");
|
||||||
|
rtgraph.addRead(read);
|
||||||
|
rtgraph.buildGraphIfNecessary();
|
||||||
|
|
||||||
|
// confirm that we have just a single dangling tail
|
||||||
|
MultiDeBruijnVertex altSink = null;
|
||||||
|
for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) {
|
||||||
|
if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) {
|
||||||
|
Assert.assertTrue(altSink == null, "We found more than one non-reference sink");
|
||||||
|
altSink = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertTrue(altSink != null, "We did not find a non-reference sink");
|
||||||
|
|
||||||
|
// confirm that the SW alignment agrees with our expectations
|
||||||
|
final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink);
|
||||||
|
Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
|
||||||
|
|
||||||
|
// confirm that the goodness of the cigar agrees with our expectations
|
||||||
|
Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar), cigarIsGood);
|
||||||
|
|
||||||
|
// confirm that the tail merging works as expected
|
||||||
|
if ( cigarIsGood ) {
|
||||||
|
final int mergeResult = rtgraph.mergeDanglingTail(result);
|
||||||
|
Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1);
|
||||||
|
|
||||||
|
// confirm that we created the appropriate edge
|
||||||
|
if ( mergePointDistanceFromSink >= 0 ) {
|
||||||
|
MultiDeBruijnVertex v = altSink;
|
||||||
|
for ( int i = 0; i < mergePointDistanceFromSink; i++ ) {
|
||||||
|
if ( rtgraph.inDegreeOf(v) != 1 )
|
||||||
|
Assert.fail("Encountered vertex with multiple sources");
|
||||||
|
v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v));
|
||||||
|
}
|
||||||
|
Assert.assertTrue(rtgraph.outDegreeOf(v) > 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// TODO -- update to use determineKmerSizeAndNonUniques directly
|
||||||
// @DataProvider(name = "KmerSizeData")
|
// @DataProvider(name = "KmerSizeData")
|
||||||
// public Object[][] makeKmerSizeDataProvider() {
|
// public Object[][] makeKmerSizeDataProvider() {
|
||||||
// List<Object[]> tests = new ArrayList<Object[]>();
|
// List<Object[]> tests = new ArrayList<Object[]>();
|
||||||
|
|
|
||||||
|
|
@ -800,6 +800,23 @@ public final class AlignmentUtils {
|
||||||
return new Cigar(elements);
|
return new Cigar(elements);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removing a trailing deletion from the incoming cigar if present
|
||||||
|
*
|
||||||
|
* @param c the cigar we want to update
|
||||||
|
* @return a non-null Cigar
|
||||||
|
*/
|
||||||
|
@Requires("c != null")
|
||||||
|
@Ensures("result != null")
|
||||||
|
public static Cigar removeTrailingDeletions(final Cigar c) {
|
||||||
|
|
||||||
|
final List<CigarElement> elements = c.getCigarElements();
|
||||||
|
if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.D )
|
||||||
|
return c;
|
||||||
|
|
||||||
|
return new Cigar(elements.subList(0, elements.size() - 1));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Move the indel in a given cigar string one base to the left
|
* Move the indel in a given cigar string one base to the left
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -118,6 +118,21 @@ public class SWPairwiseAlignment implements SmithWaterman {
|
||||||
align(seq1,seq2);
|
align(seq1,seq2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new SW pairwise aligner
|
||||||
|
*
|
||||||
|
* After creating the object the two sequences are aligned with an internal call to align(seq1, seq2)
|
||||||
|
*
|
||||||
|
* @param seq1 the first sequence we want to align
|
||||||
|
* @param seq2 the second sequence we want to align
|
||||||
|
* @param strategy the overhang strategy to use
|
||||||
|
*/
|
||||||
|
public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final OVERHANG_STRATEGY strategy) {
|
||||||
|
this(SWParameterSet.ORIGINAL_DEFAULT.parameters);
|
||||||
|
overhang_strategy = strategy;
|
||||||
|
align(seq1, seq2);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new SW pairwise aligner, without actually doing any alignment yet
|
* Create a new SW pairwise aligner, without actually doing any alignment yet
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -1033,5 +1033,12 @@ public class AlignmentUtilsUnitTest {
|
||||||
Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected);
|
Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "StartsOrEndsWithInsertionOrDeletionData", enabled = true)
|
||||||
|
public void testRemoveTrailingDeletions(final String cigar, final boolean expected) {
|
||||||
|
|
||||||
|
final Cigar originalCigar = TextCigarCodec.getSingleton().decode(cigar);
|
||||||
|
final Cigar newCigar = AlignmentUtils.removeTrailingDeletions(originalCigar);
|
||||||
|
|
||||||
|
Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue