Disable error correcting kmers by default in the HC
-- The error correction algorithm can break the reference graph in some cases by error correcting us into a bad state for the reference sequence. Because we know that the error correction algorithm isn't ideal, and worse, doesn't actually seem to improve the calling itself on chr20, I've simply disabled error correction by default and allowed it to be turned on with a hidden argument. -- In the process I've changed a bit the assembly interface, moving some common arguments us into the LocalAssemblyEngine, which are turned on/off via setter methods. -- Went through the updated arguments in the HC to be @Hidden and @Advanced as appropriate -- Don't write out an errorcorrected graph when debugging and error correction isn't enabled
This commit is contained in:
parent
965043472a
commit
464e65ea96
|
|
@ -65,7 +65,6 @@ import org.broadinstitute.variant.variantcontext.Allele;
|
|||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -83,7 +82,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
|
||||
// TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
|
||||
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25;
|
||||
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16;
|
||||
private static final int GRAPH_KMER_STEP = 6;
|
||||
|
||||
// Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode
|
||||
|
|
@ -94,30 +92,23 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
private final boolean debug;
|
||||
private final boolean debugGraphTransformations;
|
||||
private final PrintStream graphWriter;
|
||||
private final int minKmer;
|
||||
private final byte minBaseQualityToUseInAssembly;
|
||||
|
||||
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
|
||||
|
||||
private int PRUNE_FACTOR = 2;
|
||||
|
||||
protected DeBruijnAssembler() {
|
||||
this(false, -1, null, 11, DEFAULT_MIN_BASE_QUALITY_TO_USE);
|
||||
this(false, -1, 11);
|
||||
}
|
||||
|
||||
public DeBruijnAssembler(final boolean debug,
|
||||
final int debugGraphTransformations,
|
||||
final PrintStream graphWriter,
|
||||
final int minKmer,
|
||||
final byte minBaseQualityToUseInAssembly) {
|
||||
final int minKmer) {
|
||||
super();
|
||||
this.debug = debug;
|
||||
this.debugGraphTransformations = debugGraphTransformations > 0;
|
||||
this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations;
|
||||
this.graphWriter = graphWriter;
|
||||
this.minKmer = minKmer;
|
||||
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -126,19 +117,15 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
* @param refHaplotype reference haplotype object
|
||||
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
|
||||
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
|
||||
* @param PRUNE_FACTOR prune kmers from the graph if their weight is <= this value
|
||||
* @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
|
||||
* @return a non-empty list of all the haplotypes that are produced during assembly
|
||||
*/
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
public List<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List<VariantContext> activeAllelesToGenotype ) {
|
||||
public List<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype ) {
|
||||
if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
|
||||
if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
|
||||
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
|
||||
if( PRUNE_FACTOR < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
||||
// set the pruning factor for this run of the assembly engine
|
||||
this.PRUNE_FACTOR = PRUNE_FACTOR;
|
||||
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
||||
// create the graphs
|
||||
final List<SeqGraph> graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
|
||||
|
|
@ -170,13 +157,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug);
|
||||
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
|
||||
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR);
|
||||
graph = graph.errorCorrect();
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
|
||||
|
||||
if ( shouldErrorCorrectKmers() ) {
|
||||
graph = graph.errorCorrect();
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor);
|
||||
}
|
||||
|
||||
final SeqGraph seqGraph = toSeqGraph(graph);
|
||||
|
||||
if( seqGraph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference
|
||||
if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
|
||||
sanityCheckReferenceGraph(seqGraph, refHaplotype);
|
||||
graphs.add(seqGraph);
|
||||
|
||||
|
|
@ -192,19 +182,19 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) {
|
||||
final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR);
|
||||
seqGraph.pruneGraph(PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
|
||||
seqGraph.pruneGraph(pruneFactor);
|
||||
seqGraph.removeVerticesNotConnectedToRef();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), pruneFactor);
|
||||
seqGraph.simplifyGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor);
|
||||
|
||||
// if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef
|
||||
// might blow up because there's no reference source node
|
||||
if ( seqGraph.vertexSet().size() == 1 )
|
||||
return seqGraph;
|
||||
seqGraph.removePathsNotConnectedToRef();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor);
|
||||
|
||||
return seqGraph;
|
||||
}
|
||||
|
|
@ -295,7 +285,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
continue;
|
||||
}
|
||||
|
||||
graph.printGraph(graphWriter, false, PRUNE_FACTOR);
|
||||
graph.printGraph(graphWriter, false, pruneFactor);
|
||||
|
||||
if ( debugGraphTransformations )
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -171,6 +171,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png
|
||||
*
|
||||
*/
|
||||
@Advanced
|
||||
@Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false)
|
||||
protected StingSAMFileWriter bamWriter = null;
|
||||
private HaplotypeBAMWriter haplotypeBAMWriter;
|
||||
|
|
@ -178,12 +179,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
/**
|
||||
* The type of BAM output we want to see.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false)
|
||||
public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES;
|
||||
|
||||
/**
|
||||
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
|
||||
|
||||
|
|
@ -191,6 +194,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||
protected String keepRG = null;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
|
||||
protected int MIN_PRUNE_FACTOR = 1;
|
||||
|
||||
|
|
@ -217,6 +221,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false)
|
||||
protected boolean includeUnmappedReads = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
|
||||
protected boolean USE_ALLELES_TRIGGER = false;
|
||||
|
||||
|
|
@ -232,6 +237,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false)
|
||||
protected boolean dontGenotype = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
||||
protected boolean errorCorrectKmers = false;
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP is not used in any way for the calculations themselves.
|
||||
|
|
@ -246,6 +255,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* Records that are filtered in the comp track will be ignored.
|
||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||
*/
|
||||
@Advanced
|
||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||
|
|
@ -258,6 +268,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
/**
|
||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>(Arrays.asList(new String[]{"ClippingRankSumTest"}));
|
||||
|
||||
|
|
@ -265,6 +276,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
|
||||
|
||||
|
|
@ -277,12 +289,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@ArgumentCollection
|
||||
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
|
||||
protected boolean DEBUG;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
|
||||
protected int debugGraphTransformations = -1;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false)
|
||||
protected boolean useLowQualityBasesForAssembly = false;
|
||||
|
||||
|
|
@ -388,8 +403,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
|
||||
}
|
||||
|
||||
final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE;
|
||||
assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, minBaseQualityToUseInAssembly );
|
||||
// setup the assembler
|
||||
assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, minKmer);
|
||||
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
|
||||
if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter);
|
||||
if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1);
|
||||
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
|
||||
|
||||
|
|
@ -520,7 +539,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
|
||||
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
|
||||
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, MIN_PRUNE_FACTOR, activeAllelesToGenotype );
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
|
||||
if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
|
||||
|
||||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.Haplotype;
|
|||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
|
@ -59,13 +60,46 @@ import java.util.List;
|
|||
* Date: Mar 14, 2011
|
||||
*/
|
||||
public abstract class LocalAssemblyEngine {
|
||||
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16;
|
||||
|
||||
public enum ASSEMBLER {
|
||||
SIMPLE_DE_BRUIJN
|
||||
protected PrintStream graphWriter = null;
|
||||
protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE;
|
||||
protected int pruneFactor = 2;
|
||||
protected boolean errorCorrectKmers = false;
|
||||
|
||||
protected LocalAssemblyEngine() { }
|
||||
|
||||
public int getPruneFactor() {
|
||||
return pruneFactor;
|
||||
}
|
||||
|
||||
protected LocalAssemblyEngine() {
|
||||
public void setPruneFactor(int pruneFactor) {
|
||||
this.pruneFactor = pruneFactor;
|
||||
}
|
||||
|
||||
public abstract List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, List<VariantContext> activeAllelesToGenotype);
|
||||
public boolean shouldErrorCorrectKmers() {
|
||||
return errorCorrectKmers;
|
||||
}
|
||||
|
||||
public void setErrorCorrectKmers(boolean errorCorrectKmers) {
|
||||
this.errorCorrectKmers = errorCorrectKmers;
|
||||
}
|
||||
|
||||
public PrintStream getGraphWriter() {
|
||||
return graphWriter;
|
||||
}
|
||||
|
||||
public void setGraphWriter(PrintStream graphWriter) {
|
||||
this.graphWriter = graphWriter;
|
||||
}
|
||||
|
||||
public byte getMinBaseQualityToUseInAssembly() {
|
||||
return minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) {
|
||||
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
public abstract List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List<VariantContext> activeAllelesToGenotype);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue