diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 6aec9c7a5..c219fab00 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -65,7 +65,6 @@ import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.PrintStream; import java.util.*; /** @@ -83,7 +82,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; - public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; private static final int GRAPH_KMER_STEP = 6; // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode @@ -94,30 +92,23 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private final boolean debug; private final boolean debugGraphTransformations; - private final PrintStream graphWriter; private final int minKmer; - private final byte minBaseQualityToUseInAssembly; private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; - private int PRUNE_FACTOR = 2; protected DeBruijnAssembler() { - this(false, -1, null, 11, DEFAULT_MIN_BASE_QUALITY_TO_USE); + this(false, -1, 11); } public DeBruijnAssembler(final boolean debug, final int debugGraphTransformations, - final PrintStream graphWriter, - final int minKmer, - final byte minBaseQualityToUseInAssembly) { + final int minKmer) { super(); this.debug = debug; this.debugGraphTransformations = debugGraphTransformations > 0; this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; - this.graphWriter = graphWriter; this.minKmer = minKmer; - this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; } /** @@ -126,19 +117,15 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @param refHaplotype reference haplotype object * @param fullReferenceWithPadding byte array holding the reference sequence with padding * @param refLoc GenomeLoc object corresponding to the reference sequence with padding - * @param PRUNE_FACTOR prune kmers from the graph if their weight is <= this value * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode * @return a non-empty list of all the haplotypes that are produced during assembly */ @Ensures({"result.contains(refHaplotype)"}) - public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List activeAllelesToGenotype ) { + public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype ) { if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } - if( PRUNE_FACTOR < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } - - // set the pruning factor for this run of the assembly engine - this.PRUNE_FACTOR = PRUNE_FACTOR; + if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } // create the graphs final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); @@ -170,13 +157,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready - if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); - graph = graph.errorCorrect(); - if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); + + if ( shouldErrorCorrectKmers() ) { + graph = graph.errorCorrect(); + if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor); + } final SeqGraph seqGraph = toSeqGraph(graph); - if( seqGraph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference + if ( seqGraph != null ) { // if the graph contains interesting variation from the reference sanityCheckReferenceGraph(seqGraph, refHaplotype); graphs.add(seqGraph); @@ -192,19 +182,19 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR); - seqGraph.pruneGraph(PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + seqGraph.pruneGraph(pruneFactor); seqGraph.removeVerticesNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), pruneFactor); seqGraph.simplifyGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor); // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef // might blow up because there's no reference source node if ( seqGraph.vertexSet().size() == 1 ) return seqGraph; seqGraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor); return seqGraph; } @@ -295,7 +285,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { continue; } - graph.printGraph(graphWriter, false, PRUNE_FACTOR); + graph.printGraph(graphWriter, false, pruneFactor); if ( debugGraphTransformations ) break; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 81ff3dfbd..5849b5a0e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -171,6 +171,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png * */ + @Advanced @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) protected StingSAMFileWriter bamWriter = null; private HaplotypeBAMWriter haplotypeBAMWriter; @@ -178,12 +179,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem /** * The type of BAM output we want to see. */ + @Advanced @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ + @Advanced @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; @@ -191,6 +194,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; + @Advanced @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; @@ -217,6 +221,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) protected boolean includeUnmappedReads = false; + @Advanced @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) protected boolean USE_ALLELES_TRIGGER = false; @@ -232,6 +237,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) protected boolean dontGenotype = false; + @Hidden + @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectKmers = false; + /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -246,6 +255,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * Records that are filtered in the comp track will be ignored. * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). */ + @Advanced @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) public List> comps = Collections.emptyList(); public List> getCompRodBindings() { return comps; } @@ -258,6 +268,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem /** * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. */ + @Advanced @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList(Arrays.asList(new String[]{"ClippingRankSumTest"})); @@ -265,6 +276,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, * so annotations will be excluded even if they are explicitly included with the other options. */ + @Advanced @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); @@ -277,12 +289,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @ArgumentCollection private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); + @Advanced @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; + @Advanced @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) protected int debugGraphTransformations = -1; + @Hidden @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useLowQualityBasesForAssembly = false; @@ -388,8 +403,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE; - assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, minBaseQualityToUseInAssembly ); + // setup the assembler + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, minKmer); + assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); + if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); + if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -520,7 +539,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 3efa342b1..c31405872 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.variant.variantcontext.VariantContext; +import java.io.PrintStream; import java.util.List; /** @@ -59,13 +60,46 @@ import java.util.List; * Date: Mar 14, 2011 */ public abstract class LocalAssemblyEngine { + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; - public enum ASSEMBLER { - SIMPLE_DE_BRUIJN + protected PrintStream graphWriter = null; + protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE; + protected int pruneFactor = 2; + protected boolean errorCorrectKmers = false; + + protected LocalAssemblyEngine() { } + + public int getPruneFactor() { + return pruneFactor; } - protected LocalAssemblyEngine() { + public void setPruneFactor(int pruneFactor) { + this.pruneFactor = pruneFactor; } - public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, List activeAllelesToGenotype); + public boolean shouldErrorCorrectKmers() { + return errorCorrectKmers; + } + + public void setErrorCorrectKmers(boolean errorCorrectKmers) { + this.errorCorrectKmers = errorCorrectKmers; + } + + public PrintStream getGraphWriter() { + return graphWriter; + } + + public void setGraphWriter(PrintStream graphWriter) { + this.graphWriter = graphWriter; + } + + public byte getMinBaseQualityToUseInAssembly() { + return minBaseQualityToUseInAssembly; + } + + public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) { + this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; + } + + public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List activeAllelesToGenotype); }