From 4f3411f3d40871f69335373ec2f0ec1de20334c4 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 31 Jul 2013 10:48:55 -0400 Subject: [PATCH] Max number of haplotypes to evaluate no longer grows unbounded with the number of samples. This is necessary for multi-sample calling projects with over 100 samples. --- .../haplotypecaller/HaplotypeCaller.java | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 2d492741b..d46c786c9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -231,7 +231,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); + protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. @@ -258,23 +258,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false) protected int numPruningSamples = 1; - /** - * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype - * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the - * run of the haplotype caller we only take maxPathsPerSample * nSample paths from the graph, in order of their - * weights, no matter how many paths are possible to generate from the graph. Putting this number too low - * will result in dropping true variation because paths that include the real variant are not even considered. - */ - @Advanced - @Argument(fullName="maxPathsPerSample", shortName="maxPathsPerSample", doc="Max number of paths to consider for the read threading assembler per sample.", required = false) - protected int maxPathsPerSample = 10; - - /** - * The minimum number of paths to advance forward for genotyping, regardless of the - * number of samples - */ - private final static int MIN_PATHS_PER_GRAPH = 128; - @Hidden @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) protected boolean dontRecoverDanglingTails = false; @@ -381,9 +364,16 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) protected int phredScaledGlobalReadMismappingRate = 45; + /** + * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype + * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the + * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their + * weights, no matter how many paths are possible to generate from the graph. Putting this number too low + * will result in dropping true variation because paths that include the real variant are not even considered. + */ @Advanced @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) - protected int maxNumHaplotypesInPopulation = 25; + protected int maxNumHaplotypesInPopulation = 128; @Advanced @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) @@ -541,7 +531,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // get all of the unique sample names Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samplesList.addAll( samples ); - final int nSamples = samples.size(); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this? @@ -567,7 +556,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // initialize the output VCF header annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - Set headerInfo = new HashSet(); + Set headerInfo = new HashSet<>(); // all annotation fields from VariantAnnotatorEngine headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); @@ -610,8 +599,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } // create and setup the assembler - final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH); - assemblyEngine = new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples); + assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples); assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);