Merge pull request #353 from broadinstitute/rp_HC_updates_for_1000G_and_WGS_calling

Max number of haplotypes to evaluate no longer grows unbounded with the ...
2013-07-31 08:29:06 -07:00 · 2013-07-31 08:29:06 -07:00 · ec3c885a25
parent 00cedd0bd3 4f3411f3d4
commit ec3c885a25
1 changed files with 11 additions and 23 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@ -231,7 +231,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
     */
    @Advanced
    @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
-    protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
+    protected List<String> annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));

    /**
     * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
@ -258,23 +258,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
    @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false)
    protected int numPruningSamples = 1;

-    /**
-     * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes.  Each haplotype
-     * considered requires N PairHMM evaluations if there are N reads across all samples.  In order to control the
-     * run of the haplotype caller we only take maxPathsPerSample * nSample paths from the graph, in order of their
-     * weights, no matter how many paths are possible to generate from the graph.  Putting this number too low
-     * will result in dropping true variation because paths that include the real variant are not even considered.
-     */
-    @Advanced
-    @Argument(fullName="maxPathsPerSample", shortName="maxPathsPerSample", doc="Max number of paths to consider for the read threading assembler per sample.", required = false)
-    protected int maxPathsPerSample = 10;
-
-    /**
-     * The minimum number of paths to advance forward for genotyping, regardless of the
-     * number of samples
-     */
-    private final static int MIN_PATHS_PER_GRAPH = 128;
-
    @Hidden
    @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false)
    protected boolean dontRecoverDanglingTails = false;
@ -381,9 +364,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
    @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false)
    protected int phredScaledGlobalReadMismappingRate = 45;

+    /**
+     * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes.  Each haplotype
+     * considered requires N PairHMM evaluations if there are N reads across all samples.  In order to control the
+     * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their
+     * weights, no matter how many paths are possible to generate from the graph.  Putting this number too low
+     * will result in dropping true variation because paths that include the real variant are not even considered.
+     */
    @Advanced
    @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
-    protected int maxNumHaplotypesInPopulation = 25;
+    protected int maxNumHaplotypesInPopulation = 128;

    @Advanced
    @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
@ -541,7 +531,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
        // get all of the unique sample names
        Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
        samplesList.addAll( samples );
-        final int nSamples = samples.size();
        // initialize the UnifiedGenotyper Engine which is used to call into the exact model
        final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
        // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this?
@ -567,7 +556,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
        // initialize the output VCF header
        annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());

-        Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
+        Set<VCFHeaderLine> headerInfo = new HashSet<>();

        // all annotation fields from VariantAnnotatorEngine
        headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
@ -610,8 +599,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
        }

        // create and setup the assembler
-        final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
-        assemblyEngine = new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);
+        assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);

        assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
        assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);