diff --git a/pom.xml b/pom.xml index 7f6394a8a..c238762c0 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ org.broadinstitute.gatk gatk-root - 3.6 + 3.7-SNAPSHOT public/gatk-root diff --git a/protected/gatk-package-distribution/pom.xml b/protected/gatk-package-distribution/pom.xml index 8486ee153..840d2356d 100644 --- a/protected/gatk-package-distribution/pom.xml +++ b/protected/gatk-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.6 + 3.7-SNAPSHOT ../.. diff --git a/protected/gatk-queue-extensions-distribution/pom.xml b/protected/gatk-queue-extensions-distribution/pom.xml index eed2e0db4..2acb0c09a 100644 --- a/protected/gatk-queue-extensions-distribution/pom.xml +++ b/protected/gatk-queue-extensions-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.6 + 3.7-SNAPSHOT ../.. diff --git a/protected/gatk-queue-package-distribution/pom.xml b/protected/gatk-queue-package-distribution/pom.xml index 0ae5b23c7..f4cc8663b 100644 --- a/protected/gatk-queue-package-distribution/pom.xml +++ b/protected/gatk-queue-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.6 + 3.7-SNAPSHOT ../.. diff --git a/protected/gatk-tools-protected/pom.xml b/protected/gatk-tools-protected/pom.xml index ba6937868..350bab208 100644 --- a/protected/gatk-tools-protected/pom.xml +++ b/protected/gatk-tools-protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.6 + 3.7-SNAPSHOT ../.. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index d8c10145f..ed639c951 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -54,6 +54,7 @@ package org.broadinstitute.gatk.engine.arguments; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Hidden; import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import java.util.Collections; @@ -65,33 +66,23 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public static final String MAX_ALTERNATE_ALLELES_SHORT_NAME = "maxAltAlleles"; /** - * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping. - * Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site. + * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles + * being sent on for genotyping. Using this argument instructs the genotyper to annotate (in the INFO field) the + * number of alternate alleles that were originally discovered (but not necessarily genotyped) at the site. */ - @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) + @Argument(fullName = "annotateNDA", shortName = "nda", doc = "Annotate number of alleles observed", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; /** - * The expected heterozygosity value used to compute prior probability that a locus is non-reference. - * - * From the heterozygosity we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i) - * where hets is this case is analogous to the parameter theta from population genetics. See https://en.wikipedia.org/wiki/Coalescent_theory for more details. - * - * Note that heterozygosity as used here is the population genetics concept. (See http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics. - * We also suggest the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on the theory.) That is, a hets value of 0.001 - * implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp. - * - * The default priors provided for humans (hets = 1e-3) - * - * Also note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, - * which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there - * may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK - * only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the - * het parameters only increases the chance that a site will be called non-reference across all samples, but - * doesn't actually change the output genotype likelihoods at all, as these aren't posterior probabilities at all. - * - * The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy, - * which determines how many chromosomes each individual in the species carries. + * This activates a model for calculating QUAL that was introduced in version 3.7 (November 2016). We expect this + * model will become the default in future versions. + */ + @Argument(fullName = "useNewAFCalculator", shortName = "newQual", doc = "Use new AF model instead of the so-called exact model", required = false) + public boolean USE_NEW_AF_CALCULATOR = false; + + /** + * The expected heterozygosity value used to compute prior probability that a locus is non-reference. See + * https://software.broadinstitute.org/gatk/documentation/article?id=8603 for more details. */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY; @@ -102,32 +93,67 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY; + /** + * The standard deviation of the distribution of alt allele fractions. The above heterozygosity parameters give + * the *mean* of this distribution; this parameter gives its spread. + */ + @Argument(fullName = "heterozygosity_stdev", shortName = "heterozygosityStandardDeviation", doc = "Standard deviation of eterozygosity for SNP and indel calling.", required = false) + public double heterozygosityStandardDeviation = 0.01; + /** * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this * is the default). */ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + public double STANDARD_CONFIDENCE_FOR_CALLING = 10.0; /** * This argument allows you to emit low quality calls as filtered records. */ - @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false) + @Hidden + @Deprecated + @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", + doc = "This argument is no longer used in GATK versions 3.7 and newer. Please see the online documentation for the latest usage recommendations.", required = false) public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; /** - * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN_ALLELES), - * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it - * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend - * that you not play around with this parameter. + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or + * GENOTYPE_GIVEN_ALLELES), then only this many alleles will be used. Note that genotyping sites with many + * alternate alleles is both CPU and memory intensive and it scales exponentially based on the number of alternate + * alleles. Unless there is a good reason to change the default value, we highly recommend that you not play around + * with this parameter. * - * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6. + * See also {@link #MAX_GENOTYPE_COUNT}. */ @Advanced @Argument(fullName = "max_alternate_alleles", shortName = MAX_ALTERNATE_ALLELES_SHORT_NAME, doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 6; + /** + * If there are more than this number of genotypes at a locus presented to the genotyper, then only this many + * genotypes will be used. This is intended to deal with sites where the combination of high ploidy and high alt + * allele count can lead to an explosion in the number of possible genotypes, with extreme adverse effects on + * runtime performance. + * + * How does it work? The possible genotypes are simply different ways of partitioning alleles given a specific + * ploidy assumption. Therefore, we remove genotypes from consideration by removing alternate alleles that are the + * least well supported. The estimate of allele support is based on the ranking of the candidate haplotypes coming + * out of the graph building step. Note however that the reference allele is always kept. + * + * The maximum number of alternative alleles used in the genotyping step will be the lesser of the two: + * 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #MAX_GENOTYPE_COUNT} + * 2. the value of {@link #MAX_ALTERNATE_ALLELES} + * + * As noted above, genotyping sites with large genotype counts is both CPU and memory intensive. Unless you have + * a good reason to change the default value, we highly recommend that you not play around with this parameter. + * + * See also {@link #MAX_ALTERNATE_ALLELES}. + */ + @Advanced + @Argument(fullName = "max_genotype_count", shortName = "maxGT", doc = "Maximum number of genotypes to consider at any site", required = false) + public int MAX_GENOTYPE_COUNT = 1024; + /** * Determines the maximum number of PL values that will be logged in the output. If the number of genotypes * (which is determined by the ploidy and the number of alleles) exceeds the value provided by this argument, @@ -138,23 +164,19 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public int MAX_NUM_PL_VALUES = AFCalculator.MAX_NUM_PL_VALUES_DEFAULT; /** - * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, - * see e.g. Waterson (1975) or Tajima (1996). - * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N + * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a + * particular locus, using an infinite sites model (see e.g. Waterson, 1975 or Tajima, 1996). This model asserts that + * the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N. + * However, there are instances where using this prior might not be desirable, e.g. for population studies where prior + * might not be appropriate, as for example when the ancestral status of the reference allele is not known. * - * There are instances where using this prior might not be desirable, e.g. for population studies where prior might not be appropriate, - * as for example when the ancestral status of the reference allele is not known. - * By using this argument, the user can manually specify a list of probabilities for each AC>1 to be used as priors for genotyping, - * with the following restrictions: - * a) User must specify 2N values, where N is the number of samples. - * b) Only diploid calls supported. - * c) Probability values are specified in Double format, in linear space (not log10 space or Phred-scale). - * d) No negative values allowed. - * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. - * f) If user-defined values add to more than one, an error will be produced. + * This argument allows you to manually specify a list of probabilities for each AC>1 to be used as + * priors for genotyping, with the following restrictions: only diploid calls are supported; you must specify 2 * + * N values where N is the number of samples; probability values must be positive and specified in Double format, + * in linear space (not log10 space nor Phred-scale); and all values must sume to 1. * - * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g. - * -inputPrior 0.33 -inputPrior 0.33 + * For completely flat priors, specify the same value (=1/(2*N+1)) 2*N times, e.g. + * -inputPrior 0.33 -inputPrior 0.33 * for the single-sample diploid case. */ @Advanced @@ -162,9 +184,10 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public List inputPrior = Collections.emptyList(); /** - * Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy + * Sample ploidy - equivalent to number of chromosome copies per pool. For pooled experiments this should be set to + * the number of samples in pool multiplied by individual sample ploidy. */ - @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) + @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) public int samplePloidy = HomoSapiensConstants.DEFAULT_PLOIDY; /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java index 787181428..f343d0bab 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java @@ -53,6 +53,8 @@ package org.broadinstitute.gatk.tools; import htsjdk.samtools.util.IOUtil; import org.broadinstitute.gatk.engine.recalibration.BQSRGatherer; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.utils.help.HelpConstants; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; @@ -97,10 +99,7 @@ import java.util.List; * */ -@CommandLineProgramProperties( - usage = "Gathers scattered BQSR recalibration reports into a single file", - usageShort = "Gathers scattered BQSR recalibration reports into a single file" -) +@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_QC) public class GatherBqsrReports extends CommandLineProgram { @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="List of scattered BQSR files") public List INPUT; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java index 25f4b3274..046ae97fd 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java @@ -71,7 +71,7 @@ import java.util.List; *

The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

* *

Statistical notes

- *

The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

+ *

The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

* *

Caveats

*