diff --git a/pom.xml b/pom.xml
index 7f6394a8a..c238762c0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -13,7 +13,7 @@
org.broadinstitute.gatk
gatk-root
- 3.6
+ 3.7-SNAPSHOT
public/gatk-root
diff --git a/protected/gatk-package-distribution/pom.xml b/protected/gatk-package-distribution/pom.xml
index 8486ee153..840d2356d 100644
--- a/protected/gatk-package-distribution/pom.xml
+++ b/protected/gatk-package-distribution/pom.xml
@@ -5,7 +5,7 @@
org.broadinstitute.gatk
gatk-aggregator
- 3.6
+ 3.7-SNAPSHOT
../..
diff --git a/protected/gatk-queue-extensions-distribution/pom.xml b/protected/gatk-queue-extensions-distribution/pom.xml
index eed2e0db4..2acb0c09a 100644
--- a/protected/gatk-queue-extensions-distribution/pom.xml
+++ b/protected/gatk-queue-extensions-distribution/pom.xml
@@ -5,7 +5,7 @@
org.broadinstitute.gatk
gatk-aggregator
- 3.6
+ 3.7-SNAPSHOT
../..
diff --git a/protected/gatk-queue-package-distribution/pom.xml b/protected/gatk-queue-package-distribution/pom.xml
index 0ae5b23c7..f4cc8663b 100644
--- a/protected/gatk-queue-package-distribution/pom.xml
+++ b/protected/gatk-queue-package-distribution/pom.xml
@@ -5,7 +5,7 @@
org.broadinstitute.gatk
gatk-aggregator
- 3.6
+ 3.7-SNAPSHOT
../..
diff --git a/protected/gatk-tools-protected/pom.xml b/protected/gatk-tools-protected/pom.xml
index ba6937868..350bab208 100644
--- a/protected/gatk-tools-protected/pom.xml
+++ b/protected/gatk-tools-protected/pom.xml
@@ -5,7 +5,7 @@
org.broadinstitute.gatk
gatk-aggregator
- 3.6
+ 3.7-SNAPSHOT
../..
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java
index d8c10145f..ed639c951 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java
@@ -54,6 +54,7 @@ package org.broadinstitute.gatk.engine.arguments;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.utils.commandline.Advanced;
import org.broadinstitute.gatk.utils.commandline.Argument;
+import org.broadinstitute.gatk.utils.commandline.Hidden;
import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants;
import java.util.Collections;
@@ -65,33 +66,23 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
public static final String MAX_ALTERNATE_ALLELES_SHORT_NAME = "maxAltAlleles";
/**
- * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping.
- * Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site.
+ * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles
+ * being sent on for genotyping. Using this argument instructs the genotyper to annotate (in the INFO field) the
+ * number of alternate alleles that were originally discovered (but not necessarily genotyped) at the site.
*/
- @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false)
+ @Argument(fullName = "annotateNDA", shortName = "nda", doc = "Annotate number of alleles observed", required = false)
public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
/**
- * The expected heterozygosity value used to compute prior probability that a locus is non-reference.
- *
- * From the heterozygosity we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i)
- * where hets is this case is analogous to the parameter theta from population genetics. See https://en.wikipedia.org/wiki/Coalescent_theory for more details.
- *
- * Note that heterozygosity as used here is the population genetics concept. (See http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics.
- * We also suggest the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on the theory.) That is, a hets value of 0.001
- * implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp.
- *
- * The default priors provided for humans (hets = 1e-3)
- *
- * Also note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype,
- * which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there
- * may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK
- * only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the
- * het parameters only increases the chance that a site will be called non-reference across all samples, but
- * doesn't actually change the output genotype likelihoods at all, as these aren't posterior probabilities at all.
- *
- * The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy,
- * which determines how many chromosomes each individual in the species carries.
+ * This activates a model for calculating QUAL that was introduced in version 3.7 (November 2016). We expect this
+ * model will become the default in future versions.
+ */
+ @Argument(fullName = "useNewAFCalculator", shortName = "newQual", doc = "Use new AF model instead of the so-called exact model", required = false)
+ public boolean USE_NEW_AF_CALCULATOR = false;
+
+ /**
+ * The expected heterozygosity value used to compute prior probability that a locus is non-reference. See
+ * https://software.broadinstitute.org/gatk/documentation/article?id=8603 for more details.
*/
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY;
@@ -102,32 +93,67 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY;
+ /**
+ * The standard deviation of the distribution of alt allele fractions. The above heterozygosity parameters give
+ * the *mean* of this distribution; this parameter gives its spread.
+ */
+ @Argument(fullName = "heterozygosity_stdev", shortName = "heterozygosityStandardDeviation", doc = "Standard deviation of eterozygosity for SNP and indel calling.", required = false)
+ public double heterozygosityStandardDeviation = 0.01;
+
/**
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
* is the default).
*/
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
- public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
+ public double STANDARD_CONFIDENCE_FOR_CALLING = 10.0;
/**
* This argument allows you to emit low quality calls as filtered records.
*/
- @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
+ @Hidden
+ @Deprecated
+ @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf",
+ doc = "This argument is no longer used in GATK versions 3.7 and newer. Please see the online documentation for the latest usage recommendations.", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
/**
- * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN_ALLELES),
- * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
- * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
- * that you not play around with this parameter.
+ * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or
+ * GENOTYPE_GIVEN_ALLELES), then only this many alleles will be used. Note that genotyping sites with many
+ * alternate alleles is both CPU and memory intensive and it scales exponentially based on the number of alternate
+ * alleles. Unless there is a good reason to change the default value, we highly recommend that you not play around
+ * with this parameter.
*
- * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
+ * See also {@link #MAX_GENOTYPE_COUNT}.
*/
@Advanced
@Argument(fullName = "max_alternate_alleles", shortName = MAX_ALTERNATE_ALLELES_SHORT_NAME, doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 6;
+ /**
+ * If there are more than this number of genotypes at a locus presented to the genotyper, then only this many
+ * genotypes will be used. This is intended to deal with sites where the combination of high ploidy and high alt
+ * allele count can lead to an explosion in the number of possible genotypes, with extreme adverse effects on
+ * runtime performance.
+ *
+ * How does it work? The possible genotypes are simply different ways of partitioning alleles given a specific
+ * ploidy assumption. Therefore, we remove genotypes from consideration by removing alternate alleles that are the
+ * least well supported. The estimate of allele support is based on the ranking of the candidate haplotypes coming
+ * out of the graph building step. Note however that the reference allele is always kept.
+ *
+ * The maximum number of alternative alleles used in the genotyping step will be the lesser of the two:
+ * 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #MAX_GENOTYPE_COUNT}
+ * 2. the value of {@link #MAX_ALTERNATE_ALLELES}
+ *
+ * As noted above, genotyping sites with large genotype counts is both CPU and memory intensive. Unless you have
+ * a good reason to change the default value, we highly recommend that you not play around with this parameter.
+ *
+ * See also {@link #MAX_ALTERNATE_ALLELES}.
+ */
+ @Advanced
+ @Argument(fullName = "max_genotype_count", shortName = "maxGT", doc = "Maximum number of genotypes to consider at any site", required = false)
+ public int MAX_GENOTYPE_COUNT = 1024;
+
/**
* Determines the maximum number of PL values that will be logged in the output. If the number of genotypes
* (which is determined by the ploidy and the number of alleles) exceeds the value provided by this argument,
@@ -138,23 +164,19 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
public int MAX_NUM_PL_VALUES = AFCalculator.MAX_NUM_PL_VALUES_DEFAULT;
/**
- * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model,
- * see e.g. Waterson (1975) or Tajima (1996).
- * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
+ * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a
+ * particular locus, using an infinite sites model (see e.g. Waterson, 1975 or Tajima, 1996). This model asserts that
+ * the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N.
+ * However, there are instances where using this prior might not be desirable, e.g. for population studies where prior
+ * might not be appropriate, as for example when the ancestral status of the reference allele is not known.
*
- * There are instances where using this prior might not be desirable, e.g. for population studies where prior might not be appropriate,
- * as for example when the ancestral status of the reference allele is not known.
- * By using this argument, the user can manually specify a list of probabilities for each AC>1 to be used as priors for genotyping,
- * with the following restrictions:
- * a) User must specify 2N values, where N is the number of samples.
- * b) Only diploid calls supported.
- * c) Probability values are specified in Double format, in linear space (not log10 space or Phred-scale).
- * d) No negative values allowed.
- * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
- * f) If user-defined values add to more than one, an error will be produced.
+ * This argument allows you to manually specify a list of probabilities for each AC>1 to be used as
+ * priors for genotyping, with the following restrictions: only diploid calls are supported; you must specify 2 *
+ * N values where N is the number of samples; probability values must be positive and specified in Double format,
+ * in linear space (not log10 space nor Phred-scale); and all values must sume to 1.
*
- * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g.
- * -inputPrior 0.33 -inputPrior 0.33
+ * For completely flat priors, specify the same value (=1/(2*N+1)) 2*N times, e.g.
+ * -inputPrior 0.33 -inputPrior 0.33
* for the single-sample diploid case.
*/
@Advanced
@@ -162,9 +184,10 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
public List inputPrior = Collections.emptyList();
/**
- * Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
+ * Sample ploidy - equivalent to number of chromosome copies per pool. For pooled experiments this should be set to
+ * the number of samples in pool multiplied by individual sample ploidy.
*/
- @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
+ @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
public int samplePloidy = HomoSapiensConstants.DEFAULT_PLOIDY;
/**
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java
index 787181428..f343d0bab 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java
@@ -53,6 +53,8 @@ package org.broadinstitute.gatk.tools;
import htsjdk.samtools.util.IOUtil;
import org.broadinstitute.gatk.engine.recalibration.BQSRGatherer;
+import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.gatk.utils.help.HelpConstants;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@@ -97,10 +99,7 @@ import java.util.List;
*
*/
-@CommandLineProgramProperties(
- usage = "Gathers scattered BQSR recalibration reports into a single file",
- usageShort = "Gathers scattered BQSR recalibration reports into a single file"
-)
+@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_QC)
public class GatherBqsrReports extends CommandLineProgram {
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="List of scattered BQSR files")
public List INPUT;
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java
index 25f4b3274..046ae97fd 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java
@@ -71,7 +71,7 @@ import java.util.List;
* The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.
*
* Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* Caveats
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InsertSizeRankSum.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InsertSizeRankSum.java
index 4d7db9ba9..cb9afcfe5 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InsertSizeRankSum.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InsertSizeRankSum.java
@@ -63,13 +63,19 @@ import java.util.List;
/**
* Allele specific Rank Sum Test for insert sizes of REF versus ALT reads
*
- *
- * This annotation tests whether the insert sizes of reads supporting the REF allele and ALT allele are roughly equal.
- * In case of multiple alternate alleles, each alternate allele is considered separately.
+ *
This variant-level annotation compares the insert sizes of reads supporting the reference allele with those supporting each alternate allele. To be clear, it does so separately for each alternate allele.
+ *
+ * The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele are associated with smaller insert sizes than those supporting the reference allele. Conversely, a positive value indicates that reads supporting the alternate allele are associated with larger insert sizes than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.
*
- *
* Statistical notes
- * See the method document for a more detailed explanation of the rank sum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for insert sizes (insert sizes of reads supporting REF vs. insert sizes of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ *
+ * Caveats
+ *
+ * - Uninformative reads are not used in these calculations.
+ * - The insert size rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ * - This is an experimental annotation and as such it is unsupported. Use at your own risk.
+ *
*
* */
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MQMateRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MQMateRankSumTest.java
index 67a9cc843..acbb6f6f7 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MQMateRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MQMateRankSumTest.java
@@ -75,7 +75,7 @@ import java.util.Map;
* This annotation can be used to evaluate confidence in a variant call and could be used as a covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
*
* Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities of the read's mate See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities of the read's mate See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* * Caveats
* - The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
@@ -85,6 +85,8 @@ import java.util.Map;
* Related annotations
*
* - AS_MappingQualityRankSumTest outputs the same rank sum test on the mapping quality of the reads themselves rather than their mates.
+ * - MappingQualityRankSumTest outputs a version of the above mapping quality ranksum test annotation that includes all alternate alleles in a single calculation.
+ * - RMSMappingQuality gives an estimation of the overall read mapping quality supporting a variant call.
*
*/
public class AS_MQMateRankSumTest extends AS_RankSumTest implements BetaTestingAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java
index 9b6d33003..66aa5f1ee 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java
@@ -76,7 +76,7 @@ import java.util.Map;
* Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
*
*
Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* Caveats
* - The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
@@ -86,6 +86,7 @@ import java.util.Map;
* Related annotations
*
* - MappingQualityRankSumTest outputs a version of this annotation that includes all alternate alleles in a single calculation.
+ * - AS_MQMateRankSumTest outputs the same allele-specific rank sum test on the mapping quality of the reads' mates rather than the reads themselves.
* - RMSMappingQuality gives an estimation of the overal read mapping quality supporting a variant call.
*
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java
index 1f15d17b5..70a13c52d 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java
@@ -72,7 +72,9 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
- * Allele-specific implementation of rank sum test annotations
+ * Allele-specific implementation of rank sum test annotations.
+ * The RankSumTest concept is documented at https://software.broadinstitute.org/gatk/documentation/article?id=8031
+ *
*/
public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnnotation {
private final static Logger logger = Logger.getLogger(AS_RMSAnnotation.class);
@@ -277,7 +279,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) {
final Double value = getElementForRead(read, refLoc, a);
- if ( value == null )
+ // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
+ if ( value == null || value == INVALID_ELEMENT_FROM_READ )
continue;
if(perAlleleValues.containsKey(a.getMostLikelyAllele()))
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java
index 2bd2eb9fd..7d377f9df 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java
@@ -74,7 +74,7 @@ import java.util.List;
* This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
*
* Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* Caveat
*
@@ -102,6 +102,11 @@ public class AS_ReadPosRankSumTest extends AS_RankSumTest implements AS_Standard
if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED )
return null;
+ // If the offset inside a deletion, it does not lie on a read.
+ if ( AlignmentUtils.isInsideDeletion(read.getCigar(), offset) ) {
+ return INVALID_ELEMENT_FROM_READ;
+ }
+
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0);
final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
if (readPos > numAlignedBases / 2)
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java
index 13e85e62d..c37b3b650 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java
@@ -60,15 +60,14 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompa
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
-import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
-import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
-import org.broadinstitute.gatk.utils.BaseUtils;
import java.util.*;
+import java.util.stream.Collectors;
/**
* Count of A, C, G, T bases for each sample
@@ -110,8 +109,9 @@ public class BaseCountsBySample extends GenotypeAnnotation {
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
- if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() )
- gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, getBaseCounts(alleleLikelihoodMap, vc));
+ if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() ) {
+ gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, Arrays.stream(getBaseCounts(alleleLikelihoodMap, vc)).boxed().collect(Collectors.toList()));
+ }
}
@Override
@@ -123,31 +123,15 @@ public class BaseCountsBySample extends GenotypeAnnotation {
}
/**
- * Base counts given for the most likely allele
+ * Counts of observed bases at a genomic position e.g. {13,0,0,1} at chr1:100,000,000
*
* @param perReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood.
* @param vc variant context
* @return count of A, C, G, T bases
- * @throws IllegalStateException if alleles in vc are not in perReadAlleleLikelihoodMap
*/
private int[] getBaseCounts(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) {
final Set alleles = new HashSet<>(vc.getAlleles());
- // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
- if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
- throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
-
- final int[] counts = new int[4];
- for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
- final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
- if (! a.isInformative() ) continue; // read is non-informative
- for (final byte base : el.getKey().getReadBases() ){
- int index = BaseUtils.simpleBaseToBaseIndex(base);
- if ( index != -1 )
- counts[index]++;
- }
- }
-
- return counts;
+ return AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, vc.getStart());
}
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java
index dfea75fa6..66fdf7d9f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java
@@ -70,7 +70,7 @@ import java.util.*;
* The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.
*
* Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* Caveats
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java
index c45ec4766..daa240b2f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java
@@ -66,7 +66,7 @@ import java.util.*;
* This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.
*
* Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* Caveat
* The clipping rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java
index 063e63560..b4dd52447 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java
@@ -139,8 +139,9 @@ public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation
double pval = exactTest(genotypeCounts);
//If the actual phredPval would be infinity we will probably still filter out just a very large number
+ //Since the method does not guarantee precision for any p-value smaller than 1e-16, we can return the phred scaled version
if (pval == 0) {
- return Integer.MAX_VALUE;
+ return -10.0 * Math.log10(minNeededValue);
}
double phredPval = -10.0 * Math.log10(pval);
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
index d737f3d8b..5cfddd1da 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
@@ -75,7 +75,7 @@ import java.util.*;
* The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls.
*
* Statistical notes
- * See the method document on statistical tests for a more detailed explanation of this application of Fisher's Exact Test.
+ * See the method document on statistical tests for a more detailed explanation of this application of Fisher's Exact Test.
*
* Caveats
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java
index f8be1c044..862846415 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java
@@ -76,7 +76,7 @@ import java.util.*;
* This annotation estimates whether there is evidence of inbreeding in a population. The higher the score, the higher the chance that there is inbreeding.
*
* Statistical notes
- * The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is the F statistic from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.
+ * The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is the F statistic from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.
*
* Caveats
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java
index e4ac83eb8..dd6ea1407 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java
@@ -66,7 +66,7 @@ import java.util.List;
* This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.
*
* Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for per-read likelihoods to the best haplotype match (likelihoods of reads supporting REF vs. likelihoods of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for per-read likelihoods to the best haplotype match (likelihoods of reads supporting REF vs. likelihoods of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* Caveat
* The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java
index 7a8554eea..6fd308e5c 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java
@@ -68,7 +68,7 @@ import java.util.*;
* This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
*
*
Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* Caveats
* - The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
@@ -78,6 +78,7 @@ import java.util.*;
* Related annotations
*
* - AS_MappingQualityRankSumTest outputs an allele-specific version of this annotation.
+ * - AS_MQMateRankSumTest outputs the allele-specific rank sum test on the mapping quality of the reads' mates rather than the reads themselves.
* - RMSMappingQuality gives an estimation of the overal read mapping quality supporting a variant call.
*
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java
index cca58bd66..c4e1cd376 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java
@@ -71,11 +71,13 @@ import java.util.*;
/**
- * Abstract root for all RankSum-based annotations
+ * Abstract root for all RankSum-based annotations.
+ * The RankSumTest concept is documented at https://software.broadinstitute.org/gatk/documentation/article?id=8031
*/
//TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final boolean DEBUG = false;
+ protected static double INVALID_ELEMENT_FROM_READ = Double.NEGATIVE_INFINITY;
public Map annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@@ -86,11 +88,11 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
// either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null
final GenotypesContext genotypes = vc.getGenotypes();
- if (genotypes == null || genotypes.size() == 0)
+ if (genotypes == null || genotypes.isEmpty())
return null;
- final ArrayList refQuals = new ArrayList<>();
- final ArrayList altQuals = new ArrayList<>();
+ final List refQuals = new ArrayList<>();
+ final List altQuals = new ArrayList<>();
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
@@ -183,7 +185,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) {
final Double value = getElementForRead(read, refLoc, a);
- if ( value == null )
+ // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
+ if ( value == null || value == INVALID_ELEMENT_FROM_READ )
continue;
if ( a.getMostLikelyAllele().isReference() )
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
index 51890fc33..d136a6d70 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
@@ -74,7 +74,7 @@ import java.util.*;
* This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
*
* Statistical notes
- * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
*
* Caveat
*
@@ -104,6 +104,11 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED )
return null;
+ // If the offset inside a deletion, it does not lie on a read.
+ if ( AlignmentUtils.isInsideDeletion(read.getCigar(), offset) ) {
+ return INVALID_ELEMENT_FROM_READ;
+ }
+
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 );
final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
if (readPos > numAlignedBases / 2)
@@ -124,7 +129,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
@Override
protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
- return super.isUsableRead(read, refLoc) && read.getSoftStart() + read.getCigar().getReadLength() > refLoc;
+ return super.isUsableRead(read, refLoc) && read.getSoftEnd() >= refLoc;
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTableUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTableUtils.java
index 9505bcacc..7768d2e6d 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTableUtils.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTableUtils.java
@@ -51,9 +51,9 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
-import cern.jet.math.Arithmetic;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.QualityUtils;
+import org.apache.commons.math3.distribution.HypergeometricDistribution;
import java.util.ArrayList;
import java.util.List;
@@ -83,22 +83,51 @@ public class StrandBiasTableUtils {
int[][] table = copyContingencyTable(normalizedTable);
- double pCutoff = computePValue(table);
+ int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
+ int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
+ int N = rowSums[0] + rowSums[1];
+ int sampleSize = colSums[0];
+ int numberOfNonSuccesses = rowSums[1];
+ int numberOfSuccesses = rowSums[0];
+ /*
+ * The lowest possible number of successes you can sample is what's left of your sample size after
+ * drawing every non success in the urn. If the number of non successes in the urn is greater than the sample
+ * size then the minimum number of drawn successes is 0.
+ */
+ int lo = Math.max(0, sampleSize - numberOfNonSuccesses);
+ /*
+ * The highest possible number of successes you can draw is either the total sample size or the number of
+ * successes in the urn. (Whichever is smaller)
+ */
+ int hi = Math.min(sampleSize, numberOfSuccesses);
- double pValue = pCutoff;
- while (rotateTable(table)) {
- double pValuePiece = computePValue(table);
-
- if (pValuePiece <= pCutoff) {
- pValue += pValuePiece;
- }
+ /**
+ * If the support of the distribution is only one value, creating the HypergeometricDistribution
+ * doesn't make sense. There would be only one possible observation so the p-value has to be 1.
+ */
+ if (lo == hi) {
+ return 1.0;
}
- table = copyContingencyTable(normalizedTable);
- while (unrotateTable(table)) {
- double pValuePiece = computePValue(table);
+ /**
+ * For the hypergeometric distribution from which to calculate the probabilities:
+ * The population (N = a+b+c+d) is the sum of all four numbers in the contingency table. Then the number of
+ * "successes" (K = a+b) is the sum of the top row, and the sample size (n = a+c) is the sum of the first column.
+ */
+ final HypergeometricDistribution dist = new HypergeometricDistribution(N, numberOfSuccesses, sampleSize);
- if (pValuePiece <= pCutoff) {
+ //Then we determine a given probability with the sampled successes (k = a) from the first entry in the table.
+ double pCutoff = dist.probability(table[0][0]);
+
+ double pValue = 0.0;
+ /**
+ * Now cycle through all possible k's and add those whose probabilities are smaller than our observed k
+ * to the p-value, since this is a two-sided test
+ */
+ for(int i = lo; i <= hi; i++){
+ double pValuePiece = dist.probability(i);
+
+ if(pValuePiece <= pCutoff) {
pValue += pValuePiece;
}
}
@@ -190,45 +219,6 @@ public class StrandBiasTableUtils {
return c;
}
- protected static boolean rotateTable(int[][] table) {
- table[0][0]--;
- table[1][0]++;
-
- table[0][1]++;
- table[1][1]--;
-
- return (table[0][0] >= 0 && table[1][1] >= 0);
- }
-
- protected static boolean unrotateTable(int[][] table) {
- table[0][0]++;
- table[1][0]--;
-
- table[0][1]--;
- table[1][1]++;
-
- return (table[0][1] >= 0 && table[1][0] >= 0);
- }
-
- protected static double computePValue(int[][] table) {
-
- int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
- int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
- int N = rowSums[0] + rowSums[1];
-
- // calculate in log space for better precision
- double pCutoff = Arithmetic.logFactorial(rowSums[0])
- + Arithmetic.logFactorial(rowSums[1])
- + Arithmetic.logFactorial(colSums[0])
- + Arithmetic.logFactorial(colSums[1])
- - Arithmetic.logFactorial(table[0][0])
- - Arithmetic.logFactorial(table[0][1])
- - Arithmetic.logFactorial(table[1][0])
- - Arithmetic.logFactorial(table[1][1])
- - Arithmetic.logFactorial(N);
- return Math.exp(pCutoff);
- }
-
private static int sumRow(int[][] table, int column) {
int sum = 0;
for (int r = 0; r < table.length; r++) {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java
index d4d2ab02b..4597d359e 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java
@@ -241,7 +241,7 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation implements Acti
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
- * fw rc
+ * fw rv
* allele1 # #
* allele2 # #
* @return a 2x2 contingency table
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
index ba2011215..3a7336549 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
@@ -77,24 +77,22 @@ import java.util.*;
*
* and its inverse:
*
- *
+ *
* | | + strand | - strand |
* | REF; | X[0][0] | X[0][1] |
* | ALT; | X[1][0] | X[1][1] |
*
- *
+ *
* The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where
*
- * $$ refRatio = \frac{max(X[0][0], X[0][1])}{min(X[0][0], X[0][1} $$
+ * $$ refRatio = \frac{min(X[0][0], X[0][1])}{max(X[0][0], X[0][1} $$
*
* and
*
- * $$ altRatio = \frac{max(X[1][0], X[1][1])}{min(X[1][0], X[1][1]} $$
+ * $$ altRatio = \frac{min(X[1][0], X[1][1])}{max(X[1][0], X[1][1]} $$
*
* ensures that the annotation value is large only.
*
- * See the method document on statistical tests for a more detailed explanation of this statistical test.
- *
* Caveat
*
* The name SOR is not entirely appropriate because the implementation was changed somewhere between the start of development and release of this annotation. Now SOR isn't really an odds ratio anymore. The goal was to separate certain cases of data without penalizing variants that occur at the ends of exons because they tend to only be covered by reads in one direction (depending on which end of the exon they're on), so if a variant has 10 ref reads in the + direction, 1 ref read in the - direction, 9 alt reads in the + direction and 2 alt reads in the - direction, it's actually not strand biased, but the FS score is pretty bad. The implementation that resulted derived in part from empirically testing some read count tables of various sizes with various ratios and deciding from there.
@@ -153,6 +151,7 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio
double ratio = 0;
ratio += (augmentedTable[0][0] / augmentedTable[0][1]) * (augmentedTable[1][1] / augmentedTable[1][0]);
+ // TODO: repeated computation: how about ratio += 1/ratio, or ratio = ratio + 1/ratio to be expicit
ratio += (augmentedTable[0][1] / augmentedTable[0][0]) * (augmentedTable[1][0] / augmentedTable[1][1]);
final double refRatio = (Math.min(augmentedTable[0][0], augmentedTable[0][1])/Math.max(augmentedTable[0][0], augmentedTable[0][1]));
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java
index 4205ed2d2..4b621b41d 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java
@@ -51,9 +51,9 @@
package org.broadinstitute.gatk.tools.walkers.bqsr;
-import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.engine.recalibration.*;
import org.broadinstitute.gatk.engine.walkers.*;
@@ -194,7 +194,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to mask out known variant sites. Please provide a VCF file containing known sites of genetic variation.";
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
- private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
+ private ReferenceSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
/**
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java
new file mode 100644
index 000000000..01fb11c69
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java
@@ -0,0 +1,350 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2016 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.cancer;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.apache.commons.math3.stat.descriptive.rank.Median;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.gatk.tools.walkers.cancer.m2.MuTect2;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.collections.Pair;
+import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
+import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
+import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.sam.ReadUtils;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Detect clustering of variants near the ends of reads
+ *
+ * This annotation detects clustering of evidence for a somatic variant near the ends of reads. To turn on the annotation and the accompanying filter (clustered_read_position), add --enable_clustered_read_position_filter flag in the commandline.
+ *
+ *
+ *
Statistical notes
+ * ClusteredReadPosition produces four INFO field annotations. At a given somatic variant site, MEDIAN_LEFT_OFFSET is the median of the number of bases from the left end of the tumor read to the variant. MEDIAN_RIGHT_OFFSET is similar, but counts from the right end of the read. MAD_LEFT_OFFSET and MAD_RIGHT_OFFSET measure the median absolute deviations. The median gives us the offset of a representative read, while the median absolute deviation captures the spread. We filter a variant if MEDIAN_LEFT_OFFSET <= 10 and MAD_LEFT_OFFSET <= 3, or if MEDIAN_RIGHT_OFFSET <= 10 and MAD_RIGHT_OFFSET <= 3.
+ *
+ *
+ *
Caveat
+ * ClusteredReadPosition is available with MuTect2 only
+ *
+ * RelatedAnnotation
+ * - ReadPosRankSum is a similar annotation designed for germline variants.
+ *
+ */
+public class ClusteredReadPosition extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
+ private final static Logger logger = Logger.getLogger(ClusteredReadPosition.class);
+ private String tumorSampleName = null;
+
+ @Override
+ public List getKeyNames() { return Arrays.asList(
+ GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY,
+ GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY,
+ GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY,
+ GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
+ }
+
+ @Override
+ public List getDescriptions() {
+ List descriptions = new ArrayList<>();
+ for (final String infoFieldKey : getKeyNames()){
+ descriptions.add(GATKVCFHeaderLines.getInfoLine(infoFieldKey));
+ }
+ return descriptions;
+
+ // the following causes a cryptic class not found error, similar to the one in computeReadPositionStats
+ // return getKeyNames().stream().map(GATKVCFHeaderLines::getInfoLine).collect(Collectors.toList());
+ }
+
+ @Override
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map stratifiedPerReadAlleleLikelihoodMap) {
+ // TODO: might make sense to move this code to SomaticGenoypingEngine.
+ // FIXME: checking walker is mutect2 is not ideal...moving this annotation to SomaticGenoypingEngine will solve it
+
+ // populate tumorSampleName the first time we call this method. skip afterwards.
+ if (tumorSampleName == null){
+ if (walker instanceof MuTect2) {
+ tumorSampleName = ((MuTect2) walker).getTumorSampleName();
+ } else {
+ throw new IllegalStateException("ClusteredReadPosition: walker is not MuTect2");
+ }
+ }
+
+ // we skip multi-allelic sites
+ if (vc.getAlternateAlleles().size() > 1){
+ return null;
+ }
+
+ final Map result = new HashMap<>();
+
+ if ( stratifiedPerReadAlleleLikelihoodMap != null ) {
+ final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(tumorSampleName);
+ if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
+ final Optional readPositionStatsOption = computeReadPositionStats(vc, likelihoodMap);
+ if (readPositionStatsOption.isPresent()){
+ MedianStatistics readPositionStats = readPositionStatsOption.get();
+ result.put(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY, readPositionStats.getLeftMedian());
+ result.put(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY, readPositionStats.getRightMedian());
+ result.put(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY, readPositionStats.getLeftMAD());
+ result.put(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY, readPositionStats.getRightMAD());
+ } else {
+ return null;
+ }
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ *
+ * @param vc
+ * @param pralm
+ * @return median of left and right offsets and their median absolute deviations. does not return null.
+ */
+ private Optional computeReadPositionStats(final VariantContext vc,
+ final PerReadAlleleLikelihoodMap pralm) {
+ final int variantStartPosition = vc.getStart();
+ final List tumorLeftOffsets = new ArrayList<>();
+ final List tumorRightOffsets = new ArrayList<>();
+ for ( final Map.Entry> readAlleleLikelihood : pralm.getLikelihoodReadMap().entrySet() ) {
+ final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(readAlleleLikelihood.getValue());
+ final GATKSAMRecord read = readAlleleLikelihood.getKey();
+ if ( mostLikelyAllele.getMostLikelyAllele().isReference() || ! mostLikelyAllele.isInformative() || ! isUsableRead(read)) {
+ continue;
+ }
+
+ final Pair offsetPair = getVariantPositionInRead(read, variantStartPosition);
+ final OptionalInt variantPositionInReadFromLeft = offsetPair.getFirst();
+ final OptionalInt variantPositionInReadFromRight = offsetPair.getSecond();
+
+ // suffices to check only the left offset because the right offset depends on it
+ if ( variantPositionInReadFromLeft.isPresent() ) {
+ tumorLeftOffsets.add(variantPositionInReadFromLeft.getAsInt());
+ tumorRightOffsets.add(variantPositionInReadFromRight.getAsInt());
+ }
+ }
+
+ if (tumorLeftOffsets.isEmpty() || tumorRightOffsets.isEmpty()) {
+ // This condition seems to arise when the reads as aligned in the bam (as represented by PRALM) do not contain the alt read found by HaplotypeCaller
+ logger.warn("At Position " + vc.getContig() + ": " + vc.getStart() + " , the left or right offset list is empty");
+ return Optional.empty();
+ }
+
+ // The following (mapToDouble() in particular) causes ClusteredReadPosition to be not added to ClassMap
+ // leftMedian = median.evaluate(tumorLeftOffsets.stream().mapToDouble( x -> x ).toArray());
+ // rightMedian = median.evaluate(tumorRightOffsets.stream().mapToDouble( x -> x).toArray());
+
+ // until we understand why mapToDouble() causes the above error, have to compute medians in two steps
+ // first use a for loop to manually cast integer to doubles, then call median :: evaluate
+ double[] tumorLeftOffsetsDouble = new double[tumorLeftOffsets.size()];
+ double[] tumorRightOffsetsDouble = new double[tumorRightOffsets.size()];
+ for (int i = 0; i < tumorLeftOffsets.size(); i++){
+ tumorLeftOffsetsDouble[i] = (double) tumorLeftOffsets.get(i);
+ tumorRightOffsetsDouble[i] = (double) tumorRightOffsets.get(i);
+ }
+
+ Median median = new Median();
+ double leftMedian = median.evaluate(tumorLeftOffsetsDouble);
+ double rightMedian = median.evaluate(tumorRightOffsetsDouble);
+ double leftMAD = calculateMAD(tumorLeftOffsets, leftMedian);
+ double rightMAD = calculateMAD(tumorRightOffsets, rightMedian);
+
+ return( Optional.of(new MedianStatistics(leftMedian, rightMedian, leftMAD, rightMAD) ) );
+ }
+
+ private static class MedianStatistics {
+ private double leftMedian;
+ private double rightMedian;
+ private double leftMAD;
+ private double rightMAD;
+
+ public MedianStatistics(double leftMedian, double rightMedian, double leftMAD, double rightMAD) {
+ this.leftMedian = leftMedian;
+ this.rightMedian = rightMedian;
+ this.leftMAD = leftMAD;
+ this.rightMAD = rightMAD;
+ }
+
+ public double getLeftMedian() {
+ return leftMedian;
+ }
+
+ public double getRightMedian() {
+ return rightMedian;
+ }
+
+ public double getLeftMAD() {
+ return leftMAD;
+ }
+
+ public double getRightMAD() {
+ return rightMAD;
+ }
+ }
+
+
+ /**
+ Examples below show how we compute the position of the variant with respect to the left and right end of the reads.
+ Note that a variant may be SNP, deletion, or insertion, and we are counting the number of bases from the left/right end of the read to that variant.
+ We first compute the left offset. Then, right offset = read length - left offset.
+ This means that if there is an insertion between the either end of a read and the variant, we count the inserted bases. Conversely, we do not count the deleted bases between the end of a read and a variant.
+ We count soft-clipped bases.
+
+ example 1 : SNP
+
+ right offset: 9 8 7 6 5 4 3 2 1 0
+ ref: _ _ _ _ _ _ _ _ _ _
+ read: _ _ _ _ x _ _ _ _ _
+ left offset: 0 1 2 3 4 5 6 7 8 9
+
+ left-offset = 4. right offset = 5.
+ read.getReadLength() = 10. numReadBasesToVariant = 5.
+
+ example 2: deletion
+
+ We count from the left end of the read to the last non-deleted base i.e. the first deleted base is not counted.
+ From the right end, we count bases to the *end* of the deletion.
+
+ right offset: 9 8 7 6 5 4 3 2 1 0
+ ref: _ _ _ _ _ _ _ _ _ _
+ read: _ _ _ _|- - - -|_ _
+ left offset: 0 1 2 3 4 5 6 7 8 9
+
+ left-offset = 3. right-offset = 2.
+ read.getReadLength() = 6. numReadBasesToVariant = 4
+
+ example 3: insertion
+
+ For insertions, we count from the left to the first inserted base. From the right, we count all the way to the first inserted base.
+ In the future, we may modify this; it might be desirable to count from the right to the *last* inserted base.
+
+ right offset: 9 8 7 6 5 4 3 2 1 0
+ ref: _ _ _ _ _ _ _ _
+ read: _ _ _ I I I _ _ _ _
+ left offset: 0 1 2 3 4 5 6 7 8 9
+
+ left-offset = 3. right offset = 6
+ read.getReadLength() = 10. numReadBasesToVariant = 4.
+
+ */
+
+ /**
+ * The function assumes that read contains the variant allele.
+ *
+ * @param read
+ * @param variantStartPosition the location of the variant in the reference
+ * @return
+ */
+
+ protected Pair getVariantPositionInRead(final GATKSAMRecord read, final int variantStartPosition) {
+ final Pair refPositionAndDeletionFlag = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), variantStartPosition, true);
+ // the +1 is needed there because getReadCoordinateForReferenceCoordinate() returns the number of read bases from the left end to the variant - 1
+ int numReadBasesFromLeftEndToVariant = refPositionAndDeletionFlag.getFirst() + 1;
+
+ // we don't take advantage of fallsInsideOrJustBeforeDeletionOrSkippedRegion flag now, but we might want to, so I will leave it here in comments.
+ // boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = refPositionAndDeletionFlag.getSecond();
+
+ if ( numReadBasesFromLeftEndToVariant == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ return new Pair(OptionalInt.empty(), OptionalInt.empty());
+ } else {
+ int leftOffset = numReadBasesFromLeftEndToVariant - 1;
+ int rightOffset = read.getReadLength() - numReadBasesFromLeftEndToVariant;
+ return new Pair(OptionalInt.of(leftOffset), OptionalInt.of(rightOffset));
+ }
+ }
+
+ /**
+ * Can the read be used in comparative tests between ref / alt bases?
+ *
+ * @param read the read to consider
+ * @return false if MQ is either 0 or unavailable. true otherwise.
+ */
+ private boolean isUsableRead(final GATKSAMRecord read) {
+ return( read.getMappingQuality() != 0 || read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE);
+ }
+
+ /**
+ *
+ * @param offsets a list of integers
+ * @param median median of the list offsets.
+ * @return median absolute deviation (median of the list of deviations from the median)
+ */
+ private double calculateMAD(final List offsets, final double median) {
+ // This code is concise but somehow leads to ClusteredReadPosition class being removed from ClassMap.
+ // mapToDouble() seems to be the trigger
+ // return new Median().evaluate(offsets.stream().mapToDouble(x -> Math.abs(x - median)).toArray());
+
+ double[] medianAbsoluteDeviations = new double[offsets.size()];
+ for (int i = 0; i < offsets.size(); i++){
+ medianAbsoluteDeviations[i] = Math.abs(offsets.get(i) - median);
+ }
+
+ return new Median().evaluate(medianAbsoluteDeviations);
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/create_M2_pon.scala b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java
similarity index 76%
rename from protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/create_M2_pon.scala
rename to protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java
index f3ec63163..0a88c28de 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/create_M2_pon.scala
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java
@@ -49,92 +49,37 @@
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
-package org.broadinstitute.gatk.tools.walkers.cancer.m2
+package org.broadinstitute.gatk.tools.walkers.cancer;
-import java.io.File
+/**
+ * Created by tsato on 6/27/16.
+ */
+public class MedianStatistics {
+ private double leftMedian;
+ private double rightMedian;
+ private double leftMAD;
+ private double rightMAD;
-import org.broadinstitute.gatk.queue.QScript
-import org.broadinstitute.gatk.queue.extensions.gatk._
-import org.broadinstitute.gatk.queue.function.CommandLineFunction
-import org.broadinstitute.gatk.queue.util.QScriptUtils
-import org.broadinstitute.gatk.utils.commandline.{Input, Output}
-import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.FilteredRecordMergeType
-
-import scala.collection.mutable.ListBuffer
-
-class create_M2_pon extends QScript {
-
- @Argument(shortName = "bams", required = true, doc = "file of all BAM files")
- var allBams: String = ""
-
- @Argument(shortName = "o", required = true, doc = "Output prefix")
- var outputPrefix: String = ""
-
- @Argument(shortName = "minN", required = false, doc = "minimum number of sample observations to include in PON")
- var minN: Int = 2
-
- @Argument(doc="Reference fasta file to process with", fullName="reference", shortName="R", required=false)
- var reference = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
-
- @Argument(doc="Intervals file to process with", fullName="intervals", shortName="L", required=true)
- var intervals : File = ""
-
- @Argument(shortName = "sc", required = false, doc = "base scatter count")
- var scatter: Int = 10
-
-
- def script() {
- val bams = QScriptUtils.createSeqFromFile(allBams)
- val genotypesVcf = outputPrefix + ".genotypes.vcf"
- val finalVcf = outputPrefix + ".vcf"
-
- val perSampleVcfs = new ListBuffer[File]()
- for (bam <- bams) {
- val outputVcf = "sample-vcfs/" + bam.getName + ".vcf"
- add( createM2Config(bam, outputVcf))
- perSampleVcfs += outputVcf
+ public MedianStatistics(double leftMedian, double rightMedian, double leftMAD, double rightMAD) {
+ this.leftMedian = leftMedian;
+ this.rightMedian = rightMedian;
+ this.leftMAD = leftMAD;
+ this.rightMAD = rightMAD;
}
- val cv = new CombineVariants()
- cv.reference_sequence = reference
- cv.memoryLimit = 2
- cv.setKey = "null"
- cv.minimumN = minN
- cv.memoryLimit = 16
- cv.filteredrecordsmergetype = FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
- cv.filteredAreUncalled = true
- cv.variant = perSampleVcfs
- cv.out = genotypesVcf
+ public double getLeftMedian() {
+ return leftMedian;
+ }
- // using this instead of "sites_only" because we want to keep the AC info
- val vc = new VcfCutter()
- vc.inVcf = genotypesVcf
- vc.outVcf = finalVcf
+ public double getRightMedian() {
+ return rightMedian;
+ }
- add (cv, vc)
+ public double getLeftMAD() {
+ return leftMAD;
+ }
- }
-
-
- def createM2Config(bam : File, outputVcf : File): org.broadinstitute.gatk.queue.extensions.gatk.MuTect2 = {
- val mutect2 = new org.broadinstitute.gatk.queue.extensions.gatk.MuTect2
-
- mutect2.reference_sequence = reference
- mutect2.artifact_detection_mode = true
- mutect2.intervalsString :+= intervals
- mutect2.memoryLimit = 2
- mutect2.input_file = List(new TaggedFile(bam, "tumor"))
-
- mutect2.scatterCount = scatter
- mutect2.out = outputVcf
-
- mutect2
- }
+ public double getRightMAD() {
+ return rightMAD;
+ }
}
-
-class VcfCutter extends CommandLineFunction {
- @Input(doc = "vcf to cut") var inVcf: File = _
- @Output(doc = "output vcf") var outVcf: File = _
-
- def commandLine = "cat %s | cut -f1-8 > %s".format(inVcf, outVcf)
-}
\ No newline at end of file
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/contamination/ContEst.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/contamination/ContEst.java
index a0639181f..4e33b4fa3 100755
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/contamination/ContEst.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/contamination/ContEst.java
@@ -197,29 +197,29 @@ public class ContEst extends RodWalker