Merge remote-tracking branch 'unstable/master'

This commit is contained in:
Geraldine Van der Auwera 2016-12-12 08:48:29 -05:00
commit 49c1a6c7bd
250 changed files with 7967 additions and 3552 deletions

View File

@ -13,7 +13,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
<version>3.6</version>
<version>3.7-SNAPSHOT</version>
<relativePath>public/gatk-root</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.6</version>
<version>3.7-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.6</version>
<version>3.7-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.6</version>
<version>3.7-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.6</version>
<version>3.7-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -54,6 +54,7 @@ package org.broadinstitute.gatk.engine.arguments;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.utils.commandline.Advanced;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.Hidden;
import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants;
import java.util.Collections;
@ -65,33 +66,23 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
public static final String MAX_ALTERNATE_ALLELES_SHORT_NAME = "maxAltAlleles";
/**
* Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping.
* Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site.
* Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles
* being sent on for genotyping. Using this argument instructs the genotyper to annotate (in the INFO field) the
* number of alternate alleles that were originally discovered (but not necessarily genotyped) at the site.
*/
@Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false)
@Argument(fullName = "annotateNDA", shortName = "nda", doc = "Annotate number of alleles observed", required = false)
public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
/**
* The expected heterozygosity value used to compute prior probability that a locus is non-reference.
*
* From the heterozygosity we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i)
* where hets is this case is analogous to the parameter theta from population genetics. See https://en.wikipedia.org/wiki/Coalescent_theory for more details.
*
* Note that heterozygosity as used here is the population genetics concept. (See http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics.
* We also suggest the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on the theory.) That is, a hets value of 0.001
* implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp.
*
* The default priors provided for humans (hets = 1e-3)
*
* Also note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype,
* which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there
* may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK
* only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the
* het parameters only increases the chance that a site will be called non-reference across all samples, but
* doesn't actually change the output genotype likelihoods at all, as these aren't posterior probabilities at all.
*
* The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy,
* which determines how many chromosomes each individual in the species carries.
* This activates a model for calculating QUAL that was introduced in version 3.7 (November 2016). We expect this
* model will become the default in future versions.
*/
@Argument(fullName = "useNewAFCalculator", shortName = "newQual", doc = "Use new AF model instead of the so-called exact model", required = false)
public boolean USE_NEW_AF_CALCULATOR = false;
/**
* The expected heterozygosity value used to compute prior probability that a locus is non-reference. See
* https://software.broadinstitute.org/gatk/documentation/article?id=8603 for more details.
*/
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY;
@ -102,32 +93,67 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY;
/**
* The standard deviation of the distribution of alt allele fractions. The above heterozygosity parameters give
* the *mean* of this distribution; this parameter gives its spread.
*/
@Argument(fullName = "heterozygosity_stdev", shortName = "heterozygosityStandardDeviation", doc = "Standard deviation of eterozygosity for SNP and indel calling.", required = false)
public double heterozygosityStandardDeviation = 0.01;
/**
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
* is the default).
*/
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
public double STANDARD_CONFIDENCE_FOR_CALLING = 10.0;
/**
* This argument allows you to emit low quality calls as filtered records.
*/
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
@Hidden
@Deprecated
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf",
doc = "This argument is no longer used in GATK versions 3.7 and newer. Please see the online documentation for the latest usage recommendations.", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
/**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN_ALLELES),
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter.
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or
* GENOTYPE_GIVEN_ALLELES), then only this many alleles will be used. Note that genotyping sites with many
* alternate alleles is both CPU and memory intensive and it scales exponentially based on the number of alternate
* alleles. Unless there is a good reason to change the default value, we highly recommend that you not play around
* with this parameter.
*
* As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
* See also {@link #MAX_GENOTYPE_COUNT}.
*/
@Advanced
@Argument(fullName = "max_alternate_alleles", shortName = MAX_ALTERNATE_ALLELES_SHORT_NAME, doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 6;
/**
* If there are more than this number of genotypes at a locus presented to the genotyper, then only this many
* genotypes will be used. This is intended to deal with sites where the combination of high ploidy and high alt
* allele count can lead to an explosion in the number of possible genotypes, with extreme adverse effects on
* runtime performance.
*
* How does it work? The possible genotypes are simply different ways of partitioning alleles given a specific
* ploidy assumption. Therefore, we remove genotypes from consideration by removing alternate alleles that are the
* least well supported. The estimate of allele support is based on the ranking of the candidate haplotypes coming
* out of the graph building step. Note however that the reference allele is always kept.
*
* The maximum number of alternative alleles used in the genotyping step will be the lesser of the two:
* 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #MAX_GENOTYPE_COUNT}
* 2. the value of {@link #MAX_ALTERNATE_ALLELES}
*
* As noted above, genotyping sites with large genotype counts is both CPU and memory intensive. Unless you have
* a good reason to change the default value, we highly recommend that you not play around with this parameter.
*
* See also {@link #MAX_ALTERNATE_ALLELES}.
*/
@Advanced
@Argument(fullName = "max_genotype_count", shortName = "maxGT", doc = "Maximum number of genotypes to consider at any site", required = false)
public int MAX_GENOTYPE_COUNT = 1024;
/**
* Determines the maximum number of PL values that will be logged in the output. If the number of genotypes
* (which is determined by the ploidy and the number of alleles) exceeds the value provided by this argument,
@ -138,23 +164,19 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
public int MAX_NUM_PL_VALUES = AFCalculator.MAX_NUM_PL_VALUES_DEFAULT;
/**
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model,
* see e.g. Waterson (1975) or Tajima (1996).
* This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a
* particular locus, using an infinite sites model (see e.g. Waterson, 1975 or Tajima, 1996). This model asserts that
* the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N.
* However, there are instances where using this prior might not be desirable, e.g. for population studies where prior
* might not be appropriate, as for example when the ancestral status of the reference allele is not known.
*
* There are instances where using this prior might not be desirable, e.g. for population studies where prior might not be appropriate,
* as for example when the ancestral status of the reference allele is not known.
* By using this argument, the user can manually specify a list of probabilities for each AC>1 to be used as priors for genotyping,
* with the following restrictions:
* a) User must specify 2N values, where N is the number of samples.
* b) Only diploid calls supported.
* c) Probability values are specified in Double format, in linear space (not log10 space or Phred-scale).
* d) No negative values allowed.
* e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
* f) If user-defined values add to more than one, an error will be produced.
* This argument allows you to manually specify a list of probabilities for each AC>1 to be used as
* priors for genotyping, with the following restrictions: only diploid calls are supported; you must specify 2 *
* N values where N is the number of samples; probability values must be positive and specified in Double format,
* in linear space (not log10 space nor Phred-scale); and all values must sume to 1.
*
* If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g.
* -inputPrior 0.33 -inputPrior 0.33
* For completely flat priors, specify the same value (=1/(2*N+1)) 2*N times, e.g.
* -inputPrior 0.33 -inputPrior 0.33
* for the single-sample diploid case.
*/
@Advanced
@ -162,9 +184,10 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
public List<Double> inputPrior = Collections.emptyList();
/**
* Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
* Sample ploidy - equivalent to number of chromosome copies per pool. For pooled experiments this should be set to
* the number of samples in pool multiplied by individual sample ploidy.
*/
@Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
@Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
public int samplePloidy = HomoSapiensConstants.DEFAULT_PLOIDY;
/**

View File

@ -53,6 +53,8 @@ package org.broadinstitute.gatk.tools;
import htsjdk.samtools.util.IOUtil;
import org.broadinstitute.gatk.engine.recalibration.BQSRGatherer;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@ -97,10 +99,7 @@ import java.util.List;
*
*/
@CommandLineProgramProperties(
usage = "Gathers scattered BQSR recalibration reports into a single file",
usageShort = "Gathers scattered BQSR recalibration reports into a single file"
)
@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_QC)
public class GatherBqsrReports extends CommandLineProgram {
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="List of scattered BQSR files")
public List<File> INPUT;

View File

@ -71,7 +71,7 @@ import java.util.List;
* <p>The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.</p>
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveats</h3>
* <ul>

View File

@ -63,13 +63,19 @@ import java.util.List;
/**
* Allele specific Rank Sum Test for insert sizes of REF versus ALT reads
*
* <p>
* This annotation tests whether the insert sizes of reads supporting the REF allele and ALT allele are roughly equal.
* In case of multiple alternate alleles, each alternate allele is considered separately.
* <p>This variant-level annotation compares the insert sizes of reads supporting the reference allele with those supporting each alternate allele. To be clear, it does so separately for each alternate allele.</p>
*
* <p>The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele are associated with smaller insert sizes than those supporting the reference allele. Conversely, a positive value indicates that reads supporting the alternate allele are associated with larger insert sizes than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.</p>
*
* </p>
* <h3>Statistical notes</h3>
* <p> See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document</a> for a more detailed explanation of the rank sum test. </p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for insert sizes (insert sizes of reads supporting REF vs. insert sizes of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveats</h3>
* <ul>
* <li>Uninformative reads are not used in these calculations.</li>
* <li>The insert size rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</li>
* <li>This is an experimental annotation and as such it is unsupported. Use at your own risk.</li>
* </ul>
*
* */

View File

@ -75,7 +75,7 @@ import java.util.Map;
* <p>This annotation can be used to evaluate confidence in a variant call and could be used as a covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.</p>
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities of the read's mate See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities of the read's mate See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* * <h3>Caveats</h3>
* <ul><li>The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</li>
@ -85,6 +85,8 @@ import java.util.Map;
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AS_MappingQualityRankSumTest.php">AS_MappingQualityRankSumTest</a></b> outputs the same rank sum test on the mapping quality of the reads themselves rather than their mates.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityRankSumTest.php">MappingQualityRankSumTest</a></b> outputs a version of the above mapping quality ranksum test annotation that includes all alternate alleles in a single calculation.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_RMSMappingQuality.php">RMSMappingQuality</a></b> gives an estimation of the overall read mapping quality supporting a variant call.</li>
* </ul>
*/
public class AS_MQMateRankSumTest extends AS_RankSumTest implements BetaTestingAnnotation {

View File

@ -76,7 +76,7 @@ import java.util.Map;
* <p>Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveats</h3>
* <ul><li>The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</li>
@ -86,6 +86,7 @@ import java.util.Map;
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityRankSumTest.php">MappingQualityRankSumTest</a></b> outputs a version of this annotation that includes all alternate alleles in a single calculation.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AS_MQMateRankSumTest.php">AS_MQMateRankSumTest</a></b> outputs the same allele-specific rank sum test on the mapping quality of the reads' mates rather than the reads themselves.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_RMSMappingQuality.php">RMSMappingQuality</a></b> gives an estimation of the overal read mapping quality supporting a variant call.</li>
* </ul>
*

View File

@ -72,7 +72,9 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
/**
* Allele-specific implementation of rank sum test annotations
* Allele-specific implementation of rank sum test annotations.
* The RankSumTest concept is documented at https://software.broadinstitute.org/gatk/documentation/article?id=8031
*
*/
public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnnotation {
private final static Logger logger = Logger.getLogger(AS_RMSAnnotation.class);
@ -277,7 +279,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) {
final Double value = getElementForRead(read, refLoc, a);
if ( value == null )
// Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
if ( value == null || value == INVALID_ELEMENT_FROM_READ )
continue;
if(perAlleleValues.containsKey(a.getMostLikelyAllele()))

View File

@ -74,7 +74,7 @@ import java.util.List;
* <p>This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.</p>
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveat</h3>
* <ul>
@ -102,6 +102,11 @@ public class AS_ReadPosRankSumTest extends AS_RankSumTest implements AS_Standard
if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED )
return null;
// If the offset inside a deletion, it does not lie on a read.
if ( AlignmentUtils.isInsideDeletion(read.getCigar(), offset) ) {
return INVALID_ELEMENT_FROM_READ;
}
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0);
final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
if (readPos > numAlignedBases / 2)

View File

@ -60,15 +60,14 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompa
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.gatk.utils.BaseUtils;
import java.util.*;
import java.util.stream.Collectors;
/**
* Count of A, C, G, T bases for each sample
@ -110,8 +109,9 @@ public class BaseCountsBySample extends GenotypeAnnotation {
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() )
gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, getBaseCounts(alleleLikelihoodMap, vc));
if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() ) {
gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, Arrays.stream(getBaseCounts(alleleLikelihoodMap, vc)).boxed().collect(Collectors.toList()));
}
}
@Override
@ -123,31 +123,15 @@ public class BaseCountsBySample extends GenotypeAnnotation {
}
/**
* Base counts given for the most likely allele
* Counts of observed bases at a genomic position e.g. {13,0,0,1} at chr1:100,000,000
*
* @param perReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood.
* @param vc variant context
* @return count of A, C, G, T bases
* @throws IllegalStateException if alleles in vc are not in perReadAlleleLikelihoodMap
*/
private int[] getBaseCounts(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) {
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
final int[] counts = new int[4];
for ( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
if (! a.isInformative() ) continue; // read is non-informative
for (final byte base : el.getKey().getReadBases() ){
int index = BaseUtils.simpleBaseToBaseIndex(base);
if ( index != -1 )
counts[index]++;
}
}
return counts;
return AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, vc.getStart());
}
}

View File

@ -70,7 +70,7 @@ import java.util.*;
* <p>The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.</p>
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveats</h3>
* <ul>

View File

@ -66,7 +66,7 @@ import java.util.*;
* <p>This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.</p>
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveat</h3>
* <p>The clipping rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>

View File

@ -139,8 +139,9 @@ public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation
double pval = exactTest(genotypeCounts);
//If the actual phredPval would be infinity we will probably still filter out just a very large number
//Since the method does not guarantee precision for any p-value smaller than 1e-16, we can return the phred scaled version
if (pval == 0) {
return Integer.MAX_VALUE;
return -10.0 * Math.log10(minNeededValue);
}
double phredPval = -10.0 * Math.log10(pval);

View File

@ -75,7 +75,7 @@ import java.util.*;
* <p>The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls.</p>
*
* <h3>Statistical notes</h3>
* <p>See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this application of Fisher's Exact Test.</p>
* <p>See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8056">method document on statistical tests</a> for a more detailed explanation of this application of Fisher's Exact Test.</p>
*
* <h3>Caveats</h3>
* <ul>

View File

@ -76,7 +76,7 @@ import java.util.*;
* <p>This annotation estimates whether there is evidence of inbreeding in a population. The higher the score, the higher the chance that there is inbreeding.</p>
*
* <h3>Statistical notes</h3>
* <p>The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is the F statistic from running the HW test for disequilibrium with PL values. See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
* <p>The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is the F statistic from running the HW test for disequilibrium with PL values. See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8032">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
*
* <h3>Caveats</h3>
* <ul>

View File

@ -66,7 +66,7 @@ import java.util.List;
* <p>This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.</p>
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for per-read likelihoods to the best haplotype match (likelihoods of reads supporting REF vs. likelihoods of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for per-read likelihoods to the best haplotype match (likelihoods of reads supporting REF vs. likelihoods of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveat</h3>
* <p>The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>

View File

@ -68,7 +68,7 @@ import java.util.*;
* <p>This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveats</h3>
* <ul><li>The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</li>
@ -78,6 +78,7 @@ import java.util.*;
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AS_MappingQualityRankSumTest.php">AS_MappingQualityRankSumTest</a></b> outputs an allele-specific version of this annotation.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AS_MQMateRankSumTest.php">AS_MQMateRankSumTest</a></b> outputs the allele-specific rank sum test on the mapping quality of the reads' mates rather than the reads themselves.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_RMSMappingQuality.php">RMSMappingQuality</a></b> gives an estimation of the overal read mapping quality supporting a variant call.</li>
* </ul>
*

View File

@ -71,11 +71,13 @@ import java.util.*;
/**
* Abstract root for all RankSum-based annotations
* Abstract root for all RankSum-based annotations.
* The RankSumTest concept is documented at https://software.broadinstitute.org/gatk/documentation/article?id=8031
*/
//TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final boolean DEBUG = false;
protected static double INVALID_ELEMENT_FROM_READ = Double.NEGATIVE_INFINITY;
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -86,11 +88,11 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
// either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null
final GenotypesContext genotypes = vc.getGenotypes();
if (genotypes == null || genotypes.size() == 0)
if (genotypes == null || genotypes.isEmpty())
return null;
final ArrayList<Double> refQuals = new ArrayList<>();
final ArrayList<Double> altQuals = new ArrayList<>();
final List<Double> refQuals = new ArrayList<>();
final List<Double> altQuals = new ArrayList<>();
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
@ -183,7 +185,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) {
final Double value = getElementForRead(read, refLoc, a);
if ( value == null )
// Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
if ( value == null || value == INVALID_ELEMENT_FROM_READ )
continue;
if ( a.getMostLikelyAllele().isReference() )

View File

@ -74,7 +74,7 @@ import java.util.*;
* <p>This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.</p>
*
* <h3>Statistical notes</h3>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=8031">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
*
* <h3>Caveat</h3>
* <ul>
@ -104,6 +104,11 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED )
return null;
// If the offset inside a deletion, it does not lie on a read.
if ( AlignmentUtils.isInsideDeletion(read.getCigar(), offset) ) {
return INVALID_ELEMENT_FROM_READ;
}
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 );
final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
if (readPos > numAlignedBases / 2)
@ -124,7 +129,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
@Override
protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
return super.isUsableRead(read, refLoc) && read.getSoftStart() + read.getCigar().getReadLength() > refLoc;
return super.isUsableRead(read, refLoc) && read.getSoftEnd() >= refLoc;
}

View File

@ -51,9 +51,9 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import cern.jet.math.Arithmetic;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.apache.commons.math3.distribution.HypergeometricDistribution;
import java.util.ArrayList;
import java.util.List;
@ -83,22 +83,51 @@ public class StrandBiasTableUtils {
int[][] table = copyContingencyTable(normalizedTable);
double pCutoff = computePValue(table);
int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
int N = rowSums[0] + rowSums[1];
int sampleSize = colSums[0];
int numberOfNonSuccesses = rowSums[1];
int numberOfSuccesses = rowSums[0];
/*
* The lowest possible number of successes you can sample is what's left of your sample size after
* drawing every non success in the urn. If the number of non successes in the urn is greater than the sample
* size then the minimum number of drawn successes is 0.
*/
int lo = Math.max(0, sampleSize - numberOfNonSuccesses);
/*
* The highest possible number of successes you can draw is either the total sample size or the number of
* successes in the urn. (Whichever is smaller)
*/
int hi = Math.min(sampleSize, numberOfSuccesses);
double pValue = pCutoff;
while (rotateTable(table)) {
double pValuePiece = computePValue(table);
if (pValuePiece <= pCutoff) {
pValue += pValuePiece;
}
/**
* If the support of the distribution is only one value, creating the HypergeometricDistribution
* doesn't make sense. There would be only one possible observation so the p-value has to be 1.
*/
if (lo == hi) {
return 1.0;
}
table = copyContingencyTable(normalizedTable);
while (unrotateTable(table)) {
double pValuePiece = computePValue(table);
/**
* For the hypergeometric distribution from which to calculate the probabilities:
* The population (N = a+b+c+d) is the sum of all four numbers in the contingency table. Then the number of
* "successes" (K = a+b) is the sum of the top row, and the sample size (n = a+c) is the sum of the first column.
*/
final HypergeometricDistribution dist = new HypergeometricDistribution(N, numberOfSuccesses, sampleSize);
if (pValuePiece <= pCutoff) {
//Then we determine a given probability with the sampled successes (k = a) from the first entry in the table.
double pCutoff = dist.probability(table[0][0]);
double pValue = 0.0;
/**
* Now cycle through all possible k's and add those whose probabilities are smaller than our observed k
* to the p-value, since this is a two-sided test
*/
for(int i = lo; i <= hi; i++){
double pValuePiece = dist.probability(i);
if(pValuePiece <= pCutoff) {
pValue += pValuePiece;
}
}
@ -190,45 +219,6 @@ public class StrandBiasTableUtils {
return c;
}
protected static boolean rotateTable(int[][] table) {
table[0][0]--;
table[1][0]++;
table[0][1]++;
table[1][1]--;
return (table[0][0] >= 0 && table[1][1] >= 0);
}
protected static boolean unrotateTable(int[][] table) {
table[0][0]++;
table[1][0]--;
table[0][1]--;
table[1][1]++;
return (table[0][1] >= 0 && table[1][0] >= 0);
}
protected static double computePValue(int[][] table) {
int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
int N = rowSums[0] + rowSums[1];
// calculate in log space for better precision
double pCutoff = Arithmetic.logFactorial(rowSums[0])
+ Arithmetic.logFactorial(rowSums[1])
+ Arithmetic.logFactorial(colSums[0])
+ Arithmetic.logFactorial(colSums[1])
- Arithmetic.logFactorial(table[0][0])
- Arithmetic.logFactorial(table[0][1])
- Arithmetic.logFactorial(table[1][0])
- Arithmetic.logFactorial(table[1][1])
- Arithmetic.logFactorial(N);
return Math.exp(pCutoff);
}
private static int sumRow(int[][] table, int column) {
int sum = 0;
for (int r = 0; r < table.length; r++) {

View File

@ -241,7 +241,7 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation implements Acti
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
* fw rv
* allele1 # #
* allele2 # #
* @return a 2x2 contingency table

View File

@ -77,24 +77,22 @@ import java.util.*;
*
* <p>and its inverse:</p>
*
* <table>
* <table class='table table-striped'>
* <tr><td>&nbsp;</td><td>+ strand </td><td>- strand</td></tr>
* <tr><td>REF;</td><td>X[0][0]</td><td>X[0][1]</td></tr>
* <tr><td>ALT;</td><td>X[1][0]</td><td>X[1][1]</td></tr>
* </table>
*
* <br />
* <p>The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where</p>
*
* $$ refRatio = \frac{max(X[0][0], X[0][1])}{min(X[0][0], X[0][1} $$
* $$ refRatio = \frac{min(X[0][0], X[0][1])}{max(X[0][0], X[0][1} $$
*
* <p>and </p>
*
* $$ altRatio = \frac{max(X[1][0], X[1][1])}{min(X[1][0], X[1][1]} $$
* $$ altRatio = \frac{min(X[1][0], X[1][1])}{max(X[1][0], X[1][1]} $$
*
* <p>ensures that the annotation value is large only. </p>
*
* <p>See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
*
* <h3>Caveat</h3>
* <p>
* The name SOR is not entirely appropriate because the implementation was changed somewhere between the start of development and release of this annotation. Now SOR isn't really an odds ratio anymore. The goal was to separate certain cases of data without penalizing variants that occur at the ends of exons because they tend to only be covered by reads in one direction (depending on which end of the exon they're on), so if a variant has 10 ref reads in the + direction, 1 ref read in the - direction, 9 alt reads in the + direction and 2 alt reads in the - direction, it's actually not strand biased, but the FS score is pretty bad. The implementation that resulted derived in part from empirically testing some read count tables of various sizes with various ratios and deciding from there.</p>
@ -153,6 +151,7 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio
double ratio = 0;
ratio += (augmentedTable[0][0] / augmentedTable[0][1]) * (augmentedTable[1][1] / augmentedTable[1][0]);
// TODO: repeated computation: how about ratio += 1/ratio, or ratio = ratio + 1/ratio to be expicit
ratio += (augmentedTable[0][1] / augmentedTable[0][0]) * (augmentedTable[1][0] / augmentedTable[1][1]);
final double refRatio = (Math.min(augmentedTable[0][0], augmentedTable[0][1])/Math.max(augmentedTable[0][0], augmentedTable[0][1]));

View File

@ -51,9 +51,9 @@
package org.broadinstitute.gatk.tools.walkers.bqsr;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.engine.recalibration.*;
import org.broadinstitute.gatk.engine.walkers.*;
@ -194,7 +194,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to mask out known variant sites. Please provide a VCF file containing known sites of genetic variation.";
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
private ReferenceSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
/**

View File

@ -0,0 +1,350 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2016 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.cancer;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.cancer.m2.MuTect2;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
import java.util.stream.Collectors;
/**
* Detect clustering of variants near the ends of reads
*
* <p> This annotation detects clustering of evidence for a somatic variant near the ends of reads. To turn on the annotation and the accompanying filter (clustered_read_position), add --enable_clustered_read_position_filter flag in the commandline.
*
*
* <h3>Statistical notes</h3>
* <p> ClusteredReadPosition produces four INFO field annotations. At a given somatic variant site, MEDIAN_LEFT_OFFSET is the median of the number of bases from the left end of the tumor read to the variant. MEDIAN_RIGHT_OFFSET is similar, but counts from the right end of the read. MAD_LEFT_OFFSET and MAD_RIGHT_OFFSET measure the median absolute deviations. The median gives us the offset of a representative read, while the median absolute deviation captures the spread. We filter a variant if MEDIAN_LEFT_OFFSET <= 10 and MAD_LEFT_OFFSET <= 3, or if MEDIAN_RIGHT_OFFSET <= 10 and MAD_RIGHT_OFFSET <= 3.
*
*
* <h3>Caveat</h3>
* <p> ClusteredReadPosition is available with MuTect2 only </p>
*
* <h3>RelatedAnnotation</h3>
* <li><b><a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_annotator_ReadPosRankSumTest.php">ReadPosRankSum</a></b> is a similar annotation designed for germline variants.
*
*/
public class ClusteredReadPosition extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
private final static Logger logger = Logger.getLogger(ClusteredReadPosition.class);
private String tumorSampleName = null;
@Override
public List<String> getKeyNames() { return Arrays.asList(
GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY,
GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY,
GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY,
GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
}
@Override
public List<VCFInfoHeaderLine> getDescriptions() {
List<VCFInfoHeaderLine> descriptions = new ArrayList<>();
for (final String infoFieldKey : getKeyNames()){
descriptions.add(GATKVCFHeaderLines.getInfoLine(infoFieldKey));
}
return descriptions;
// the following causes a cryptic class not found error, similar to the one in computeReadPositionStats
// return getKeyNames().stream().map(GATKVCFHeaderLines::getInfoLine).collect(Collectors.toList());
}
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
// TODO: might make sense to move this code to SomaticGenoypingEngine.
// FIXME: checking walker is mutect2 is not ideal...moving this annotation to SomaticGenoypingEngine will solve it
// populate tumorSampleName the first time we call this method. skip afterwards.
if (tumorSampleName == null){
if (walker instanceof MuTect2) {
tumorSampleName = ((MuTect2) walker).getTumorSampleName();
} else {
throw new IllegalStateException("ClusteredReadPosition: walker is not MuTect2");
}
}
// we skip multi-allelic sites
if (vc.getAlternateAlleles().size() > 1){
return null;
}
final Map<String, Object> result = new HashMap<>();
if ( stratifiedPerReadAlleleLikelihoodMap != null ) {
final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(tumorSampleName);
if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
final Optional<MedianStatistics> readPositionStatsOption = computeReadPositionStats(vc, likelihoodMap);
if (readPositionStatsOption.isPresent()){
MedianStatistics readPositionStats = readPositionStatsOption.get();
result.put(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY, readPositionStats.getLeftMedian());
result.put(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY, readPositionStats.getRightMedian());
result.put(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY, readPositionStats.getLeftMAD());
result.put(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY, readPositionStats.getRightMAD());
} else {
return null;
}
}
}
return result;
}
/**
*
* @param vc
* @param pralm
* @return median of left and right offsets and their median absolute deviations. does not return null.
*/
private Optional<MedianStatistics> computeReadPositionStats(final VariantContext vc,
final PerReadAlleleLikelihoodMap pralm) {
final int variantStartPosition = vc.getStart();
final List<Integer> tumorLeftOffsets = new ArrayList<>();
final List<Integer> tumorRightOffsets = new ArrayList<>();
for ( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> readAlleleLikelihood : pralm.getLikelihoodReadMap().entrySet() ) {
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(readAlleleLikelihood.getValue());
final GATKSAMRecord read = readAlleleLikelihood.getKey();
if ( mostLikelyAllele.getMostLikelyAllele().isReference() || ! mostLikelyAllele.isInformative() || ! isUsableRead(read)) {
continue;
}
final Pair<OptionalInt, OptionalInt> offsetPair = getVariantPositionInRead(read, variantStartPosition);
final OptionalInt variantPositionInReadFromLeft = offsetPair.getFirst();
final OptionalInt variantPositionInReadFromRight = offsetPair.getSecond();
// suffices to check only the left offset because the right offset depends on it
if ( variantPositionInReadFromLeft.isPresent() ) {
tumorLeftOffsets.add(variantPositionInReadFromLeft.getAsInt());
tumorRightOffsets.add(variantPositionInReadFromRight.getAsInt());
}
}
if (tumorLeftOffsets.isEmpty() || tumorRightOffsets.isEmpty()) {
// This condition seems to arise when the reads as aligned in the bam (as represented by PRALM) do not contain the alt read found by HaplotypeCaller
logger.warn("At Position " + vc.getContig() + ": " + vc.getStart() + " , the left or right offset list is empty");
return Optional.empty();
}
// The following (mapToDouble() in particular) causes ClusteredReadPosition to be not added to ClassMap
// leftMedian = median.evaluate(tumorLeftOffsets.stream().mapToDouble( x -> x ).toArray());
// rightMedian = median.evaluate(tumorRightOffsets.stream().mapToDouble( x -> x).toArray());
// until we understand why mapToDouble() causes the above error, have to compute medians in two steps
// first use a for loop to manually cast integer to doubles, then call median :: evaluate
double[] tumorLeftOffsetsDouble = new double[tumorLeftOffsets.size()];
double[] tumorRightOffsetsDouble = new double[tumorRightOffsets.size()];
for (int i = 0; i < tumorLeftOffsets.size(); i++){
tumorLeftOffsetsDouble[i] = (double) tumorLeftOffsets.get(i);
tumorRightOffsetsDouble[i] = (double) tumorRightOffsets.get(i);
}
Median median = new Median();
double leftMedian = median.evaluate(tumorLeftOffsetsDouble);
double rightMedian = median.evaluate(tumorRightOffsetsDouble);
double leftMAD = calculateMAD(tumorLeftOffsets, leftMedian);
double rightMAD = calculateMAD(tumorRightOffsets, rightMedian);
return( Optional.of(new MedianStatistics(leftMedian, rightMedian, leftMAD, rightMAD) ) );
}
private static class MedianStatistics {
private double leftMedian;
private double rightMedian;
private double leftMAD;
private double rightMAD;
public MedianStatistics(double leftMedian, double rightMedian, double leftMAD, double rightMAD) {
this.leftMedian = leftMedian;
this.rightMedian = rightMedian;
this.leftMAD = leftMAD;
this.rightMAD = rightMAD;
}
public double getLeftMedian() {
return leftMedian;
}
public double getRightMedian() {
return rightMedian;
}
public double getLeftMAD() {
return leftMAD;
}
public double getRightMAD() {
return rightMAD;
}
}
/**
Examples below show how we compute the position of the variant with respect to the left and right end of the reads.
Note that a variant may be SNP, deletion, or insertion, and we are counting the number of bases from the left/right end of the read to that variant.
We first compute the left offset. Then, right offset = read length - left offset.
This means that if there is an insertion between the either end of a read and the variant, we count the inserted bases. Conversely, we do not count the deleted bases between the end of a read and a variant.
We count soft-clipped bases.
example 1 : SNP
right offset: 9 8 7 6 5 4 3 2 1 0
ref: _ _ _ _ _ _ _ _ _ _
read: _ _ _ _ x _ _ _ _ _
left offset: 0 1 2 3 4 5 6 7 8 9
left-offset = 4. right offset = 5.
read.getReadLength() = 10. numReadBasesToVariant = 5.
example 2: deletion
We count from the left end of the read to the last non-deleted base i.e. the first deleted base is not counted.
From the right end, we count bases to the *end* of the deletion.
right offset: 9 8 7 6 5 4 3 2 1 0
ref: _ _ _ _ _ _ _ _ _ _
read: _ _ _ _|- - - -|_ _
left offset: 0 1 2 3 4 5 6 7 8 9
left-offset = 3. right-offset = 2.
read.getReadLength() = 6. numReadBasesToVariant = 4
example 3: insertion
For insertions, we count from the left to the first inserted base. From the right, we count all the way to the first inserted base.
In the future, we may modify this; it might be desirable to count from the right to the *last* inserted base.
right offset: 9 8 7 6 5 4 3 2 1 0
ref: _ _ _ _ _ _ _ _
read: _ _ _ I I I _ _ _ _
left offset: 0 1 2 3 4 5 6 7 8 9
left-offset = 3. right offset = 6
read.getReadLength() = 10. numReadBasesToVariant = 4.
*/
/**
* The function assumes that read contains the variant allele.
*
* @param read
* @param variantStartPosition the location of the variant in the reference
* @return
*/
protected Pair<OptionalInt, OptionalInt> getVariantPositionInRead(final GATKSAMRecord read, final int variantStartPosition) {
final Pair<Integer, Boolean> refPositionAndDeletionFlag = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), variantStartPosition, true);
// the +1 is needed there because getReadCoordinateForReferenceCoordinate() returns the number of read bases from the left end to the variant - 1
int numReadBasesFromLeftEndToVariant = refPositionAndDeletionFlag.getFirst() + 1;
// we don't take advantage of fallsInsideOrJustBeforeDeletionOrSkippedRegion flag now, but we might want to, so I will leave it here in comments.
// boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = refPositionAndDeletionFlag.getSecond();
if ( numReadBasesFromLeftEndToVariant == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
return new Pair(OptionalInt.empty(), OptionalInt.empty());
} else {
int leftOffset = numReadBasesFromLeftEndToVariant - 1;
int rightOffset = read.getReadLength() - numReadBasesFromLeftEndToVariant;
return new Pair(OptionalInt.of(leftOffset), OptionalInt.of(rightOffset));
}
}
/**
* Can the read be used in comparative tests between ref / alt bases?
*
* @param read the read to consider
* @return false if MQ is either 0 or unavailable. true otherwise.
*/
private boolean isUsableRead(final GATKSAMRecord read) {
return( read.getMappingQuality() != 0 || read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE);
}
/**
*
* @param offsets a list of integers
* @param median median of the list offsets.
* @return median absolute deviation (median of the list of deviations from the median)
*/
private double calculateMAD(final List<Integer> offsets, final double median) {
// This code is concise but somehow leads to ClusteredReadPosition class being removed from ClassMap.
// mapToDouble() seems to be the trigger
// return new Median().evaluate(offsets.stream().mapToDouble(x -> Math.abs(x - median)).toArray());
double[] medianAbsoluteDeviations = new double[offsets.size()];
for (int i = 0; i < offsets.size(); i++){
medianAbsoluteDeviations[i] = Math.abs(offsets.get(i) - median);
}
return new Median().evaluate(medianAbsoluteDeviations);
}
}

View File

@ -49,92 +49,37 @@
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.cancer.m2
package org.broadinstitute.gatk.tools.walkers.cancer;
import java.io.File
/**
* Created by tsato on 6/27/16.
*/
public class MedianStatistics {
private double leftMedian;
private double rightMedian;
private double leftMAD;
private double rightMAD;
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.function.CommandLineFunction
import org.broadinstitute.gatk.queue.util.QScriptUtils
import org.broadinstitute.gatk.utils.commandline.{Input, Output}
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.FilteredRecordMergeType
import scala.collection.mutable.ListBuffer
class create_M2_pon extends QScript {
@Argument(shortName = "bams", required = true, doc = "file of all BAM files")
var allBams: String = ""
@Argument(shortName = "o", required = true, doc = "Output prefix")
var outputPrefix: String = ""
@Argument(shortName = "minN", required = false, doc = "minimum number of sample observations to include in PON")
var minN: Int = 2
@Argument(doc="Reference fasta file to process with", fullName="reference", shortName="R", required=false)
var reference = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
@Argument(doc="Intervals file to process with", fullName="intervals", shortName="L", required=true)
var intervals : File = ""
@Argument(shortName = "sc", required = false, doc = "base scatter count")
var scatter: Int = 10
def script() {
val bams = QScriptUtils.createSeqFromFile(allBams)
val genotypesVcf = outputPrefix + ".genotypes.vcf"
val finalVcf = outputPrefix + ".vcf"
val perSampleVcfs = new ListBuffer[File]()
for (bam <- bams) {
val outputVcf = "sample-vcfs/" + bam.getName + ".vcf"
add( createM2Config(bam, outputVcf))
perSampleVcfs += outputVcf
public MedianStatistics(double leftMedian, double rightMedian, double leftMAD, double rightMAD) {
this.leftMedian = leftMedian;
this.rightMedian = rightMedian;
this.leftMAD = leftMAD;
this.rightMAD = rightMAD;
}
val cv = new CombineVariants()
cv.reference_sequence = reference
cv.memoryLimit = 2
cv.setKey = "null"
cv.minimumN = minN
cv.memoryLimit = 16
cv.filteredrecordsmergetype = FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
cv.filteredAreUncalled = true
cv.variant = perSampleVcfs
cv.out = genotypesVcf
public double getLeftMedian() {
return leftMedian;
}
// using this instead of "sites_only" because we want to keep the AC info
val vc = new VcfCutter()
vc.inVcf = genotypesVcf
vc.outVcf = finalVcf
public double getRightMedian() {
return rightMedian;
}
add (cv, vc)
public double getLeftMAD() {
return leftMAD;
}
}
def createM2Config(bam : File, outputVcf : File): org.broadinstitute.gatk.queue.extensions.gatk.MuTect2 = {
val mutect2 = new org.broadinstitute.gatk.queue.extensions.gatk.MuTect2
mutect2.reference_sequence = reference
mutect2.artifact_detection_mode = true
mutect2.intervalsString :+= intervals
mutect2.memoryLimit = 2
mutect2.input_file = List(new TaggedFile(bam, "tumor"))
mutect2.scatterCount = scatter
mutect2.out = outputVcf
mutect2
}
public double getRightMAD() {
return rightMAD;
}
}
class VcfCutter extends CommandLineFunction {
@Input(doc = "vcf to cut") var inVcf: File = _
@Output(doc = "output vcf") var outVcf: File = _
def commandLine = "cat %s | cut -f1-8 > %s".format(inVcf, outVcf)
}

View File

@ -197,29 +197,29 @@ public class ContEst extends RodWalker<Map<String, Map<String, ContaminationStat
public SeqGenotypeMode genotypeMode = SeqGenotypeMode.HARD_THRESHOLD;
// ------------------------------------------------------------------------------------------------------------------------------------------------------
// hidden arguments
// advanced arguments
// ------------------------------------------------------------------------------------------------------------------------------------------------------
@Hidden
@Advanced
@Argument(fullName = "trim_interval", doc = "progressively trim from 0 to TRIM_FRACTION by this interval", required = false)
public double TRIM_INTERVAL = 0;
@Hidden
@Advanced
@Argument(fullName = "min_site_depth", required = false, doc = "minimum depth at a site to consider in calculation")
public int MIN_SITE_DEPTH = 0;
@Hidden
@Advanced
@Argument(fullName = "fixed_epsilon_qscore", required = false, doc = "use a constant epsilon (phred scale) for calculation")
public Byte FIXED_EPSILON = null;
@Hidden
@Advanced
@Argument(fullName = "min_genotype_depth", required = false, doc = "what minimum depth is required to call a site in seq genotype mode")
public int MIN_GENOTYPE_DEPTH_FOR_SEQ = 50;
@Hidden
@Advanced
@Argument(fullName = "min_genotype_ratio", required = false, doc = "the ratio of alt to other bases to call a site a hom non-ref variant")
public double MIN_GENOTYPE_RATIO = 0.80;
@Hidden
@Advanced
@Argument(fullName = "min_genotype_llh", required = false, doc = "the min log likelihood for UG to call a genotype")
public double MIN_UG_LOG_LIKELIHOOD = 5;
// ------------------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -54,7 +54,7 @@ and then run the following Queue command
java \
-Djava.io.tmpdir=$TEMPDIR \
-jar $QUEUE_JAR \
-S $GSA_UNSTABLE_HOME/private/gatk-tools-private/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_dream.scala \
-S $GSA_UNSTABLE_HOME/private/gatk-queue-extensions-internal/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/m2/run_M2_dream.scala \
--job_queue gsa -qsub -jobResReq virtual_free=5G -startFromScratch \
-sc 200 \
-normal $NORMAL_BAM \

View File

@ -51,11 +51,41 @@
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection;
import org.broadinstitute.gatk.utils.commandline.Advanced;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.*;
import java.util.Collections;
import java.util.List;
public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection {
/***************************************/
// Reference Metadata inputs
/***************************************/
/**
* MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant
* in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal
* sample to prove the variant is not present in germline.
*/
@Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false)
public List<RodBinding<VariantContext>> cosmicRod = Collections.emptyList();
/**
* A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants.
*/
@Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false)
public List<RodBinding<VariantContext>> normalPanelRod = Collections.emptyList();
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@Advanced
@Argument(fullName="m2debug", shortName="m2debug", doc="Print out very verbose M2 debug information", required = false)
public boolean M2_DEBUG = false;
@ -119,4 +149,36 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
*/
@Argument(fullName = "max_alt_allele_in_normal_fraction", required = false, doc="Threshold for maximum alternate allele fraction in normal")
public double MAX_ALT_ALLELE_IN_NORMAL_FRACTION = 0.03;
/**
* This argument is used for the M1-style strand bias filter
*/
@Argument(fullName="power_constant_qscore", doc="Phred scale quality score constant to use in power calculations", required=false)
public int POWER_CONSTANT_QSCORE = 30;
@Hidden
@Argument(fullName = "strand_artifact_lod", required = false, doc = "LOD threshold for calling strand bias")
public float STRAND_ARTIFACT_LOD_THRESHOLD = 2.0f;
@Hidden
@Argument(fullName = "strand_artifact_power_threshold", required = false, doc = "power threshold for calling strand bias")
public float STRAND_ARTIFACT_POWER_THRESHOLD = 0.9f;
@Argument(fullName = "enable_strand_artifact_filter", required = false, doc = "turn on strand artifact filter")
public boolean ENABLE_STRAND_ARTIFACT_FILTER = false;
@Argument(fullName = "enable_clustered_read_position_filter", required = false, doc = "turn on clustered read position filter")
public boolean ENABLE_CLUSTERED_READ_POSITION_FILTER = false;
/**
* This argument is used for the M1-style read position filter
*/
@Argument(fullName = "pir_median_threshold", required = false, doc="threshold for clustered read position artifact median")
public double PIR_MEDIAN_THRESHOLD = 10;
/**
* This argument is used for the M1-style read position filter
*/
@Argument(fullName = "pir_mad_threshold", required = false, doc="threshold for clustered read position artifact MAD")
public double PIR_MAD_THRESHOLD = 3;
}

View File

@ -0,0 +1,68 @@
# CRSP HapMap Sensitivity Evaluation
###Current M2 Performance
(gsa-unstable 9/1/15, commit:a08903d)
| Mixture | type | sensitvity |
|------|----------------------|
| 5-plex |SNP|0.9691274|
| 5-plex |INDEL|0.87466127|
| 10-plex |SNP|0.97179496|
| 10-plex |INDEL|0.8888889|
| 20-plex |SNP|0.9537307|
| 20-plex |INDEL|0.83281654|
###Run Procedure
Run the script separately for each HapMap mixture bam:
inputDir=/dsde/working/mutect/laura/hapmapSensitivity/inputs/
Queue_Jar=<Queue jar of interest>
```
java -jar $Queue_Jar -S Qscript_HapMapPlex.scala \
-intervals $inputDir/agilent_5plex_intervalFiles.list \
-tumors $inputDir/agilent_5plex_bams.list \
-truthVCF $inputDir/agilent_5plex_truth_intervals.vcf \
-snpCounts $inputDir/agilent_5plex_truth_intervals.snpCounts.list \
-indelCounts $inputDir/agilent_5plex_truth_intervals.indelCounts.list \
-o <output.5plex.sensitivity.report> \
-qsub -jobQueue gsa -jobResReq virtual_free=5G -sc 50
```
The HapMap bams get run as tumors without normals because we're not interested in specificity here, so we don't need the normals to filter out noise
###Inputs
Bam lists:
5- and 10-plex have 3 replicates, 20-plex has 9
Interval files:
If we're only interested in sensitivity, then we only need to run the caller around known true positive sites, which we take from the truth VCFs
This workaround repeats the truth filename for the number of bams -- in theory each could have a separate truth VCF, but they are the same titration mixture
SNP/INDEL counts:
This is the number of events in the truth VCFs so we can find the sensitivity across all samples
TODO: this could be generalized
###Outputs
Each run outputs its own SNP and INDEL sensitivity combined across all samples:
```
Sensitivity across all samples:
SNPs: 0.95156
INDELs: 0.7328859
```
Note that these are not filtered for depth as described in the CRSP documentation
###Resources
Truth file preparation for 5-plex:
Start with /cga/tcga-gsc/benchmark/data/crsp-truth/1kg_5plex_wgs_hc_calls.codingIndelSnp.db135.recode.vcf
Select out allele fraction greater than 20% using "vc.isBiallelic() ? AF >= 0.2 : vc.hasGenotypes() && vc.getCalledChrCount(vc.getAltAlleleWithHighestAlleleCount())/(1.0*vc.getCalledChrCount()) >= 0.2"
Similar for 10-plex source:
/cga/tcga-gsc/benchmark/data/crsp-truth/1kg_10plex_wgs_hc_calls.codingIndelSnp.db135.recode.vcf
And 20-plex source:
/cga/tcga-gsc/benchmark/data/crsp-truth/1kg_20plex_wgs_hc_calls.codingIndelSnp.db135.recode.vcf
both also using AF filter of 0.2

View File

@ -22,16 +22,18 @@ TODO: write a simple tool to do this more easily
To calculate per pair-counts, run:
```
# for SNPs
for vcf in *.bam.vcf
do
cat $vcf | grep PASS | awk '{ if ( length($4) + length($5) == 2) print $0 }' | wc -l
done
for vcf in *.vcf
do
cat $vcf | grep PASS | awk '{ if ( length($4) + length($5) == 2) print $0 }' | wc -l
done > snp-fps.txt
cat snp-fps.txt | awk '{ sum += $1 } END { print sum }'
# for INDELs
for vcf in *.bam.vcf
do
cat $vcf | grep PASS | awk '{ if ( length($4) + length($5) != 2) print $0 }' | wc -l
done
for vcf in *.vcf
do
cat $vcf | grep PASS | awk '{ if ( length($4) + length($5) != 2) print $0 }' | wc -l
done > indel-fps.txt
cat indel-fps.txt | awk '{ sum += $1 } END { print sum }'
```
### Current M1 and Indelocator Performance
@ -72,7 +74,7 @@ and then run the following Queue command
java \
-Djava.io.tmpdir=$TEMPDIR \
-jar $QUEUE_JAR \
-S $GSA_UNSTABLE_HOME/private/gatk-tools-private/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_ICE_NN.scala \
-S $GSA_UNSTABLE_HOME/private/gatk-queue-extensions-internal/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/m2/run_M2_ICE_NN.scala \
-sc 50 \
--job_queue gsa -qsub -jobResReq virtual_free=5G -startFromScratch \
--allbams /humgen/gsa-hpprojects/NA12878Collection/bams/crsp_ice_validation//NA12878.intra.flowcell.replicate.bam_list \

View File

@ -0,0 +1,195 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2016 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import htsjdk.variant.variantcontext.Allele;
import java.util.*;
/**
* A container for allele to value mapping.
*
* Each PerAlleleCollection may hold a value for each ALT allele and, optionally, a value for the REF allele.
* For example,
*
* PerAlleleCollection<Double> alleleFractions = PerAlleleCollection.createPerAltAlleleCollection()
*
* may be a container for allele fractions for ALT alleles in a variant context. While
*
* PerAlleleCollection<Double> alleleCount = PerAlleleCollection.createPerRefAndAltAlleleCollection()
*
* may hold the allele counts for the REF allele and all ALT alleles in a variant context.
*
*
**/
public class PerAlleleCollection<X> {
// TODO: consider using Optional for ref allele
private Optional<Allele> refAllele;
private Optional<X> refValue;
private Map<Allele, X> altAlleleValueMap;
private boolean altOnly;
private PerAlleleCollection(boolean altOnly){
this.altOnly = altOnly;
this.altAlleleValueMap = new HashMap();
this.refAllele = Optional.empty();
}
public static PerAlleleCollection createPerAltAlleleCollection(){
return new PerAlleleCollection(true);
}
public static PerAlleleCollection createPerRefAndAltAlleleCollection(){
return new PerAlleleCollection(false);
}
/**
* Take an allele, REF or ALT, and update its value appropriately
*
* @param allele : REF or ALT allele
* @param newValue :
*/
public void set(Allele allele, X newValue){
if (allele == null || newValue == null){
throw new IllegalArgumentException("allele or newValue is null");
}
if (allele.isReference() && altOnly){
throw new IllegalArgumentException("Collection stores values for alternate alleles only");
}
if (allele.isReference()){
this.setRef(allele, newValue);
} else {
this.setAlt(allele, newValue);
}
}
public void setRef(Allele refAllele, X newValue){
if (refAllele == null || newValue == null){
throw new IllegalArgumentException("refAllele or newValue is null");
}
if (refAllele.isNonReference()){
throw new IllegalArgumentException("Setting Non-reference allele as reference");
}
if (this.refAllele.isPresent()){
throw new IllegalArgumentException("Resetting the reference allele not permitted");
}
this.refAllele = Optional.of(refAllele);
this.refValue = Optional.of(newValue);
}
public void setAlt(Allele altAllele, X newValue){
if (altAllele == null || newValue == null){
throw new IllegalArgumentException("altAllele or newValue is null");
}
if (altAllele.isReference()){
throw new IllegalArgumentException("Setting reference allele as alt");
}
altAlleleValueMap.put(altAllele, newValue);
}
/**
* Get the value for an allele, REF or ALT
* @param allele
*/
public X get(Allele allele){
if (allele == null){
throw new IllegalArgumentException("allele is null");
}
if (allele.isReference()){
if (allele.equals(this.refAllele.get())){
return(getRef());
} else {
throw new IllegalArgumentException("Requested ref allele does not match the stored ref allele");
}
} else {
return(getAlt(allele));
}
}
public X getRef(){
if (altOnly) {
throw new IllegalStateException("Collection does not hold the REF allele");
}
if (this.refAllele.isPresent()){
return(refValue.get());
} else {
throw new IllegalStateException("Collection's ref allele has not been set yet");
}
}
public X getAlt(Allele allele){
if (allele == null){
throw new IllegalArgumentException("allele is null");
}
if (allele.isReference()){
throw new IllegalArgumentException("allele is not an alt allele");
}
if (altAlleleValueMap.containsKey(allele)) {
return(altAlleleValueMap.get(allele));
} else {
throw new IllegalArgumentException("Requested alt allele is not in the collection");
}
}
public Set<Allele> getAltAlleles(){
return(altAlleleValueMap.keySet());
}
}

View File

@ -51,16 +51,15 @@
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import com.google.java.contract.Ensures;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*;
import org.apache.commons.lang.mutable.MutableDouble;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
@ -77,13 +76,41 @@ import java.util.*;
public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
protected M2ArgumentCollection MTAC;
private final M2ArgumentCollection MTAC;
private final TumorPowerCalculator strandArtifactPowerCalculator;
private final String tumorSampleName;
private final String matchedNormalSampleName;
private final String DEBUG_READ_NAME;
//Mutect2 does not run in GGA mode
private static final List<VariantContext> NO_GIVEN_ALLELES = Collections.EMPTY_LIST;
// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; }
};
private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class);
public SomaticGenotypingEngine(final M2ArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser, final AFCalculatorProvider afCalculatorProvider, final boolean doPhysicalPhasing, final M2ArgumentCollection MTAC) {
super(configuration, samples, genomeLocParser, afCalculatorProvider, doPhysicalPhasing);
public SomaticGenotypingEngine(final M2ArgumentCollection configuration,
final SampleList samples,
final GenomeLocParser genomeLocParser,
final boolean doPhysicalPhasing,
final M2ArgumentCollection MTAC,
final String tumorSampleName,
final String matchedNormalSampleName,
final String DEBUG_READ_NAME) {
super(configuration, samples, genomeLocParser, DUMMY_AF_CALCULATOR_PROVIDER, doPhysicalPhasing);
this.MTAC = MTAC;
this.tumorSampleName = tumorSampleName;
this.matchedNormalSampleName = matchedNormalSampleName;
this.DEBUG_READ_NAME = DEBUG_READ_NAME;
// coverage related initialization
//TODO: in GATK4, use a QualityUtils method
final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE/10);
strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f);
}
/**
@ -91,373 +118,392 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
* genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling
*
* The list of samples we're working with is obtained from the readLikelihoods
*
* @param haplotypes Haplotypes to assign likelihoods to
* @param readLikelihoods Map from reads->(haplotypes,likelihoods)
* @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods.
* @param ref Reference bytes at active region
* @param refLoc Corresponding active region genome location
* @param activeRegionWindow Active window
* @param genomeLocParser GenomeLocParser
* @param activeAllelesToGenotype Alleles to genotype
* @param emitReferenceConfidence whether we should add a &lt;NON_REF&gt; alternative allele to the result variation contexts.
*
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
*
*/
// @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
@Ensures("result != null")
// TODO - can this be refactored? this is hard to follow!
public HaplotypeCallerGenotypingEngine.CalledHaplotypes callMutations (
final List<Haplotype> haplotypes,
//final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, Integer> originalNormalReadQualities,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final RefMetaDataTracker tracker,
final List<VariantContext> activeAllelesToGenotype,
final boolean emitReferenceConfidence,
final String tumorSampleName,
final String matchedNormalSampleName,
final RodBinding<VariantContext> dbsnpRod,
final List<RodBinding<VariantContext>> cosmicRod,
final String DEBUG_READ_NAME
) {
// sanity check input arguments
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
public CalledHaplotypes callMutations (
final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, Integer> originalNormalReadQualities,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final RefMetaDataTracker tracker) {
//TODO: in GATK4 use Utils.nonNull
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
final List<Haplotype> haplotypes = readLikelihoods.alleles();
// Somatic Tumor/Normal Sample Handling
verifySamplePresence(tumorSampleName, readLikelihoods.samples());
final boolean hasNormal = (matchedNormalSampleName != null);
if (!readLikelihoods.samples().contains(tumorSampleName)) {
throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample " + tumorSampleName);
}
final boolean hasNormal = matchedNormalSampleName != null;
// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
// that carry events among the haplotypes
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, activeAllelesToGenotype);
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, NO_GIVEN_ALLELES);
// Walk along each position in the key set and create each event to be outputted
final Set<Haplotype> calledHaplotypes = new HashSet<>();
final List<VariantContext> returnCalls = new ArrayList<>();
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
if( eventsAtThisLoc.isEmpty() ) { continue; }
// Create the event mapping object which maps the original haplotype events to the events present at just this locus
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
// Sanity check the priority list for mistakes
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
// Merge the event to find a common reference representation
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
final int numAlts = mergedVC.getNAlleles()-1;
// final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC);
final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP()
? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL;
if (emitReferenceConfidence)
mergedVC = addNonRefSymbolicAllele(mergedVC);
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) {
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
}
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
if( configuration.DEBUG && logger != null ) {
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
//LDG: do we want to do this before or after pulling out overlapping reads?
if (MTAC.isSampleContaminationPresent())
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
//if (!mergedVC.isBiallelic()) {
// logger.info("[UNSUPPORTED] Detected non-Biallelic VC" + mergedVC.toString());
// continue;
//}
// TODO: once tests are passing, refactor to use the new data structure (not the deprecated one)
// handle overlapping fragments
// TODO: CONFIRM WITH GSA IF IT IS OK TO REMOVE READS FROM THE PRALM (should be... they do it in filterPoorlyModeledReads!)
PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present after filtering for overlapping reads");
// extend to multiple samples
//handle existence of secondary alts
double[] afs = estimateAlleleFraction(mergedVC, tumorPRALM);
if( configuration.DEBUG && logger != null ) {
String output = "Calculated allelic fraction at " + loc + " = ";
for (int i = 0; i<afs.length; i++)
output = output + afs[i];
if (logger != null) logger.info(output);
}
double[] tumorGLs = getVariableGenotypeLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, afs);
PerReadAlleleLikelihoodMap normalPRALM = null;
double[] normalGLs = null;
if (hasNormal) {
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after filtering for overlapping reads");
double[] diploidAFarray = new double[numAlts];
Arrays.fill(diploidAFarray, 0.5d);
normalGLs = getVariableGenotypeLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidAFarray);
}
double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
final int REF_INDEX = 0;
double[] tumorLods = new double[numAlts];
for (int altInd = 0; altInd < numAlts; altInd++) {
tumorLods[altInd] = tumorGLs[altInd+1] - tumorGLs[REF_INDEX];
}
if (configuration.DEBUG && logger != null) {
String output = "Tumor LOD at " + loc + " = ";
for (int i = 0; i<tumorLods.length; i++)
output = output + tumorLods[i];
if (logger != null) logger.info(output);
}
double[] normalLods = new double[numAlts];
if (hasNormal) {
GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
Collection<VariantContext> cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc);
Collection<VariantContext> dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc);
// remove the effect of cosmic from dbSNP
boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty());
INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal
NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD;
for (int altInd = 0; altInd < numAlts; altInd++)
normalLods[altInd] = normalGLs[REF_INDEX] - normalGLs[altInd+1];
}
//reconcile multiple alts, if applicable
int numPassingAlts = 0;
int lodInd = 0;
for (int altInd = 0; altInd < numAlts; altInd++) {
if (tumorLods[altInd] >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD && normalLods[altInd] >= INIT_NORMAL_LOD_THRESHOLD) {
numPassingAlts++;
lodInd = altInd;
}
}
final double tumorLod = tumorLods[lodInd];
final double normalLod = normalLods[lodInd];
VariantContext call = null;
if (tumorLod >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD && normalLod >= INIT_NORMAL_LOD_THRESHOLD) {
VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC);
if (normalLod < NORMAL_LOD_THRESHOLD) {
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
}
int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(lodInd)).size();
callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLod);
callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLod);
if (normalLod < NORMAL_LOD_THRESHOLD) {
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
}
if (numPassingAlts > 1) {
callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME);
}
List<Allele> tumorAlleles = new ArrayList<>();
tumorAlleles.add(mergedVC.getReference());
tumorAlleles.add(mergedVC.getAlternateAllele(lodInd));
GenotypeBuilder tumorGenotype =
new GenotypeBuilder(tumorSampleName, tumorAlleles);
tumorGenotype.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, afs[lodInd]);
// how should we set the genotype properly here?
List<Allele> refAlleles = new ArrayList<>();
refAlleles.add(mergedVC.getReference());
refAlleles.add(mergedVC.getReference());
List<Genotype> genotypes = new ArrayList<>();
genotypes.add(tumorGenotype.make());
// if we are calling with a normal, add that sample in
if (hasNormal) {
int[] normalCounts = getRefAltCount(mergedVC, normalPRALM);
int[] normalAD = new int[2];
normalAD[REF_INDEX] = normalCounts[REF_INDEX];
normalAD[1] = normalCounts[lodInd+1];
double normalF = (double) normalAD[1] / ((double) normalAD[REF_INDEX] + (double) normalAD[1]);
GenotypeBuilder normalGenotype =
new GenotypeBuilder(matchedNormalSampleName, refAlleles).AD(normalAD);
normalGenotype.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalF);
genotypes.add(normalGenotype.make());
}
//only use alleles found in the tumor (
call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make();
}
// how should we be making use of _perSampleFilteredReadList_?
if( call != null ) {
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call);
ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false);
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
// maintain the set of all called haplotypes
for ( final Allele calledAllele : call.getAlleles() ) {
final List<Haplotype> haplotypeList = alleleMapper.get(calledAllele);
if (haplotypeList == null) continue;
calledHaplotypes.addAll(haplotypeList);
}
returnCalls.add( annotatedCall );
}
if( loc < activeRegionWindow.getStart() || loc > activeRegionWindow.getStop() ) {
continue;
}
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, NO_GIVEN_ALLELES);
if( eventsAtThisLoc.isEmpty() ) { continue; }
// Create the event mapping object which maps the original haplotype events to the events present at just this locus
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
// TODO: priorityList is not sorted by priority, might as well just use eventsAtThisLoc.map(VariantContext::getSource)
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
// merge variant contexts from multiple haplotypes into one variant context
// TODO: we should use haplotypes if possible, but that may have to wait for GATK4
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
// TODO: this varaible needs a descriptive name
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
for(int i = 0; i < eventsAtThisLoc.size(); i++) {
// TODO: as noted below, this operation seems dangerous. Understand how things can go wrong.
mergeMap.put(eventsAtThisLoc.get(i), mergedVC.getAlternateAllele(i)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
}
/** TODO: the code in the for loop up to here needs refactor. The goal, as far as I can tell, is to create two things: alleleMapper and mergedVC
* alleleMapper maps alleles to haplotypes, and we need this to create readAlleleLikelihoods.
* To make alleleMapper we make mergeMap (of type VC -> Allele) and eventMapper (of type Event -> List(Haplotypes), where Event is essentialy Variant Context)
* If we just want a map of Alleles to Haplotypes, we should be able to do so directly; no need for intermediate maps, which just complicates the code.
**/
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
// converting ReadLikelihoods<Haplotype> to ReadLikeliHoods<Allele>
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
//LDG: do we want to do this before or after pulling out overlapping reads?
if (MTAC.isSampleContaminationPresent()) {
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
}
// TODO: this is a good break point for a new method
// TODO: replace PRALM with ReadLikelihoods
final PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads");
// extend to multiple samples
// compute tumor LOD for each alternate allele
// TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you
final PerAlleleCollection<Double> altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false);
final PerAlleleCollection<Double> tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions);
final PerAlleleCollection<Double> tumorLods = PerAlleleCollection.createPerAltAlleleCollection();
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
}
// TODO: another good breakpoint e.g. compute normal LOD/set thresholds
// TODO: anything related to normal should be encapsulated in Optional
// A variant candidate whose normal LOD is below this threshold will be filtered as 'germline_risk'
// This is a more stringent threshold than normalLodThresholdForVCF
double normalLodFilterThreshold = -Double.MAX_VALUE;
PerReadAlleleLikelihoodMap normalPRALM = null;
final PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
// if normal bam is available, compute normal LOD
// TODO: this if statement should be a standalone method for computing normal LOD
// TODO: then we can do something like normalLodThreshold = hasNormal ? thisMethod() : Optional.empty()
if (hasNormal) {
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads");
final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
final Collection<VariantContext> cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc);
final Collection<VariantContext> dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc);
final boolean germlineAtRisk = !dbsnpVC.isEmpty() && cosmicVC.isEmpty();
normalLodFilterThreshold = germlineAtRisk ? MTAC.NORMAL_DBSNP_LOD_THRESHOLD : MTAC.NORMAL_LOD_THRESHOLD;
// compute normal LOD = LL(X|REF)/LL(X|ALT) where REF is the diploid HET with AF = 0.5
// note normal LOD is REF over ALT, the reciprocal of the tumor LOD
final PerAlleleCollection<Double> diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection();
for (final Allele allele : mergedVC.getAlternateAlleles()){
diploidHetAlleleFractions.setAlt(allele, 0.5);
}
final PerAlleleCollection<Double> normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions);
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele));
}
}
int numPassingAlts = 0;
final Set<Allele> allelesThatPassThreshold = new HashSet<>();
Allele alleleWithHighestTumorLOD = null;
for (final Allele altAllele : mergedVC.getAlternateAlleles()) {
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= MTAC.INITIAL_NORMAL_LOD_THRESHOLD : true;
if (passesTumorLodThreshold && passesNormalLodThreshold) {
numPassingAlts++;
allelesThatPassThreshold.add(altAllele);
if (alleleWithHighestTumorLOD == null || tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){
alleleWithHighestTumorLOD = altAllele;
}
}
}
if (numPassingAlts == 0) {
continue;
}
final VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC);
final int haplotypeCount = alleleMapper.get(alleleWithHighestTumorLOD).size();
callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD));
if (hasNormal) {
callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD));
if (normalLods.getAlt(alleleWithHighestTumorLOD) < normalLodFilterThreshold) {
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
}
}
// TODO: this should be a separate method
// TODO: move code to MuTect2::calculateFilters()
if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) {
final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap();
final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap();
splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM);
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split");
MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split");
MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split");
// TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[]
// TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd.
final PerAlleleCollection<Double> alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true);
final PerAlleleCollection<Double> tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward);
final PerAlleleCollection<Double> alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true);
final PerAlleleCollection<Double> tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse);
final double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef();
final double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef();
// Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction
final double tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
final double tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd);
callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev);
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd);
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev);
if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ||
(tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD))
callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME);
}
// TODO: this probably belongs in M2::calculateFilters()
if (numPassingAlts > 1) {
callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME);
}
// build genotypes TODO: this part needs review and refactor
final List<Allele> tumorAlleles = Arrays.asList(mergedVC.getReference(), alleleWithHighestTumorLOD);
// TODO: estimateAlleleFraction should not repeat counting allele depths
final PerAlleleCollection<Integer> tumorAlleleDepths = getRefAltCount(mergedVC, tumorPRALM, false);
final int tumorRefAlleleDepth = tumorAlleleDepths.getRef();
final int tumorAltAlleleDepth = tumorAlleleDepths.getAlt(alleleWithHighestTumorLOD);
final Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles)
.AD(new int[] { tumorRefAlleleDepth, tumorAltAlleleDepth })
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD))
.make();
final List<Genotype> genotypes = new ArrayList<>();
genotypes.add(tumorGenotype);
// We assume that the genotype in the normal is 0/0
// TODO: is normal always homozygous reference?
final List<Allele> homRefAllelesforNormalGenotype = Collections.nCopies(2, mergedVC.getReference());
// if we are calling with a normal, build the genotype for the sample to appear in vcf
if (hasNormal) {
final PerAlleleCollection<Integer> normalAlleleDepths = getRefAltCount(mergedVC, normalPRALM, false);
final int normalRefAlleleDepth = normalAlleleDepths.getRef();
final int normalAltAlleleDepth = normalAlleleDepths.getAlt(alleleWithHighestTumorLOD);
final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth);
final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype)
.AD(new int[] { normalRefAlleleDepth, normalAltAlleleDepth })
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction)
.make();
genotypes.add(normalGenotype);
}
final VariantContext call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make();
// how should we be making use of _perSampleFilteredReadList_?
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
genomeLocParser, false, alleleMapper, readAlleleLikelihoods, call);
final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false);
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
// maintain the set of all called haplotypes
call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll);
returnCalls.add( annotatedCall );
}
// TODO: understand effect of enabling this for somatic calling...
final List<VariantContext> phasedCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
return new CalledHaplotypes(phasedCalls, calledHaplotypes);
//return new CalledHaplotypes(returnCalls, calledHaplotypes);
final List<VariantContext> outputCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
return new CalledHaplotypes(outputCalls, calledHaplotypes);
}
private void verifySamplePresence(String sampleName, List<String> samples) {
if (!samples.contains(sampleName)) {
throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples));
}
}
/** Calculate the genotype likelihoods for variable allele fraction
/** Calculate the likelihoods of hom ref and each het genotype of the form ref/alt
*
* @param mergedVC input VC
* @param tumorPRALM read likelihoods
* @param originalNormalMQs original MQs, before boosting normals to avoid qual capping
* @param afs allele fraction(s) for alternate allele(s)
* @param alleleFractions allele fraction(s) for alternate allele(s)
*
* @return genotype likelihoods for homRef (index 0) and het for each alternate allele
*/
private double[] getVariableGenotypeLikelihoods(final VariantContext mergedVC, final PerReadAlleleLikelihoodMap tumorPRALM,
final Map<String, Integer> originalNormalMQs, double[] afs) {
double[] genotypeLikelihoods = new double[mergedVC.getNAlleles()];
for(Map.Entry<GATKSAMRecord,Map<Allele, Double>> e : tumorPRALM.getLikelihoodReadMap().entrySet()) {
Map<Allele, Double> m = e.getValue();
Double refLL = m.get(mergedVC.getReference());
if (originalNormalMQs.get(e.getKey().getReadName()) != 0) {
genotypeLikelihoods[0] += Math.log10(Math.pow(10, refLL));
for (int altInd = 0; altInd < mergedVC.getNAlleles()-1; altInd++) {
Double altLL = m.get(mergedVC.getAlternateAllele(altInd));
genotypeLikelihoods[altInd+1] += Math.log10(Math.pow(10, refLL) * (1 - afs[altInd]) + Math.pow(10, altLL) * afs[altInd]);
}
}
* @return genotype likelihoods for homRef and het for each alternate allele
*/
private PerAlleleCollection<Double> getHetGenotypeLogLikelihoods(final VariantContext mergedVC,
final PerReadAlleleLikelihoodMap tumorPRALM,
final Map<String, Integer> originalNormalMQs,
final PerAlleleCollection<Double> alleleFractions) {
// make sure that alleles in alleleFraction are a subset of alleles in the variant context
if (! mergedVC.getAlternateAlleles().containsAll(alleleFractions.getAltAlleles()) ){
throw new IllegalArgumentException("alleleFractions has alleles that are not in the variant context");
}
return genotypeLikelihoods;
final PerAlleleCollection<MutableDouble> genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection();
mergedVC.getAlleles().forEach(a -> genotypeLogLikelihoods.set(a, new MutableDouble(0)));
final Allele refAllele = mergedVC.getReference();
for(Map.Entry<GATKSAMRecord,Map<Allele, Double>> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) {
final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) {
continue;
}
final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele);
genotypeLogLikelihoods.getRef().add(readRefLogLikelihood);
for (final Allele altAllele : alleleFractions.getAltAlleles()) {
final double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele);
final double adjustedReadAltLL = Math.log10(
Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) +
Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele)
);
genotypeLogLikelihoods.get(altAllele).add(adjustedReadAltLL);
}
}
final PerAlleleCollection<Double> result = PerAlleleCollection.createPerRefAndAltAlleleCollection();
mergedVC.getAlleles().stream().forEach(a -> result.set(a,genotypeLogLikelihoods.get(a).toDouble()));
return result;
}
/**
* Find the allele fractions for each alternate allele
*
* @param vc input VC, for alleles
* @param map read likelihoods
* @param pralm read likelihoods
* @return estimated AF for each alt
*/
// FIXME: calculate using the uncertainty rather than this cheap approach
private double[] estimateAlleleFraction(VariantContext vc, PerReadAlleleLikelihoodMap map) {
int[] counts = getRefAltCount(vc, map);
int numAlts = vc.getNAlleles()-1;
double[] afs = new double[numAlts];
int refCount = counts[0];
int altCount;
private PerAlleleCollection<Double> estimateAlleleFraction(final VariantContext vc,
final PerReadAlleleLikelihoodMap pralm,
final boolean oneStrandOnly) {
final PerAlleleCollection<Integer> alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly);
final PerAlleleCollection<Double> alleleFractions = PerAlleleCollection.createPerAltAlleleCollection();
for(int altInd = 0; altInd < numAlts; altInd++) {
altCount = counts[altInd+1];
afs[altInd] = (double) altCount / ((double) refCount + (double) altCount);
//logger.info("Counted " + refCount + " ref and " + altCount + " alt " );
final int refCount = alleleCounts.getRef();
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
final int altCount = alleleCounts.getAlt(altAllele);
double alleleFraction = (double) altCount / (refCount + altCount);
// weird case, but I've seen it happen in one strand cases
if (refCount == 0 && altCount == refCount ) {
alleleFraction = 0;
}
alleleFractions.setAlt(altAllele, alleleFraction);
// logger.info("Counted " + refCount + " ref and " + altCount + " alt " );
}
return afs;
return alleleFractions;
}
/**
* Evalutate the most likely allele for each read, if it is in fact informative
* Go through the PRALM and tally the most likely allele in each read. Only count informative reads.
*
* @param mergedVC input VC, for alleles
* @param afMap read likelihoods
* @param vc input VC, for alleles
* @param pralm read likelihoods
* @return an array giving the read counts for the ref and each alt allele
*/
// TODO: ensure there are only two alleles in the VC
private int[] getRefAltCount(VariantContext mergedVC, PerReadAlleleLikelihoodMap afMap) {
int counts[] = new int[mergedVC.getNAlleles()];
int REF = 0;
private PerAlleleCollection<Integer> getRefAltCount(final VariantContext vc,
final PerReadAlleleLikelihoodMap pralm,
final boolean oneStrandOnly) {
// Check that the alleles in Variant Context are in PRALM
// Skip the check for strand-conscious PRALM; + reads may not have alleles in - reads, for example.
final Set<Allele> vcAlleles = new HashSet<>(vc.getAlleles());
if ( ! oneStrandOnly && ! pralm.getAllelesSet().containsAll( vcAlleles ) ) {
StringBuilder message = new StringBuilder();
message.append("At Locus chr" + vc.getContig() + ":" + vc.getStart() + ", we detected that variant context had alleles that not in PRALM. ");
message.append("VC alleles = " + vcAlleles + ", PRALM alleles = " + pralm.getAllelesSet());
logger.warn(message);
}
for(Map.Entry<GATKSAMRecord,Map<Allele, Double>> e : afMap.getLikelihoodReadMap().entrySet()) {
Map<Allele, Double> m = e.getValue();
Double rl = m.get(mergedVC.getReference());
for(int altInd=0; altInd<mergedVC.getNAlleles()-1;altInd++) {
Double al = m.get(mergedVC.getAlternateAllele(altInd));
logger.debug("At " + mergedVC.getStart() + ", for read " + e.getKey().getReadName() + ", al = " + al + ", rl = " + rl + ", diff = " + (al - rl));
if (arePairHMMLikelihoodsInformative(rl, al)) {
if (rl > al) {
counts[REF]++;
} else {
counts[altInd+1]++;
logM2Debug("Using " + e.getKey().toString() + " towards alternate allele count");
}
}
final PerAlleleCollection<MutableInt> alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection();
vcAlleles.stream().forEach(a -> alleleCounts.set(a, new MutableInt(0)));
for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) {
final GATKSAMRecord read = readAlleleLikelihoodMap.getKey();
final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles);
if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) {
alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment();
}
// if (al >= rl) logger.info("Alt found in " + e.getKey().getReadName());
}
return counts;
}
final PerAlleleCollection<Integer> result = PerAlleleCollection.createPerRefAndAltAlleleCollection();
vc.getAlleles().stream().forEach(a -> result.set(a, alleleCounts.get(a).toInteger()));
return(result);
}
private void logM2Debug(String s) {
if (MTAC.M2_DEBUG) {
@ -465,26 +511,16 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
}
}
// would have used org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getMostLikelyAllele but we have this case where
// there is a read that doesn't overlap the variant site, and thus supports both alleles equally.
private boolean arePairHMMLikelihoodsInformative(double l1, double l2) {
// TODO: should this be parameterized, or simply encoded
double EPSILON = 0.1;
return (Math.abs(l1 - l2) >= EPSILON);
}
private void filterPRALMForOverlappingReads(PerReadAlleleLikelihoodMap pralm, Allele ref, int location, boolean retainMismatches) {
Map<GATKSAMRecord, Map<Allele, Double>> m = pralm.getLikelihoodReadMap();
private void filterPRALMForOverlappingReads(final PerReadAlleleLikelihoodMap pralm, final Allele ref, final int location, final boolean retainMismatches) {
final Map<GATKSAMRecord, Map<Allele, Double>> m = pralm.getLikelihoodReadMap();
// iterate through the reads, if the name has been seen before we have overlapping (potentially) fragments, so handle them
Map<String, GATKSAMRecord> nameToRead = new HashMap<>();
Set<GATKSAMRecord> readsToKeep = new HashSet<>();
final Map<String, GATKSAMRecord> nameToRead = new HashMap<>();
final Set<GATKSAMRecord> readsToKeep = new HashSet<>();
for(GATKSAMRecord rec : m.keySet()) {
for(final GATKSAMRecord rec : m.keySet()) {
// if we haven't seen it... just record the name and add it to the list of reads to keep
GATKSAMRecord existing = nameToRead.get(rec.getReadName());
final GATKSAMRecord existing = nameToRead.get(rec.getReadName());
if (existing == null) {
nameToRead.put(rec.getReadName(), rec);
readsToKeep.add(rec);
@ -496,11 +532,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
// TODO: CHECK IF THE READS BOTH OVERLAP THE POSITION!!!!
if ( ReadUtils.isInsideRead(existing, location) && ReadUtils.isInsideRead(rec, location) ) {
MostLikelyAllele existingMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing));
Allele existingAllele = existingMLA.getMostLikelyAllele();
final MostLikelyAllele existingMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing));
final Allele existingAllele = existingMLA.getMostLikelyAllele();
MostLikelyAllele recMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec));
Allele recAllele = recMLA.getMostLikelyAllele();
final MostLikelyAllele recMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec));
final Allele recAllele = recMLA.getMostLikelyAllele();
// if the reads disagree at this position...
if (!existingAllele.equals(recAllele)) {
@ -545,15 +581,20 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
}
}
// Move to utility class so we can use one shared with HaplotypeCallerGenotypingEngine
private VariantContext addNonRefSymbolicAllele(final VariantContext mergedVC) {
final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC);
final List<Allele> originalList = mergedVC.getAlleles();
final List<Allele> alleleList = new ArrayList<>(originalList.size() + 1);
alleleList.addAll(mergedVC.getAlleles());
alleleList.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
vcb.alleles(alleleList);
return vcb.make();
}
private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) {
final Map<GATKSAMRecord, Map<Allele, Double>> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap();
for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) {
if (read.isStrandless())
continue;
for (final Map.Entry<Allele, Double> alleleLikelihoodMap : origReadAlleleLikelihoodMap.get(read).entrySet()) {
final Allele allele = alleleLikelihoodMap.getKey();
final Double likelihood = alleleLikelihoodMap.getValue();
if (read.getReadNegativeStrandFlag())
reversePRALM.add(read, allele, likelihood);
else
forwardPRALM.add(read, allele, likelihood);
}
}
}
}

View File

@ -0,0 +1,201 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2016 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import org.apache.commons.math.MathException;
import org.apache.commons.math.distribution.BinomialDistribution;
import org.apache.commons.math.distribution.BinomialDistributionImpl;
import org.apache.commons.math3.util.Pair;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.OptionalInt;
import java.util.stream.IntStream;
/**
* We store a memo to avoid repeated computation of statistical power to detect a variant.
* The key of the memo is a pair of numbers: number of reads and estimated allele fraction
*/
public class TumorPowerCalculator {
private final double errorProbability;
private final double tumorLODThreshold;
private final double contamination;
private final boolean enableSmoothing;
private final HashMap<PowerCacheKey, Double> cache = new HashMap<PowerCacheKey, Double>();
public TumorPowerCalculator(double errorProbability, double constantLodThreshold, double contamination) {
this(errorProbability, constantLodThreshold, contamination, true);
}
public TumorPowerCalculator(double errorProbability, double tumorLODThreshold, double contamination, boolean enableSmoothing) {
this.errorProbability = errorProbability;
this.tumorLODThreshold = tumorLODThreshold;
this.contamination = contamination;
this.enableSmoothing = enableSmoothing;
}
/**
* A helper class that acts as the key to the memo of pre-computed power
*
* TODO: Not ideal to use double as a key. Refactor such that we use as keys numAlts and numReads, which are integers. Then calculate numAlts/numReads when we need allele fraction.
*
*/
private static class PowerCacheKey extends Pair<Integer, Double> {
private final Double alleleFraction;
private final Integer numReads;
public PowerCacheKey(final int numReads, final double alleleFraction) {
super(numReads, alleleFraction);
this.alleleFraction = alleleFraction;
this.numReads = numReads;
}
private boolean closeEnough(final double x, final double y, final double epsilon){
return(Math.abs(x - y) < epsilon);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
PowerCacheKey that = (PowerCacheKey) o;
return (closeEnough(alleleFraction, that.alleleFraction, 0.001) && numReads != that.numReads);
}
@Override
public int hashCode() {
int result;
long temp;
result = numReads;
temp = alleleFraction != +0.0d ? Double.doubleToLongBits(alleleFraction) : 0L;
result = 31 * result + (int) (temp ^ (temp >>> 32));
return result;
}
}
/**
*
* @param numReads total number of reads, REF and ALT combined, in + or - strand
* @param alleleFraction the true allele fraction estimated as the combined allele fraction from + and - reads
* @return probability of correctly calling the variant (i.e. power) given the above estimated allele fraction and number of reads.
* we compute power separately for each strand (+ and -)
* @throws MathException
*
*/
public double cachedPowerCalculation(final int numReads, final double alleleFraction) {
PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction);
// we first look up if power for given number of read and allele fraction has already been computed and stored in the cache.
// if not we compute it and store it in teh cache.
Double power = cache.get(key);
if (power == null) {
try {
power = calculatePower(numReads, alleleFraction);
} catch (final Exception ex) {
throw new GATKException("Power calculation failed", ex);
}
cache.put(key, power);
}
return power;
}
/* helper function for calculateTumorLod */
private double calculateLogLikelihood(final int numReads, final int numAlts, final double alleleFraction) {
return((numReads-numAlts) * Math.log10( alleleFraction * errorProbability + (1 - alleleFraction)*(1 - errorProbability) ) +
numAlts * Math.log10(alleleFraction * (1 - errorProbability) + (1 - alleleFraction) * errorProbability));
}
private double calculateTumorLod(final int numReads, final int numAlts) {
final double alleleFraction = (double) numAlts / (double) numReads;
final double altLikelihod = calculateLogLikelihood(numReads, numAlts, alleleFraction);
final double refLikelihood = calculateLogLikelihood(numReads, numAlts, contamination);
return(altLikelihod - refLikelihood);
}
private double calculatePower(final int numReads, final double alleleFraction) throws MathException {
if (numReads==0) return 0;
// TODO: add the factor of 1/3
final double probAltRead = alleleFraction*(1 - errorProbability) + (1/3)*(1 - alleleFraction) * errorProbability;
final BinomialDistribution binom = new BinomialDistributionImpl(numReads, probAltRead);
final double[] binomialProbabilities = IntStream.range(0, numReads + 1).mapToDouble(binom::probability).toArray();
// find the smallest number of ALT reads k such that tumorLOD(k) > tumorLODThreshold
final OptionalInt smallestKAboveLogThreshold = IntStream.range(0, numReads + 1)
.filter(k -> calculateTumorLod(numReads, k) > tumorLODThreshold)
.findFirst();
if (! smallestKAboveLogThreshold.isPresent()){
return 0;
}
if (smallestKAboveLogThreshold.getAsInt() <= 0){
throw new IllegalStateException("smallest k that meets the tumor LOD threshold is less than or equal to 0");
}
double power = Arrays.stream(binomialProbabilities, smallestKAboveLogThreshold.getAsInt(), binomialProbabilities.length).sum();
// here we correct for the fact that the exact lod threshold is likely somewhere between
// the k and k-1 bin, so we prorate the power from that bin
if ( enableSmoothing ){
final double tumorLODAtK = calculateTumorLod(numReads, smallestKAboveLogThreshold.getAsInt());
final double tumorLODAtKMinusOne = calculateTumorLod(numReads, smallestKAboveLogThreshold.getAsInt()-1);
final double weight = 1 - (tumorLODThreshold - tumorLODAtKMinusOne ) / (tumorLODAtK - tumorLODAtKMinusOne);
power += weight * binomialProbabilities[smallestKAboveLogThreshold.getAsInt() - 1];
}
return(power);
}
}

View File

@ -52,6 +52,7 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.IndexRange;
import org.broadinstitute.gatk.utils.MathUtils;
import java.util.Arrays;
@ -108,7 +109,18 @@ import java.util.List;
*/
public class GenotypeAlleleCounts implements Comparable<GenotypeAlleleCounts>, Cloneable {
private double log10CombinationCount;
private static final double UNCOMPUTED_LOG_10_COMBINATION_COUNT = -1;
/**
* The log10 number of phased genotypes corresponding to this unphased genotype. For example,
* [0, 1, 1, 1] = AB: log10(2)
* [0, 2] = AA: log10(1)
* [0, 1, 1, 1, 2, 1] = ABC: log10(6)
* [0, 2, 1, 2] = AABB: log10(4!/(2!2!))
* This is evaluated lazily i.e. it is initialized to {@link GenotypeAlleleCounts::UNCOMPUTED_LOG_10_COMBINATION_COUNT}
* and only calculated if its getter is invoked.
*/
private double log10CombinationCount = UNCOMPUTED_LOG_10_COMBINATION_COUNT;
/**
* The ploidy of the genotype.
@ -156,38 +168,30 @@ public class GenotypeAlleleCounts implements Comparable<GenotypeAlleleCounts>, C
* @param index the genotype index.
*/
private GenotypeAlleleCounts(final int ploidy, final int index, final int... sortedAlleleCounts) {
this(ploidy, index, sortedAlleleCounts, sortedAlleleCounts.length >> 1);
}
private GenotypeAlleleCounts(final int ploidy, final int index, final int[] sortedAlleleCounts, final int distinctAlleleCount){
this.ploidy = ploidy;
this.sortedAlleleCounts = sortedAlleleCounts;
distinctAlleleCount = sortedAlleleCounts.length >> 1;
log10CombinationCount = -1;
this.index = index;
this.sortedAlleleCounts = sortedAlleleCounts;
this.distinctAlleleCount = distinctAlleleCount;
}
/**
* Returns the log10 of the number of possible allele combinations that would give raise to this allele count.
* @return 0 or less.
* Gets the log10 combination count, computing it if uninitialized. Note that the invoked MathUtils method uses fast cached
* log10 values of integers for any reasonable ploidy.
*
* This method should be invoked on instances of {@link GenotypeAlleleCounts} cached in {@link GenotypeLikelihoodCalculators::genotypeTableByPloidy}.
* Such usage allows the result of this computation to be cached once for an entire run of HaplotypeCaller.
* @return
*/
public double log10CombinationCount() {
if (log10CombinationCount == -1)
return log10CombinationCount = calculateLog10CombinationCount();
else
return log10CombinationCount;
}
/**
* Calculates log10 combination count.
*
* @return 0 or less.
*/
private double calculateLog10CombinationCount() {
if (ploidy <= 1)
return 0;
else {
final int[] counts = new int[distinctAlleleCount];
for (int i = 0, j = 1; i < distinctAlleleCount; i++, j+=2)
counts[i] = sortedAlleleCounts[j];
return MathUtils.log10MultinomialCoefficient(ploidy, counts);
if (log10CombinationCount == UNCOMPUTED_LOG_10_COMBINATION_COUNT) {
log10CombinationCount = MathUtils.log10Factorial(ploidy)
- new IndexRange(0, distinctAlleleCount).sum(n -> MathUtils.log10Factorial(sortedAlleleCounts[2*n+1]));
}
return log10CombinationCount;
}
/**
@ -785,4 +789,22 @@ public class GenotypeAlleleCounts implements Comparable<GenotypeAlleleCounts>, C
}
return result;
}
@FunctionalInterface
public interface IntBiConsumer {
void accept(final int alleleIndex, final int alleleCount);
}
@FunctionalInterface
public interface IntToDoubleBiFunction {
double apply(final int alleleIndex, final int alleleCount);
}
public void forEachAlleleIndexAndCount(final IntBiConsumer action) {
new IndexRange(0, distinctAlleleCount).forEach(n -> action.accept(sortedAlleleCounts[2*n], sortedAlleleCounts[2*n+1]));
}
public double sumOverAlleleIndicesAndCounts(final IntToDoubleBiFunction func) {
return new IndexRange(0, distinctAlleleCount).sum(n -> func.apply(sortedAlleleCounts[2*n], sortedAlleleCounts[2*n+1]));
}
}

View File

@ -163,7 +163,7 @@ public class GenotypeLikelihoodCalculator {
* <p>This is in fact a shallow copy if {@link GenotypeLikelihoodCalculators#ploidyLog10}</p> and is not meant to be modified by
* this class. </p>
*/
private final double[] log10;
private final double[] ploidyLog10;
/**
* Buffer field use as a temporal container for sorted allele counts when calculating the likelihood of a
@ -202,24 +202,22 @@ public class GenotypeLikelihoodCalculator {
* Creates a new calculator providing its ploidy and number of genotyping alleles.
*/
protected GenotypeLikelihoodCalculator(final int ploidy, final int alleleCount,
final int[][] alleleFirstGenotypeOffsetByPloidy,
final GenotypeAlleleCounts[][] genotypeTableByPloidy,
final double[] ploidyLog10) {
final int[][] alleleFirstGenotypeOffsetByPloidy,
final GenotypeAlleleCounts[][] genotypeTableByPloidy,
final double[] ploidyLog10) {
this.alleleFirstGenotypeOffsetByPloidy = alleleFirstGenotypeOffsetByPloidy;
genotypeAlleleCounts = genotypeTableByPloidy[ploidy];
this.alleleCount = alleleCount;
this.ploidy = ploidy;
genotypeCount = this.alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
if (genotypeCount == GenotypeLikelihoodCalculators.GENOTYPE_COUNT_OVERFLOW)
throw new IllegalArgumentException(
String.format("the combination of ploidy (%s) and number of alleles (%s) results in a very large number of genotypes (> %s). You need to limit ploidy or the number of alternative alleles to analyze this locus",
ploidy,alleleCount,Integer.MAX_VALUE));
alleleHeap = new IntMaxHeap(ploidy);
readLikelihoodsByGenotypeIndex = new double[genotypeCount][];
log10 = ploidyLog10;
this.ploidyLog10 = ploidyLog10;
// The number of possible components is limited by distinct allele count and ploidy.
maximumDistinctAllelesInGenotype = Math.min(ploidy, alleleCount);
genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype << 1];
genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype*2];
}
/**
@ -349,7 +347,7 @@ public class GenotypeLikelihoodCalculator {
*/
private double[] genotypeLikelihoods(final double[][] readLikelihoodsByGenotypeIndex, final int readCount) {
final double[] result = new double[genotypeCount];
final double denominator = readCount * log10[ploidy]; // instead of dividing each read likelihood by ploidy
final double denominator = readCount * ploidyLog10[ploidy]; // instead of dividing each read likelihood by ploidy
// ( so subtract log10(ploidy) ) we multiply them all and the divide by ploidy^readCount (so substract readCount * log10(ploidy) )
for (int g = 0; g < genotypeCount; g++) {
final double[] likelihoodsByRead = readLikelihoodsByGenotypeIndex[g];
@ -464,7 +462,9 @@ public class GenotypeLikelihoodCalculator {
* exactly one allele present in the genotype.
*/
private void singleComponentGenotypeLikelihoodByRead(final GenotypeAlleleCounts genotypeAlleleCounts,
final double[] likelihoodByRead, final double[] readLikelihoodComponentsByAlleleCount, final int readCount) {
final double[] likelihoodByRead,
final double[] readLikelihoodComponentsByAlleleCount,
final int readCount) {
final int allele = genotypeAlleleCounts.alleleIndexAt(0);
// the count of the only component must be = ploidy.
int offset = (allele * (ploidy + 1) + ploidy) * readCount;
@ -493,7 +493,7 @@ public class GenotypeLikelihoodCalculator {
// p = 2 because the frequency == 1 we already have it.
for (int frequency = 2, destinationOffset = frequency1Offset + readCount; frequency <= ploidy; frequency++) {
final double log10frequency = log10[frequency];
final double log10frequency = ploidyLog10[frequency];
for (int r = 0, sourceOffset = frequency1Offset; r < readCount; r++)
readAlleleLikelihoodByAlleleCount[destinationOffset++] =
readAlleleLikelihoodByAlleleCount[sourceOffset++] + log10frequency;
@ -620,7 +620,11 @@ public class GenotypeLikelihoodCalculator {
* @param destination where to store the new genotype index mapping to old.
* @param sortedAlleleCountsBuffer a buffer to re-use to get the genotype-allele-count's sorted allele counts.
*/
private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex, final GenotypeAlleleCounts alleleCounts, final int[] oldToNewAlleleIndexMap, final int[] destination, final int[] sortedAlleleCountsBuffer) {
private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex,
final GenotypeAlleleCounts alleleCounts,
final int[] oldToNewAlleleIndexMap,
final int[] destination,
final int[] sortedAlleleCountsBuffer) {
final int distinctAlleleCount = alleleCounts.distinctAlleleCount();
alleleCounts.copyAlleleCounts(sortedAlleleCountsBuffer,0);
for (int j = 0, jj = 0; j < distinctAlleleCount; j++) {

View File

@ -51,7 +51,11 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.util.Arrays;
import java.util.stream.IntStream;
/**
* Genotype likelihood calculator utility.
@ -116,7 +120,10 @@ public class GenotypeLikelihoodCalculators {
private volatile static GenotypeAlleleCounts[][] genotypeTableByPloidy =
buildGenotypeAlleleCountsTable(maximumPloidy,maximumAllele,alleleFirstGenotypeOffsetByPloidy);
/**
* Cached log10 values for the first integer up to the maximum ploidy requested thus far.
*/
private volatile static double[] ploidyLog10 = IntStream.range(0, maximumPloidy + 1).mapToDouble(Math::log10).toArray();
/**
* Build the table with the genotype offsets based on ploidy and the maximum allele index with representation
@ -291,40 +298,29 @@ public class GenotypeLikelihoodCalculators {
return result;
}
/**
* Cached log10 values for the first integer up to the maximum ploidy requested thus far.
*/
private volatile static double[] ploidyLog10;
// Initialize {@link #ploidyLog10}.
static {
ploidyLog10 = new double[maximumPloidy + 1];
for (int i = 0; i <= maximumPloidy; i++)
ploidyLog10[i] = Math.log10(i);
}
/**
* Returns an instance given its ploidy and the number of alleles.
*
* @param alleleCount the required allele-count.
* @param ploidy the required ploidy-count.
*
* @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is {@code null}, or
* the resulting number of genotypes is too large.
* @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is negative, or the resulting number of genotypes is too large.
*
* @return never {@code null}.
*/
public static GenotypeLikelihoodCalculator getInstance(final int ploidy,
final int alleleCount) {
public static GenotypeLikelihoodCalculator getInstance(final int ploidy, final int alleleCount) {
checkPloidyAndMaximumAllele(ploidy, alleleCount);
// Non-thread safe (fast) check on tables capacities,
// if not enough capacity we expand the tables in a thread-safe manner:
if (alleleCount > maximumAllele || ploidy > maximumPloidy)
ensureCapacity(alleleCount, ploidy);
// if not enough capacity we expand the tables in a thread-safe manner
// also checks if the requested ploidy and allele count result in a genotype count too large to deal with
if(calculateGenotypeCountUsingTables(ploidy, alleleCount) == GENOTYPE_COUNT_OVERFLOW){
final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1);
throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount));
}
// At this point the tables must have at least the requested capacity, likely to be much more.
return new GenotypeLikelihoodCalculator(ploidy,alleleCount,alleleFirstGenotypeOffsetByPloidy,genotypeTableByPloidy,ploidyLog10);
return new GenotypeLikelihoodCalculator(ploidy, alleleCount, alleleFirstGenotypeOffsetByPloidy, genotypeTableByPloidy, ploidyLog10);
}
/**
@ -413,14 +409,59 @@ public class GenotypeLikelihoodCalculators {
* @param ploidy the requested ploidy.
* @param alleleCount the requested number of alleles.
*
* @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
* @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative or
* the number of genotypes is too large (more than {@link Integer#MAX_VALUE}).
*
* @return 0 or greater.
* @return the number of genotypes given ploidy and allele count (0 or greater).
*/
public final static int genotypeCount(final int ploidy, final int alleleCount) {
final int result = calculateGenotypeCountUsingTables(ploidy, alleleCount);
if (result == GENOTYPE_COUNT_OVERFLOW) {
final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1);
throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount));
}
return result;
}
/**
* Compute the maximally acceptable allele count (ref allele included) given the maximally acceptable genotype count.
* @param ploidy sample ploidy
* @param maxGenotypeCount maximum number of genotype count used to calculate upper bound on number of alleles given ploidy
* @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
* @return the maximally acceptable allele count given ploidy and maximum number of genotypes acceptable
*/
public static int computeMaxAcceptableAlleleCount(final int ploidy, final int maxGenotypeCount){
checkPloidyAndMaximumAllele(ploidy, ploidy); // a hack to check ploidy makes sense (could duplicate code but choice must be made)
final double log10MaxGenotypeCount = Math.log10(maxGenotypeCount);
// Math explanation: genotype count is determined by ${P+A-1 \choose A-1}$, this leads to constraint
// $\log(\frac{(P+A-1)!}{(A-1)!}) \le \log(P!G)$,
// where $P$ is ploidy, $A$ is allele count, and $G$ is maxGenotypeCount
// The upper and lower bounds of the left hand side of the constraint are $P \log(A-1+P)$ and $P \log(A)$
// which require $A$ to be searched in interval $[10^{\log(P!G)/P} - (P-1), 10^{\log(P!G)/P}]$
// Denote $[10^{\log(P!G)/P}$ as $x$ in the code.
final double x = Math.pow(10, (MathUtils.log10Factorial(ploidy) + log10MaxGenotypeCount)/ploidy );
final int lower = (int)Math.floor(x) - ploidy - 1;
final int upper = (int)Math.ceil(x);
for(int a=upper; a>=lower; --a){// check one by one
final double log10GTCnt = MathUtils.log10BinomialCoefficient(ploidy+a-1, a-1);
if(log10MaxGenotypeCount >= log10GTCnt) {
return a;
}
}
throw new GATKException("Code should never reach here.");
}
private static int calculateGenotypeCountUsingTables(int ploidy, int alleleCount) {
checkPloidyAndMaximumAllele(ploidy, alleleCount);
if (ploidy > maximumPloidy || alleleCount > maximumAllele)
ensureCapacity(alleleCount,ploidy);
if (ploidy > maximumPloidy || alleleCount > maximumAllele) {
ensureCapacity(alleleCount, ploidy);
}
return alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
}
}

View File

@ -60,6 +60,7 @@ import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AlleleFrequencyCalculator;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils;
@ -68,6 +69,7 @@ import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
@ -104,8 +106,9 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
protected final GenomeLocParser genomeLocParser;
protected static int maxNumPLValuesObserved = 0;
protected static int numTimesMaxNumPLValuesExceeded = 0;
private final List<GenomeLoc> upstreamDeletionsLoc = new LinkedList<>();
protected final AFCalculator newAFCalculator;
/**
* Construct a new genotyper engine, on a specific subset of samples.
@ -139,6 +142,11 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
log10AlleleFrequencyPriorsIndels = composeAlleleFrequencyPriorProvider(numberOfGenomes,
configuration.genotypeArgs.indelHeterozygosity, configuration.genotypeArgs.inputPrior);
this.genomeLocParser = genomeLocParser;
final double refPseudocount = configuration.genotypeArgs.snpHeterozygosity / Math.pow(configuration.genotypeArgs.heterozygosityStandardDeviation,2);
final double snpPseudocount = configuration.genotypeArgs.snpHeterozygosity * refPseudocount;
final double indelPseudocount = configuration.genotypeArgs.indelHeterozygosity * refPseudocount;
newAFCalculator = new AlleleFrequencyCalculator(refPseudocount, snpPseudocount, indelPseudocount, configuration.genotypeArgs.samplePloidy);
}
protected GenotypingEngine(final Config configuration, final SampleList samples,
@ -219,7 +227,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model,
final boolean inheritAttributesFromInputVC,
final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final boolean doAlleleSpecificCalcs) {
final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
@ -230,10 +238,17 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
final int defaultPloidy = configuration.genotypeArgs.samplePloidy;
final int maxAltAlleles = configuration.genotypeArgs.MAX_ALTERNATE_ALLELES;
final int maxNumPLValues = configuration.genotypeArgs.MAX_NUM_PL_VALUES;
final AFCalculator afCalculator = afCalculatorProvider.getInstance(vc,defaultPloidy,maxAltAlleles).setMaxNumPLValues(maxNumPLValues);
final AFCalculationResult AFresult = afCalculator.getLog10PNonRef(vc, defaultPloidy,maxAltAlleles, getAlleleFrequencyPriors(vc,defaultPloidy,model));
final OutputAlleleSubset outputAlternativeAlleles = calculateOutputAlleleSubset(AFresult);
// NOTE: in GATK4, allele subsetting has been extracted out of the AFCalculator into a utils class
// The new AFCalculator (AlleleFrequencyCalculator) of GATK4 therefore does not implement this subsetting,
// which *includes attaching a genotype call to a VariantContext*. In order to backport the new AFCalculator
// it is necessary to use it only for the qual score calculation and not for any other duties.
final AFCalculator afCalculatorForAlleleSubsetting = afCalculatorProvider.getInstance(vc,defaultPloidy,maxAltAlleles).setMaxNumPLValues(maxNumPLValues);
final AFCalculator afCalculatorForQualScore = configuration.genotypeArgs.USE_NEW_AF_CALCULATOR ? newAFCalculator : afCalculatorForAlleleSubsetting;
final AFCalculationResult AFresult = afCalculatorForQualScore.getLog10PNonRef(vc, defaultPloidy,maxAltAlleles, getAlleleFrequencyPriors(vc,defaultPloidy,model));
final OutputAlleleSubset outputAlternativeAlleles = calculateOutputAlleleSubset(AFresult, vc);
final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0());
@ -274,7 +289,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
// create the genotypes
final GenotypesContext genotypes = afCalculator.subsetAlleles(vc, defaultPloidy, outputAlleles, true);
final GenotypesContext genotypes = afCalculatorForAlleleSubsetting.subsetAlleles(vc, defaultPloidy, outputAlleles, true);
builder.genotypes(genotypes);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
@ -341,13 +356,13 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
}
}
/**
* Provided the exact mode computations it returns the appropriate subset of alleles that progress to genotyping.
* @param afcr the exact model calcualtion result.
* @return never {@code null}.
* @param vc the input variant context
* @return information about the alternative allele subsetting {@code null}.
*/
private OutputAlleleSubset calculateOutputAlleleSubset(final AFCalculationResult afcr) {
private OutputAlleleSubset calculateOutputAlleleSubset(final AFCalculationResult afcr, final VariantContext vc) {
final List<Allele> alleles = afcr.getAllelesUsedInGenotyping();
final int alternativeAlleleCount = alleles.size() - 1;
@ -355,23 +370,74 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
final int[] mleCounts = new int[alternativeAlleleCount];
int outputAlleleCount = 0;
boolean siteIsMonomorphic = true;
for (final Allele alternativeAllele : alleles) {
if (alternativeAllele.isReference()) continue;
// we want to keep the NON_REF symbolic allele but only in the absence of a non-symbolic allele, e.g.
// if we combined a ref / NON_REF gVCF with a ref / alt gVCF
final boolean isNonRefWhichIsLoneAltAllele = alternativeAlleleCount == 1 && alternativeAllele == GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE;
final boolean isPlausible = afcr.isPolymorphicPhredScaledQual(alternativeAllele, configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING);
final boolean toOutput = isPlausible || forceKeepAllele(alternativeAllele) || isNonRefWhichIsLoneAltAllele;
int referenceAlleleSize = 0;
for (final Allele allele : alleles) {
if (allele.isReference() ) {
referenceAlleleSize = allele.length();
} else {
// we want to keep the NON_REF symbolic allele but only in the absence of a non-symbolic allele, e.g.
// if we combined a ref / NON_REF gVCF with a ref / alt gVCF
final boolean isNonRefWhichIsLoneAltAllele = alternativeAlleleCount == 1 && allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final boolean isPlausible = afcr.isPolymorphicPhredScaledQual(allele, configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING);
siteIsMonomorphic &= ! isPlausible;
if (!toOutput) continue;
outputAlleles[outputAlleleCount] = alternativeAllele;
mleCounts[outputAlleleCount++] = afcr.getAlleleCountAtMLE(alternativeAllele);
siteIsMonomorphic &= !isPlausible;
boolean toOutput = (isPlausible || forceKeepAllele(allele) || isNonRefWhichIsLoneAltAllele);
if ( allele.equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED) ||
allele.equals(Allele.SPAN_DEL) ) {
toOutput &= coveredByDeletion(vc);
}
if (toOutput) {
outputAlleles[outputAlleleCount] = allele;
mleCounts[outputAlleleCount++] = afcr.getAlleleCountAtMLE(allele);
recordDeletion(referenceAlleleSize, allele, vc);
}
}
}
return new OutputAlleleSubset(outputAlleleCount,outputAlleles,mleCounts,siteIsMonomorphic);
}
/**
* Record deletion to keep
* Add deletions to a list.
*
* @param referenceAlleleSize reference allele length
* @param allele allele of interest
* @param vc variant context
*/
private void recordDeletion(final int referenceAlleleSize, final Allele allele, final VariantContext vc) {
final int deletionSize = referenceAlleleSize - allele.length();
// Allele ia a deletion
if (deletionSize > 0) {
final GenomeLoc genomeLoc = genomeLocParser.createGenomeLocOnContig(vc.getContig(), vc.getStart(), vc.getStart() + deletionSize);
upstreamDeletionsLoc.add(genomeLoc);
}
}
/**
* Is the variant context covered by an upstream deletion?
*
* @param vc variant context
* @return true if the location is covered by an upstream deletion, false otherwise
*/
private boolean coveredByDeletion(final VariantContext vc) {
for (Iterator<GenomeLoc> it = upstreamDeletionsLoc.iterator(); it.hasNext(); ) {
final GenomeLoc loc = it.next();
if (!loc.getContig().equals(vc.getContig())) { // past contig deletion.
it.remove();
} else if (loc.getStop() < vc.getStart()) { // past position in current contig deletion.
it.remove();
} else if (loc.getStart() == vc.getStart()) {
// ignore this deletion, the symbolic one does not make reference to it.
} else { // deletion covers.
return true;
}
}
return false;
}
/**
* Checks whether even if the allele is not well supported by the data, we should keep it for genotyping.
*
@ -565,7 +631,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
if (!inputPriors.isEmpty()) {
// user-specified priors
if (inputPriors.size() != N)
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples * ploidy");
for (final Double prior : inputPriors) {
if (prior <= 0 || prior >= 1) throw new UserException.BadArgumentValue("inputPrior","inputPrior vector values must be greater than 0 and less than 1");
}
@ -621,8 +687,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
protected final boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) {
return (configuration.outputMode == OutputMode.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) &&
conf >= Math.min(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING,
configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING);
conf >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING;
}
protected final boolean passesCallThreshold(double conf) {
@ -632,7 +697,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
protected Map<String,Object> composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc,
final AlignmentContext rawContext, final Map<String, AlignmentContext> stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List<Integer> alleleCountsofMLE, final boolean bestGuessIsRef,
final AFCalculationResult AFresult, final List<Allele> allAllelesToUse, final GenotypesContext genotypes,
final GenotypeLikelihoodsCalculationModel.Model model, final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final GenotypeLikelihoodsCalculationModel.Model model, final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final boolean doAlleleSpecificCalcs) {
final HashMap<String, Object> attributes = new HashMap<>();
@ -728,7 +793,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
final double normalizedLog10ACeq0Posterior = log10ACeq0Posterior - log10PosteriorNormalizationConstant;
// This is another condition to return a 0.0 also present in AFCalculator code as well.
if (normalizedLog10ACeq0Posterior >= QualityUtils.qualToErrorProbLog10(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING))
if (normalizedLog10ACeq0Posterior >= QualityUtils.qualToErrorProbLog10(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING))
return 0.0;
return 1.0 - Math.pow(10.0, normalizedLog10ACeq0Posterior);

View File

@ -81,26 +81,30 @@ public class StandardCallerArgumentCollection implements Cloneable {
public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY;
/**
* When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
* When --genotyping_mode is set to GENOTYPE_GIVEN_ALLELES mode, the caller will genotype the samples using only the alleles provide in this callset. Note that this is not well tested in HaplotypeCaller, and is definitely not suitable for use with HaplotypeCaller in -ERC GVCF mode. In addition, it does not apply to MuTect2 at all.
*/
@Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
@Input(fullName="alleles", shortName = "alleles", doc="Set of alleles to use in genotyping", required=false)
public RodBinding<VariantContext> alleles;
/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
* will try to remove (N * contamination fraction) bases for each alternate allele.
* If this fraction is greater is than zero, the caller will aggressively attempt to remove
* contamination through biased down-sampling of reads (for all samples). Basically, it will ignore the
* contamination fraction of reads for each alternate allele. So if the pileup contains N
* total bases, then we will try to remove (N * contamination fraction) bases for each alternate
* allele.
*/
@Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
@Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination to aggressively remove", required = false)
public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0;
/**
* This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.
* Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION.
* This argument specifies a file with two columns "sample" and "contamination" (separated by a tab)
* specifying the contamination level for those samples (where contamination is given as a
* decimal number, not an integer) per line. There should be no header. Samples that do not appear
* in this file will be processed with CONTAMINATION_FRACTION.
**/
@Advanced
@Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"<SampleID><TAB><Contamination>\" (Contamination is double) per line; No header.", required = false)
@Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Contamination per sample", required = false)
public File CONTAMINATION_FRACTION_FILE = null;
/**
@ -148,23 +152,33 @@ public class StandardCallerArgumentCollection implements Cloneable {
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
*/
@Hidden
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model", required = false)
public AFCalculatorImplementation requestedAlleleFrequencyCalculationModel;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
/**
* Experimental argument FOR USE WITH UnifiedGenotyper ONLY. When using HaplotypeCaller, use -ERC
* instead. When using GenotypeGVCFs, see -allSites.
*/
@Advanced
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Which type of calls we should output", required = false)
public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY;
/**
* Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
* This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
* Experimental argument FOR USE WITH UnifiedGenotyper ONLY: if SNP likelihood model
* is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we
* will also emit PLs at all sites. This will give a measure of reference confidence and a
* measure of which alt alleles are more plausible (if any).
* WARNINGS:
* - This feature will inflate VCF file size considerably.
* - All SNP ALT alleles will be emitted with corresponding 10 PL values.
* - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
* - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid
* SNP model is used
* - THIS WILL NOT WORK WITH HaplotypeCaller, GenotypeGVCFs or MuTect2! Use HaplotypeCaller with
* -ERC GVCF then GenotypeGVCFs instead. See the Best Practices documentation for more information.
*/
@Advanced
@Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)

View File

@ -116,7 +116,6 @@ import java.util.*;
* --dbsnp dbSNP.vcf \
* -o snps.raw.vcf \
* -stand_call_conf [50.0] \
* -stand_emit_conf 10.0 \
* [-L targets.interval_list]
* </pre>
*

View File

@ -203,7 +203,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
final List<GenotypeLikelihoodsCalculationModel.Model> models = getGLModelsToUse(tracker, rawContext);
final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap = new HashMap<>();
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap = new HashMap<>();
final VariantCallContext defaultResult = configuration.outputMode == OutputMode.EMIT_ALL_SITES
&& configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES
@ -267,7 +267,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext,
final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
final List<GenotypeLikelihoodsCalculationModel.Model> models = getGLModelsToUse(tracker, rawContext);
if ( models.isEmpty() ) {
return null;
@ -345,7 +345,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
final List<Allele> alternateAllelesToUse,
final boolean useBAQedPileup,
final GenotypeLikelihoodsCalculationModel.Model model,
final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine,
genomeLocParser != null || refContext == null ? genomeLocParser : refContext.getGenomeLocParser(), perReadAlleleLikelihoodMap);
@ -360,7 +360,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model,
final boolean inheritAttributesFromInputVC,
final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, inheritAttributesFromInputVC, perReadAlleleLikelihoodMap, false);
}
@ -370,7 +370,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final GenotypeLikelihoodsCalculationModel.Model model,
final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final boolean useAlleleSpecificCalcs) {
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap, useAlleleSpecificCalcs);
}
@ -386,7 +386,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model,
final boolean inheritAttributesFromInputVC,
final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final boolean useAlleleSpecificCalcs) {
boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
final VariantCallContext result = super.calculateGenotypes(tracker,refContext,rawContext,stratifiedContexts,vc,model,inheritAttributesFromInputVC,perReadAlleleLikelihoodMap, useAlleleSpecificCalcs);
@ -410,7 +410,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
protected Map<String,Object> composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc,
final AlignmentContext rawContext, final Map<String, AlignmentContext> stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List<Integer> alleleCountsofMLE, final boolean bestGuessIsRef,
final AFCalculationResult AFresult, final List<Allele> allAllelesToUse, final GenotypesContext genotypes,
final GenotypeLikelihoodsCalculationModel.Model model, final Map<String, org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final GenotypeLikelihoodsCalculationModel.Model model, final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final boolean useAlleleSpecificCalcs) {
final Map<String,Object> result = super.composeCallAttributes(inheritAttributesFromInputVC, vc,rawContext,stratifiedContexts,tracker,refContext,alleleCountsofMLE,bestGuessIsRef,
AFresult,allAllelesToUse,genotypes,model,perReadAlleleLikelihoodMap, useAlleleSpecificCalcs);

View File

@ -110,7 +110,7 @@ public class AFCalculationResult {
if ( log10pRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pRefByAllele has the wrong number of elements: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
if ( ! allelesUsedInGenotyping.containsAll(log10pRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pRefByAllele doesn't contain all of the alleles used in genotyping: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
this.alleleCountsOfMLE = alleleCountsOfMLE;
this.nEvaluations = nEvaluations;

View File

@ -238,7 +238,7 @@ public abstract class AFCalculator implements Cloneable {
* @param maximumAlternativeAlleleCount the maximum alternative allele count it must be able to handle. Has no effect if
* the current tracker is able to handle that number.
*
* @return never {@code null}
* @return {@code null} iff this calculator implementation does not use a state tracker.
*/
protected StateTracker getStateTracker(final boolean reset, final int maximumAlternativeAlleleCount) {
if (stateTracker == null)

View File

@ -0,0 +1,249 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2016 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.math3.util.MathArrays;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeAlleleCounts;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.gatk.utils.Dirichlet;
import org.broadinstitute.gatk.utils.IndexRange;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* @author David Benjamin &lt;davidben@broadinstitute.org&gt;
*/
public final class AlleleFrequencyCalculator extends AFCalculator {
private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
private static final double THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE = 0.1;
private static final int HOM_REF_GENOTYPE_INDEX = 0;
private final double refPseudocount;
private final double snpPseudocount;
private final double indelPseudocount;
private final int defaultPloidy;
// this doesn't use the exact model, so number of evaluations is irrelevant
private static final int DUMMY_N_EVALUATIONS = 1;
public AlleleFrequencyCalculator(final double refPseudocount, final double snpPseudocount, final double indelPseudocount, final int defaultPloidy) {
this.refPseudocount = refPseudocount;
this.snpPseudocount = snpPseudocount;
this.indelPseudocount = indelPseudocount;
this.defaultPloidy = defaultPloidy;
}
public AFCalculationResult getLog10PNonRef(final VariantContext vc) {
// maxAltAlleles is not used by getLog10PNonRef, so don't worry about the 0
return getLog10PNonRef(vc, defaultPloidy, 0, null);
}
//TODO: this should be a class of static methods once the old AFCalculator is gone.
/**
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
* @param vc the VariantContext holding the alleles and sample information. The VariantContext
* must have at least 1 alternative allele
* @param refSnpIndelPseudocounts a total hack. A length-3 vector containing Dirichlet prior pseudocounts to
* be given to ref, alt SNP, and alt indel alleles. Hack won't be necessary when we destroy the old AF calculators
* @return result (for programming convenience)
*/
@Override
public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int defaultPloidy, final int maximumAlternativeAlleles, final double[] refSnpIndelPseudocounts) {
Utils.nonNull(vc, "vc is null");
final int numAlleles = vc.getNAlleles();
final List<Allele> alleles = vc.getAlleles();
Utils.validateArg( numAlleles > 1, "VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc);
final double[] priorPseudocounts = alleles.stream()
.mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() > 1 ? snpPseudocount : indelPseudocount)).toArray();
double[] alleleCounts = new double[numAlleles];
final double flatLog10AlleleFrequency = -MathUtils.Log10Cache.get(numAlleles); // log10(1/numAlleles)
double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency);
double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY;
while (alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE) {
final double[] newAlleleCounts = effectiveAlleleCounts(vc, log10AlleleFrequencies);
alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble();
alleleCounts = newAlleleCounts;
final double[] posteriorPseudocounts = MathArrays.ebeAdd(priorPseudocounts, alleleCounts);
// first iteration uses flat prior in order to avoid local minimum where the prior + no pseudocounts gives such a low
// effective allele frequency that it overwhelms the genotype likelihood of a real variant
// basically, we want a chance to get non-zero pseudocounts before using a prior that's biased against a variant
log10AlleleFrequencies = new Dirichlet(posteriorPseudocounts).log10MeanWeights();
}
double[] log10POfZeroCountsByAllele = new double[numAlleles];
double log10PNoVariant = 0;
for (final Genotype g : vc.getGenotypes()) {
if (!g.hasLikelihoods()) {
continue;
}
final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
//the total probability
log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
// per allele non-log space probabilities of zero counts for this sample
// for each allele calculate the total probability of genotypes containing at least one copy of the allele
final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles];
Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY);
for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) ->
log10ProbabilityOfNonZeroAltAlleles[alleleIndex] =
MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior));
}
for (int allele = 0; allele < numAlleles; allele++) {
// if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN
if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) {
log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY;
} else {
log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]);
}
}
}
// unfortunately AFCalculationResult expects integers for the MLE. We really should emit the EM no-integer values
// which are valuable (eg in CombineGVCFs) as the sufficient statistics of the Dirichlet posterior on allele frequencies
final int[] integerAlleleCounts = Arrays.stream(alleleCounts).mapToInt(x -> (int) Math.round(x)).toArray();
final int[] integerAltAlleleCounts = Arrays.copyOfRange(integerAlleleCounts, 1, numAlleles);
//skip the ref allele (index 0)
final Map<Allele, Double> log10PRefByAllele = IntStream.range(1, numAlleles).boxed()
.collect(Collectors.toMap(alleles::get, a -> log10POfZeroCountsByAllele[a]));
// we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we
// give it our posterior as its "likelihood" along with a flat dummy prior
final double[] dummyFlatPrior = {-1e-10, -1e-10}; //TODO: HACK must be negative for AFCalcResult
final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, MathUtils.log10OneMinusPow10(log10PNoVariant)};
return new AFCalculationResult(integerAltAlleleCounts, DUMMY_N_EVALUATIONS, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele);
}
// effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g]
// for numerical stability we will do this in log space:
// count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g)
// thanks to the log-sum-exp trick this lets us work with log posteriors alone
private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
final int numAlleles = vc.getNAlleles();
Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
final double[] log10Result = new double[numAlleles];
Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
for (final Genotype g : vc.getGenotypes()) {
if (!g.hasLikelihoods()) {
continue;
}
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles);
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex ->
glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) ->
log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.Log10Cache.get(count))));
}
return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
}
private static double[] log10NormalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) {
final double[] log10Likelihoods = g.getLikelihoods().getAsVector();
final double[] log10Posteriors = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> {
final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(genotypeIndex);
return gac.log10CombinationCount() + log10Likelihoods[genotypeIndex]
+ gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]);
});
return MathUtils.normalizeFromLog10(log10Posteriors, true);
}
@Override //Note: unused
protected AFCalculationResult getResultFromFinalState(final VariantContext vc, final double[] priors, final StateTracker st) { return null; }
@Override//Note: unused
protected AFCalculationResult computeLog10PNonRef(final VariantContext vc, final int defaultPloidy,
final double[] priors, final StateTracker st) { return null; }
@Override //Note: unused
protected StateTracker getStateTracker(final boolean reset, final int maximumAlternativeAlleleCount) { return null; }
@Override //trivial implementation -- new AFCalculator can handle multiallelics so we're not afraid
protected VariantContext reduceScope(final VariantContext vc, final int defaultPloidy, final int maximumAlternativeAlleles) {
return vc;
}
@Override //also trivial
public GenotypesContext subsetAlleles(final VariantContext vc,
final int defaultPloidy,
final List<Allele> allelesToUse,
final boolean assignGenotypes) {
return vc.getGenotypes();
}
}

View File

@ -110,7 +110,7 @@ public abstract class DiploidExactAFCalculator extends ExactAFCalculator {
@Override
protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final int defaultPloidy, final List<Allele> allelesToUse) {
return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL);
return GATKVariantContextUtils.subsetAlleles(vc, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL);
}
@Override
@ -346,8 +346,8 @@ public abstract class DiploidExactAFCalculator extends ExactAFCalculator {
if (defaultPloidy != 2)
throw new IllegalArgumentException("cannot support ploidy different than 2 and the default ploidy is " + defaultPloidy);
return allelesToUse.size() == 1
? GATKVariantContextUtils.subsetToRefOnly(vc, 2)
: GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse,
? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy)
: GATKVariantContextUtils.subsetAlleles(vc, allelesToUse,
assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL);
}
}

View File

@ -194,7 +194,9 @@ import java.util.*;
}
gb.alleles(newAlleles);
}
if (combineAltAlleleLikelihoods(oldGenotype, genotypeCount, newLikelihoods, hetLikelihoods, homAltLikelihoods))
if (oldGenotype.isNonInformative())
gb.PL(BIALLELIC_NON_INFORMATIVE_PLS);
else if (combineAltAlleleLikelihoods(oldGenotype, genotypeCount, newLikelihoods, hetLikelihoods, homAltLikelihoods))
gb.PL(newLikelihoods);
newGenotypes.add(gb.make());
}

View File

@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFConstants;
@ -210,7 +211,6 @@ import java.util.*;
* -I sample1.bam [-I sample2.bam ...] \
* [--dbsnp dbSNP.vcf] \
* [-stand_call_conf 30] \
* [-stand_emit_conf 10] \
* [-L targets.interval_list] \
* -o output.raw.snps.indels.vcf
* </pre>
@ -223,7 +223,6 @@ import java.util.*;
* -I sample1.bam \
* [--dbsnp dbSNP.vcf] \
* -stand_call_conf 20 \
* -stand_emit_conf 20 \
* -o output.raw.snps.indels.vcf
* </pre>
*
@ -365,12 +364,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
* sites are compressed into bands of similar genotype quality (GQ) that are emitted as a single VCF record. See
* the FAQ documentation for more details about the GVCF format.
*
* This argument allows you to set the GQ boundaries. HC expects a list of multiple GQ threshold values. To pass
* multiple values, you provide them one by one with the argument, as in `-GQB 10 -GQB 20 -GQB 30` and so on. Note
* that GQ values are capped at 99 in the GATK.
* This argument allows you to set the GQ bands. HC expects a list of strictly increasing GQ values
* that will act as exclusive upper bounds for the GQ bands. To pass multiple values,
* you provide them one by one with the argument, as in `-GQB 10 -GQB 20 -GQB 30` and so on
* (this would set the GQ bands to be `[0, 10), [10, 20), [20, 30)` and so on, for example).
* Note that GQ values are capped at 99 in the GATK, so values must be integers in [1, 100].
* If the last value is strictly less than 100, the last GQ band will start at that value (inclusive)
* and end at 100 (exclusive).
*/
@Advanced
@Argument(fullName="GVCFGQBands", shortName="GQB", doc="GQ thresholds for reference confidence bands", required = false)
@Argument(fullName="GVCFGQBands", shortName="GQB", doc="Exclusive upper bounds for reference confidence GQ bands " +
"(must be in [1, 100] and specified in increasing order)", required = false)
protected List<Integer> GVCFGQBands = new ArrayList<Integer>(70) {{
for (int i=1; i<=60; ++i) add(i);
add(70); add(80); add(90); add(99);
@ -487,7 +491,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private HaplotypeCallerGenotypingEngine genotypingEngine = null;
// fasta reference reader to supplement the edges of the reference sequence
protected CachingIndexedFastaSequenceFile referenceReader;
protected ReferenceSequenceFile referenceReader;
// reference base padding size
private static final int REFERENCE_PADDING = 500;
@ -584,7 +588,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
if (HCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES)
throw new UserException.BadArgumentValue("ERC/gt_mode","you cannot request reference confidence output and GENOTYPE_GIVEN_ALLELES at the same time");
HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0;
HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = -0.0;
// also, we don't need to output several of the annotations
@ -626,7 +629,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY;
simpleUAC.genotypingOutputMode = GenotypingOutputMode.DISCOVERY;
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.CONTAMINATION_FRACTION = 0.0;
simpleUAC.CONTAMINATION_FRACTION_FILE = null;
simpleUAC.exactCallsLog = null;
@ -683,12 +685,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
vcfWriter.writeHeader(new VCFHeader(headerInfo, sampleSet));
try {
// fasta reference reader to supplement the edges of the reference sequence
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
} catch( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
}
// fasta reference reader to supplement the edges of the reference sequence
referenceReader = CachingIndexedFastaSequenceFile.checkAndCreate(getToolkit().getArguments().referenceFile);
// create and setup the assembler
assemblyEngine = new ReadThreadingAssembler(RTAC.maxNumHaplotypesInPopulation, RTAC.kmerSizes, RTAC.dontIncreaseKmerSizesForCycles, RTAC.allowNonUniqueKmersInRef, RTAC.numPruningSamples);
@ -754,8 +752,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
try {
vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands, HCAC.genotypeArgs.samplePloidy);
} catch ( IllegalArgumentException e ) {
throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage());
} catch ( final IllegalArgumentException e ) {
throw new UserException.BadArgumentValue("GVCFGQBands", e.getMessage());
}
}
}

View File

@ -54,7 +54,6 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import com.google.common.annotations.VisibleForTesting;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
import org.broadinstitute.gatk.utils.*;
@ -73,6 +72,7 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
import java.util.stream.Collectors;
/**
* {@link HaplotypeCaller}'s genotyping strategy implementation.
@ -82,7 +82,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
protected static final int ALLELE_EXTENSION = 2;
private static final String phase01 = "0|1";
private static final String phase10 = "1|0";
private static final int MAX_DROPPED_ALTERNATIVE_ALLELES_TO_LOG = 20;
private static final int MAX_DROPPED_ALTERNATIVE_ALLELES_LOG_STRING_LENGTH = 500;
private final int maxGenotypeCountToEnumerate;
private final Map<Integer, Integer> practicalAlleleCountForPloidy = new HashMap<>();
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
@ -99,13 +102,18 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
* @param genomeLocParser {@inheritDoc}
* @param doPhysicalPhasing whether to try physical phasing.
*/
public HaplotypeCallerGenotypingEngine(final AssemblyBasedCallerArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser, final AFCalculatorProvider afCalculatorProvider, final boolean doPhysicalPhasing) {
public HaplotypeCallerGenotypingEngine(final AssemblyBasedCallerArgumentCollection configuration,
final SampleList samples,
final GenomeLocParser genomeLocParser,
final AFCalculatorProvider afCalculatorProvider,
final boolean doPhysicalPhasing) {
super(configuration,samples,genomeLocParser,afCalculatorProvider);
if (genomeLocParser == null)
throw new IllegalArgumentException("the genome location parser provided cannot be null");
this.doPhysicalPhasing= doPhysicalPhasing;
ploidyModel = new HomogeneousPloidyModel(samples,configuration.genotypeArgs.samplePloidy);
genotypingModel = new InfiniteRandomMatingPopulationModel();
maxGenotypeCountToEnumerate = configuration.genotypeArgs.MAX_GENOTYPE_COUNT;
}
/**
@ -135,7 +143,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
}
/**
* Carries the result of a call to #assignGenotypeLikelihoods
* Carries the result of a call to {@link #assignGenotypeLikelihoods(List, ReadLikelihoods, Map, byte[], GenomeLoc, GenomeLoc, GenomeLocParser, RefMetaDataTracker, List, boolean)}
*/
public static class CalledHaplotypes {
private final List<VariantContext> calls;
@ -189,16 +197,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
@Ensures("result != null")
// TODO - can this be refactored? this is hard to follow!
CalledHaplotypes assignGenotypeLikelihoods( final List<Haplotype> haplotypes,
final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final RefMetaDataTracker tracker,
final List<VariantContext> activeAllelesToGenotype,
final boolean emitReferenceConfidence) {
CalledHaplotypes assignGenotypeLikelihoods(final List<Haplotype> haplotypes,
final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final RefMetaDataTracker tracker,
final List<VariantContext> activeAllelesToGenotype,
final boolean emitReferenceConfidence) {
// sanity check input arguments
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
@ -232,15 +240,17 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
// Merge the event to find a common reference representation
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc,
priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE,
false, false, null, false, false);
if( mergedVC == null )
continue;
final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP()
? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL;
final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP
: GenotypeLikelihoodsCalculationModel.Model.INDEL;
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
@ -248,13 +258,17 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
}
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
if( configuration.DEBUG && logger != null ) {
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
final ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
mergedVC = removeAltAllelesIfTooManyGenotypes(ploidy, alleleMapper, mergedVC);
final ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper,
genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC),
ALLELE_EXTENSION));
if (configuration.isSampleContaminationPresent())
readAlleleLikelihoods.contaminationDownsampling(configuration.getSampleContamination());
@ -269,10 +283,23 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
readAlleleLikelihoods.addNonReferenceAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
}
final GenotypesContext genotypes = calculateGLsForThisEvent(readAlleleLikelihoods, noCallAlleles );
final GenotypesContext genotypes = calculateGLsForThisEvent(readAlleleLikelihoods, noCallAlleles);
final VariantContext call = calculateGenotypes(new VariantContextBuilder(mergedVC).alleles(readAlleleLikelihoods.alleles()).genotypes(genotypes).make(), calculationModel);
if ( call != null ) {
final VariantContext annotatedCall = annotateCall(readLikelihoods, perSampleFilteredReadList, ref, refLoc, genomeLocParser, tracker, emitReferenceConfidence, calledHaplotypes, mergedVC, alleleMapper, readAlleleLikelihoods, someAllelesWereDropped, call);
final VariantContext annotatedCall = annotateCall(readLikelihoods,
perSampleFilteredReadList,
ref,
refLoc,
genomeLocParser,
tracker,
emitReferenceConfidence,
calledHaplotypes,
mergedVC,
alleleMapper,
readAlleleLikelihoods,
someAllelesWereDropped,
call);
returnCalls.add( annotatedCall );
}
}
@ -300,7 +327,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
final boolean someAlternativeAllelesWereDropped = call.getAlleles().size() != initialAlleleNumber;
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker,readAlleleLikelihoodsForAnnotation, call, emitReferenceConfidence);
if (someAlternativeAllelesWereDropped || someAlternativeAllelesWereAlreadyDropped)
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
// maintain the set of all called haplotypes
for ( final Allele calledAllele : call.getAlleles() ) {
@ -312,6 +339,138 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
return !emitReferenceConfidence ? clearUnconfidentGenotypeCalls(annotatedCall) : annotatedCall;
}
/**
* If the number of alleles is so high that enumerating all possible genotypes is impractical, as determined by
* {@link #maxGenotypeCountToEnumerate}, remove alt alleles from the input {@code alleleMapper} that are
* not well supported by good-scored haplotypes.
* Otherwise do nothing.
*
* Alleles kept are guaranteed to have higher precedence than those removed, where precedence is determined by
* {@link AlleleScoredByHaplotypeScores}.
*
* After the remove operation, entries in map are guaranteed to have the same relative order as they were in the input map,
* that is, entries will be only be removed but not not shifted relative to each other.
* @param ploidy ploidy of the sample
* @param alleleMapper original allele to haplotype map
*/
private VariantContext removeAltAllelesIfTooManyGenotypes(final int ploidy, final Map<Allele, List<Haplotype>> alleleMapper, final VariantContext mergedVC) {
final int originalAlleleCount = alleleMapper.size();
practicalAlleleCountForPloidy.putIfAbsent(ploidy, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(ploidy, maxGenotypeCountToEnumerate));
final int practicalAlleleCount = practicalAlleleCountForPloidy.get(ploidy);
if (originalAlleleCount > practicalAlleleCount) {
final List<Allele> allelesToKeep = whichAllelesToKeepBasedonHapScores(alleleMapper, practicalAlleleCount);
alleleMapper.keySet().retainAll(allelesToKeep);
logger.warn(String.format("Removed alt alleles where ploidy is %d and original allele count is %d, whereas after trimming the allele count becomes %d. Alleles kept are:%s",
ploidy, originalAlleleCount, practicalAlleleCount, allelesToKeep));
return removeExcessAltAllelesFromVC(mergedVC, allelesToKeep);
} else {
return mergedVC;
}
}
/**
* Returns a list of alleles that is a subset of the key set of input map {@code alleleMapper}.
* The size of the returned list is min({@code desiredNumOfAlleles}, alleleMapper.size()).
*
* Alleles kept are guaranteed to have higher precedence than those removed, where precedence is determined by
* {@link AlleleScoredByHaplotypeScores}.
*
* Entries in the returned list are guaranteed to have the same relative order as they were in the input map.
*
* @param alleleMapper original allele to haplotype map
* @param desiredNumOfAlleles desired allele count, including ref allele
*/
@VisibleForTesting
static List<Allele> whichAllelesToKeepBasedonHapScores(final Map<Allele, List<Haplotype>> alleleMapper,
final int desiredNumOfAlleles) {
if(alleleMapper.size() <= desiredNumOfAlleles){
return alleleMapper.keySet().stream().collect(Collectors.toList());
}
final PriorityQueue<AlleleScoredByHaplotypeScores> alleleMaxPriorityQ = new PriorityQueue<>();
for(final Allele allele : alleleMapper.keySet()){
final List<Double> hapScores = alleleMapper.get(allele).stream().map(Haplotype::getScore).sorted().collect(Collectors.toList());
final Double highestScore = hapScores.get(hapScores.size()-1);
final Double secondHighestScore = hapScores.size()>1 ? hapScores.get(hapScores.size()-2) : Double.NEGATIVE_INFINITY;
alleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore));
}
final Set<Allele> allelesToRetain = new LinkedHashSet<>();
while(allelesToRetain.size()<desiredNumOfAlleles){
allelesToRetain.add(alleleMaxPriorityQ.poll().getAllele());
}
return alleleMapper.keySet().stream().filter(allelesToRetain::contains).collect(Collectors.toList());
}
/**
* A utility class that provides ordering information, given best and second best haplotype scores.
* If there's a tie between the two alleles when comparing their best haplotype score, the second best haplotype score
* is used for breaking the tie. In the case that one allele doesn't have a second best allele, i.e. it has only one
* supportive haplotype, its second best score is set as {@link Double#NEGATIVE_INFINITY}.
* In the extremely unlikely cases that two alleles, having the same best haplotype score, neither have a second
* best haplotype score, or the same second best haplotype score, the order is exactly the same as determined by
* {@link Allele#compareTo(Allele)}.
*/
private static final class AlleleScoredByHaplotypeScores implements Comparable<AlleleScoredByHaplotypeScores>{
private final Allele allele;
private final Double bestHaplotypeScore;
private final Double secondBestHaplotypeScore;
public AlleleScoredByHaplotypeScores(final Allele allele, final Double bestHaplotypeScore, final Double secondBestHaplotypeScore){
this.allele = allele;
this.bestHaplotypeScore = bestHaplotypeScore;
this.secondBestHaplotypeScore = secondBestHaplotypeScore;
}
@Override
public int compareTo(final AlleleScoredByHaplotypeScores other) {
if(allele.isReference() && other.allele.isNonReference()){
return -1;
} else if(allele.isNonReference() && other.allele.isReference()){
return 1;
} else if(bestHaplotypeScore > other.bestHaplotypeScore) {
return -1;
} else if (bestHaplotypeScore < other.bestHaplotypeScore) {
return 1;
} else if (!secondBestHaplotypeScore.equals(other.secondBestHaplotypeScore)) {
return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? -1 : 1;
} else {
return allele.compareTo(other.allele);
}
}
public Allele getAllele(){
return allele;
}
}
/**
* Returns an VC that is similar to {@code inputVC} in every aspect except that alleles not in {@code allelesToKeep}
* are removed in the returned VC.
* @throws IllegalArgumentException if 1) {@code allelesToKeep} is null or contains null elements; or
* 2) {@code allelesToKeep} doesn't contain a reference allele; or
* 3) {@code allelesToKeep} is not a subset of {@code inputVC.getAlleles()}
*/
@VisibleForTesting
static VariantContext removeExcessAltAllelesFromVC(final VariantContext inputVC, final Collection<Allele> allelesToKeep){
Utils.validateArg(allelesToKeep!=null, "alleles to keep is null");
Utils.validateArg(!allelesToKeep.contains(null), "alleles to keep contains null elements");
Utils.validateArg(allelesToKeep.stream().anyMatch(Allele::isReference), "alleles to keep doesn't contain reference allele!");
Utils.validateArg(inputVC.getAlleles().containsAll(allelesToKeep), "alleles to keep is not a subset of input VC alleles");
if(inputVC.getAlleles().size() == allelesToKeep.size()) return inputVC;
final VariantContextBuilder vcb = new VariantContextBuilder(inputVC);
final List<Allele> originalList = inputVC.getAlleles();
originalList.retainAll(allelesToKeep);
vcb.alleles(originalList);
return vcb.make();
}
/**
* Reduce the number alternative alleles in a read-likelihoods collection to the maximum-alt-allele user parameter value.
* <p>
@ -324,20 +483,29 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
private void reduceNumberOfAlternativeAllelesBasedOnLikelihoods(final ReadLikelihoods<Allele> readAlleleLikelihoods, final GenomeLoc location) {
final GenotypingLikelihoods<Allele> genotypeLikelihoods = genotypingModel.calculateLikelihoods(readAlleleLikelihoods, new GenotypingData<>(ploidyModel,readAlleleLikelihoods));
final Set<Allele> allelesToDrop = excessAlternativeAlleles(genotypeLikelihoods, configuration.genotypeArgs.MAX_ALTERNATE_ALLELES);
final String allelesToDropString;
if (allelesToDrop.size() < MAX_DROPPED_ALTERNATIVE_ALLELES_TO_LOG) {
allelesToDropString = StringUtil.join(", ", allelesToDrop);
} else {
final Iterator<Allele> it = allelesToDrop.iterator();
final StringBuilder builder = new StringBuilder();
for (int i = 0; i < MAX_DROPPED_ALTERNATIVE_ALLELES_TO_LOG; i++) {
builder.append(it.next().toString()).append(", ");
if (allelesToDrop.isEmpty()) return;
int allelesInMessage = 0;
final StringBuilder droppedAlleleStringBuilder = new StringBuilder(MAX_DROPPED_ALTERNATIVE_ALLELES_LOG_STRING_LENGTH << 1);
for (final Allele allele : allelesToDrop) {
allelesInMessage++;
droppedAlleleStringBuilder.append(allele.toString()).append(", ");
if (droppedAlleleStringBuilder.length() > MAX_DROPPED_ALTERNATIVE_ALLELES_LOG_STRING_LENGTH - 2) {
break;
}
allelesToDropString = builder.append(it.next().toString()).append(" and ").append(allelesToDrop.size() - 20).append(" more").toString();
}
droppedAlleleStringBuilder.setLength(droppedAlleleStringBuilder.length() - 2); // remove the last ", "
if (droppedAlleleStringBuilder.length() > MAX_DROPPED_ALTERNATIVE_ALLELES_LOG_STRING_LENGTH) {
droppedAlleleStringBuilder.setLength(MAX_DROPPED_ALTERNATIVE_ALLELES_LOG_STRING_LENGTH);
droppedAlleleStringBuilder.append("...");
}
if (allelesInMessage < allelesToDrop.size()) {
droppedAlleleStringBuilder.append(" and ").append(allelesToDrop.size() - allelesInMessage).append(" more");
}
logger.warn(String.format("location %s: too many alternative alleles found (%d) larger than the maximum requested with -%s (%d), the following will be dropped: %s.", location,
readAlleleLikelihoods.alleleCount() - 1, GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_SHORT_NAME, configuration.genotypeArgs.MAX_ALTERNATE_ALLELES,
allelesToDropString));
droppedAlleleStringBuilder.toString()));
readAlleleLikelihoods.dropAlleles(allelesToDrop);
}
@ -361,7 +529,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
* @param maxAlternativeAlleles maximum number of alternative alleles allowed.
* @return never {@code null}.
*/
private Set<Allele> excessAlternativeAlleles(final GenotypingLikelihoods<Allele> genotypeLikelihoods, final int maxAlternativeAlleles) {
@VisibleForTesting
static Set<Allele> excessAlternativeAlleles(final GenotypingLikelihoods<Allele> genotypeLikelihoods, final int maxAlternativeAlleles) {
final int alleleCount = genotypeLikelihoods.alleleCount();
final int excessAlternativeAlleleCount = Math.max(0, alleleCount - 1 - maxAlternativeAlleles);
if (excessAlternativeAlleleCount <= 0) {
@ -413,8 +582,11 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
}
});
for (int i = 1; i < alleleCount; i++) {
lessFrequentFirst.add(genotypeLikelihoods.alleleAt(i));
for (int i = 0; i < alleleCount; i++) {
final Allele a = genotypeLikelihoods.alleleAt(i);
if(a.isNonReference()){
lessFrequentFirst.add(genotypeLikelihoods.alleleAt(i));
}
}
final Set<Allele> result = new HashSet<>(excessAlternativeAlleleCount);
@ -453,7 +625,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
*/
@VisibleForTesting
static Map<VariantContext, Set<Haplotype>> constructHaplotypeMapping(final List<VariantContext> originalCalls,
final Set<Haplotype> calledHaplotypes) {
final Set<Haplotype> calledHaplotypes) {
final Map<VariantContext, Set<Haplotype>> haplotypeMap = new HashMap<>(originalCalls.size());
for ( final VariantContext call : originalCalls ) {
// don't try to phase if there is not exactly 1 alternate allele
@ -665,14 +837,13 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
// Builds the read-likelihoods collection to use for annotation considering user arguments and the collection
// used for genotyping.
protected ReadLikelihoods<Allele> prepareReadAlleleLikelihoodsForAnnotation(
final ReadLikelihoods<Haplotype> readHaplotypeLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final GenomeLocParser genomeLocParser,
final boolean emitReferenceConfidence,
final Map<Allele, List<Haplotype>> alleleMapper,
final ReadLikelihoods<Allele> readAlleleLikelihoodsForGenotyping,
final VariantContext call) {
protected ReadLikelihoods<Allele> prepareReadAlleleLikelihoodsForAnnotation(final ReadLikelihoods<Haplotype> readHaplotypeLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final GenomeLocParser genomeLocParser,
final boolean emitReferenceConfidence,
final Map<Allele, List<Haplotype>> alleleMapper,
final ReadLikelihoods<Allele> readAlleleLikelihoodsForGenotyping,
final VariantContext call) {
final ReadLikelihoods<Allele> readAlleleLikelihoodsForAnnotations;
final GenomeLoc loc = genomeLocParser.createGenomeLoc(call);
@ -735,10 +906,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
* @return never {@code null} but perhaps an empty list if there is no variants to report.
*/
protected TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes,
final ReadLikelihoods readLikelihoods,
final byte[] ref,
final GenomeLoc refLoc,
final List<VariantContext> activeAllelesToGenotype) {
final ReadLikelihoods readLikelihoods,
final byte[] ref,
final GenomeLoc refLoc,
final List<VariantContext> activeAllelesToGenotype) {
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
@ -773,8 +944,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
}
protected List<VariantContext> getVCsAtThisLocation(final List<Haplotype> haplotypes,
final int loc,
final List<VariantContext> activeAllelesToGenotype) {
final int loc,
final List<VariantContext> activeAllelesToGenotype) {
// the overlapping events to merge into a common reference view
final List<VariantContext> eventsAtThisLoc = new ArrayList<>();

View File

@ -77,8 +77,7 @@ import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.VariantContextUtils;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory;
import htsjdk.variant.variantcontext.writer.SortingVariantContextWriter;
import java.util.*;
/**
@ -186,7 +185,7 @@ public class HaplotypeResolver extends RodWalker<Integer, Integer> {
headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match"));
final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.<String>emptySet());
baseWriter.writeHeader(vcfHeader);
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW);
writer = new SortingVariantContextWriter(baseWriter, ACTIVE_WINDOW);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {

View File

@ -63,7 +63,9 @@ import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.genotyper.*;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.genotyper.SampleListUtils;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
@ -250,8 +252,9 @@ public class ReferenceConfidenceModel {
// as our GLs for the site.
final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs);
gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF)));
gb.PL(leastConfidenceGLs.getAsPLs());
final int[] leastConfidenceGLsAsPLs = leastConfidenceGLs.getAsPLs();
gb.GQ(GATKVariantContextUtils.calculateGQFromPLs(leastConfidenceGLsAsPLs));
gb.PL(leastConfidenceGLsAsPLs);
//gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads);
vcb.genotypes(gb.make());

View File

@ -140,7 +140,7 @@ import java.util.*;
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMap<Byte,Integer>> {
@ArgumentCollection

View File

@ -81,7 +81,7 @@ import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory;
import htsjdk.variant.variantcontext.writer.SortingVariantContextWriter;
import java.io.*;
import java.util.*;
@ -124,7 +124,7 @@ import static org.broadinstitute.gatk.engine.GATKVCFUtils.getVCFHeadersFromRods;
* reverse order between these two genotypes.</p>
* <p>In an old notation that RBP used to output in much earlier versions, the genotypes would have been: 0/1 and 1|0,
* respectively. This was changed because depending on the case it caused ambiguity, incompleteness, and possible
* inconsistency with trio-based phasing. In contrast, the HP tag is much more explicitl for linking alleles, especially
* inconsistency with trio-based phasing. In contrast, the HP tag is much more explicit for linking alleles, especially
* if the genotypes are non-consecutive.</p>
*
* <h3>Usage example</h3>
@ -257,7 +257,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
But, NOTE that map() is careful to pass out a list of records to be written that FIRST includes any records discarded due to having reached mostDownstreamLocusReached,
and only THEN records located at mostDownstreamLocusReached. The opposite order in map() would violate the startDistance limits imposed when contracting SortingVCFWriter with (2 * cacheWindow).
*/
writer = VariantContextWriterFactory.sortOnTheFly(writer, 2 * cacheWindow, writer != origWriter);
writer = new SortingVariantContextWriter(writer, 2 * cacheWindow, writer != origWriter);
// setup the header fields:
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();

View File

@ -108,7 +108,8 @@ import java.util.ArrayList;
* -I input.bam \
* -o output.bam \
* -U ALLOW_N_CIGARS
*
* </pre>
*
* <h3>Note</h3>
* <p>When this tool is used as part of the RNAseq best practices, the command should include mapping quality
* reassignment. See the Best Practices documentation for details.</p>

View File

@ -165,7 +165,7 @@ public class FamilyLikelihoodsUtils {
//final double[] log10Posteriors = MathUtils.toLog10(normalizedPosteriors);
//update genotype types based on posteriors
GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc.getAlleles(), builder,
GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc.getAlleles(), genotype.getPloidy(), builder,
GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, log10Posteriors, vc.getAlleles());
builder.attribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY,

View File

@ -64,9 +64,12 @@ import org.broadinstitute.gatk.engine.walkers.Reference;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.engine.walkers.TreeReducible;
import org.broadinstitute.gatk.engine.walkers.Window;
import org.broadinstitute.gatk.tools.walkers.annotator.RankSumTest;
import org.broadinstitute.gatk.tools.walkers.annotator.RMSAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode;
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedArgumentCollection;
@ -133,6 +136,9 @@ import java.util.*;
@Reference(window=@Window(start=-10,stop=10))
@SuppressWarnings("unused")
public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWriter> implements AnnotatorCompatible, TreeReducible<VariantContextWriter> {
private static String GVCF_BLOCK = "GVCFBlock";
/**
* The gVCF files to merge together
*/
@ -154,7 +160,7 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
@Argument(fullName="uniquifySamples", shortName="uniquifySamples", doc="Assume duplicate samples are present and uniquify all names with '.variant' and file number index")
public boolean uniquifySamples = false;
@ArgumentCollection
@ArgumentCollection
public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection();
/**
@ -185,12 +191,16 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
private UnifiedGenotypingEngine genotypingEngine;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
// the INFO field annotation key names to remove
private final List<String> infoFieldAnnotationKeyNamesToRemove = new ArrayList<>();
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
public boolean alwaysAppendDbsnpId() { return false; }
// INFO Header names that require alt alleles
final Set<String> infoHeaderAltAllelesLineNames = new LinkedHashSet<>();
public void initialize() {
boolean inputsAreTagged = false;
@ -218,6 +228,16 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
annotationEngine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, Collections.<String>emptyList(), this, toolkit);
// Request INFO field annotations inheriting from RankSumTest and RMSAnnotation added to remove list
for ( final InfoFieldAnnotation annotation : annotationEngine.getRequestedInfoAnnotations() ) {
if ( annotation instanceof RankSumTest || annotation instanceof RMSAnnotation ) {
final List<String> keyNames = annotation.getKeyNames();
if ( !keyNames.isEmpty() ) {
infoFieldAnnotationKeyNamesToRemove.add(keyNames.get(0));
}
}
}
// create the genotyping engine
// when checking for presence of AS_StandardAnnotation we must deal with annoying feature that
// the class name with or without the trailing "Annotation" are both valid command lines
@ -229,6 +249,14 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
// take care of the VCF headers
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
// Remove GCVFBlocks
for ( final Iterator<VCFHeaderLine> iter = headerLines.iterator(); iter.hasNext(); ) {
if ( iter.next().getKey().contains(GVCF_BLOCK) ) {
iter.remove();
}
}
headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions());
headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
@ -237,6 +265,18 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags
if ( INCLUDE_NON_VARIANTS ) {
// Save INFO header names that require alt alleles
for ( final VCFHeaderLine headerLine : headerLines ) {
if (headerLine instanceof VCFInfoHeaderLine ) {
if (((VCFInfoHeaderLine) headerLine).getCountType() == VCFHeaderLineCount.A) {
infoHeaderAltAllelesLineNames.add(((VCFInfoHeaderLine) headerLine).getID());
}
}
}
}
if ( dbsnp != null && dbsnp.dbsnp.isBound() )
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);
@ -296,7 +336,6 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
//do trimming after allele-specific annotation reduction or the mapping is difficult
result = GATKVariantContextUtils.reverseTrimAlleles(result);
// Re-annotate and fix/remove some of the original annotations.
// Note that the order of these actions matters and is different for polymorphic and monomorphic sites.
// For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later.
@ -308,12 +347,66 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
} else if (INCLUDE_NON_VARIANTS) {
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
result = annotationEngine.annotateContext(tracker, ref, null, result);
result = removeNonRefAlleles(result);
} else {
return null;
}
result = removeInfoAnnotationsIfNoAltAllele(result);
return result;
}
/**
* Remove INFO field annotations if no alternate alleles
*
* @param vc the variant context
* @return variant context with the INFO field annotations removed if no alternate alleles
*/
private VariantContext removeInfoAnnotationsIfNoAltAllele(final VariantContext vc) {
// If no alt alleles, remove any RankSumTest or RMSAnnotation attribute
if ( vc.getAlternateAlleles().isEmpty() ) {
final VariantContextBuilder builder = new VariantContextBuilder(vc);
for ( final String annotation : infoFieldAnnotationKeyNamesToRemove ) {
builder.rmAttribute(annotation);
}
return builder.make();
} else {
return vc;
}
}
/**
* Remove NON-REF alleles from the variant context
*
* @param vc the variant context
* @return variant context with the NON-REF alleles removed if multiallelic or replaced with NO-CALL alleles if biallelic
*/
private VariantContext removeNonRefAlleles(final VariantContext vc) {
// If NON_REF is the only alt allele, ignore this site
final List<Allele> newAlleles = new ArrayList<>();
// Only keep alleles that are not NON-REF
for ( final Allele allele : vc.getAlleles() ) {
if ( !allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ) {
newAlleles.add(allele);
}
}
// If no alt allele, then remove INFO fields that require alt alleles
if ( newAlleles.size() == 1 ) {
final VariantContextBuilder builder = new VariantContextBuilder(vc).alleles(newAlleles);
for ( final String name : infoHeaderAltAllelesLineNames ) {
builder.rmAttributes(Arrays.asList(name));
}
return builder.make();
} else {
return vc;
}
}
/**
* Determines whether the provided VariantContext has real alternate alleles.
*
@ -365,13 +458,14 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
* 4. change the PGT value from "0|1" to "1|1" for homozygous variant genotypes
* 5. move GQ to RGQ if the site is monomorphic
*
* @param VC the VariantContext with the Genotypes to fix
* @param vc the VariantContext with the Genotypes to fix
* @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic
* @return a new set of Genotypes
*/
private List<Genotype> cleanupGenotypeAnnotations(final VariantContext VC, final boolean createRefGTs) {
final GenotypesContext oldGTs = VC.getGenotypes();
private List<Genotype> cleanupGenotypeAnnotations(final VariantContext vc, final boolean createRefGTs) {
final GenotypesContext oldGTs = vc.getGenotypes();
final List<Genotype> recoveredGs = new ArrayList<>(oldGTs.size());
for ( final Genotype oldGT : oldGTs ) {
final Map<String, Object> attrs = new HashMap<>(oldGT.getExtendedAttributes());
@ -400,15 +494,15 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
}
// create AD if it's not there
if ( !oldGT.hasAD() && VC.isVariant() ) {
final int[] AD = new int[VC.getNAlleles()];
if ( !oldGT.hasAD() && vc.isVariant() ) {
final int[] AD = new int[vc.getNAlleles()];
AD[0] = depth;
builder.AD(AD);
}
if ( createRefGTs ) {
final int ploidy = oldGT.getPloidy();
final List<Allele> refAlleles = Collections.nCopies(ploidy,VC.getReference());
final List<Allele> refAlleles = Collections.nCopies(ploidy,vc.getReference());
//keep 0 depth samples and 0 GQ samples as no-call
if (depth > 0 && oldGT.hasGQ() && oldGT.getGQ() > 0) {

View File

@ -153,7 +153,7 @@ public class PosteriorLikelihoodsUtils {
final GenotypeBuilder builder = new GenotypeBuilder(vc1.getGenotype(genoIdx));
builder.phased(vc1.getGenotype(genoIdx).isPhased());
if ( posteriors.get(genoIdx) != null ) {
GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder,
GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), vc1.getMaxPloidy(2), builder,
GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles());
builder.attribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY,
Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs()));

View File

@ -63,7 +63,6 @@ import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
@ -135,7 +134,7 @@ public class ReferenceConfidenceVariantContextMerger {
// record whether it's also a spanning deletion/event (we know this because the VariantContext type is no
// longer "symbolic" but "mixed" because there are real alleles mixed in with the symbolic non-ref allele)
sawSpanningDeletion |= ( isSpanningEvent && vc.isMixed() ) || vc.getAlternateAlleles().contains(Allele.SPAN_DEL) ||
vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED );
vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED);
sawNonSpanningEvent |= ( !isSpanningEvent && vc.isMixed() );
vcAndNewAllelePairs.add(new Pair<>(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet)));
@ -147,11 +146,22 @@ public class ReferenceConfidenceVariantContextMerger {
final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
//TODO quick fix patch to address memory issue described in https://github.com/broadinstitute/gsa-unstable/issues/1419
//TODO The reason to impose this limit here is that in practice the tool that is affected by the mem issue, GenotypeGVCFs will
//TODO skip the site when the number of alleles is bigger than that limit so this change does not change the outputs.
//TODO However we need to change this with a more permanent solution.
//TODO For example we could impose maxAltAlleles or maxGenotypes in the output at every step including CombineGVCFs and GenotypeGVCFs
//TODO in order to avoid to add yet another limit .
final boolean shouldComputePLs = allelesList.size() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED;
if (!shouldComputePLs) {
logger.debug(String.format("location %s:%d has too many alleles (%d) to compute PLs (maximum allowed %d). PL genotype annotations won't be produced at this site", loc.getContig(), loc.getStart(), allelesList.size(), GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED));
}
for ( final Pair<VariantContext,List<Allele>> pair : vcAndNewAllelePairs ) {
final VariantContext vc = pair.getFirst();
final List<Allele> remappedAlleles = pair.getSecond();
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified);
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified, shouldComputePLs);
// special case DP (add it up) for all events
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) {
@ -186,7 +196,7 @@ public class ReferenceConfidenceVariantContextMerger {
//annotatorEngine.combineAnnotations removed the successfully combined annotations, so now parse those that are left
//here we're assuming that things that are left are scalars per sample
Map<String, List<Comparable>> parsedAnnotationMap = parseRemainingAnnotations(annotationMap);
final Map<String, List<Comparable>> parsedAnnotationMap = parseRemainingAnnotations(annotationMap);
// when combining remaining annotations use the median value from all input VCs which had annotations provided
for ( final Map.Entry<String, List<Comparable>> p : parsedAnnotationMap.entrySet() ) {
@ -413,18 +423,20 @@ public class ReferenceConfidenceVariantContextMerger {
* @param remappedAlleles the list of remapped alleles for the sample
* @param targetAlleles the list of target alleles
* @param samplesAreUniquified true if sample names have been uniquified
* @param shouldComputePLs true if the PL can be computed in this merge.
*/
private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes,
final VariantContext vc,
final List<Allele> remappedAlleles,
final List<Allele> targetAlleles,
final boolean samplesAreUniquified) {
final boolean samplesAreUniquified,
final boolean shouldComputePLs) {
final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY);
// the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies)
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
int[] perSampleIndexesOfRelevantAlleles;
for (final Genotype g : vc.getGenotypes()) {
final String name;
@ -433,23 +445,28 @@ public class ReferenceConfidenceVariantContextMerger {
else
name = g.getSampleName();
final int ploidy = g.getPloidy();
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()))
.noPL();
genotypeBuilder.name(name);
final boolean hasPL = g.hasPL();
final boolean doPLs = shouldComputePLs && g.hasPL();
final boolean hasAD = g.hasAD();
final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY);
if (hasPL || hasSAC) {
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
if (g.hasPL()) {
if (doPLs || hasSAC || hasAD) {
final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
if (doPLs) {
// lazy initialization of the genotype index map by ploidy.
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
: genotypeIndexMapsByPloidy[ploidy];
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
genotypeBuilder.PL(PLs).AD(AD);
genotypeBuilder.PL(PLs);
}
if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) {
if (hasAD) {
genotypeBuilder.AD(generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
}
if (hasSAC) {
final List<Integer> sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles);
final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse);
genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs);
@ -469,11 +486,11 @@ public class ReferenceConfidenceVariantContextMerger {
if (perSampleIndexesOfRelevantAlleles == null)
throw new IllegalArgumentException("The per sample index of relevant alleles must not be null");
final List<Integer> sacIndexesToUse = new ArrayList(2 * perSampleIndexesOfRelevantAlleles.length);
final List<Integer> sacIndexesToUse = new ArrayList<>(2 * perSampleIndexesOfRelevantAlleles.length);
for (int item : perSampleIndexesOfRelevantAlleles) {
sacIndexesToUse.add(new Integer(2 * item));
sacIndexesToUse.add(new Integer(2 * item + 1));
sacIndexesToUse.add(2 * item);
sacIndexesToUse.add(2 * item + 1);
}
return sacIndexesToUse;

View File

@ -0,0 +1,121 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2016 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.utils;
import org.apache.commons.math3.special.Gamma;
import org.apache.commons.math3.util.MathArrays;
import java.util.Arrays;
import java.util.Collections;
import java.util.stream.IntStream;
/**
* The Dirichlet distribution is a distribution on multinomial distributions: if pi is a vector of positive multinomial weights
* such that sum_i pi[i] = 1, the Dirichlet pdf is P(pi) = [prod_i Gamma(alpha[i]) / Gamma(sum_i alpha[i])] * prod_i pi[i]^(alpha[i] - 1)
*
* The vector alpha comprises the sufficient statistics for the Dirichlet distribution.
*
* Since the Dirichlet is the conjugate prior to the multinomial, if one has a Dirichlet prior with concentration alpha
* and observes each category i n_i times (assuming categories are drawn from a multinomial distribution pi)
* the posterior is alpha_i -> alpha_i + n_i
*
*
* @author David Benjamin &lt;davidben@broadinstitute.org&gt;
*/
public class Dirichlet {
final double[] alpha;
public Dirichlet(final double... alpha) {
Utils.nonNull(alpha);
Utils.validateArg(alpha.length >= 1, "Dirichlet parameters must have at least one element");
Utils.validateArg(MathUtils.allMatch(alpha, x -> x >= 0), "Dirichlet parameters may not be negative");
Utils.validateArg(MathUtils.allMatch(alpha, Double::isFinite), "Dirichlet parameters must be finite");
this.alpha = alpha.clone();
}
/**
* Create a symmetric distribution Dir(a/K, a/K, a/K . . .) where K is the number of states and
* a is the concentration.
*/
public static Dirichlet symmetricDirichlet(final int numStates, final double concentration) {
Utils.validateArg(numStates > 0, "Must have at leat one state");
Utils.validateArg(concentration > 0, "concentration must be positive");
return new Dirichlet(Collections.nCopies(numStates, concentration/numStates).stream().mapToDouble(x->x).toArray());
}
// in variational Bayes one often needs the effective point estimate of a multinomial distribution with a
// Dirichlet prior. This value is not the mode or mean of the Dirichlet but rather the exp of the expected log weights.
// note that these effective weights do not add up to 1. This is fine because in any probabilistic model scaling all weights
// amounts to an arbitrary normalization constant, but it's important to keep in mind because some classes may expect
// normalized weights. In that case the calling code must normalize the weights.
public double[] effectiveMultinomialWeights() {
final double digammaOfSum = Gamma.digamma(MathUtils.sum(alpha));
return MathUtils.applyToArray(alpha, a -> Math.exp(Gamma.digamma(a) - digammaOfSum));
}
public double[] effectiveLog10MultinomialWeights() {
final double digammaOfSum = Gamma.digamma(MathUtils.sum(alpha));
return MathUtils.applyToArray(alpha, a -> (Gamma.digamma(a) - digammaOfSum) * MathUtils.LOG10_OF_E);
}
public double[] meanWeights() {
final double sum = MathUtils.sum(alpha);
return MathUtils.applyToArray(alpha, x -> x / sum);
}
public double[] log10MeanWeights() {
final double sum = MathUtils.sum(alpha);
return MathUtils.applyToArray(alpha, x -> Math.log10(x / sum));
}
public int size() { return alpha.length; }
}

View File

@ -0,0 +1,248 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2016 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.utils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.function.*;
/**
* Represents 0-based integer index range.
*
* <p>
* It represents an integer index range as the pair values:
* <dl>
* <dt>{@link #from}</dt>
* <dd>- index of the first element in range (i.e. inclusive).</dd>
* <dt>{@link #to}</dt>
* <dd>- index of the element following the last element in range (i.e. exclusive).</dd>
* </dl>
* </p>
*
* <p>
* This class is intended to specify a valid index range in arrays or ordered collections.
* </p>
*
* <p>
* All instances are constraint so that neither <code>from</code> nor <code>to</code> can
* be negative nor <code>from</code> can be larger than <code>to</code>.
* </p>
*
* <p>
* You can use {@link #isValidLength(int) isValidFor(length)} to verify that a range instance represents a valid
* range for an 0-based indexed object with {@code length} elements.
* </p>
*/
public final class IndexRange {
/**
* First index in the range.
* <p>
* It won't ever be negative nor greater than {@link #to}.
* </p>
*/
public final int from;
/**
* Index following the last index included in the range.
*
* <p>
* It won't ever be negative nor less than {@link #from}.
* </p>
*/
public final int to;
/**
* Creates a new range given its {@code from} and {@code to} indices.
*
* @param fromIndex the {@code from} index value.
* @param toIndex the {@code to} index value.
* @throws IllegalArgumentException if {@code fromIndex} is larger than {@code toIndex} or either is
* negative.
*/
public IndexRange(final int fromIndex, final int toIndex) {
Utils.validateArg(fromIndex <= toIndex, "the range size cannot be negative");
Utils.validateArg(fromIndex >= 0, "the range cannot contain negative indices");
from = fromIndex;
to = toIndex;
}
/**
* Checks whether this range is valid for a collection or array of a given size.
*
* <p>
* It assume that 0 is the first valid index for target indexed object which is true
* for Java Arrays and mainstream collections.
* </p>
*
* <p>
* If the input length is less than 0, thus incorrect, this method is guaranteed to return
* {@code false}. No exception is thrown.
* </p>
*
*
* @param length the targeted collection or array length.
* @return {@code true} if this range is valid for that {@code length}, {@code false} otherwise.
*/
public boolean isValidLength(final int length) {
return to <= length;
}
/**
* Returns number indexes expanded by this range.
*
* @return 0 or greater.
*/
public int size() {
return to - from;
}
/**
* Iterate through all indexes in the range in ascending order to be processed by the
* provided {@link IntConsumer integer consumer} lambda function.
*
* <p>
* Exceptions thrown by the execution of the index consumer {@code lambda}
* will be propagated to the caller immediately thus stopping early and preventing
* further indexes to be processed.
* </p>
* @param lambda the index consumer lambda.
* @throws IllegalArgumentException if {@code lambda} is {@code null}.
* @throws RuntimeException if thrown by {@code lambda} for some index.
* @throws Error if thrown by {@code lambda} for some index.
*/
public void forEach(final IntConsumer lambda) {
Utils.nonNull(lambda, "the lambda function cannot be null");
for (int i = from; i < to; i++) {
lambda.accept(i);
}
}
/**
* Apply an int -> double function to this range, producing a double[]
*
* @param lambda the int -> double function
*/
public double[] mapToDouble(final IntToDoubleFunction lambda) {
Utils.nonNull(lambda, "the lambda function cannot be null");
final double[] result = new double[size()];
for (int i = from; i < to; i++) {
result[i - from] = lambda.applyAsDouble(i);
}
return result;
}
/**
* Sums the values of an int -> double function applied to this range
*
* @param lambda the int -> double function
*/
public double sum(final IntToDoubleFunction lambda) {
Utils.nonNull(lambda, "the lambda function cannot be null");
double result = 0;
for (int i = from; i < to; i++) {
result += lambda.applyAsDouble(i);
}
return result;
}
/**
* Apply an int -> int function to this range, producing an int[]
*
* @param lambda the int -> int function
*/
public int[] mapToInteger(final IntUnaryOperator lambda) {
Utils.nonNull(lambda, "the lambda function cannot be null");
final int[] result = new int[size()];
for (int i = from; i < to; i++) {
result[i - from] = lambda.applyAsInt(i);
}
return result;
}
/**
* Find the elements of this range for which an int -> boolean predicate is true
*
* @param predicate the int -> boolean predicate
* @return
*/
public List<Integer> filter(final IntPredicate predicate) {
Utils.nonNull(predicate, "predicate may not be null");
final List<Integer> result = new ArrayList<>();
forEach(i -> {if (predicate.test(i)) result.add(i); } );
return result;
}
@Override
public boolean equals(final Object other) {
if (other == this) {
return true;
} else if (!(other instanceof IndexRange)) {
return false;
} else {
final IndexRange otherCasted = (IndexRange) other;
return otherCasted.from == this.from && otherCasted.to == this.to;
}
}
@Override
public int hashCode() {
// Inspired on {@link Arrays#hashCode(Object[])}.
return (( 31 + Integer.hashCode(from) ) * 31 ) + Integer.hashCode(to);
}
@Override
public String toString() {
return String.format("%d-%d",from,to);
}
}

View File

@ -74,13 +74,15 @@ import java.util.List;
*/
public class GVCFWriter implements VariantContextWriter {
private static final int MAX_GENOTYPE_QUAL = VCFConstants.MAX_GENOTYPE_QUAL;
//
// Final fields initialized in constructor
//
/** Where we'll ultimately write our VCF records */
final private VariantContextWriter underlyingWriter;
private final VariantContextWriter underlyingWriter;
final private List<HomRefBlock> GQPartitions;
private final List<HomRefBlock> GQPartitions;
/** fields updated on the fly during GVCFWriter operation */
int nextAvailableStart = -1;
@ -90,26 +92,42 @@ public class GVCFWriter implements VariantContextWriter {
private final int defaultPloidy;
/**
* Is the proposed GQ partitions well-formed?
* Are the proposed GQ partitions well-formed?
*
* @param GQPartitions proposed GQ partitions
* @return a non-null string if something is wrong (string explains issue)
*/
protected static List<HomRefBlock> parsePartitions(final List<Integer> GQPartitions, final int defaultPloidy) {
if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null");
if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty");
if ( GQPartitions == null ) {
throw new IllegalArgumentException("The list of GQ partitions cannot be null.");
}
if ( GQPartitions.isEmpty() ) {
throw new IllegalArgumentException("The list of GQ partitions cannot be empty.");
}
final List<HomRefBlock> result = new LinkedList<>();
int lastThreshold = 0;
for ( final Integer value : GQPartitions ) {
if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer");
if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value);
if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value);
result.add(new HomRefBlock(lastThreshold, value,defaultPloidy));
if ( value == null || value <= 0 ) {
throw new IllegalArgumentException("The list of GQ partitions contains a null or non-positive integer.");
}
if ( value < lastThreshold ) {
throw new IllegalArgumentException(String.format("The list of GQ partitions is out of order. " +
"Previous value is %d but the next is %d.", lastThreshold, value));
}
if ( value == lastThreshold ) {
throw new IllegalArgumentException(String.format("The value %d appears more than once in the list of GQ partitions.", value));
}
if ( value > MAX_GENOTYPE_QUAL + 1 ) {
throw new IllegalArgumentException(String.format("The value %d in the list of GQ partitions is " +
"greater than VCFConstants.MAX_GENOTYPE_QUAL + 1 = %d.", value, VCFConstants.MAX_GENOTYPE_QUAL + 1));
}
result.add(new HomRefBlock(lastThreshold, value, defaultPloidy));
lastThreshold = value;
}
result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE,defaultPloidy));
if ( lastThreshold <= MAX_GENOTYPE_QUAL ) {
result.add(new HomRefBlock(lastThreshold, MAX_GENOTYPE_QUAL + 1, defaultPloidy));
}
return result;
}
@ -209,10 +227,14 @@ public class GVCFWriter implements VariantContextWriter {
}
private boolean genotypeCanBeMergedInCurrentBlock(final Genotype g) {
return currentBlock != null && currentBlock.withinBounds(g.getGQ()) && currentBlock.getPloidy() == g.getPloidy()
return currentBlock != null && currentBlock.withinBounds(capToMaxGQ(g.getGQ())) && currentBlock.getPloidy() == g.getPloidy()
&& (currentBlock.getMinPLs() == null || !g.hasPL() || (currentBlock.getMinPLs().length == g.getPL().length));
}
private int capToMaxGQ(final int gq) {
return Math.min(gq, MAX_GENOTYPE_QUAL);
}
/**
* Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null
*/
@ -246,7 +268,7 @@ public class GVCFWriter implements VariantContextWriter {
final int[] minPLs = block.getMinPLs();
gb.PL(minPLs);
final int gq = genotypeQualityFromPLs(minPLs);
final int gq = GATKVariantContextUtils.calculateGQFromPLs(minPLs);
gb.GQ(gq);
gb.DP(block.getMedianDP());
gb.attribute(GATKVCFConstants.MIN_DP_FORMAT_KEY, block.getMinDP());
@ -257,26 +279,6 @@ public class GVCFWriter implements VariantContextWriter {
return vcb.genotypes(gb.make()).make();
}
private int genotypeQualityFromPLs(final int[] minPLs) {
int first = minPLs[0];
int second = minPLs[1];
if (first > second) {
second = first;
first = minPLs[1];
}
for (int i = 3; i < minPLs.length; i++) {
final int candidate = minPLs[i];
if (candidate >= second) continue;
if (candidate <= first) {
second = first;
first = candidate;
} else
second = candidate;
}
return second - first;
}
/**
* Helper function to create a new HomRefBlock from a variant context and current genotype
*
@ -288,7 +290,7 @@ public class GVCFWriter implements VariantContextWriter {
// figure out the GQ limits to use based on the GQ of g
HomRefBlock partition = null;
for ( final HomRefBlock maybePartition : GQPartitions ) {
if ( maybePartition.withinBounds(g.getGQ()) ) {
if ( maybePartition.withinBounds(capToMaxGQ(g.getGQ())) ) {
partition = maybePartition;
break;
}

View File

@ -223,6 +223,9 @@ public class ExcessHetUnitTest {
final double EHHets = new ExcessHet().calculateEH(allHet, allHet.getGenotypes());
Assert.assertTrue(Math.abs(EHsingleton) < Math.abs(EHHets), String.format("singleton=%f allHets=%f", EHsingleton, EHHets));
//Since all hets is such an extreme case and the sample size is large here, we know that the p-value should be 0
Assert.assertTrue(EHHets == 160.0, String.format("P-value of 0 should be phred scaled to 160.0"));
}
@DataProvider(name = "smallSets")

View File

@ -72,23 +72,33 @@ public class StrandBiasTableUtilsTest {
//> fisher(c(2068, 6796, 1133, 0))
final List<Object[]> tests = new ArrayList<>();
tests.add(new Object[]{0, 0, 0, 0, 1.0});
tests.add(new Object[]{100000, 100000, 100000, 100000, 1.0});
tests.add(new Object[]{1, 2, 3, 4, 1.0});
tests.add(new Object[]{0, 0, 100000, 100000, 1.0});
tests.add(new Object[]{100000, 100000, 100000, 0, 0.0}); //below R's or Java's precision
tests.add(new Object[]{9,11,12,10, 0.7578618});
tests.add(new Object[]{12,10,9,11, 0.7578618});
tests.add(new Object[]{9,10,12,10, 0.7578618});
tests.add(new Object[]{9,9,12,10, 1.0});
tests.add(new Object[]{9,13,12,10, 0.5466948});
tests.add(new Object[]{12,10,9,13, 0.5466948});
tests.add(new Object[]{9,12,11,9, 0.5377362});
tests.add(new Object[]{200000, 100000, 1, 2, 1.0}); //differs from GATK4 implementation
tests.add(new Object[]{100, 100, 100, 0, 3.730187e-23});
tests.add(new Object[]{13736, 9047, 41, 1433, 1.232E-4}); //differs from GATK4 implementation
tests.add(new Object[]{66, 14, 64, 4, 0.0688244});
tests.add(new Object[]{351169, 306836, 153739, 2379, 0.0}); //below R's or Java's precision
tests.add(new Object[]{116449, 131216, 289, 16957, 0.0026801}); //differs from GATK4 implementation
tests.add(new Object[]{137, 159, 9, 23, 0.10752410}); //differs from GATK4 implementation
tests.add(new Object[]{129, 90, 21, 20, 0.6450772}); //differs from GATK4 implementation
tests.add(new Object[]{14054, 9160, 16, 7827, 0.0}); //below R's or Java's precision
tests.add(new Object[]{32803, 9184, 32117, 3283, 0.0289540}); //differs from GATK4 implementation
tests.add(new Object[]{2068, 6796, 1133, 0, 0.0}); //below R's or Java's precision
tests.add(new Object[]{0, 0, 0, 0, 1.0});
tests.add(new Object[]{100000, 100000, 100000, 100000, 1.0} );
tests.add(new Object[]{0, 0, 100000, 100000, 1.0});
tests.add(new Object[]{100000,100000,100000,0, 1.312515e-15});
tests.add(new Object[]{0, 0, 0, 3, 1.0});
tests.add(new Object[]{9, 0, 0, 0, 1.0});
tests.add(new Object[]{200000, 100000, 1, 2, 1.0});
tests.add(new Object[]{100,100,100,0, 3.730187e-23});
tests.add(new Object[]{13736,9047,41,1433, 6.162592e-05});
tests.add(new Object[]{66, 14, 64, 4, 4.243330e-02});
tests.add(new Object[]{351169, 306836, 153739, 2379, 2.193607e-09});
tests.add(new Object[]{116449, 131216, 289, 16957, 1.340052e-03});
tests.add(new Object[]{137, 159, 9, 23, 6.088506e-02});
tests.add(new Object[]{129, 90, 21, 20, 3.919603e-01});
tests.add(new Object[]{14054, 9160, 16, 7827, 7.466277e-17});
tests.add(new Object[]{32803, 9184, 32117, 3283, 1.795855e-02});
tests.add(new Object[]{2068, 6796, 1133, 0, 5.919091e-13});
return tests.toArray(new Object[][]{});
}

View File

@ -423,7 +423,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
@Test
public void testStrandAlleleCountsBySample() {
final String MD5 = "564aeeefad92353d66dbb2a2222d5108";
final String MD5 = "994d1d3c53e3878e17d20e1327506d77";
final WalkerTestSpec spec = new WalkerTestSpec(
"-T HaplotypeCaller --disableDithering " +
String.format("-R %s -I %s ", REF, CEUTRIO_BAM) +

View File

@ -51,7 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@ -71,7 +71,7 @@ import java.util.*;
public class VariantOverlapAnnotatorUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
@BeforeClass
public void setup() throws FileNotFoundException {

View File

@ -59,8 +59,12 @@ import org.broadinstitute.gatk.engine.recalibration.RecalUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
@ -70,38 +74,36 @@ import java.util.List;
*/
public class BQSRGathererUnitTest extends BaseTest {
private static File recal1 = new File(privateTestDir + "HiSeq.1mb.1RG.sg1.table");
private static File recal2 = new File(privateTestDir + "HiSeq.1mb.1RG.sg2.table");
private static File recal3 = new File(privateTestDir + "HiSeq.1mb.1RG.sg3.table");
private static File recal4 = new File(privateTestDir + "HiSeq.1mb.1RG.sg4.table");
private static File recal5 = new File(privateTestDir + "HiSeq.1mb.1RG.sg5.table");
private static File recalEmpty = new File(privateTestDir + "HiSeq.1mb.1RG.empty.table");
private static File recal1 = new File(privateTestDir, "HiSeq.1mb.1RG.sg1.table");
private static File recal2 = new File(privateTestDir, "HiSeq.1mb.1RG.sg2.table");
private static File recal3 = new File(privateTestDir, "HiSeq.1mb.1RG.sg3.table");
private static File recal4 = new File(privateTestDir, "HiSeq.1mb.1RG.sg4.table");
private static File recal5 = new File(privateTestDir, "HiSeq.1mb.1RG.sg5.table");
private static File recalEmpty = new File(privateTestDir, "HiSeq.1mb.1RG.empty.table");
private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table");
private static File recal_original = new File(privateTestDir, "HiSeq.1mb.1RG.noSG.table");
private static File recal_many = new File(privateTestDir, "bqsr.manyObservations.full.table");
@Test(enabled = true)
@Test
public void testManyObservations() {
File recal = new File(privateTestDir + "bqsr.manyObservations.piece.table");
final File recal = new File(privateTestDir, "bqsr.manyObservations.piece.table");
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
List<File> recalFiles = new LinkedList<File> ();
final List<File> recalFiles = new LinkedList<>();
for ( int i=0; i < 5; i++ )
recalFiles.add(recal);
BQSRGatherer gatherer = new BQSRGatherer();
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(new File(privateTestDir + "bqsr.manyObservations.full.table"));
GATKReport calculatedReport = new GATKReport(output);
testReports(originalReport, calculatedReport);
testReports(recal_many, output);
}
@Test(enabled = true)
@Test
public void testGatherBQSR() {
BQSRGatherer gatherer = new BQSRGatherer();
List<File> recalFiles = new LinkedList<File> ();
final BQSRGatherer gatherer = new BQSRGatherer();
final List<File> recalFiles = new LinkedList<>();
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
recalFiles.add(recal1);
@ -111,16 +113,13 @@ public class BQSRGathererUnitTest extends BaseTest {
recalFiles.add(recal5);
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(recal_original);
GATKReport calculatedReport = new GATKReport(output);
testReports(originalReport, calculatedReport);
testReports(recal_original, output);
}
@Test(enabled = true)
@Test
public void testGatherBQSRWithEmptyFile() {
BQSRGatherer gatherer = new BQSRGatherer();
List<File> recalFiles = new LinkedList<File> ();
final BQSRGatherer gatherer = new BQSRGatherer();
final List<File> recalFiles = new LinkedList<>();
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
recalFiles.add(recal1);
@ -131,13 +130,12 @@ public class BQSRGathererUnitTest extends BaseTest {
recalFiles.add(recalEmpty);
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(recal_original);
GATKReport calculatedReport = new GATKReport(output);
testReports(originalReport, calculatedReport);
testReports(recal_original, output);
}
private void testReports(final GATKReport originalReport, final GATKReport calculatedReport) {
private void testReports(final File originalFile, final File calculatedFile) {
final GATKReport originalReport = new GATKReport(originalFile);
final GATKReport calculatedReport = new GATKReport(calculatedFile);
// test the Arguments table
List<String> columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
@ -177,11 +175,11 @@ public class BQSRGathererUnitTest extends BaseTest {
* @param calculated the calculated table
* @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor)
*/
private void testTablesWithColumns(GATKReportTable original, GATKReportTable calculated, List<String> columnsToTest) {
for (int row = 0; row < original.getNumRows(); row++ ) {
for (String column : columnsToTest) {
Object actual = calculated.get(new Integer(row), column);
Object expected = original.get(row, column);
private void testTablesWithColumns(final GATKReportTable original, final GATKReportTable calculated, final List<String> columnsToTest) {
for (int row = 0; row < original.getNumRows(); row++) {
for (final String column : columnsToTest) {
final Object actual = calculated.get(Integer.valueOf(row), column);
final Object expected = original.get(row, column);
//if ( !actual.equals(expected) )
// System.out.println("Row=" + row + " Table=" + original.getTableName() + " Column=" + column + " Expected=" + expected + " Actual=" + actual);
Assert.assertEquals(actual, expected, "Row: " + row + " Original Table: " + original.getTableName() + " Column=" + column);
@ -196,12 +194,83 @@ public class BQSRGathererUnitTest extends BaseTest {
// TODO: - Doesn't end up in protected / private github
// TODO: - IS otherwise available for local debugging unlike /humgen NFS mounts
// Hand modified subset of problematic gather inputs submitted by George Grant
File input1 = new File(privateTestDir + "NA12878.rg_subset.chr1.recal_data.table");
File input2 = new File(privateTestDir + "NA12878.rg_subset.chrY_Plus.recal_data.table");
final File input1 = new File(privateTestDir, "NA12878.rg_subset.chr1.recal_data.table");
final File input2 = new File(privateTestDir, "NA12878.rg_subset.chrY_Plus.recal_data.table");
GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2));
GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1));
final GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2));
final GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1));
Assert.assertTrue(report12.equals(report21), "GATK reports are different when gathered in a different order.");
}
}
@Test
public void testParseInputsAsList() {
final File inputListFile = BaseTest.createTempFile("BQSRGatherer.parse.input", ".list");
try (final BufferedWriter bw = new BufferedWriter(new FileWriter(inputListFile))) {
bw.write(recal1.getAbsolutePath() + "\n");
bw.write(recal2.getAbsolutePath() + "\n");
bw.write(recal3.getAbsolutePath() + "\n");
bw.write(recal4.getAbsolutePath() + "\n");
bw.write(recal5.getAbsolutePath() + "\n");
} catch (final IOException ioe) {
Assert.fail("Could not create temporary list of input files for BQSRGatherer unit test.");
}
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(Collections.singletonList(inputListFile), output);
testReports(recal_original, output);
}
@Test
public void testParseInputsAsMultipleFiles() {
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(Arrays.asList(recal1, recal2, recal3, recal4, recal5), output);
testReports(recal_original, output);
}
@Test
public void testParseInputsMixedSingleList() {
final File inputListFile = BaseTest.createTempFile("BQSRGatherer.parse.input", ".list");
try (final BufferedWriter bw = new BufferedWriter(new FileWriter(inputListFile))) {
bw.write(recal2.getAbsolutePath() + "\n");
bw.write(recal3.getAbsolutePath() + "\n");
bw.write(recal4.getAbsolutePath() + "\n");
} catch (final IOException ioe) {
Assert.fail("Could not create temporary list of input files for BQSRGatherer unit test.");
}
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(Arrays.asList(recal1, inputListFile, recal5), output);
testReports(recal_original, output);
}
@Test
public void testParseInputsMixedMultipleLists() {
final File inputListFile1 = BaseTest.createTempFile("BQSRGatherer.parse.input.1", ".list");
final File inputListFile2 = BaseTest.createTempFile("BQSRGatherer.parse.input.2", ".list");
try (final BufferedWriter bw1 = new BufferedWriter(new FileWriter(inputListFile1));
final BufferedWriter bw2 = new BufferedWriter(new FileWriter(inputListFile2))) {
bw1.write(recal2.getAbsolutePath() + "\n");
bw1.write(recal3.getAbsolutePath() + "\n");
bw2.write(recal5.getAbsolutePath() + "\n");
} catch (final IOException ioe) {
Assert.fail("Could not create temporary lists of input files for BQSRGatherer unit test.");
}
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(Arrays.asList(recal1, inputListFile1, recal4, inputListFile2), output);
testReports(recal_original, output);
}
}

View File

@ -109,13 +109,13 @@ public class BQSRIntegrationTest extends WalkerTest {
@DataProvider(name = "BQSRTest")
public Object[][] createBQSRTestData() {
return new Object[][]{
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "fc9df1faf67bab70d32f89bcf4fa39db")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "73ec38eb23b1739ecef8194cbb1132a3")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "2d5721193ed4410d1a7d8db467a1fa05")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "16df7f1745f17f190c9fc33c475b91d8")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "01811003ae811ee74c4b8d3eb5e992fe")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "5e0eea6b0b300fbd2edabc3506ad3a60")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "8500b9747c16cb8eb17082163bdb8069")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "dde4269a873c6f7a751e775cbc79fdb9")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "fae427cb969638060e2294540e120dfc")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "651be7dcd798c71ceaefb773ed792193")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f5fd024e900d0d77c681483da1e5dfd5")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "751189ec8cd406628cf4e698c69e8d11")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "b921c36eb7f5be8f8b91b651247a83d7")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "6c3c6176eb6214dc7ef121fa11916e5d")},
{new BQSRTest(b36KGReference, SimpleCigarMatchMismatchBam, SimpleCigarMatchMismatchInterval, "", "56dfb2918a4cdae3ef9d705a43e85194")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")},

View File

@ -76,5 +76,4 @@ public class BaseQualitySumPerAlleleBySampleUnitTest {
Assert.assertFalse(a.isUsableRead(read));
}
}

View File

@ -57,8 +57,6 @@ import org.testng.annotations.Test;
import java.util.*;
public class MuTect2IntegrationTest extends WalkerTest {
final static String REF = hg19Reference;
final static String CCLE_MICRO_TUMOR_BAM = privateTestDir + "HCC1143.cghub.ccle.micro.bam";
final static String CCLE_MICRO_NORMAL_BAM = privateTestDir + "HCC1143_BL.cghub.ccle.micro.bam";
final static String CCLE_MICRO_INTERVALS_FILE = privateTestDir + "HCC1143.cghub.ccle.micro.intervals";
@ -72,14 +70,13 @@ public class MuTect2IntegrationTest extends WalkerTest {
final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals";
final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals";
final String commandLine =
"-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -I:normal %s -L %s";
private void M2Test(String tumorBam, String normalBam, String intervals, String args, String md5) {
final String base = String.format(
commandLine,
REF, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) +
hg19Reference, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) +
" -o %s ";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
@ -94,7 +91,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
private void m2TumorOnlyTest(String tumorBam, String intervals, String args, String md5) {
final String base = String.format(
"-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -L %s",
REF, DBSNP, COSMIC, PON, tumorBam, intervals) +
hg19Reference, DBSNP, COSMIC, PON, tumorBam, intervals) +
" -o %s ";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
@ -106,7 +103,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
private void M2TestWithDroppedReads(String tumorBam, String normalBam, String intervals, String args, String md5Variants, String md5Bamout) {
final String base = String.format(
commandLine,
REF, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) +
hg19Reference, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) +
" -o %s " +
"-bamout %s --emitDroppedReads";
@ -121,7 +118,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
@Test
public void testMicroRegression() {
M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "", "a7658ccfb75bf1ce8d3d3cfbf3b552f0");
M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "", "dd3bb9526c85c0aed39545c4639ff138");
}
/**
@ -131,7 +128,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
*/
@Test
public void testTruePositivesDream3() {
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_TP_INTERVALS_FILE, "", "91dee82a13275e5568f5d2e680e3162b");
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_TP_INTERVALS_FILE, "", "5bd540d238916a2b91e827aed3592e59");
}
/**
@ -140,7 +137,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
@Test
public void testTruePositivesDream3TrackedDropped() {
M2TestWithDroppedReads(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, "21:10935369", "",
"4f1337df1de5dd4468e2d389403ca785",
"48a446d47bb10434cb7f0ee726d15721",
"b536e76870326b4be01b8d6b83c1cf1c");
}
@ -150,7 +147,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
*/
@Test
public void testFalsePositivesDream3() {
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "", "6be3fc318e2c22a28098f58b76c9a5a1");
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "", "c9eec57bbc93ea630c202b7620f8dca8"); // e2413f4166b6ed20be6cdee6616ba43d
}
/**
@ -158,15 +155,21 @@ public class MuTect2IntegrationTest extends WalkerTest {
*/
@Test
public void testContaminationCorrection() {
M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "b1010a6614b0332c41fd6da9d5f6b14e");
M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b");
}
/**
* Test that tumor-only mode does not create an empty vcf
*/
@Test
public void testTumorOnly(){
m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "bb0cddfdc29500fbea68a0913d6706a3");
m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0");
}
@Test
public void testStrandArtifactFilter(){
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_strand_artifact_filter", "1686c1a0e63768497f21b9d7bb6548c5");
}
@Test
public void testClusteredReadPositionFilter() {
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "b44c23af7de84f96d2371db25d29aba2");
}
}

View File

@ -49,54 +49,59 @@
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.queue.qscripts.dev
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.util.QScriptUtils
import htsjdk.variant.variantcontext.Allele;
import org.testng.annotations.Test;
class run_M2_ICE_NN extends QScript {
import static org.testng.Assert.*;
@Argument(shortName = "bams", required = true, doc = "file of all BAM files")
var allBams: String = ""
@Argument(shortName = "o", required = false, doc = "Output prefix")
var outputPrefix: String = ""
@Argument(shortName = "pon", required = false, doc = "Normal PON")
var panelOfNormals: String = "/dsde/working/mutect/panel_of_normals/panel_of_normals_m2_ice_wgs_territory/m2_406_ice_normals_wgs_calling_regions.vcf";
@Argument(shortName = "sc", required = false, doc = "base scatter count")
var scatter: Int = 10
/**
* Created by tsato on 6/21/16.
*/
public class PerAlleleCollectionTest {
def script() {
val bams = QScriptUtils.createSeqFromFile(allBams)
for (tumor <- bams) {
for (normal <- bams) {
if (tumor != normal) add( createM2Config(tumor, normal, new File(panelOfNormals), outputPrefix))
}
@Test
public void testSet() throws Exception {
PerAlleleCollection<Integer> alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection();
Allele refA = Allele.create("A", true);
Allele altT = Allele.create("T", false);
alleleCounts.set(refA, 40);
alleleCounts.set(altT, 10);
assertEquals((int)alleleCounts.getRef(), 40);
assertEquals((int)alleleCounts.getAlt(altT), 10);
}
@Test
public void testGet() throws Exception {
PerAlleleCollection<Integer> alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection();
Allele refA = Allele.create("A", true);
Allele altT = Allele.create("T", false);
alleleCounts.set(refA, 40);
alleleCounts.set(altT, 10);
assertEquals((int)alleleCounts.get(refA), 40);
assertEquals((int)alleleCounts.get(altT), 10);
}
}
def createM2Config(tumorBAM : File, normalBAM : File, panelOfNormals : File, outputPrefix : String): M2 = {
val mutect2 = new MuTect2
@Test
public void testGetAltAlleles() throws Exception {
PerAlleleCollection<Integer> alleleCounts = PerAlleleCollection.createPerAltAlleleCollection();
Allele altA = Allele.create("A", false);
Allele altC = Allele.create("C", false);
Allele altG = Allele.create("G", false);
Allele altT = Allele.create("T", false);
Allele[] altAlleles = {altA, altC, altG, altT};
for (Allele altAllele : altAlleles ) {
alleleCounts.set(altAllele, 3);
}
mutect2.reference_sequence = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
mutect2.cosmic :+= new File("/xchip/cga/reference/hg19/hg19_cosmic_v54_120711.vcf")
mutect2.dbsnp = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_138.b37.vcf")
mutect2.normal_panel :+= panelOfNormals
for (Allele altAllele : altAlleles ) {
assertTrue(alleleCounts.getAltAlleles().contains(altAllele));
}
mutect2.intervalsString :+= new File("/dsde/working/mutect/crsp_nn/whole_exome_illumina_coding_v1.Homo_sapiens_assembly19.targets.no_empty.interval_list")
mutect2.memoryLimit = 2
mutect2.input_file = List(new TaggedFile(normalBAM, "normal"), new TaggedFile(tumorBAM, "tumor"))
mutect2.scatterCount = scatter
mutect2.out = outputPrefix + tumorBAM.getName + "-vs-" + normalBAM.getName + ".vcf"
println("Adding " + tumorBAM + " vs " + normalBAM + " as " + mutect2.out)
mutect2
}
}
assertFalse(alleleCounts.getAltAlleles().contains(Allele.create("A", true)));
}
}

View File

@ -49,41 +49,32 @@
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.queue.qscripts.dev
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.testng.annotations.Test;
class run_M2_dream extends QScript {
import static org.testng.Assert.*;
@Argument(shortName = "L", required=false, doc = "Intervals file")
var intervalsFile: List[File] = Nil
@Argument(shortName = "normal", required=true, doc = "Normal sample BAM")
var normalBAM: String = ""
@Argument(shortName = "tumor", required=true, doc = "Tumor sample BAM")
var tumorBAM: String = ""
@Argument(shortName = "o", required=true, doc = "Output file")
var outputFile: String = ""
@Argument(shortName = "sc", required=false, doc = "base scatter count")
var scatter: Int = 10
/**
* Created by tsato on 6/19/16.
*/
public class TumorPowerCalculatorTest {
private boolean closeEnough(double x, double y, double epsilon){
return(Math.abs(x - y) < epsilon);
}
@Test
public void testCachedPowerCalculation() throws Exception {
TumorPowerCalculator tpc = new TumorPowerCalculator(0.001, 2.0, 0.0);
final double epsilon = 0.0001;
assertTrue(closeEnough(tpc.cachedPowerCalculation(100,0.2), 1.0, epsilon));
assertTrue(closeEnough(tpc.cachedPowerCalculation(30,0.1), 0.8864, epsilon));
assertTrue(closeEnough(tpc.cachedPowerCalculation(0,0.02), 0.0, epsilon));
assertTrue(closeEnough(tpc.cachedPowerCalculation(5, 0.01), 0.0520, epsilon));
def script() {
}
val mutect2 = new MuTect2
mutect2.reference_sequence = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
mutect2.cosmic :+= new File("/xchip/cga/reference/hg19/hg19_cosmic_v54_120711.vcf")
mutect2.dbsnp = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_138.b37.vcf")
mutect2.normal_panel :+= new File("/xchip/cga/reference/hg19/wgs_hg19_125_cancer_blood_normal_panel.vcf")
mutect2.intervalsString = intervalsFile
mutect2.memoryLimit = 2
mutect2.input_file = List(new TaggedFile(normalBAM, "normal"), new TaggedFile(tumorBAM, "tumor"))
mutect2.scatterCount = scatter
mutect2.out = outputFile
add(mutect2)
}
}
}

View File

@ -62,7 +62,7 @@ public class ErrorRatePerCycleIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ErrorRatePerCycle -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-10,100,000 -o %s",
1,
Arrays.asList("6191340f0b56ee81fb248c8f5c913a8e"));
Arrays.asList("a83453820b7afb5ee79856093d62901f"));
executeTest("ErrorRatePerCycle:", spec);
}
}

View File

@ -71,11 +71,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testSingleSample() {
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "13bfe41ef083d2716e07d35223916a4e");
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "19c56b853b20ac674b6de1332043586d");
}
@Test(enabled = true)
public void testMultiSample() {
DTTest("testMultiSample ", "-I " + multiSample, "64b4fa6cf4c4d16e822289990ee88240");
DTTest("testMultiSample ", "-I " + multiSample, "90770023666f3c1d6a3f35e5ecada4a8");
}
}

View File

@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.fasta;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
@ -176,4 +177,33 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
Arrays.asList("8fd887bca9f3949f2c23c3565f7dcc1b"));
executeTest("test iupac", spec);
}
@Test
void testSpanDel() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.delOnly.starFirst.vcf -L 1:1273247 -o %s",
1,
Arrays.asList("69852222a8c9c9e1604808b62df96f8a"));
executeTest("test spanning deletion", spec);
}
@DataProvider(name = "iupacSsample")
public Object[][] getIupacSampleData() {
return new Object[][]{
{"NA1", "b5d95e28263c88b20325d7a545576ad4"},
{"NA2", "a8b4b79dea8ad1fde2c0d8ff42ca132d"},
{"NA3", "69852222a8c9c9e1604808b62df96f8a"}
};
}
@Test(dataProvider = "iupacSsample")
void testSpanDelIUPAC(final String sample, final String md5) {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b37KGReference + " --use_IUPAC_sample " + sample + " -V " + privateTestDir + "spanningDel.delOnly.starFirst.vcf -L 1:1273247 -o %s",
1,
Arrays.asList(md5));
executeTest("test spanning deletion using IUPAC codes", spec);
}
}

View File

@ -164,7 +164,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testGenotypeFilters1() {
WalkerTestSpec spec1 = new WalkerTestSpec(
baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("ced70cfb4e6681a3aa0633cd0510ada0"));
Arrays.asList("b6e8d70223826000ea1a6d6bc9c4fc65"));
executeTest("test genotype filter #1", spec1);
}
@ -172,7 +172,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testGenotypeFilters2() {
WalkerTestSpec spec2 = new WalkerTestSpec(
baseTestString() + " -G_filter 'isHomVar == 1' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("837b6a3ce3fad3bd77ec3e870c4d2f10"));
Arrays.asList("9cd315a433ab7d9da637156011328509"));
executeTest("test genotype filter #2", spec2);
}
@ -207,16 +207,39 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1,
Arrays.asList("260dd9d7e35737fe695b241b7a5a52a2"));
Arrays.asList("b0016040127766a4163fcbd91afff3ea"));
executeTest("testFilteringDPfromFORMAT", spec);
}
// The current htsjdk implementation of JEXL matching on genotype fields is buggy. When the filter uses an
// annotation that is present in both FORMAT and INFO, and the FORMAT value is missing, the current code (Dec 2016)
// will look up the INFO value. Here we use a made-up annotation Z instead of DP to avoid having to rig the test
// so that the INFO value will give the same matching results as the FORMAT value.
@Test
public void testFilteringZfromFORMATWithMissing() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'Z < 10' --genotypeFilterName lowZ -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1,
Arrays.asList("47607708dee31b6033f14a3613c8acb8"));
executeTest("testFilteringDPfromFORMATWithMissing", spec);
}
// Same comment as above.
@Test
public void testFilteringZfromFORMATAndFailMissing() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --missingValuesInExpressionsShouldEvaluateAsFailing --genotypeFilterExpression 'Z < 10' --genotypeFilterName lowZ -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1,
Arrays.asList("4f519e725203931841940707c50ab6a3"));
executeTest("testFilteringDPfromFORMATAndFailMissing", spec);
}
@Test
public void testInvertGenotypeFilterExpression() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invertGenotypeFilterExpression", 1,
Arrays.asList("907527b89d3f819cc3f6f88f51fcaaf6"));
Arrays.asList("c6bc275c97a9e737748d16132ee76f48"));
executeTest("testInvertGenotypeFilterExpression", spec);
}
@ -225,7 +248,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP >= 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1,
Arrays.asList("d79b2e5a7502a6d6e902bc40d74cc826")); // Differs from testInvertFilter because FILTER description uses the -genotypeFilterExpression argument
Arrays.asList("9321b5993d51a4da02f69e5467164587")); // Differs from testInvertFilter because FILTER description uses the -genotypeFilterExpression argument
executeTest("testInvertJexlGenotypeFilterExpression", spec);
}
@ -234,7 +257,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf --setFilteredGtToNocall", 1,
Arrays.asList("2ff3753215d418712309e50da323f6e8"));
Arrays.asList("00990d54017b7384ce9f979d796b9d16"));
executeTest("testSetFilteredGtoNocall", spec);
}
@ -245,7 +268,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " -G_filter 'GQ < 20' -G_filterName lowDP -G_filter 'DP<10' -G_filterName lowGQ -V " + privateTestDir + "variantFiltrationInfoField.vcf --setFilteredGtToNocall",
1,
Arrays.asList("3b074975bb6f70c84b2dd81695bb89ff"));
Arrays.asList("0f8ed3a62a53feca0c4b86671e4b53e4"));
executeTest("testSetFilteredGtoNocallUpdateInfo", spec);
}
@ -256,7 +279,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("410c6b7bb62fc43bb41eee627670f757")
Arrays.asList("cb5ef9233503bebc81593e436a6de943")
);
spec.disableShadowBCF();

View File

@ -52,6 +52,7 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.MathUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -76,6 +77,7 @@ public class GenotypeAlleleCountsUnitTest {
Assert.assertEquals(subject.distinctAlleleCount(),1);
Assert.assertEquals(subject.alleleCountAt(0),ploidy);
Assert.assertEquals(subject.alleleCountFor(0),ploidy);
Assert.assertEquals(subject.log10CombinationCount(), 0.0);
Assert.assertEquals(subject.alleleRankFor(0),0);
Assert.assertEquals(subject.alleleRankFor(1),-2);
Assert.assertTrue(subject.containsAllele(0));
@ -175,6 +177,31 @@ public class GenotypeAlleleCountsUnitTest {
while (!current.containsAllele(MAXIMUM_ALLELE_INDEX + 1)) {
final GenotypeAlleleCounts next = current.next();
// test log10CombinationCount
if (ploidy == 2) {
Assert.assertEquals(next.log10CombinationCount(), next.distinctAlleleCount() == 2 ? Math.log10(2) : 0.0);
} else if (ploidy == 3) {
Assert.assertEquals(next.log10CombinationCount(),
next.distinctAlleleCount() == 3 ? Math.log10(6) : (next.distinctAlleleCount() == 2 ? Math.log10(6) - Math.log10(2) : 0.0));
} else {
if (next.distinctAlleleCount() == 1) {
Assert.assertEquals(next.log10CombinationCount(), 0.0);
} else if (next.distinctAlleleCount() == ploidy) {
Assert.assertEquals(next.log10CombinationCount(), MathUtils.log10Factorial(ploidy));
}
}
//test forEach
final List<Integer> alleleCountsAsList = new ArrayList<>(next.distinctAlleleCount()*2);
next.forEachAlleleIndexAndCount((alleleIndex, alleleCount) -> {
alleleCountsAsList.add(alleleIndex);
alleleCountsAsList.add(alleleCount);});
final int[] actualAlleleCounts = new int[next.distinctAlleleCount()*2];
next.copyAlleleCounts(actualAlleleCounts, 0);
Assert.assertEquals(alleleCountsAsList.stream().mapToInt(n->n).toArray(), actualAlleleCounts);
if (current.distinctAlleleCount() == 1) {
Assert.assertEquals(next.maximumAlleleIndex(),current.maximumAlleleIndex() + 1);
Assert.assertEquals(next.distinctAlleleCount(), 2 );

View File

@ -170,6 +170,15 @@ public class GenotypeLikelihoodCalculatorUnitTest {
}
@Test
public void testComputeMaxAcceptableAlleleCount(){
Assert.assertEquals(1024, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(1, 1024));
Assert.assertEquals(44, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(2, 1024));
Assert.assertEquals(17, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(3, 1024));
Assert.assertEquals(5, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(10, 1024));
Assert.assertEquals(3, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(20, 1024));
Assert.assertEquals(2, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(100, 1024));
}
// Simple inefficient calculation of the genotype count given the ploidy.
private int calculateGenotypeCount(final int ploidy, final int alleleCount) {
if (ploidy == 0)

View File

@ -70,7 +70,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nt : Arrays.asList(1, 2) )
for ( final int nct : Arrays.asList(1, 2) ) {
tests.add(new Object[]{ "BOTH", "52f590f6b37a1b3b12042ae917738965", nt, nct });
tests.add(new Object[]{ "BOTH", "e2fdd36a4eda18f748df944b428fa392", nt, nct });
}
return tests.toArray(new Object[][]{});

View File

@ -53,11 +53,12 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.genotyper.*;
import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.genotyper.*;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
@ -472,14 +473,24 @@ public class ReadLikelihoodsUnitTest
secondBestLk = lk;
}
}
final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk;
final Median median = new Median();
final List<Double> qualifylingLikelihoods = new ArrayList<>();
for (int a = 0; a < ordinaryAlleleCount; a++) {
if (originalLikelihoods[s][a][r] >= bestLk) continue;
qualifylingLikelihoods.add(originalLikelihoods[s][a][r]);
}
final double medianLikelihood = median.evaluate(qualifylingLikelihoods.stream().mapToDouble(d -> d).toArray());
// NaN is returned in cases whether there is no elements in qualifyingLikelihoods.
// In such case we set the NON-REF likelihood to -Inf.
final double expectedNonRefLk = !Double.isNaN(medianLikelihood) ? medianLikelihood
: ordinaryAlleleCount <= 1 ? Double.NaN : bestLk;
newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk;
}
}
testLikelihoodMatrixQueries(samples,result,newLikelihoods);
}
private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods<Allele> result, final double[][][] likelihoods) {
private void testLikelihoodMatrixQueries(final String[] samples, final ReadLikelihoods<Allele> result, final double[][][] likelihoods) {
for (final String sample : samples) {
final int sampleIndex = result.sampleIndex(sample);
final int sampleReadCount = result.sampleReadCount(sampleIndex);
@ -487,9 +498,14 @@ public class ReadLikelihoodsUnitTest
Assert.assertEquals(result.alleleCount(), alleleCount);
for (int a = 0; a < alleleCount; a++) {
Assert.assertEquals(result.sampleReadCount(sampleIndex),sampleReadCount);
for (int r = 0; r < sampleReadCount; r++)
Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a,r),
likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON);
for (int r = 0; r < sampleReadCount; r++) {
if (Double.isNaN(result.sampleMatrix(sampleIndex).get(a, r))) {
Assert.assertTrue(likelihoods != null && Double.isNaN(likelihoods[sampleIndex][a][r]));
} else {
Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a, r),
likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON);
}
}
}
}
}

View File

@ -69,17 +69,17 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testSNP_ACS_Pools() {
executor.PC_LSV_Test_short("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "ebdf749d404aaef298780a53059a4f93");
executor.PC_LSV_Test_short("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "90ed6f1c268b9c57ecb52b35a88b9368");
}
@Test(enabled = true)
public void testBOTH_GGA_Pools() {
executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "c3826794a250e32b0497353ceb1deb26");
executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "5ad4dd6b0c3c170ba44fdad6d4fa58cf");
}
@Test(enabled = true)
public void testINDEL_GGA_Pools() {
executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "4eb0d8018da6612cd434491f338ed5a4");
executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "d26b0ba07e056b73fe4cfe873636d0d6");
}
@Test(enabled = true)
@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
//TODO the old MD5 is kept for the record.
//TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach.
// executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2");
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "c2fb9b05027c2b0ac9e338d9ddda69b1");
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "f6b9e1ac0c51c9702525ee52bb2db18a");
}
}

View File

@ -63,7 +63,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","e22846de4567f576e08e00edda2931d0");
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","cdaa55c53005deb132f600fa5539c254");
}
@Test(enabled = true)

View File

@ -78,7 +78,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("32bece91e170d623092817738faddb4e"));
Arrays.asList("96afa04944156c1ca5028e5506ba8b94"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -105,7 +105,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("dd66e5f8a6e43be0e473251185a4f38a"));
Arrays.asList("7e211573190003342af274e64a0612fb"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -115,7 +115,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("aa56ed44e77162efce45c936c485769e"));
Arrays.asList("50622e495cad2a24fbc4a80f1281d4dc"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -125,7 +125,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("a4b6434c59c4b119e480ddafc86de234"));
Arrays.asList("50622e495cad2a24fbc4a80f1281d4dc"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -140,7 +140,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("f9d848fe5e6e6762e0dd5b5d925f74f4"));
Arrays.asList("751334df2c7b14cc0e57df231825da57"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -181,7 +181,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("2a82d1586b2148e8d902da5cf8538210"));
Arrays.asList("0d1a5c865c382f1f0ca6f0f104478366"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -189,7 +189,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("3184a3f58b3aeafcd97280af708a04bb"));
Arrays.asList("aab86cec61adaeb3a5c6887e70211663"));
executeTest("test minIndelFraction 0.25", spec);
}

View File

@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinBaseQualityScore() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
Arrays.asList("52a3064863b97e43d8df878edc29275c"));
Arrays.asList("d0499af17dc66e77849e547bc5a182ff"));
executeTest("test min_base_quality_score 26", spec);
}
@ -102,7 +102,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNDA() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("2f2d7dd623446fc3cae62a44a016c16d"));
Arrays.asList("5f69de274c0705cf1cb9387651df98bf"));
executeTest("test NDA", spec);
}
@ -124,17 +124,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameterSitesOnly() {
testOutputParameters("-sites_only", "3c0e109190cfbe41d24e7726cc8fe6e3");
testOutputParameters("-sites_only", "4355f5b6fd8cd769a479677f1255bee5");
}
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f6937cc8ec068f2d38b5d277a92be34b");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "182af9490667cb6ce1415305de4f3fdd");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "1cddd7b1e730765c2b7b55d8a1d69b4c");
testOutputParameters("--output_mode EMIT_ALL_SITES", "524e85c225ce330fd094de93f078fa56");
}
private void testOutputParameters(final String args, final String md5) {
@ -148,7 +148,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("5c7d237e666439edb0ef8c697e37933c"));
Arrays.asList("c794c7681856c1ec3c3429dbd9e5dc75"));
executeTest("test confidence 1", spec1);
}
@ -156,7 +156,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNoPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1,
Arrays.asList("24b550bbc3c9f0577e069b3fd3122d52"));
Arrays.asList("39d15f041a0c86058f46f23960bb129b"));
executeTest("test no prior 1", spec1);
}
@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testUserPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1,
Arrays.asList("f60b6705daec1059ce3e533bf8e44c89"));
Arrays.asList("00bff7a5dc584b5b6931a826eae6b013"));
executeTest("test user prior 1", spec1);
}
@ -174,7 +174,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void emitPLsAtAllSites() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1,
Arrays.asList("ae778a64323abe0da5194f0b936f48aa"));
Arrays.asList("067d0a9d90c978c5563ea56a86fc682f"));
// GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail
spec1.disableShadowBCF();
@ -190,12 +190,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "6b8bdde9d303139806c5177fae53b1fd" );
testHeterozosity( 0.01, "0175a3d4dedea857f87149522d133d78" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "b1604d1ba68dfe2fcfb861ef6420a8ba" );
testHeterozosity( 1.0 / 1850, "080a14940e4d3f5c14c533d019b99341" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -238,7 +238,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "398d3ad38834fea8961ab6f46a21dc4b";
String md5 = "75b4b097747f91b8b7ceea153d2b7e1c";
final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
@ -274,7 +274,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("7ed55f70feeacf8ecc6b36f0d741dfc7"));
Arrays.asList("482f6b310e59d05508811932ec21c801"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -293,7 +293,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("90224ac1c9e2ce9b77fee8dd6e044efe"));
Arrays.asList("c99b7dbe881aa3274cb9876e495cf8f3"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -310,7 +310,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " +
"-A SnpEff",
1,
Arrays.asList("e99f100fe71bb7f328b485204c16f14a"));
Arrays.asList("81ac0ffd22a0d0907848019944034359"));
executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec);
}

View File

@ -70,7 +70,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("c759b04ed0d948bda95008e29f3f5c2d"));
Arrays.asList("605f447127bf9c92f60bbaa9c6a6732e"));
executeTest("test MultiSample Pilot1", spec);
}
@ -94,7 +94,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("281db46f39e3367f207838c620a82bd2"));
Arrays.asList("ac9905e26a2c51129d22b5c617679c6a"));
executeTest("test SingleSample Pilot2", spec);
}
@ -102,7 +102,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("94ca1e00d4fad9c5279271c2779ff797"));
Arrays.asList("3304a20af6745beeec07ef2c47d617d3"));
executeTest("test Multiple SNP alleles", spec);
}
@ -126,7 +126,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("c5aff2572ce09c413e7f5c9e1b3f92d6"));
Arrays.asList("5dc0ccd66105e0f12c72987d56c85235"));
executeTest("test mismatched PLs", spec);
}
}

View File

@ -0,0 +1,250 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE
* SOFTWARE LICENSE AGREEMENT
* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. PHONE-HOME FEATURE
* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
*
* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012-2016 Broad Institute, Inc.
* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 5. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 6. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 7. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 8. MISCELLANEOUS
* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import htsjdk.variant.variantcontext.*;
import org.apache.commons.math3.util.Pair;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.gatk.utils.BaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Created by davidben on 7/28/16.
*/
public class AlleleFrequencyCalculatorUnitTest extends BaseTest {
private static final double EPS = 1.0e-8;
private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
private static final Allele A = Allele.create("A", true);
private static final Allele B = Allele.create("C");
private static final Allele C = Allele.create("G");
private static final Allele indel1 = Allele.create("AA");
private static final int HAPLOID = 1;
private static final int DIPLOID = 2;
private static final int TRIPLOID = 3;
private static final int BIALLELIC = 2;
private static final int TRIALLELIC = 3;
private static final int EXTREMELY_CONFIDENT_PL = 1000;
private static final int FAIRLY_CONFIDENT_PL = 20;
private static final int LOW_CONFIDENCE_PL = 10;
private static final int DEFAULT_PLOIDY = 2;
private static int sampleNameCounter = 0;
@Test
public void testSymmetries() {
final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 0.1, 0.1, DEFAULT_PLOIDY);
final List<Allele> alleles = Arrays.asList(A,B,C);
final Genotype AA = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,2}, FAIRLY_CONFIDENT_PL);
final Genotype BB = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {1,2}, FAIRLY_CONFIDENT_PL);
final Genotype CC = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {2,2}, FAIRLY_CONFIDENT_PL);
final Genotype AB = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,1,1}, FAIRLY_CONFIDENT_PL);
final Genotype AC = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,2,1}, FAIRLY_CONFIDENT_PL);
final Genotype BBB = genotypeWithObviousCall(TRIPLOID, TRIALLELIC, new int[] {1,3}, FAIRLY_CONFIDENT_PL);
final Genotype CCC = genotypeWithObviousCall(TRIPLOID, TRIALLELIC, new int[] {2,3}, FAIRLY_CONFIDENT_PL);
// make pairs of VCs tht differ only by B <--> C
final List<Pair<VariantContext, VariantContext>> switchBWithCPairs = Arrays.asList(
new Pair<>(makeVC(alleles, AA, BB), makeVC(alleles, AA, CC)),
new Pair<>(makeVC(alleles, AA, AB), makeVC(alleles, AA, AC)),
new Pair<>(makeVC(alleles, AB, AB), makeVC(alleles, AC, AC)),
new Pair<>(makeVC(alleles, AA, AA, BB), makeVC(alleles, AA, AA, CC)),
new Pair<>(makeVC(alleles, AA, AB, AB), makeVC(alleles, AA, AC, AC)),
new Pair<>(makeVC(alleles, AA, BBB), makeVC(alleles, AA, CCC))
);
for (final Pair<VariantContext, VariantContext> pair : switchBWithCPairs) {
final VariantContext vc1 = pair.getFirst();
final VariantContext vc2 = pair.getSecond();
final AFCalculationResult result1 = afCalc.getLog10PNonRef(vc1);
final AFCalculationResult result2 = afCalc.getLog10PNonRef(vc2);
Assert.assertEquals(result1.getLog10PosteriorOfAFEq0(), result2.getLog10PosteriorOfAFEq0(), EPS);
Assert.assertEquals(result1.getLog10PosteriorOfAFEq0ForAllele(B), result2.getLog10PosteriorOfAFEq0ForAllele(C), EPS);
Assert.assertEquals(result1.getLog10PosteriorOfAFEq0ForAllele(C), result2.getLog10PosteriorOfAFEq0ForAllele(B), EPS);
}
}
@Test
public void testMLECounts() {
final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 1, 1, DEFAULT_PLOIDY);
final List<Allele> alleles = Arrays.asList(A,B,C);
final Genotype AA = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,2}, FAIRLY_CONFIDENT_PL);
final Genotype BB = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {1,2}, FAIRLY_CONFIDENT_PL);
final Genotype AB = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,1,1}, FAIRLY_CONFIDENT_PL);
final Genotype AC = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,2,1}, FAIRLY_CONFIDENT_PL);
final Genotype BBB = genotypeWithObviousCall(TRIPLOID, TRIALLELIC, new int[] {1,3}, FAIRLY_CONFIDENT_PL);
final Genotype CCC = genotypeWithObviousCall(TRIPLOID, TRIALLELIC, new int[] {2,3}, FAIRLY_CONFIDENT_PL);
final List<Pair<VariantContext, int[]>> vcWithExpectedCounts = Arrays.asList(
new Pair<>(makeVC(alleles, AA, BB), new int[] {2,0}),
new Pair<>(makeVC(alleles, AA, AB), new int[] {1,0}),
new Pair<>(makeVC(alleles, AB, AB), new int[] {2,0}),
new Pair<>(makeVC(alleles, AA, AA, BB), new int[] {2,0}),
new Pair<>(makeVC(alleles, AA, AB, AB), new int[] {2,0}),
new Pair<>(makeVC(alleles, AA, BBB), new int[] {3,0}),
new Pair<>(makeVC(alleles, AA, BBB, CCC), new int[] {3,3}),
new Pair<>(makeVC(alleles, AA, AB, AC), new int[] {1,1}),
new Pair<>(makeVC(alleles, AA, AB, AC, BBB, CCC), new int[] {4,4})
);
for (final Pair<VariantContext, int[]> pair : vcWithExpectedCounts) {
final VariantContext vc = pair.getFirst();
final int[] expected = pair.getSecond();
final int[] actual = afCalc.getLog10PNonRef(vc).getAlleleCountsOfMLE();
Assert.assertEquals(Arrays.asList(expected), Arrays.asList(actual));
}
}
// many samples with low confidence should yield a non-zero MLE, in contrast to the old exact model
@Test
public void testManySamplesWithLowConfidence() {
// prior corresponding to 1000 observations of ref, 1 of a SNP
// for this test, we want many pseudocounts in the prior because the new AF calculator learns the allele frequency
// and we don't want the complication of the posterior being differetn from the prior
final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1000, 1, 1, DEFAULT_PLOIDY); //prior corresponding to 1000 observations of ref, 1 of a SNP
final List<Allele> alleles = Arrays.asList(A,B);
// for FAIRLY_CONFIDENT_PL = 20, this genotype has about 100 times greater likelihood to be het than hom ref
// with our prior giving 1000 times as much weight to ref, this implies a 1 in 5 chance of each sample having a copy of the alt allele
// (that is, 100/1000 times the combinatorial factor of 2). Thus the MLE for up to 2 samples should be zero
// for five samples we should have one
// for ten samples we will have more than twice as many as for five since the counts fromt he samples start to influence
// the estimated allele frequency
final Genotype AB = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {0,1,1,1}, FAIRLY_CONFIDENT_PL);
final List<VariantContext> vcsWithDifferentNumbersOfSamples = IntStream.range(1, 11)
.mapToObj(n -> makeVC(alleles, Collections.nCopies(n, AB))).collect(Collectors.toList());
final int[] counts = vcsWithDifferentNumbersOfSamples.stream().mapToInt(vc -> afCalc.getLog10PNonRef(vc).getAlleleCountAtMLE(B)).toArray();
Assert.assertEquals(counts[0],0); // one sample
Assert.assertEquals(counts[1],0); // two samples
Assert.assertEquals(counts[4],2); // five samples
Assert.assertTrue(counts[8] >= 3); // ten samples
}
@Test
public void testApproximateMultiplicativeConfidence() {
final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 1, 1, DEFAULT_PLOIDY); //flat prior -- we will choose genotypes such that the posterior remains flat
final List<Allele> alleles = Arrays.asList(A,B);
final Genotype AA = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {0,2}, FAIRLY_CONFIDENT_PL);
final Genotype BB = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {1,2}, FAIRLY_CONFIDENT_PL);
final List<VariantContext> vcsWithDifferentNumbersOfSamples = new ArrayList<>();
final List<Genotype> genotypeList = new ArrayList<>();
for (int n = 0; n < 10; n++) {
genotypeList.add(AA);
genotypeList.add(BB); //adding both keeps the flat prior. Thus the posterior will equal the likelihood
vcsWithDifferentNumbersOfSamples.add(makeVC(alleles, genotypeList));
}
// since we maintain a flat allele frequency distribution, the probability of being ref as each successive sample is added
// is multiplied by the probability of any one. Thus we get an arithmetic series in log space
final double[] log10PRefs = vcsWithDifferentNumbersOfSamples.stream()
.mapToDouble(vc -> afCalc.getLog10PNonRef(vc).getLog10LikelihoodOfAFEq0()).toArray();
for (int n = 0; n < 9; n++) {
Assert.assertEquals(log10PRefs[n+1] - log10PRefs[n], log10PRefs[0], 0.01);
}
}
@Test
public void testManyRefSamplesDontKillGoodVariant() {
final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 0.1, 0.1, DEFAULT_PLOIDY);
final List<Allele> alleles = Arrays.asList(A,B);
final Genotype AA = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {0,2}, EXTREMELY_CONFIDENT_PL);
final Genotype AB = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {0,1,1,1}, EXTREMELY_CONFIDENT_PL);
for (final int numRef : new int[]{1, 10, 100, 1000, 10000, 100000}) {
final List<Genotype> genotypeList = new ArrayList<>(Collections.nCopies(numRef, AA));
genotypeList.add(AB);
final VariantContext vc = makeVC(alleles, genotypeList);
final double log10PRef = afCalc.getLog10PNonRef(vc).getLog10LikelihoodOfAFEq0();
Assert.assertTrue(log10PRef < (-EXTREMELY_CONFIDENT_PL/10) + Math.log10(numRef) + 1);
}
}
// make PLs that correspond to an obvious call i.e. one PL is relatively big and the rest are zero
// alleleCounts is the GenotypeAlleleCounts format for the obvious genotype, with repeats but in no particular order
private static int[] PLsForObviousCall(final int ploidy, final int numAlleles, final int[] alleleCounts, final int PL) {
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);
final int[] result = Collections.nCopies(glCalc.genotypeCount(), PL).stream().mapToInt(n->n).toArray();
result[glCalc.alleleCountsToIndex(alleleCounts)] = 0;
return result;
}
private static Genotype genotypeWithObviousCall(final int ploidy, final int numAlleles, final int[] alleles, final int PL) {
return makeGenotype(ploidy, PLsForObviousCall(ploidy, numAlleles, alleles, PL));
}
//note the call is irrelevant to the AFCalculator, which only looks at PLs
private static Genotype makeGenotype(final int ploidy, int ... pls) {
return new GenotypeBuilder("sample" + sampleNameCounter++).alleles(Collections.nCopies(ploidy, Allele.NO_CALL)).PL(pls).make();
}
private static VariantContext makeVC(final List<Allele> alleles, final Genotype... genotypes) {
return new VariantContextBuilder().chr("chr1").alleles(alleles).genotypes(genotypes).make();
}
private static VariantContext makeVC(final List<Allele> alleles, final Collection<Genotype> genotypes) {
return new VariantContextBuilder().chr("chr1").alleles(alleles).genotypes(genotypes).make();
}
}

View File

@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "88255eda0e29e4a6e128ddb7177a03ab");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "4f30d9c9f1eb4529071b7060e497235d");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -96,7 +96,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"8f8680bd8e1549ad88691c9c8af9977c");
"558820f3b67f4434a41e0cb96b6469c7");
}
@Test
@ -114,7 +114,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleConsensusModeComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337",
"353f1895047b15b1fec22b559c9da0c1");
"47894766b0ce7d4aecd89e4938ac1c85");
}
}

View File

@ -51,6 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Level;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
@ -62,10 +63,14 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@ -84,12 +89,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
//TODO this might need to be addressed at some point.
//TODO the following test is commented out for the record
//tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "b7a5f4e40d5ebaf5f6c46a3d4355c817"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "0f5e6f2584649a1b7386d94e3dc60f91"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "c64e8f169b40dfcdac5bea71156753b5"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "2cc9f789100e138ffc0c383b12a1322a"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "44cc8f78e28d905efc30c218d821cc7c"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "63fa5841a21e2c13f1e1a8e2d4ea3380"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "39b2ad53ffdfcbaa4af3454c321daaa7"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "861fa31b135d200f765914126b422cf4"});
return tests.toArray(new Object[][]{});
}
@ -103,13 +108,13 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "22e03f01e91177011ac028d2347751ba"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "cb3f16bc10e1cc75f2093bec92145d18"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "f2807ff921854059746da2954dc44a7b"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "d146c8dc4fc0605b3776ab5fec837d53"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "c317193f0d1c9a8168f2625c8bf1dd2b"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "c6c19ff9dc229f6af6080a175267344c"});
final String NA12878bandedResolutionMD5 = "8d4a51af32cd13ba4b3e33dd00c58398";
final String NA12878bandedResolutionMD5 = "7240907ec3dc2ed49b55c9956546ba13";
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5});
tests.add(new Object[]{NA12878_WEx + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam -sn NA12878",
ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5});
@ -126,12 +131,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "90b25f3050435c9e67aa0ee325c24167"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "5f329540dc5c4556ab029d0e2cfcabcb"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "126527c225d24a2a0bb329ad9b3f682a"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "6c727b804084a2324ecd1c98b72734b9"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "190cef14684c95ba290d7a5fa13fdc07"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "a0be095ed902a8acdb80fb56ca4e8fb4"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "8123d8b68b6fa77ef084f292e191622a"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "21c87a3edafee3cb080169963e1e2623"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "e48bbcf453e63a6ea5eeda05f6865f94"});
return tests.toArray(new Object[][]{});
}
@ -144,17 +149,16 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "70ee4e60d9f86b63aaab09075a71ddd3"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "700d79df3b0b481444e81471204e242e"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8e17f26d07fbba596d3cfd2e344c4cd2"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "48521b89cecceb9846e4dfc0dd415874"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "eaacbeaff99a37ffa07e1f11e7f1deb2"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "228e1d2ec2e729a5f79c37f3f2557708"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "2fc7020457dde4439b4133c098d9ab9b"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "868a097a8a108f5159dbbabbfdb2e38b"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "685025831ac783784d7838e568e35f46"});
return tests.toArray(new Object[][]{});
}
/**
* Test HaplotypeCaller, using MyDataProvider
*/
@ -234,15 +238,19 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
}
/**
* Test HaplotypeCaller to ensure it does not throw an exception when a .g.vcf.gz output file is specified and the indexing arguments are omitted
* Test HaplotypeCaller to ensure it does not throw an exception when a .g.vcf.gz output file is specified and the indexing arguments are omitted.
* Verify that the output file is using the GZIP file format.
*/
@Test()
public void testGVCFGzIndexNoThrow() {
public void testGVCFGzIndexNoThrow() throws IOException {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000000-17000100");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(GATKVCFUtils.GVCF_GZ_EXT), Arrays.asList(""));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine, Arrays.asList(""));
final File outputFile = createTempFile("testGVCFGzIndexNoThrow", "." + GATKVCFUtils.GVCF_GZ_EXT);
spec.setOutputFileLocation(outputFile);
spec.disableShadowBCF();
executeTest("testGVCFIndexNoThrow", spec);
executeTest("testGVCFGzIndexNoThrow", spec);
final GZIPInputStream gzipOutputFileStream = new GZIPInputStream(new FileInputStream(outputFile));
}
@Test()
@ -275,7 +283,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testWrongGVCFNonVariantRecordOrderBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("6facd3d2cf9f52877182d627cef1c872"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("f70b7052dfeb065ee8c7d796f1a1f84a"));
spec.disableShadowBCF();
executeTest("testMissingGVCFIndexingStrategyException", spec);
}
@ -292,7 +300,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testNoCallGVCFMissingPLsBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("d55ccf214fd5095e6d586c1547cb1a7a"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("883fdc6c10fd7cbc1de375ed26ce5734"));
spec.disableShadowBCF();
executeTest("testNoCallGVCFMissingPLsBugFix", spec);
}
@ -325,7 +333,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("6f6b2fa85cd1bae7f8f72e144fe56c96"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("5877ccbc99bbaffbcd5fe3aaa3d7e7f7"));
spec.disableShadowBCF();
executeTest(" testAlleleSpecificAnnotations", spec);
}
@ -334,7 +342,16 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASMQMateRankSumAnnotation() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9613ec1ec93547cfb0651673e914bee4"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("0381fec3b0d21508b28fa62c2a61ccfc"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@Test
public void testBetaTestingAnnotationGroup() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G BetaTesting --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("f248a6c4a7645dc5cc9f5ec9f81d9ad5"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@ -343,7 +360,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a8765c11b9130c815aae4e06c1f90e45"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("4599a591427c188c117f09ac40cc866f"));
spec.disableShadowBCF();
executeTest(" testASInsertSizeRankSum", spec);
}
@ -352,7 +369,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMultiAllelicNonRef() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A StrandAlleleCountsBySample",
b37KGReference, privateTestDir + "multiallelic-nonref.bam", "2:47641259-47641859", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("1d9e75bd09a6fc5a1d9156fe8a7d43ce"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("7c707c66f77482e3f6b2b014b152bbf4"));
spec.disableShadowBCF();
executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec);
}
@ -361,7 +378,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMaxNumPLValues() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a4b5c40b1993573c5efd992f3f0db8a9"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("dd0bfade4f0a9f23a500fd23c3a24a29"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValues", spec);
}
@ -378,7 +395,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals",
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("19f5398e4013c06b52c0085fe0b3469e"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5ea45f32c09e9f7a4132f98d642f260b"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel", spec);
// Make sure the "Maximum allowed number of PLs exceeded" messages are in the log
@ -403,7 +420,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals",
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("19f5398e4013c06b52c0085fe0b3469e"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5ea45f32c09e9f7a4132f98d642f260b"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel", spec);
// Make sure the "Maximum allowed number of PLs exceeded" messages are in the log
@ -413,4 +430,42 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// Set the log level back
logger.setLevel(level);
}
//Regression test for https://github.com/broadinstitute/gsa-unstable/issues/1345
@Test
public void testHaplotypeCallerGVCFBlocks() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:1-1000000 -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
b37KGReference, privateTestDir + "gvcf_blocks_test.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("0cdf4d6d0a45def15fb11ea30c78e470"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerGVCFBlocks", spec);
}
@DataProvider(name = "dataBadGQBValues")
public Object[][] dataBadGQBValues() {
return new Object[][]{
{Arrays.asList(-1, 10, 20)},
{Arrays.asList(10, 20, 1)},
{Arrays.asList(10, 10, 20)},
{Arrays.asList(10, 20, VCFConstants.MAX_GENOTYPE_QUAL + 2)}
};
}
@Test(dataProvider = "dataBadGQBValues")
public void testBadGQBValues(final List<Integer> inputGQBValues) {
final String inputGQBValuesString = inputGQBValues.stream().map(gqb -> "-GQB " + gqb).collect(Collectors.joining(" "));
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:1-1000000 -ERC GVCF %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
b37KGReference, privateTestDir + "gvcf_blocks_test.bam", inputGQBValuesString, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.BadArgumentValue.class);
spec.disableShadowBCF();
executeTest("testBadGQBValues", spec);
}
@Test
public void testHaplotypeCallerGVCSpanDel() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:26357667 -ERC GVCF --no_cmdline_in_header -A AS_ReadPosRankSumTest -A ReadPosRankSumTest -variant_index_type %s -variant_index_parameter %d",
b37KGReference, privateTestDir + "NexPond-377866-1:26357600-26357700.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("bb12cf2dfa6f1fa0692395e295792584"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerGVCSpanDel", spec);
}
}

View File

@ -59,10 +59,15 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.tools.walkers.genotyper.*;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.genotyper.AlleleList;
import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList;
import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.EventMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
@ -439,12 +444,12 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
@Test(dataProvider="ConstructPhaseSetMappingProvider")
public void testConstructPhaseSetMapping(final List<VariantContext> calls,
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
final int totalHaplotypes,
final int expectedMapSize,
final int expectedNumGroups,
final int expectedNum01,
final int expectedNum10) {
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
final int totalHaplotypes,
final int expectedMapSize,
final int expectedNumGroups,
final int expectedNum01,
final int expectedNum10) {
final Map<VariantContext, Pair<Integer, String>> actualPhaseSetMapping = new HashMap<>();
final int actualNumGroups = HaplotypeCallerGenotypingEngine.constructPhaseSetMapping(calls, haplotypeMap, totalHaplotypes, actualPhaseSetMapping);
Assert.assertEquals(actualNumGroups, expectedNumGroups);
@ -531,4 +536,121 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
Assert.assertEquals(uniqueGroups.size(), expectedNumGroups);
Assert.assertEquals(counter, expectedGroupSize);
}
@Test
public void testExcessAlternativeAllelesKeepRef(){
// prep data
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Allele altG = Allele.create("G", false);
final Allele altT = Allele.create("T", false);
final AlleleList<Allele> indexedAlleleList = new IndexedAlleleList<>(altC, altG, altT, ref);// specifically make the ref allele not at index 0
final IndexedSampleList indexedSampleList = new IndexedSampleList("Dummy");
final List<GATKSAMRecord> reads = new ArrayList<>();
for (int i=0; i<10; ++i) {
reads.add(GATKSAMRecord.createRandomRead(101));
}
final Map<String, List<GATKSAMRecord>> sampleToReads = Collections.singletonMap(indexedSampleList.sampleAt(0), reads);
final ReadLikelihoods<Allele> readLikelihoods = new ReadLikelihoods<>(indexedSampleList, indexedAlleleList, sampleToReads);
final PloidyModel ploidyModel = new HomogeneousPloidyModel(indexedSampleList, 2);
final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel();
final GenotypingLikelihoods<Allele> genotypeLikelihoods = genotypingModel.calculateLikelihoods(readLikelihoods, new GenotypingData<>(ploidyModel, readLikelihoods));
// test
final Set<Allele> excessAltAlleles = HaplotypeCallerGenotypingEngine.excessAlternativeAlleles(genotypeLikelihoods, 2);
Assert.assertFalse(excessAltAlleles.contains(ref));
Assert.assertEquals(excessAltAlleles.size(), 1);
}
@Test
public void testReduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(){
// first have a list of alleles, one ref, several alt
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Allele altT = Allele.create("T", false);
final Allele altT2 = Allele.create("TT", false);
final Allele altG = Allele.create("G", false);
// then create several haplotypes, assign ad-hoc scores
final Haplotype hapRef = new Haplotype("AAAAA".getBytes());
hapRef.setScore(Double.MAX_VALUE);
// test case when both same best score and second best score are the same
final Haplotype hapT = new Haplotype("TAAAA".getBytes());
hapT.setScore(-2.0);
final Haplotype hapTAnother = new Haplotype("TAAAT".getBytes());
hapTAnother.setScore(-3.0);
final Haplotype hapT2 = new Haplotype("TTAAA".getBytes());
hapT2.setScore(-2.0);
final Haplotype hapT2Another = new Haplotype("TTAAT".getBytes());
hapT2Another.setScore(-3.0);
final Haplotype hapC = new Haplotype("CAAAA".getBytes());
hapC.setScore(-3.0);
// for case when there's tie in highest haplotype score
final Haplotype hapG = new Haplotype("GAAAA".getBytes());
hapG.setScore(-3.0);
final Haplotype hapGAnother = new Haplotype("GAAAG".getBytes());
hapGAnother.setScore(-5.0);
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<>();
alleleMapper.put(ref, Arrays.asList(hapRef));
alleleMapper.put(altC, Arrays.asList(hapC));
alleleMapper.put(altT, Arrays.asList(hapT, hapTAnother));
alleleMapper.put(altT2, Arrays.asList(hapT2, hapT2Another));
alleleMapper.put(altG, Arrays.asList(hapG, hapGAnother));
List<Allele> allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 5);
Assert.assertEquals(allelesToKeep.size(), 5);
Iterator<Allele> it = allelesToKeep.iterator();
Assert.assertEquals(it.next(), ref);
Assert.assertEquals(it.next(), altC);
Assert.assertEquals(it.next(), altT);
Assert.assertEquals(it.next(), altT2);
Assert.assertEquals(it.next(), altG);
allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 4);
Assert.assertEquals(allelesToKeep.size(), 4);
it = allelesToKeep.iterator();
Assert.assertEquals(it.next(), ref);
Assert.assertEquals(it.next(), altT);
Assert.assertEquals(it.next(), altT2);
Assert.assertEquals(it.next(), altG);
allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 3);
Assert.assertEquals(allelesToKeep.size(), 3);
it = allelesToKeep.iterator();
Assert.assertEquals(it.next(), ref);
Assert.assertEquals(it.next(), altT);
Assert.assertEquals(it.next(), altT2);
allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 2);
Assert.assertEquals(allelesToKeep.size(), 2);
it = allelesToKeep.iterator();
Assert.assertEquals(it.next(), ref);
Assert.assertEquals(it.next(), altT);
allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 1);
Assert.assertEquals(allelesToKeep.size(), 1);
it = allelesToKeep.iterator();
Assert.assertEquals(it.next(), ref);
}
@Test
public void testRemoveExcessiveAltAlleleFromVC(){
final VariantContext originalVC = new VariantContextBuilder("source", "1", 1000000, 1000000, Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false), Allele.create("G", false))).make();
final VariantContext reducedVC = HaplotypeCallerGenotypingEngine.removeExcessAltAllelesFromVC(originalVC, Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false)));
Assert.assertEquals(reducedVC.getNAlleles(), 3);
Assert.assertTrue(reducedVC.getAlleles().containsAll(Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false))));
}
}

View File

@ -52,6 +52,9 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.FeatureReader;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.PositionalBufferedStream;
import htsjdk.variant.variantcontext.VariantContext;
@ -106,7 +109,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeBAMOutFlags() throws IOException {
HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "08943fb76d1cd5b5b8815e3991754911", "6a81bbefa6c4ed7a6b8d2c3e0e5a4756");
HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "6588123afd06ff6acc9f10ea25250f54", "9d6bd79cdae3e3222fa93f542fbca153");
}
@Test
@ -117,7 +120,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSample() throws IOException {
HCTest(NA12878_BAM, "", "c04293cb8466a1a217bce4ef419bdabe");
HCTest(NA12878_BAM, "", "9f17ce83e639a1bd9b3f2d9fa33b15b2");
}
@Test
@ -132,12 +135,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleTetraploid() throws IOException {
HCTest(NA12878_BAM, "-ploidy 4", "5098645e8b570bc4521570654fa91806");
HCTest(NA12878_BAM, "-ploidy 4", "f993db900080aeb48c43982745e1084d");
}
@Test
public void testHaplotypeCallerMinBaseQuality() throws IOException {
HCTest(NA12878_BAM, "-mbq 15", "c04293cb8466a1a217bce4ef419bdabe");
HCTest(NA12878_BAM, "-mbq 15", "9f17ce83e639a1bd9b3f2d9fa33b15b2");
}
@Test
@ -147,27 +150,27 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMinBaseQualityTetraploid() throws IOException {
HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "5098645e8b570bc4521570654fa91806");
HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "f993db900080aeb48c43982745e1084d");
}
@Test
public void testHaplotypeCallerGraphBasedSingleSample() throws IOException {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "ba0dc5f416d69558cb5dd3e0a0a5a084");
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "420954190aef671edd02bd3c73e22642");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSampleHaploid() throws IOException {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "129bca18bb9eec23004b2d28aa541de2");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "01220e85ff6bc49e35a325a1df2519e5");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSample() throws IOException {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "2b89c9e102a049e223bc0d91156a08a3");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "80c5b0f72a7962e1ba846ec20465001f");
}
@Test
public void testHaplotypeCallerSingleSampleWithDbsnp() throws IOException {
HCTest(NA12878_BAM, "-D " + b37dbSNP132, "ff8e142f491b06e17e64e3a5d59737a7");
HCTest(NA12878_BAM, "-D " + b37dbSNP132, "9e8513ed4065138bee8dd9363a9fd355");
}
@Test
@ -202,12 +205,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3625167f0e788d409c7eab1898d5eafe");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "49b8fb444c6f88def2069b8b0efe47c7");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
try {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
final ReferenceSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
@ -263,7 +266,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestDoesNotFailOnBadRefBase() {
// don't care about the output - just want to make sure it doesn't fail
final String base = String.format("-T HaplotypeCaller --disableDithering -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
final String base = String.format("-T HaplotypeCaller --disableDithering -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2";
final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.<String>emptyList());
executeTest("HCTestDoesNotFailOnBadRefBase: ", spec);
}
@ -311,7 +314,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("b56895e6d28ea0b9dadeecd0ff61687e"));
Arrays.asList("04ff9b301bd6f50df848800fbe09de5c"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -320,7 +323,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,100,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("7b52164df8bf76d789836f990bd6066a"));
Arrays.asList("bf8bb5d13b01facdf90ec24bfbf82faa"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@ -328,7 +331,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("096826325215f79fe70661d984ae45a4"));
Arrays.asList("dbae51c7903e088b2e62cbada6ea2d50"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -337,7 +340,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("ff3b24412090ce7693d66d750ae84ac9"));
Arrays.asList("2ffaf2e9ef293a6d5ce7c00be40edba7"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@ -360,7 +363,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1,
Arrays.asList("c2dab66ad3740320004874c83051bbfc"));
Arrays.asList("8c3ae4dc3d8af2aa8c71deaadb26cc14"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@ -368,7 +371,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1,
Arrays.asList("a8ea15ac136042891434ccb0b3c3b686"));
Arrays.asList("61aef3fe9d18eec1df526e99a8456115"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
@ -397,7 +400,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testLackSensitivityDueToBadHaplotypeSelectionFix() {
final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5514cfbcf12954bb12c725b77eaac248"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5087a8855b3ee9ea1091367674783462"));
spec.disableShadowBCF();
executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec);
}
@ -420,7 +423,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final String DBSNP = b37dbSNP138;
final String commandLineWithoutInterval = String.format("-T HaplotypeCaller -pairHMMSub %s %s -I %s -R %s -D %s "
+ "-variant_index_type LINEAR -variant_index_parameter 128000 --no_cmdline_in_header "
+ "-stand_call_conf 10.0 -stand_emit_conf 10.0", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, TEST_BAM, REFERENCE, DBSNP);
+ "-stand_call_conf 10.0", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, TEST_BAM, REFERENCE, DBSNP);
final String commandLineShortInterval = commandLineWithoutInterval + " -L " + SHORT_INTERVAL;
final String commandLineLongInterval = commandLineWithoutInterval + " -L " + LONG_INTERVAL;
@ -429,7 +432,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
// but please make sure that both outputs get the same variant,
// alleles all with DBSNP ids
// We test here that change in active region size does not have an effect in placement of indels.
final String md5 = "87b687b5476eb38b11db6a156b4066c8";
final String md5 = "66caceac0a54cdfd847bfdf4226bb36a";
final WalkerTestSpec shortSpec = new WalkerTestSpec(commandLineShortInterval + " -o %s",Arrays.asList(md5));
executeTest("testDifferentIndelLocationsDueToSWExactDoubleComparisonsFix::shortInterval",shortSpec);
final WalkerTestSpec longSpec = new WalkerTestSpec(commandLineLongInterval + " -o %s",Arrays.asList(md5));
@ -483,12 +486,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "34328c475325b7dfaa57ab5920478e0c");
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "408c6940a090d31c11c171ed5e0e033c");
}
@Test
public void testHBaseCountsBySample() throws IOException{
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "f5ad4e03c0faaa806ee6ae536af8a479");
public void testBaseCounts() throws IOException{
HCTest(CEUTRIO_BAM, "-A BaseCountsBySample -A BaseCounts", "40def0e9c06031d6b624a22a093574c0");
}
@Test
@ -500,5 +503,47 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5));
executeTest("testSetZeroGQsToNoCall", spec);
}
@Test
public void testHaplotypeCallerReadPosRankSum() throws IOException {
final File testBAM = new File(privateTestDir + "testReadPos.snippet.bam");
final String md5Variants = "03b3c464f22a3572f7d66890c18bdda4";
final String md5BAMOut = "2e0843f6e8e90c407825e9c47ce4a32d";
final String base = String.format("-T HaplotypeCaller -R %s -I %s -L 1:3753063 -ip 100 ", REF, testBAM) +
" --no_cmdline_in_header -o %s -bamout %s";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5Variants, md5BAMOut));
executeTest("testHaplotypeCallerReadPosRankSum", spec);
}
@Test
public void testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores() {
final File testBAM = new File(privateTestDir + "pretendTobeTetraPloidTetraAllelicSite.bam");
final String md5 = "289304f56833ea76b60cd08763b0f68b";
final String base = String.format("-T HaplotypeCaller -R %s -I %s -L 20:11363580-11363600 -ploidy 4 -maxGT 15 ", REF, testBAM) +
" --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5));
executeTest("testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores", spec);
}
@Test
public void testAlleleBalance() throws IOException{
HCTest(CEUTRIO_BAM, " -L 20:10001000-10010000 -A AlleleBalance -A AlleleBalanceBySample", "a210161843f4cb80143ff56e4e5c250f");
}
@Test()
public void testBCFInFileNameGivesVCF() {
final String md5 = "d41d8cd98f00b204e9800998ecf8427e";
final String commandLine = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --maxReadsInRegionPerSample 1000" +
" --minReadsPerAlignmentStart 5 --maxProbPropagationDistance 50 --activeProbabilityThreshold 0.002 --pcr_indel_model NONE" +
" -pairHMMSub %s %s -R %s -I %s -L %s -minPruning 3 --no_cmdline_in_header",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, NA12878_BAM, "20:10000000-10100000");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine, Arrays.asList(md5));
final File outputFile = createTempFile("testBCFInFileNameGivesVCF", ".bcftoolsFile.vcf");
spec.setOutputFileLocation(outputFile);
spec.disableShadowBCF();
executeTest("testBCFInFileNameGivesVCF", spec);
// Will throw an exception if the file in not a VCF
FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(outputFile.getAbsolutePath(), new VCFCodec(), false);
}
}

View File

@ -69,7 +69,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "07f969acede5e0ad7e1e94f4383af2a9"});
tests.add(new Object[]{nct, "da195c6c4c8e765acb35f08e37132108"});
}
return tests.toArray(new Object[][]{});

View File

@ -51,11 +51,11 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.gatk.utils.GenomeLoc;
@ -80,7 +80,7 @@ import java.util.*;
public class LocalAssemblyEngineUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private IndexedFastaSequenceFile seq;
private ReferenceSequenceFile seq;
private SAMFileHeader header;
@BeforeClass

View File

@ -51,9 +51,8 @@
package org.broadinstitute.gatk.tools.walkers.indels;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.UnvalidatingGenomeLoc;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
@ -76,7 +75,7 @@ public class PairHMMIndelErrorModelUnitTest extends BaseTest {
@BeforeClass
public void setup() throws FileNotFoundException {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
}

View File

@ -64,6 +64,9 @@ import java.util.Arrays;
import java.util.List;
public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
private static String TRAINING_VCF = comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf";
private static String TRUTH_VCF = comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_genotypes_2141_samples.b37.vcf";
private static class VRTest {
String inVCF;
String aggregateVCF;
@ -93,15 +96,15 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
}
VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
"41e2d951a17de433fe378bb3d9ec75d4", // tranches
"3fe87e69c6a613addb7eff5449e86aa1", // recal file
"78b8f1934d77341df2f6a9fdbd30fa74"); // cut VCF
"3ccb3aa81aebee74d32641105a64ea32", // tranches
"1a87e9cdc66c53891eab61ab39ff2434", // recal file
"217ee1523b6ddaf31f0eb0464b89bab6"); // cut VCF
VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf",
"ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches
"5a298554e9175961f63506c4e42ea78b", // recal file
"f284c0cbb00407cc5273c6f1a871513e"); // cut VCF
"be89401e09dd06817c43f152c789f854", // tranches
"8ce11e7555cccb3f13ea34a9074aec00", // recal file
"c09c2425744e8d914d69a2585dba0e97"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {
@ -119,8 +122,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + TRAINING_VCF +
" -resource:training=true,truth=true,prior=12.0 " + TRUTH_VCF +
" -T VariantRecalibrator" +
" -input " + params.inVCF +
" -L 20:1,000,000-40,000,000" +
@ -159,8 +162,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + TRAINING_VCF +
" -resource:training=true,truth=true,prior=12.0 " + TRUTH_VCF +
" -T VariantRecalibrator" +
" -input " + params.inVCF +
" -aggregate " + params.aggregateVCF +
@ -210,8 +213,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + TRAINING_VCF +
" -resource:training=true,truth=true,prior=12.0 " + TRUTH_VCF +
" -T VariantRecalibrator" +
" -input " + params.inVCF +
" -L 20:10,000,000-20,000,000" +

View File

@ -66,7 +66,6 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
@ -79,175 +78,199 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
return baseTestString(" -V " + privateTestDir + "gvcf.basepairResolution.vcf " + args, b37KGReference);
}
@Test(enabled = true)
@Test
public void testUpdatePGT() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
1,
Arrays.asList("8d9788afd0de26bd9d9e55dd0e9fc3ed"));
Collections.singletonList("cdff1a18cd820c9d9c2b5b05ab7ef8a9"));
executeTest("testUpdatePGT", spec);
}
@Test(enabled = true)
@Test
public void testUpdatePGTStrandAlleleCountsBySample() throws IOException{
final String logFileName = new String("testUpdatePGTStrandAlleleCountsBySample.log");
WalkerTestSpec spec = new WalkerTestSpec(
final String logFileName = "testUpdatePGTStrandAlleleCountsBySample.log";
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample -log " + logFileName, b37KGReference),
1,
Arrays.asList("5dd4698da963a423446bb1e183eb75aa"));
Collections.singletonList("7a459c5ff606239620e5f7b089186dfb"));
executeTest("testUpdatePGTStrandAlleleCountsBySample", spec);
File file = new File(logFileName);
final File file = new File(logFileName);
Assert.assertTrue(FileUtils.readFileToString(file).contains(AnnotationUtils.ANNOTATION_HC_WARN_MSG));
}
@Test(enabled = true)
@Test
public void combineSingleSamplePipelineGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000", b37KGReference),
1,
Arrays.asList("c9edd4ca8c2801c4681322087d82e781"));
Collections.singletonList("7b2a135e694f9d1190e041e6fd420123"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@Test(enabled = true)
@Test
public void testTetraploidRun() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "tetraploid-gvcf-1.vcf" +
" -V:sample2 " + privateTestDir + "tetraploid-gvcf-2.vcf" +
" -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference),
1,
Arrays.asList("64fa89f20ee25df21ad20ce4ada7e7ad"));
executeTest("combineSingleSamplePipelineGVCF", spec);
Collections.singletonList("d6ef5e411ac5829a12d825a0fefac883"));
executeTest("testTetraploidRun", spec);
}
@Test(enabled= true)
@Test
public void testMixedPloidyRun() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "haploid-gvcf-1.vcf" +
" -V:sample2 " + privateTestDir + "tetraploid-gvcf-2.vcf" +
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference),
1,
Arrays.asList("b1d93f4cd93093c208be2c9842f38d12"));
executeTest("combineSingleSamplePipelineGVCF", spec);
Collections.singletonList("b497f16cd9eb99e353d9430fe7f34635"));
executeTest("testMixedPloidyRun", spec);
}
@Test(enabled = true)
public void combineSingleSamplePipelineGVCF_includeNonVariants() {
@Test(enabled= true)
public void testMixedPloidyMaxNumPLValuesRun() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "haploid-gvcf-1.vcf" +
" -V:sample2 " + privateTestDir + "tetraploid-gvcf-2.vcf" +
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals" +
" -maxNumPLValues 3", b37KGReference),
1,
Collections.singletonList("8c8ebe2069977ba13024a95827c6c50d"));
executeTest("testMixedPloidyMaxNumPLValuesRun", spec);
}
@Test
public void combineSingleSamplePipelineGVCF_includeNonVariants() {
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
1,
Arrays.asList("c2f30f25ba4a84e38c04aa49b95694e8"));
Collections.singletonList("a9ecd152ec4b5b541887a0aed016f40d"));
spec.disableShadowBCF();
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
}
@Test(enabled = true)
@Test
public void combineSingleSamplePipelineGVCFHierarchical() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" +
" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("f48114bc6348cdc9dc4f0960f5dcf5f8"));
Collections.singletonList("8a37077d9a52b9d556cdd19403e27635"));
executeTest("combineSingleSamplePipelineGVCFHierarchical", spec);
}
@Test(enabled = true)
@Test
public void combineSingleSamplePipelineGVCF_addDbsnp() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("f88841deb5c2ce4f3bbea1e914a13898"));
Collections.singletonList("181fcb5d240b9bd92e3c793ca5aa7954"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}
@Test(enabled = true)
@Test
public void combineBPresGVCFs() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "NA12891.BPres.g.vcf" +
" -V " + privateTestDir + "NA12892.BPres.g.vcf" +
" -L 20:10433000-10436909", b37KGReference),
1,
Arrays.asList("f342872f485e6978501facc78c354078"));
Collections.singletonList("f342872f485e6978501facc78c354078"));
executeTest("combineBPresGVCFs", spec);
}
@Test(enabled = true)
@Test
public void testJustOneSample() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference +
" -V " + privateTestDir + "gvcfExample1.vcf",
1,
Arrays.asList("9ff344a5ab87a2c3b128e435e2e86db0"));
Collections.singletonList("df88bbf2eea39a06f2bcc47d9379e5fa"));
executeTest("testJustOneSample", spec);
}
@Test(enabled = true)
@Test
public void testSamplesWithDifferentLs() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference +
" -V " + privateTestDir + "gvcfExample1.vcf" +
" -V " + privateTestDir + "gvcfExample2.vcf",
1,
Arrays.asList("0c07ed795562ea96eab427e63a970384"));
Collections.singletonList("933c3ec48870c54f7f74b259272d6645"));
executeTest("testSamplesWithDifferentLs", spec);
}
@Test(enabled = true)
@Test
public void testNoPLsException() {
// Test with input files with (1) 0/0 and (2) ./.
final String md5 = "2f3d71272fdac19ac861cc7159edfb08";
WalkerTestSpec spec1 = new WalkerTestSpec(
final String md5 = "91038469a8133feb05038528f8565840";
final WalkerTestSpec spec1 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference +
" --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf",
1,
Arrays.asList(md5));
WalkerTestSpec spec2 = new WalkerTestSpec(
Collections.singletonList(md5));
final WalkerTestSpec spec2 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference +
" --variant " + privateTestDir + "combined_genotype_gvcf_exception.nocall.vcf",
1,
Arrays.asList(md5));
Collections.singletonList(md5));
executeTest("testNoPLsException.1", spec1);
executeTest("testNoPLsException.2", spec2);
}
@Test
public void testNDA() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-nda"),
1,
Arrays.asList("ce064429e6cbcaa956d52ef22e102f2f"));
Collections.singletonList("b92742a67c5b1718fb5674393b2008fd"));
executeTest("testNDA", spec);
}
@Test
public void testAllSitesNonBiallelic() {
final WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-allSites"),
1,
Collections.singletonList("77ae407c276aa7070fa22b110160c684"));
spec.disableShadowBCF();
executeTest("testAllSitesNonBiallelic", spec);
}
@Test
public void testMaxAltAlleles() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-maxAltAlleles 1"),
1,
Arrays.asList("1f1c0605fc8a500c9646132e0d7420a0"));
Collections.singletonList("312f01d41dff851f449d9b4ecf0e78d2"));
executeTest("testMaxAltAlleles", spec);
}
@Test
public void testStandardConf() {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"),
final WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-stand_call_conf 300"),
1,
Arrays.asList("0283e784ed49bc2dce32a26137c43409"));
Collections.singletonList("89582b2a32e3f04d3daa565a0c185003"));
executeTest("testStandardConf", spec);
}
@ -261,7 +284,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"--no_cmdline_in_header -o %s -L 20:10130000-10134800 " +
"-ERC GVCF --sample_name NA12878 -variant_index_type LINEAR " +
"-variant_index_parameter 128000 -A StrandAlleleCountsBySample",
1, Arrays.asList("")
1, Collections.singletonList("")
);
specHaplotypeCaller.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC
final File gVCF = executeTest("testStrandAlleleCountsBySampleHaplotypeCaller", specHaplotypeCaller).getFirst().get(0);
@ -270,7 +293,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference),
1,
Arrays.asList("34d76dc8dabc6a97e6d8f5365d7531e5"));
Collections.singletonList("f613d0bc0d45aafe53227bc0d13712f1"));
spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC
executeTest("testStrandAlleleCountsBySample", spec);
}
@ -278,7 +301,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
@Test
public void testUniquifiedSamples() throws IOException {
//two copies of 5 samples; will also test InbreedingCoeff calculation for uniquified samples
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample1B " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
@ -287,7 +310,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" +
" --uniquifySamples", b37KGReference),
1,
Arrays.asList("16d7374502fa3cf99863d15d31b5ef86"));
Collections.singletonList("0c99b1b20fb035a5dada036bd4cf39e5"));
executeTest("testUniquifiedSamples", spec);
}
@ -308,7 +331,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s));
codec.readHeader(lineIteratorVCF);
List<String> attributeValues = new ArrayList<String>();
final List<String> attributeValues = new ArrayList<String>();
while (lineIteratorVCF.hasNext()) {
final String line = lineIteratorVCF.next();
Assert.assertFalse(line == null);
@ -339,7 +362,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3,
1,
Arrays.asList(""));
Collections.singletonList(""));
genotypeBase.disableShadowBCF();
final File genotypeBaseVCF = executeTest("genotypeBase", genotypeBase).getFirst().get(0);
final List<VariantContext> BASE_VARIANT_CONTEXTS = getVariantContexts(genotypeBaseVCF);
@ -349,14 +372,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2,
1,
Arrays.asList(""));
Collections.singletonList(""));
combine12.disableShadowBCF();
final File combined_gVCF12 = executeTest("combine12", combine12).getFirst().get(0);
final WalkerTestSpec genotype12_3 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12.getAbsolutePath() + " -V " + gvcf3,
1,
Arrays.asList(""));
Collections.singletonList(""));
genotype12_3.disableShadowBCF();
final File genotype12_3VCF = executeTest("genotype12_3", genotype12_3).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_12_3 = getVariantContexts(genotype12_3VCF);
@ -367,14 +390,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12 + " -V " + gvcf3,
1,
Arrays.asList(""));
Collections.singletonList(""));
combine12then3.disableShadowBCF();
final File combined_gVCF12then3 = executeTest("combined_gVCF12then3", combine12then3).getFirst().get(0);
final WalkerTestSpec genotype12then3 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12then3.getAbsolutePath(),
1,
Arrays.asList(""));
Collections.singletonList(""));
genotype12then3.disableShadowBCF();
final File genotype12then3VCF = executeTest("genotype12then3", genotype12then3).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_12then3 = getVariantContexts(genotype12then3VCF);
@ -385,21 +408,21 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf3,
1,
Arrays.asList(""));
Collections.singletonList(""));
combine13.disableShadowBCF();
final File combined_gVCF13 = executeTest("combine13", combine13).getFirst().get(0);
final WalkerTestSpec combine13then2 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF13 + " -V " + gvcf2,
1,
Arrays.asList(""));
Collections.singletonList(""));
combine13then2.disableShadowBCF();
final File combined_gVCF13then2 = executeTest("combined_gVCF13then2", combine13then2).getFirst().get(0);
final WalkerTestSpec genotype13then2 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF13then2.getAbsolutePath(),
1,
Arrays.asList(""));
Collections.singletonList(""));
genotype13then2.disableShadowBCF();
final File genotype13then2VCF = executeTest("genotype13then2", genotype13then2).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_13then2 = getVariantContexts(genotype13then2VCF);
@ -410,14 +433,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3,
1,
Arrays.asList(""));
Collections.singletonList(""));
combine123.disableShadowBCF();
final File combined_gVCF123 = executeTest("combine123", combine123).getFirst().get(0);
final WalkerTestSpec genotype123 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF123.getAbsolutePath(),
1,
Arrays.asList(""));
Collections.singletonList(""));
genotype123.disableShadowBCF();
final File genotype123VCF = executeTest("genotype123", genotype123).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_123 = getVariantContexts(genotype123VCF);
@ -459,115 +482,116 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
}
private static final String simpleSpanningDeletionsMD5 = "4629c2f02ff58c111828269091cded82";
private static final String simpleSpanningDeletionsMD5 = "53f2b8991e49a47efc44b8e02ebb8d91";
@Test(enabled = true)
@Test
public void testSpanningDeletionsMD5() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf",
1,
Arrays.asList(simpleSpanningDeletionsMD5));
Collections.singletonList(simpleSpanningDeletionsMD5));
spec.disableShadowBCF();
executeTest("testSpanningDeletionsMD5", spec);
}
@Test(enabled = true)
@Test
public void testSpanningDeletionsFromCombinedGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.combined.g.vcf",
1,
Arrays.asList(simpleSpanningDeletionsMD5));
Collections.singletonList(simpleSpanningDeletionsMD5));
spec.disableShadowBCF();
executeTest("testSpanningDeletionsFromCombinedGVCFMD5", spec);
}
@Test(enabled = true)
@Test
public void testMultipleSpanningDeletionsMD5() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf",
1,
Arrays.asList("7fe5364565585d31a0bb6a9dfa4a01d4"));
Collections.singletonList("907dfaa4d31c22705eadd5890ae23929"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsMD5", spec);
}
@Test(enabled = true)
@Test
public void testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.delOnly.g.vcf",
1,
Arrays.asList("057f9368f380bf3c12b539a749deac61"));
Collections.singletonList("b923e5c6d5dbce62034178bd5234b932"));
spec.disableShadowBCF();
executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec);
}
@Test(enabled = true)
@Test
public void testDeprecatedSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.depr.delOnly.g.vcf",
1,
Arrays.asList("e8f5186718050fe0784416e41425563f"));
Collections.singletonList("01ae75dfe5c0c2350fcef0f4cdca36b2"));
spec.disableShadowBCF();
executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec);
}
@Test(enabled = true)
@Test
public void testGenotypingSpanningDeletionOverSpan() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.delOverSpan.1.g.vcf -V " +
privateTestDir + "spanningDel.delOverSpan.2.g.vcf",
0,
Arrays.asList("")); // we do not care about the md5; we just want to make sure it doesn't blow up with an error
Collections.singletonList("")); // we do not care about the md5; we just want to make sure it doesn't blow up with an error
spec.disableShadowBCF();
executeTest("testGenotypingSpanningDeletionOverSpan", spec);
}
@Test(enabled = true)
@Test
public void testBadADPropagationHaploidBugTest() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "ad-bug-input.vcf",
1,
Arrays.asList("5ed5cb6aac68aa8943dc45b8b90eb508"));
Collections.singletonList("4d6cbd8d666a43fc136d73de2b217719"));
spec.disableShadowBCF();
executeTest("testBadADPropagationHaploidBugTest", spec);
}
@Test(enabled = true)
@Test
public void testSAC() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "261_S01_raw_variants_gvcf.vcf",
1,
Arrays.asList("37eec6aedd26aa3430a15d90d7f8a011"));
Collections.singletonList("ea96440b537dd1b2b25ea565dfaa71fc"));
spec.disableShadowBCF();
executeTest("testSAC", spec);
}
@Test(enabled = true)
@Test
public void testSACMultisampleTetraploid() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "tetraploid-multisample-sac.g.vcf",
1,
Arrays.asList("76532a74d4ba49f23362c149ad31a229"));
Collections.singletonList("c21c847ef794c11e249985a16893b2fa"));
spec.disableShadowBCF();
executeTest("testSACMultisampleTetraploid", spec);
}
@Test(enabled = true)
@Test
public void testSetZeroRGQsToNoCall() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "set.zero.RGQs.no.call.sample1.g.vcf" +
" -V " + privateTestDir + "set.zero.RGQs.no.call.sample2.g.vcf" +
" -L chr16:1279274-1279874 -allSites", hg19ReferenceWithChrPrefixInChromosomeNames),
Arrays.asList("b7106be316e43ca04204b78038f65c9f"));
Collections.singletonList("903047b6262fcb82070556ff74f26a75"));
spec.disableShadowBCF();
executeTest("testSetZeroRGQsToNoCall", spec);
}
@ -575,7 +599,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("89712a9fe5b6db16be2257be2b0b4759"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("29d6db0a93abd72d64fb1e82da65c715"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -584,7 +608,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testASMateRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest --disableDithering -V "
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("8e41a139600ab58a67910cdc60053726"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("2a330015a7db9f9aee9bc5b776698f73"));
spec.disableShadowBCF();
executeTest("testASMateRankSumAnnotation", spec);
}
@ -593,7 +617,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b1334fbfbf21934aac1c1eda0b5062d5"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("75aee1e0c8c3528180e344ec6c0d8ffd"));
spec.disableShadowBCF();
executeTest("testASInsertSizeRankSumAnnotation", spec);
}
@ -606,7 +630,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations_oneSample() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("7d86260e91fe74588e01339a2064b59c"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("f4fa3acec2b21037368898e913b7a3fa"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_oneSample", spec);
}
@ -616,7 +640,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations_elevenSamples() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "multiSamples.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("a889fe6775575513e84905b4fa98f8b3"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("4e90f6908248fac9b3ce3e545180a8e5"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_elevenSamples", spec);
}
@ -625,7 +649,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testMonomorphicVCwithAlt() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header --disableDithering -V "
+ privateTestDir + "monomorphicGVCwithAlt.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("8bf329a40637623515972dcc0e09a49e"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("43953b3e75a4d470b65773b1b5bea066"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -634,9 +658,46 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testFractionInformativeReads() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header -A FractionInformativeReads --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("b338bf1807791b23255b8cb1947c01b2"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("0b1bbcc7d24f8b0945c97907b1cdd974"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
//Regression test for https://github.com/broadinstitute/gsa-unstable/issues/1281
@Test
public void testGenotypingSpanningDeletionWithAllSites() {
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "spanningDel.genotyping.g.vcf -allSites", b37KGReference),
Collections.singletonList("04cfe93e92444cbde80e13ca8b8c3913"));
spec.disableShadowBCF();
executeTest("testGenotypingSpanningDeletionWithAllSites", spec);
}
@Test
public void testGenotypingSpanningDeletionAcrossLines() {
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "input-1_2256566.vcf", b37KGReference),
Collections.singletonList("152c8e07e35c592868f43626f27365de"));
spec.disableShadowBCF();
executeTest("testGenotypingSpanningDeletionAcrossLines", spec);
}
@Test
public void testNewQualNaNBugFix() {
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -newQual -V " + privateTestDir + "input-newqual-nan-bug-fix.vcf", b37KGReferenceWithDecoy),
Collections.singletonList("ab5994dcaf9b2d41269b4ff4729b5e81"));
spec.disableShadowBCF();
executeTest("testNewQualNaNBugFix", spec);
}
@Test
public void testHomRefHighMQ() {
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "NA18503.22.vcf -V " + privateTestDir + "NA18504.22.vcf -V " +
privateTestDir + "NA18505.22.vcf -allSites", b37KGReference),
Collections.singletonList("6d253024246e1024b9b6e8f885f53799"));
spec.disableShadowBCF();
executeTest("testHomRefHighMQ", spec);
}
}

View File

@ -67,8 +67,8 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
private static final String SAMPLE_EXCLUSION_MD5 = "2e52f21e7dcc67151a51630807a4eef2";
private static final String INVERT_SELECTION_MD5 = "26d192b868746ab14133f145ae812e7c";
private static final String MAX_FILTERED_GT_SELECTION_MD5 = "f83ac0deb7a8b022d6d40a85627a71ec";
private static final String MIN_FILTERED_GT_SELECTION_MD5 = "346620b7a5d66dabf89d3f42d6e27db7";
private static final String MAX_FILTERED_GT_SELECTION_MD5 = "66d92fac72b339195b393c9915643a14";
private static final String MIN_FILTERED_GT_SELECTION_MD5 = "965c0cf7daa03a1731b371bb20b582d4";
private static final String NO_CALL_FILTERING_KEEP_ONE = "6e2401190c5ada6a3bed2640c068f43b";
private static final String NO_CALL_FILTERING_KEEP_TWO = "6bced1ab6a3d58f1fd905b7f601987a3";
@ -88,6 +88,18 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec);
}
@Test
public void testExcludeIntervalsPadding(){
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -L 1:1715011-1734970 -XL 1:1725305 -ip 200 --variant "
+ b37hapmapGenotypes + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("2e31c0be0d639d7110e639a11c03f4ca")
);
executeTest("testExcludeIntervalsPadding--", spec);
}
@Test
public void testRepeatedLineSelection() {
String testfile = privateTestDir + "test.dup.vcf";
@ -304,7 +316,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --minIndelSize 2",
1,
Arrays.asList("ed9dc00d0551630a2eed9e81a2a357d3")
Arrays.asList("ad0965edb1dbd30060afd21ba9f11bf7")
);
executeTest("testMinIndelLengthSelection--" + testFile, spec);
@ -395,7 +407,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("c78a65b41edbdd386211042e8f65220b")
Arrays.asList("1fc77d7f47e75a24222a358c69de7f3d")
);
executeTest("testNoGTs--" + testFile, spec);
@ -606,7 +618,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -IDs " + idFile + " --variant " + testFile),
1,
Arrays.asList("c6632b63617162455f02670174a2322a")
Arrays.asList("da1117cba380345c622a6d8e52c2270b")
);
spec.disableShadowBCF();
executeTest("testKeepSelectionID--" + testFile, spec);
@ -641,7 +653,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -xlSelectType SNP --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("ed9dc00d0551630a2eed9e81a2a357d3")
Arrays.asList("ad0965edb1dbd30060afd21ba9f11bf7")
);
executeTest("testExcludeSelectionType--" + testFile, spec);
@ -732,7 +744,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("410c6b7bb62fc43bb41eee627670f757")
Arrays.asList("cb5ef9233503bebc81593e436a6de943")
);
spec.disableShadowBCF();
@ -747,7 +759,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
"-T SelectVariants --setFilteredGtToNocall --removeUnusedAlternates --excludeNonVariants -R " + b37KGReference + " --variant " +
testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("349136d92f915f8c7ba8a2f92e51d6b7"));
Arrays.asList("f5b2592361d8ab0d47e5047e63f78e4c"));
executeTest("testSetFilteredGtoNocallUpdateInfo", spec);
}
@ -782,7 +794,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA12891 -trimAlternates",
1,
ReviewedGATKException.class);
Arrays.asList("7880f8a1dfae1804998b6a994574e734"));
spec.disableShadowBCF();
executeTest("testSACNonDiploid", spec);
}
@ -838,4 +850,43 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testMaxNoCall0_5", spec);
}
@Test
public void testHaploid() {
final String testfile = privateTestDir + "haploid-multisample.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn HG00610 -select 'DP > 7'",
1,
Arrays.asList("bc6caa81836f4c94a1216babd0c1ac72"));
spec.disableShadowBCF();
executeTest("testHaploid", spec);
}
@Test
public void testTetraploid() {
final String testfile = privateTestDir + "tetraploid-multisample.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA18486 -select 'DP > 19'",
1,
Arrays.asList("4fcfa5e0ba5d39ca9f0593aa0c0f7a63"));
spec.disableShadowBCF();
executeTest("testTetraploid", spec);
}
@Test
public void testTetraDiploid() {
final String testfile = privateTestDir + "tetra-diploid.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA12878 -select 'DP > 48' -trimAlternates",
1,
Arrays.asList("709782f7a07cd500d41370e6a275fcdf"));
spec.disableShadowBCF();
executeTest("testTetraDiploid", spec);
}
}

View File

@ -95,7 +95,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest {
{ // new tests on b37 using testdir VCF
final String testfile = privateTestDir + "NA12878.hg19.example1.vcf";
final String args = "-select 'DP > 30' -V " + testfile;
new ParallelSelectTestProvider(b37KGReference, args, "64f9258e9e3024b6361abbeeeefafee9", nt);
new ParallelSelectTestProvider(b37KGReference, args, "51645037428729c3a9fa0e25fc2104ad", nt);
}
{ // AD and PL decoding race condition
final String testfile = privateTestDir + "race_condition.vcf";

View File

@ -87,7 +87,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
" --no_cmdline_in_header " +
" -o %s",
1,
Arrays.asList("f9f6418698f967ba7ca451ac1fb4cc8d")
Arrays.asList("94057d7a98c1af0a7490540ea1d9b247")
);
executeTest("testSimpleVCFStreaming", spec);

Some files were not shown because too many files have changed in this diff Show More