From 2e78815055d1427ca9c31ee6ff6914797d8c5623 Mon Sep 17 00:00:00 2001 From: Phillip Dexheimer Date: Wed, 28 May 2014 22:11:56 -0400 Subject: [PATCH] Added missing arguments to GenotypeGVCFs - New arguments are nda, hets, indelHeterozygosity, stand_call_conf, stand_emit_conf, ploidy, and maxAltAlleles - Addresses PT 70110918 - To do this, moved those arguments out of the StandardCallerArgumentCollection into a new GenotypeCalculationArgumentCollection, which is now included as a member of SCAC --- ...GenotypeCalculationArgumentCollection.java | 169 ++++++++++++++++++ .../StandardCallerArgumentCollection.java | 106 ++--------- ...dyGenotypeLikelihoodsCalculationModel.java | 2 +- ...NPGenotypeLikelihoodsCalculationModel.java | 2 +- .../walkers/genotyper/GenotypingEngine.java | 40 +++-- .../genotyper/UnifiedArgumentCollection.java | 7 - .../walkers/genotyper/UnifiedGenotyper.java | 8 +- .../genotyper/UnifiedGenotypingEngine.java | 8 +- .../genotyper/afcalc/AFCalcFactory.java | 12 +- .../haplotypecaller/HaplotypeCaller.java | 11 +- .../validation/GenotypeAndValidate.java | 6 +- .../walkers/variantutils/GenotypeGVCFs.java | 38 +++- .../GenotypeGVCFsIntegrationTest.java | 31 ++++ 13 files changed, 290 insertions(+), 150 deletions(-) create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java new file mode 100644 index 000000000..2d857ec4b --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -0,0 +1,169 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import org.broadinstitute.gatk.utils.commandline.Advanced; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; + +import java.util.Collections; +import java.util.List; + +public class GenotypeCalculationArgumentCollection implements Cloneable{ + + /** + * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping. + * Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site. + */ + @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) + public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + + /** + * The expected heterozygosity value used to compute prior probability that a locus is non-reference. + * + * The default priors are for provided for humans: + * + * het = 1e-3 + * + * which means that the probability of N samples being hom-ref at a site is: + * + * 1 - sum_i_2N (het / i) + * + * Note that heterozygosity as used here is the population genetics concept: + * + * http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics + * + * That is, a hets value of 0.01 implies that two randomly chosen chromosomes from the population of organisms + * would differ from each other (one being A and the other B) at a rate of 1 in 100 bp. + * + * Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, + * which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there + * may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK + * only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the + * het parameters only increases the chance that a site will be called non-reference across all samples, but + * doesn't actually change the output genotype likelihoods at all, as these aren't posterior probabilities at all. + * + * The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy, + * which determines how many chromosomes each individual in the species carries. + */ + @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept", required = false) + public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY; + + /** + * This argument informs the prior probability of having an indel at a site. + */ + @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false) + public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY; + + /** + * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with + * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this + * is the default). + */ + @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) + public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + + /** + * This argument allows you to emit low quality calls as filtered records. + */ + @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false) + public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + + /** + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it + * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend + * that you not play around with this parameter. + * + * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6. + */ + @Advanced + @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) + public int MAX_ALTERNATE_ALLELES = 6; + + /** + * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, + * see e.g. Waterson (1975) or Tajima (1996). + * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N + * + * There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate, + * as for example when the ancestral status of the reference allele is not known. + * By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns: + * a) User must specify 2N values, where N is the number of samples. + * b) Only diploid calls supported. + * c) Probability values are specified in double format, in linear space. + * d) No negative values allowed. + * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. + * f) If user-defined values add to more than one, an error will be produced. + * + * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g. + * -inputPrior 0.33 -inputPrior 0.33 + * for the single-sample diploid case. + */ + @Advanced + @Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false) + public List inputPrior = Collections.emptyList(); + + /** + * Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy + */ + @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) + public int samplePloidy = HomoSapiensConstants.DEFAULT_PLOIDY; + + /** + * Creates a copy of this configuration. + * @return never {@code null}. + */ + @Override + public GenotypeCalculationArgumentCollection clone() { + try { + return (GenotypeCalculationArgumentCollection) super.clone(); + } catch (CloneNotSupportedException e) { + throw new IllegalStateException("unreachable code"); + } + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java index 3e31636a4..653b438ec 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java @@ -51,14 +51,13 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingOutputMode; import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.gatk.utils.collections.DefaultHashMap; -import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import htsjdk.variant.variantcontext.VariantContext; import java.io.File; import java.lang.reflect.Field; +import java.lang.reflect.Method; import java.lang.reflect.Modifier; import java.util.Collections; -import java.util.List; import java.util.Map; /** @@ -70,101 +69,19 @@ import java.util.Map; */ public class StandardCallerArgumentCollection implements Cloneable { - /** - * The expected heterozygosity value used to compute prior probability that a locus is non-reference. - * - * The default priors are for provided for humans: - * - * het = 1e-3 - * - * which means that the probability of N samples being hom-ref at a site is: - * - * 1 - sum_i_2N (het / i) - * - * Note that heterozygosity as used here is the population genetics concept: - * - * http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics - * - * That is, a hets value of 0.01 implies that two randomly chosen chromosomes from the population of organisms - * would differ from each other (one being A and the other B) at a rate of 1 in 100 bp. - * - * Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, - * which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there - * may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK - * only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the - * het parameters only increases the chance that a site will be called non-reference across all samples, but - * doesn't actually change the output genotype likelihoods at all, as these aren't posterior probabilities at all. - * - * The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy, - * which determines how many chromosomes each individual in the species carries. - */ - @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept", required = false) - public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY; - /** - * This argument informs the prior probability of having an indel at a site. - */ - @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false) - public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY; + @ArgumentCollection + public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection(); @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY; - /** - * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with - * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this - * is the default). - */ - @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; - - /** - * This argument allows you to emit low quality calls as filtered records. - */ - @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false) - public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; - /** * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding */ @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) public RodBinding alleles; - /** - * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), - * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it - * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend - * that you not play around with this parameter. - * - * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6. - */ - @Advanced - @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) - public int MAX_ALTERNATE_ALLELES = 6; - - /** - * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, - * see e.g. Waterson (1975) or Tajima (1996). - * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N - * - * There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate, - * as for example when the ancestral status of the reference allele is not known. - * By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns: - * a) User must specify 2N values, where N is the number of samples. - * b) Only diploid calls supported. - * c) Probability values are specified in double format, in linear space. - * d) No negative values allowed. - * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. - * f) If user-defined values add to more than one, an error will be produced. - * - * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g. - * -inputPrior 0.33 -inputPrior 0.33 - * for the single-sample diploid case. - */ - @Advanced - @Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false) - public List inputPrior = Collections.emptyList(); - /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we @@ -212,12 +129,6 @@ public class StandardCallerArgumentCollection implements Cloneable { @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; - /** - * Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy - */ - @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) - public int samplePloidy = HomoSapiensConstants.DEFAULT_PLOIDY; - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY; @@ -258,7 +169,12 @@ public class StandardCallerArgumentCollection implements Cloneable { continue; final int fieldModifiers = field.getModifiers(); if ((fieldModifiers & UNCOPYABLE_MODIFIER_MASK) != 0) continue; - field.set(result,field.get(this)); + //Use the clone() method if appropriate + if (Cloneable.class.isAssignableFrom(field.getType())) { + Method clone = field.getType().getMethod("clone"); + field.set(result, clone.invoke(field.get(this))); + } else + field.set(result,field.get(this)); } return result; } catch (final Exception ex) { @@ -273,7 +189,9 @@ public class StandardCallerArgumentCollection implements Cloneable { @Override public StandardCallerArgumentCollection clone() { try { - return (StandardCallerArgumentCollection) super.clone(); + StandardCallerArgumentCollection cloned = (StandardCallerArgumentCollection) super.clone(); + cloned.genotypeArgs = genotypeArgs.clone(); + return cloned; } catch (CloneNotSupportedException e) { throw new IllegalStateException("unreachable code"); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index b4c855577..9206fbc37 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -259,7 +259,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G } // create the GenotypeLikelihoods object - final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO, perReadAlleleLikelihoodMap.get(sample.getKey())); + final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.genotypeArgs.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO, perReadAlleleLikelihoodMap.get(sample.getKey())); // actually compute likelihoods final int nGoodBases = GL.add(pileup, UAC); if ( nGoodBases > 0 ) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java index cd59c0dec..7271e6499 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java @@ -99,7 +99,7 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General final ReferenceContext ref, final boolean ignoreLaneInformation, final org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){ - return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO); + return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.genotypeArgs.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO); } protected List getInitialAllelesToUse(final RefMetaDataTracker tracker, diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index 405afd825..97a98e704 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -48,6 +48,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; @@ -79,6 +81,7 @@ import java.util.*; */ public abstract class GenotypingEngine { + public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; public static final String LOW_QUAL_FILTER_NAME = "LowQual"; protected Logger logger; @@ -154,12 +157,12 @@ public abstract class GenotypingEngine getAppropriateVCFInfoHeaders() { + Set headerInfo = new HashSet<>(); + if ( configuration.genotypeArgs.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site")); + return headerInfo; + } + /** * Changes the annotation engine for this genotyping-engine. * @@ -273,7 +283,7 @@ public abstract class GenotypingEngine= configuration.STANDARD_CONFIDENCE_FOR_CALLING || + return conf >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING || (configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES - && QualityUtils.phredScaleErrorRate(PofF) >= configuration.STANDARD_CONFIDENCE_FOR_CALLING); + && QualityUtils.phredScaleErrorRate(PofF) >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING); } /** @@ -401,10 +411,10 @@ public abstract class GenotypingEngine= configuration.STANDARD_CONFIDENCE_FOR_CALLING, false); + return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING, false); } protected final double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { @@ -643,12 +653,12 @@ public abstract class GenotypingEngine= Math.min(configuration.STANDARD_CONFIDENCE_FOR_CALLING, - configuration.STANDARD_CONFIDENCE_FOR_EMITTING); + conf >= Math.min(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING, + configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING); } protected final boolean passesCallThreshold(double conf) { - return conf >= configuration.STANDARD_CONFIDENCE_FOR_CALLING; + return conf >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING; } protected Map composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc, @@ -673,6 +683,10 @@ public abstract class GenotypingEngine, Unif UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); // check for a bad max alleles value - if ( UAC.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) + if ( UAC.genotypeArgs.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); // warn the user for misusing EMIT_ALL_SITES @@ -289,6 +289,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif // initialize the header Set headerInfo = getHeaderInfo(UAC, annotationEngine, dbsnp); + headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); // invoke initialize() method on each of the annotation classes, allowing them to add their own header lines // and perform any necessary initialization/validation steps @@ -319,11 +320,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif if ( UAC.COMPUTE_SLOD ) VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, VCFConstants.STRAND_BIAS_KEY); - if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) - headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site")); - // add the pool values for each genotype - if (UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY) { + if (UAC.genotypeArgs.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY) { headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample")); headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample")); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java index 3fd51c21d..36ac2e5bf 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java @@ -75,9 +75,6 @@ import java.util.*; */ public class UnifiedGenotypingEngine extends GenotypingEngine { - public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - - public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; private static final int SNP_MODEL = 0; @@ -362,9 +359,6 @@ public class UnifiedGenotypingEngine extends GenotypingEngine supportingCalculations = new LinkedList(); for ( final Calculation calc : Calculation.values() ) { - if ( calc.usableForParams(UAC.samplePloidy, maxAltAlleles) ) + if ( calc.usableForParams(UAC.genotypeArgs.samplePloidy, maxAltAlleles) ) supportingCalculations.add(calc); } if ( supportingCalculations.isEmpty() ) - throw new UserException("no AFCalculation model found that supports ploidy of " + UAC.samplePloidy + " and max alt alleles " + maxAltAlleles); + throw new UserException("no AFCalculation model found that supports ploidy of " + UAC.genotypeArgs.samplePloidy + " and max alt alleles " + maxAltAlleles); else if ( supportingCalculations.size() > 1 ) logger.debug("Warning, multiple supporting AFCalcs found " + Utils.join(",", supportingCalculations) + " choosing first arbitrarily"); else @@ -155,7 +155,7 @@ public class AFCalcFactory { logger.info("Selecting model " + UAC.AFmodel); } - final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.samplePloidy); + final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, maxAltAlleles, UAC.genotypeArgs.samplePloidy); if ( logger != null ) calc.setLogger(logger); if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index b2974ba85..6ae18b29b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -536,8 +536,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if (SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES) throw new UserException.BadArgumentValue("ERC/gt_mode","you cannot request reference confidence output and Genotyping Giving Alleles at the same time"); - SCAC.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; - SCAC.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; + SCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; + SCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; // also, we don't need to output several of the annotations @@ -564,8 +564,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final UnifiedArgumentCollection simpleUAC = SCAC.cloneTo(UnifiedArgumentCollection.class); simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY; simpleUAC.genotypingOutputMode = GenotypingOutputMode.DISCOVERY; - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, SCAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, SCAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, SCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, SCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; @@ -578,11 +578,13 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode ) throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other."); + genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(),SCAC); // initialize the output VCF header final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); Set headerInfo = new HashSet<>(); + headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); // all annotation fields from VariantAnnotatorEngine headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); // all callers need to add these standard annotation header lines @@ -646,7 +648,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(SCAC.DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); - genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(),SCAC); genotypingEngine.setCrossHaplotypeEventMerger(variantMerger); genotypingEngine.setAnnotationEngine(annotationEngine); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java index fd8c62c90..432bfc770 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java @@ -346,8 +346,8 @@ public class GenotypeAndValidate extends RodWalker= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; - if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; + if (emitConf >= 0) uac.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; + if (callConf >= 0) uac.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = callConf; uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; snpEngine = new UnifiedGenotypingEngine(getToolkit(), uac); @@ -358,7 +358,7 @@ public class GenotypeAndValidate extends RodWalkeremptyList(), this, getToolkit()); - // collect the actual rod bindings into a list for use later for ( final RodBindingCollection variantCollection : variantCollections ) variants.addAll(variantCollection.getRodBindings()); - // take care of the VCF headers final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), variants); + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + // create the genotyping engine + genotypingEngine = new UnifiedGenotypingEngine(getToolkit(), createUAC(), samples); + // create the annotation engine + annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, getToolkit()); + + // take care of the VCF headers final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions()); + headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); + // add the pool values for each genotype + if (genotypeArgs.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY) { + headerLines.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample")); + headerLines.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample")); + } VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.MLE_ALLELE_COUNT_KEY, VCFConstants.MLE_ALLELE_FREQUENCY_KEY); if ( dbsnp != null && dbsnp.dbsnp.isBound() ) VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); - final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); vcfWriter.writeHeader(vcfHeader); - - // create the genotyping engine - genotypingEngine = new UnifiedGenotypingEngine(getToolkit(), new UnifiedArgumentCollection(), samples); } public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { @@ -211,6 +221,8 @@ public class GenotypeGVCFs extends RodWalker attrs = new HashMap<>(originalVC.getAttributes()); attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + if (regenotypedVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY)) + attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, regenotypedVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY)); result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make(); } @@ -282,6 +294,16 @@ public class GenotypeGVCFs extends RodWalker