From 08203b516e4d44e237a5d72cc1057f314dfc076b Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Fri, 21 Mar 2014 10:16:26 -0400 Subject: [PATCH] Disentangle UG and HC Genotyper engines. Description: Transforms a delegation dependency from HC to UG genotyping engine into a reusage by inhertance where HC and UG engines inherit from a common superclass GenotyperEngine that implements the common parts. A side-effect some of the code is now more clear and redundant code has been removed. Changes have a few consequence for the end user. HC has now a few more user arguments, those that control the functionality that HC was borrowing directly from UGE. Added -ploidy argument although it is contraint to be 2 for now. Added -out_mode EMIT_ALL_SITES|EMIT_VARIANTS_ONLY ... Added -allSitePLs flag. Stories: https://www.pivotaltracker.com/story/show/68017394 Changes: - Moved (HC's) GenotyperEngine to HaplotypeCallerGenotyperEngine (HCGE). Then created a engine superclass class GenotypingEngine (GE) that contains common parts between HCGE and the UG counterpart 'UnifiedGenotypingEngine' (UGE). Simplified the code and applied the template pattern to accomodate for small diferences in behaviour between both caller engines. (There is still room for improvement though). - Moved inner classes and enums to top-level components for various reasons including making them shorter and simpler names to refer to them. - Create a HomoSpiens class for Human specific constants; even if they are good default for most users we need to clearly identify the human assumption across the code if we want to make GATK work with any species in general; i.e. any reference to HomoSapiens, except as a default value for a user argument, should smell. - Fixed a bug deep in the genotyping calculation we were taking on fixed values for snp and indel heterozygisity to be the default for Human ignoring user arguments. - GenotypingLikehooldCalculationCModel.Model to Gen.*Like.*Calc.*Model.Name; not a definitive solution though as names are used often in conditionals that perhaps should be member methods of the GenLikeCalc classes. - Renamed LikelihoodCalculationEngine to ReadLikelihoodCalculationEngine to distinguish them clearly from Genotype likelihood calculation engines. - Changed copy by explicity argument listing to a clone/reflexion solution for casting between genotypers argument collection classes. - Created GenotypeGivenAllelesUtils to collect methods needed nearly exclusively by the GGA mode. Tests : - StandardCallerArgumentCollectionUnitTest (check copy by cloning/reflexion). - All existing integration and unit tests for modified classes. --- .../StandardCallerArgumentCollection.java | 98 +- ...NPGenotypeLikelihoodsCalculationModel.java | 7 +- .../GenotypeLikelihoodsCalculationModel.java | 19 +- .../walkers/genotyper/GenotypingEngine.java | 637 +++++++++++++ .../walkers/genotyper/GenotypingMode.java | 68 ++ ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../gatk/walkers/genotyper/OutputMode.java | 62 ++ ...NPGenotypeLikelihoodsCalculationModel.java | 9 +- .../genotyper/UnifiedArgumentCollection.java | 77 +- .../walkers/genotyper/UnifiedGenotyper.java | 18 +- .../genotyper/UnifiedGenotyperEngine.java | 844 ------------------ .../genotyper/UnifiedGenotypingEngine.java | 503 +++++++++++ .../genotyper/afcalc/AFCalcFactory.java | 4 +- .../genotyper/afcalc/AFCalcTestBuilder.java | 7 +- ...GraphBasedLikelihoodCalculationEngine.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 181 ++-- .../HaplotypeCallerArgumentCollection.java | 82 ++ ...a => HaplotypeCallerGenotypingEngine.java} | 76 +- .../haplotypecaller/HaplotypeResolver.java | 4 +- .../haplotypecaller/LocalAssemblyEngine.java | 40 +- .../PairHMMLikelihoodCalculationEngine.java | 2 +- .../RandomLikelihoodCalculationEngine.java | 2 +- ...a => ReadLikelihoodCalculationEngine.java} | 2 +- .../ReferenceConfidenceMode.java | 69 ++ .../readthreading/ReadThreadingAssembler.java | 6 +- .../validation/GenotypeAndValidate.java | 23 +- .../CalculateGenotypePosteriors.java | 12 +- .../walkers/variantutils/GenotypeGVCFs.java | 6 +- .../variantutils/RegenotypeVariants.java | 32 +- .../gga/GenotypingGivenAllelesUtils.java | 135 +++ ...ndardCallerArgumentCollectionUnitTest.java | 206 +++++ .../UnifiedGenotyperEngineUnitTest.java | 14 +- .../UnifiedGenotyperIntegrationTest.java | 3 + .../genotyper/afcalc/AFCalcUnitTest.java | 6 +- .../HaplotypeCallerGVCFIntegrationTest.java | 14 +- ...lotypeCallerGenotypingEngineUnitTest.java} | 18 +- .../HaplotypeCallerIntegrationTest.java | 6 +- .../sting/gatk/GenomeAnalysisEngine.java | 2 +- .../sting/gatk/samples/SampleDB.java | 2 +- .../sting/utils/SampleUtils.java | 2 +- .../variant/GATKVariantContextUtils.java | 31 +- .../sting/utils/variant/HomoSapiens.java | 51 ++ .../GATKVariantContextUtilsUnitTest.java | 73 +- 43 files changed, 2228 insertions(+), 1229 deletions(-) create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingEngine.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingMode.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/OutputMode.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotypingEngine.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java rename protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{GenotypingEngine.java => HaplotypeCallerGenotypingEngine.java} (92%) rename protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{LikelihoodCalculationEngine.java => ReadLikelihoodCalculationEngine.java} (99%) create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceMode.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gga/GenotypingGivenAllelesUtils.java create mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java rename protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{GenotypingEngineUnitTest.java => HaplotypeCallerGenotypingEngineUnitTest.java} (94%) create mode 100644 public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/HomoSapiens.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index c331451d5..5559db77b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -47,13 +47,15 @@ package org.broadinstitute.sting.gatk.arguments; import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.OutputMode; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.collections.DefaultHashMap; +import org.broadinstitute.sting.utils.variant.HomoSapiens; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; import java.util.Collections; import java.util.List; import java.util.Map; @@ -66,7 +68,7 @@ import java.util.Map; * This is pulled out so that every caller isn't exposed to the arguments from every other caller. */ -public class StandardCallerArgumentCollection { +public class StandardCallerArgumentCollection implements Cloneable { /** * The expected heterozygosity value used to compute prior probability that a locus is non-reference. * @@ -96,16 +98,16 @@ public class StandardCallerArgumentCollection { * which determines how many chromosomes each individual in the species carries. */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept", required = false) - public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; + public Double snpHeterozygosity = HomoSapiens.SNP_HETEROZYGOSITY; /** * This argument informs the prior probability of having an indel at a site. */ @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false) - public double INDEL_HETEROZYGOSITY = 1.0/8000; + public double indelHeterozygosity = HomoSapiens.INDEL_HETEROZYGOSITY; @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) - public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + public org.broadinstitute.sting.gatk.walkers.genotyper.GenotypingMode genotypingMode = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypingMode.DISCOVERY; /** * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with @@ -209,22 +211,76 @@ public class StandardCallerArgumentCollection { @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; + /** + * Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy + */ + @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) + public int samplePloidy = HomoSapiens.DEFAULT_PLOIDY; + + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) + public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY; + + /** + * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites. + * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any). + * WARNINGS: + * - This feature will inflate VCF file size considerably. + * - All SNP ALT alleles will be emitted with corresponding 10 PL values. + * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used + */ + @Advanced + @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false) + public boolean annotateAllSitesWithPLs = false; + + + /** + * Creates a Standard caller argument collection with default values. + */ public StandardCallerArgumentCollection() { } - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! - public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) { - this.alleles = SCAC.alleles; - this.GenotypingMode = SCAC.GenotypingMode; - this.heterozygosity = SCAC.heterozygosity; - this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY; - this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; - this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; - this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION; - this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE; - this.exactCallsLog = SCAC.exactCallsLog; - this.sampleContamination=SCAC.sampleContamination; - this.AFmodel = SCAC.AFmodel; - this.inputPrior = SCAC.inputPrior; + /** + * "Casts" a caller argument collection into another type. + * + *

Common fields values are copied across

+ * @param clazz the class of the result. + * @param result argument collection class. + * @return never {@code null}. + */ + public T cloneTo(final Class clazz) { + // short cut: just use regular clone if it happens to be the same class. + if (clazz == getClass()) + return (T) clone(); + try { + final T result = clazz.newInstance(); + for (final Field field : getClass().getFields()) { + // just copy common fields. + if (!field.getDeclaringClass().isAssignableFrom(clazz)) + continue; + final int fieldModifiers = field.getModifiers(); + if (Modifier.isPrivate((fieldModifiers))) + continue; + if (Modifier.isFinal(fieldModifiers)) + continue; + if (Modifier.isStatic(fieldModifiers)) + continue; + field.set(result,field.get(this)); + } + return result; + } catch (final Exception ex) { + throw new IllegalStateException(ex); + } + } + + /** + * Creates a copy of this configuration. + * @return never {@code null}. + */ + @Override + public StandardCallerArgumentCollection clone() { + try { + return (StandardCallerArgumentCollection) super.clone(); + } catch (CloneNotSupportedException e) { + throw new IllegalStateException("unreachable code"); + } } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java index 9ea027698..c3412e2bc 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java @@ -78,6 +78,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.variant.variantcontext.*; import java.util.*; @@ -133,8 +134,8 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General final List alleles = new ArrayList(); if ( allAllelesToUse != null ) { alleles.addAll(allAllelesToUse); - } else if ( UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); + } else if ( UAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext vc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, ref.getLocus(), true, logger, UAC.alleles); // ignore places where we don't have a SNP if ( vc == null || !vc.isSNP() ) @@ -150,7 +151,7 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General if ( alleles.size() == 1 ) { final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(ref.getBase()); // if we only want variants, then we don't need to calculate genotype likelihoods - if ( UAC.OutputMode != UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) + if ( UAC.outputMode != OutputMode.EMIT_VARIANTS_ONLY ) // otherwise, choose any alternate allele (it doesn't really matter) alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0))); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 95d3fb78b..dc5220e6a 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -66,30 +66,17 @@ import java.util.Map; /** * The model representing how we calculate genotype likelihoods */ -public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { +public abstract class GenotypeLikelihoodsCalculationModel { public static final String DUMMY_LANE = "Lane1"; public static final String DUMMY_SAMPLE_NAME = "DummySample1"; - /* public enum Model { - SNP, - INDEL, - BOTH - } - */ - public enum Model { + public enum Name { SNP, INDEL, GENERALPLOIDYSNP, GENERALPLOIDYINDEL, - BOTH - } - - public enum GENOTYPING_MODE { - /** the Unified Genotyper will choose the most likely alternate allele */ - DISCOVERY, - /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */ - GENOTYPE_GIVEN_ALLELES + BOTH; } protected final UnifiedArgumentCollection UAC; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingEngine.java new file mode 100644 index 000000000..c0dcfd54e --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingEngine.java @@ -0,0 +1,637 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; + +import java.util.*; + +/** + * Base class for genotyper engines. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public abstract class GenotypingEngine { + + public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + + protected Logger logger; + + protected final GenomeAnalysisEngine toolkit; + + protected final Config configuration; + + protected final VariantAnnotatorEngine annotationEngine; + + protected final int numberOfGenomes; + + protected final Collection sampleNames; + + + private final double[] log10AlleleFrequencyPriorsSNPs; + + private final double[] log10AlleleFrequencyPriorsIndels; + + private final GenomeLocParser genomeLocParser; + + // the model used for calculating p(non-ref) + protected ThreadLocal afcm = new ThreadLocal() { + + @Override + public AFCalc initialValue() { + return AFCalcFactory.createAFCalc(configuration, numberOfGenomes, logger); + } + }; + + /** + * Construct a new genotyper engine. + * + * @param toolkit reference to the genome-analysis toolkit. + * @param configuration engine configuration object. + * @param annotationEngine reference to the annotation engine. If {@code null}, no annotations will be processed. + * @param sampleNames subset of sample to work on identified by their names. If {@code null}, the full toolkit + * sample set will be used instead. + * + * @throws IllegalArgumentException if either {@code toolkit} or {@code configuration} is {@code null}. + */ + public GenotypingEngine(final GenomeAnalysisEngine toolkit, final Config configuration, + final VariantAnnotatorEngine annotationEngine, final Set sampleNames) { + if (toolkit == null) + throw new IllegalArgumentException("the toolkit cannot be null"); + if (configuration == null) + throw new IllegalArgumentException("the configuration cannot be null"); + this.configuration = configuration; + this.annotationEngine = annotationEngine; + logger = Logger.getLogger(getClass()); + this.toolkit = toolkit; + this.sampleNames = sampleNames != null ? sampleNames : toolkit.getSampleDB().getSampleNames(); + numberOfGenomes = this.sampleNames.size() * configuration.samplePloidy; + log10AlleleFrequencyPriorsSNPs = computeAlleleFrequencyPriors(numberOfGenomes, + configuration.snpHeterozygosity,configuration.inputPrior); + log10AlleleFrequencyPriorsIndels = computeAlleleFrequencyPriors(numberOfGenomes, + configuration.indelHeterozygosity,configuration.inputPrior); + genomeLocParser = toolkit.getGenomeLocParser(); + } + + /** + * Changes the logger for this genotyper engine. + * @param logger new logger. + * + * @throws IllegalArgumentException if {@code logger} is {@code null}. + */ + public void setLogger(final Logger logger) { + if (logger == null) + throw new IllegalArgumentException("the logger cannot be null"); + this.logger = logger; + } + + /** + * Returns a reference to the engine configuration + * @return never {@code null}. + */ + public Config getConfiguration() { + return configuration; + } + + /** + * Completes a variant context with genotype calls and associated annotations give the genotype likelihoods and + * the model that need to be applied. + * @param vc variant-context to complete. + * @param modelName model name. + * + * @return can be {@code null} indicating that genotyping it not possible with the information provided. + */ + public VariantCallContext calculateGenotypes(final VariantContext vc, final GeneralPloidySNPGenotypeLikelihoodsCalculationModel.Name modelName) { + if (vc == null) + throw new IllegalArgumentException("vc cannot be null"); + if (modelName == null) + throw new IllegalArgumentException("the model cannot be null"); + return calculateGenotypes(null,null,null,null,vc,modelName,false,null); + } + + /** + * Main entry function to calculate genotypes of a given VC with corresponding GL's that is shared across genotypers (namely UG and HC). + * + * @param tracker Tracker + * @param refContext Reference context + * @param rawContext Raw context + * @param stratifiedContexts Stratified alignment contexts + * @param vc Input VC + * @param model GL calculation model + * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc + * @return VC with assigned genotypes + */ + protected VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext rawContext, Map stratifiedContexts, + final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Name model, + final boolean inheritAttributesFromInputVC, + final Map perReadAlleleLikelihoodMap) { + + final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; + // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call + if (hasTooManyAlternativeAlleles(vc) || vc.getNSamples() == 0) + return emptyCallContext(tracker,refContext,rawContext); + + final AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); + + final OutputAlleleSubset outputAlternativeAlleles = calculateOutputAlleleSubset(AFresult); + + final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); + + // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice + double log10Confidence = + ! outputAlternativeAlleles.siteIsMonomorphic || + configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES || configuration.annotateAllSitesWithPLs + ? AFresult.getLog10PosteriorOfAFEq0() + 0.0 + : AFresult.getLog10PosteriorOfAFGT0() + 0.0; + + + // Add 0.0 removes -0.0 occurrences. + final double phredScaledConfidence = (-10.0 * log10Confidence) + 0.0; + + // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero + if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission()) + // technically, at this point our confidence in a reference call isn't accurately estimated + // because it didn't take into account samples with no data, so let's get a better estimate + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getModelTheta(model), true, PoFGT0); + + // start constructing the resulting VC + final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); + final List outputAlleles = outputAlternativeAlleles.outputAlleles(vc.getReference()); + final VariantContextBuilder builder = new VariantContextBuilder(callSourceString(), loc.getContig(), loc.getStart(), loc.getStop(), outputAlleles); + + // Seems that when log10PError is 0.0, you must pass -0.0 to get a nice output at the other end otherwise is a "-0". + // Truth is that this should be fixed in the "variant" dependency code but perhaps it can be ammeneded also in the VariantContextWriter. + //TODO report this bug or make sure our VCFWriters do not output '-0' QUAL column values. + builder.log10PError(log10Confidence == 0.0 ? -0.0 : log10Confidence); + if ( ! passesCallThreshold(phredScaledConfidence) ) + builder.filter(LOW_QUAL_FILTER_NAME); + + // create the genotypes + final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, outputAlleles, true,configuration.samplePloidy); + builder.genotypes(genotypes); + + // *** note that calculating strand bias involves overwriting data structures, so we do that last + final Map attributes = composeCallAttributes(inheritAttributesFromInputVC, vc, rawContext, stratifiedContexts, tracker, refContext, + outputAlternativeAlleles.alternativeAlleleMLECounts(), outputAlternativeAlleles.siteIsMonomorphic, AFresult, outputAlternativeAlleles.outputAlleles(vc.getReference()),genotypes,model,perReadAlleleLikelihoodMap); + + builder.attributes(attributes); + + VariantContext vcCall = builder.make(); + + if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations + final ReadBackedPileup pileup = rawContext.getBasePileup(); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); + + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); + } + + // if we are subsetting alleles (either because there were too many or because some were not polymorphic) + // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). + if ( outputAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync + vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall); + + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); + } + + /** + * What string to use as source of variant-context generated by this genotyper-engine. + * @return never {@code null} nor empty. + */ + protected abstract String callSourceString(); + + /** + * Holds information about the alternative allele subsetting based on supporting evidence, genotyping and + * output modes. + */ + private static class OutputAlleleSubset { + private final Allele[] alleles; + private final boolean siteIsMonomorphic; + private final int[] mleCounts; + private final int count; + + @Requires({"count >= 0", "alleles != null", "mleCounts != null", "count <= alleles.length", " count <= mleCounts"}) + private OutputAlleleSubset(final int count, final Allele[] alleles, final int[] mleCounts, final boolean siteIsMonomorphic) { + this.siteIsMonomorphic = siteIsMonomorphic; + this.count = count; + this.alleles = alleles; + this.mleCounts = mleCounts; + } + + private List outputAlleles(final Allele referenceAllele) { + final ArrayList result = new ArrayList<>(count + 1); + result.add(referenceAllele); + for (int i = 0; i < count; i++) + result.add(alleles[i]); + return result; + } + + public List alternativeAlleleMLECounts() { + final List result = new ArrayList<>(count); + for (int i = 0; i < count; i++) + result.add(mleCounts[i]); + return result; + } + } + + + /** + * Provided the exact mode computations it returns the appropiate subset of alleles that progress to genotyping. + * @param afcr the exact model calcualtion result. + * @return never {@code null}. + */ + private OutputAlleleSubset calculateOutputAlleleSubset(final AFCalcResult afcr) { + final List alleles = afcr.getAllelesUsedInGenotyping(); + + final int alternativeAlleleCount = alleles.size() - 1; + Allele[] outputAlleles = new Allele[alternativeAlleleCount]; + int[] mleCounts = new int[alternativeAlleleCount]; + int outputAlleleCount = 0; + boolean siteIsMonomorphic = true; + for (final Allele alternativeAllele : alleles) { + if (alternativeAllele.isReference()) continue; + final boolean isPlausible = afcr.isPolymorphicPhredScaledQual(alternativeAllele, configuration.STANDARD_CONFIDENCE_FOR_EMITTING); + final boolean toOutput = isPlausible || forceKeepAllele(alternativeAllele); + + siteIsMonomorphic &= ! isPlausible; + if (!toOutput) continue; + outputAlleles[outputAlleleCount] = alternativeAllele; + mleCounts[outputAlleleCount++] = afcr.getAlleleCountAtMLE(alternativeAllele); + } + + return new OutputAlleleSubset(outputAlleleCount,outputAlleles,mleCounts,siteIsMonomorphic); + } + + /** + * Checks whether even if the allele is not well supported by the data, we should keep it for genotyping. + * + * @param allele target allele. + * + * @return {@code true} iff we need to keep this alleles even if does not seem plausible. + */ + protected abstract boolean forceKeepAllele(final Allele allele); + + /** + * Checks whether a variant site seems confidently called base on user threshold that the score provided + * by the exact model. + * + * @param conf + * @param PofF + * @return {@true} iff the variant is confidently called. + */ + protected final boolean confidentlyCalled(final double conf, final double PofF) { + return conf >= configuration.STANDARD_CONFIDENCE_FOR_CALLING || + (configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES + && QualityUtils.phredScaleErrorRate(PofF) >= configuration.STANDARD_CONFIDENCE_FOR_CALLING); + } + + /** + * Based in the model used, returns the appropriate heterozygosity argument value. + * @param model genotyping model. + * + * @return a valid heterozygosity in (0,1). + */ + private double getModelTheta(final GenotypeLikelihoodsCalculationModel.Name model) { + switch (model) { + case SNP: + case GENERALPLOIDYSNP: + return configuration.snpHeterozygosity; + case INDEL: + case GENERALPLOIDYINDEL: + return configuration.indelHeterozygosity; + default: + throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + } + } + + + /** + * Checks whether the variant context has too many alternative alleles for progress to genotyping the site. + *

+ * AF calculation may get intro trouble with too many alternative alleles. + *

+ * + * @param vc the variant context to evaluate. + * + * @throws NullPointerException if {@code vc} is {@code null}. + * + * @return {@code true} iff there is too many alternative alleles based on + * {@link GenotypeLikelihoods#MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED}. + */ + @Requires("vc != null") + protected final boolean hasTooManyAlternativeAlleles(final VariantContext vc) { + // protect against too many alternate alleles that we can't even run AF on: + if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) + return false; + logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + + " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart()); + return true; + } + + /** + * Produces an empty variant-call context to output when there is no enough data provided to call anything. + * + * @param tracker meta-data tracker. + * @param ref the reference context. + * @param rawContext the read alignment at that location. + * @return it might be null if no enough information is provided to do even an empty call. For example when + * we have limmited-context (i.e. any of the tracker, reference or alignment is {@code null}. + */ + protected final VariantCallContext emptyCallContext(final RefMetaDataTracker tracker, final ReferenceContext ref, + final AlignmentContext rawContext) { + if (tracker == null || ref == null || rawContext == null) + return null; + + if (!forceSiteEmission()) + return null; + + VariantContext vc; + + if ( configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext ggaVc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, + rawContext.getLocation(), false, logger, configuration.alleles); + if (ggaVc == null) + return null; + vc = new VariantContextBuilder(callSourceString(), ref.getLocus().getContig(), ggaVc.getStart(), + ggaVc.getEnd(), ggaVc.getAlleles()).make(); + } else { + // deal with bad/non-standard reference bases + if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) + return null; + final Set alleles = new HashSet<>(Collections.singleton(Allele.create(ref.getBase(),true))); + vc = new VariantContextBuilder(callSourceString(), ref.getLocus().getContig(), + ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); + } + + if ( vc != null && annotationEngine != null ) { + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations + final ReadBackedPileup pileup = rawContext.getBasePileup(); + vc = annotationEngine.annotateContext(tracker, ref, AlignmentContextUtils.splitContextBySampleName(pileup), vc); + } + + return new VariantCallContext(vc, false); + } + + /** + * Indicates whether we have to emit any site no matter what. + *

+ * Note: this has been added to allow differences between UG and HC GGA modes where the latter force emmitions of all given alleles + * sites even if there is no enough confidence. + *

+ * + * @return {@code true} iff we force emissions. + */ + protected abstract boolean forceSiteEmission(); + + protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { + if ( contexts == null ) + return null; + + double log10POfRef = Math.log10(initialPofRef); + + // for each sample that we haven't examined yet + for ( String sample : sampleNames ) { + final AlignmentContext context = contexts.get(sample); + if ( ignoreCoveredSamples && context != null ) + continue; + final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage(); + log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta); + } + + return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= configuration.STANDARD_CONFIDENCE_FOR_CALLING, false); + } + + protected final double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Name model ) { + switch (model) { + case SNP: + case GENERALPLOIDYSNP: + return log10AlleleFrequencyPriorsSNPs; + case INDEL: + case GENERALPLOIDYINDEL: + return log10AlleleFrequencyPriorsIndels; + default: + throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + } + } + + /** + * Compute the log10 probability of a sample with sequencing depth and no alt allele is actually truly homozygous reference + * + * Assumes the sample is diploid + * + * @param depth the depth of the sample + * @param theta the heterozygosity of this species (between 0 and 1) + * @return a valid log10 probability of the sample being hom-ref + */ + @Requires({"depth >= 0", "theta >= 0.0 && theta <= 1.0"}) + @Ensures("MathUtils.goodLog10Probability(result)") + protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) { + final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth); + return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef)); + } + + protected final double getRefBinomialProbLog10(final int depth) { + return MathUtils.log10BinomialProbability(depth, 0); + } + + /** + * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, + * where Pr(AC=i) = theta/i where theta is heterozygosity + * @param N Number of chromosomes + * @param heterozygosity default heterozygosity to use, if inputPriors is empty + * @param inputPriors Input priors to use (in which case heterozygosity is ignored) + * + * @return never {@code null}. + */ + public static double[] computeAlleleFrequencyPriors(final int N, final double heterozygosity, final List inputPriors) { + + final double[] priors = new double[N + 1]; + double sum = 0.0; + + if (!inputPriors.isEmpty()) { + // user-specified priors + if (inputPriors.size() != N) + throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + + int idx = 1; + for (final double prior: inputPriors) { + if (prior < 0.0) + throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); + priors[idx++] = Math.log10(prior); + sum += prior; + } + } + else + // for each i + for (int i = 1; i <= N; i++) { + final double value = heterozygosity / (double)i; + priors[i] = Math.log10(value); + sum += value; + } + + // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) + if (sum > 1.0) + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[0] = Math.log10(1.0 - sum); + return priors; + } + + /** + * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, + * where Pr(AC=i) = theta/i where theta is heterozygosity + * @param N Number of chromosomes + * @param priors (output) array to be filled with priors + * @param heterozygosity default heterozygosity to use, if inputPriors is empty + * @param inputPriors Input priors to use (in which case heterozygosity is ignored) + */ + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { + + + double sum = 0.0; + + if (!inputPriors.isEmpty()) { + // user-specified priors + if (inputPriors.size() != N) + throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + + int idx = 1; + for (final double prior: inputPriors) { + if (prior < 0.0) + throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); + priors[idx++] = Math.log10(prior); + sum += prior; + } + } + else { + // for each i + for (int i = 1; i <= N; i++) { + final double value = heterozygosity / (double)i; + priors[i] = Math.log10(value); + sum += value; + } + } + + // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) + if (sum > 1.0) { + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); + } + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[0] = Math.log10(1.0 - sum); + } + + + protected final boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { + return (configuration.outputMode == OutputMode.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && + conf >= Math.min(configuration.STANDARD_CONFIDENCE_FOR_CALLING, + configuration.STANDARD_CONFIDENCE_FOR_EMITTING); + } + + protected final boolean passesCallThreshold(double conf) { + return conf >= configuration.STANDARD_CONFIDENCE_FOR_CALLING; + } + + protected Map composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc, + final AlignmentContext rawContext, final Map stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List alleleCountsofMLE, final boolean bestGuessIsRef, + final AFCalcResult AFresult, final List allAllelesToUse, final GenotypesContext genotypes, + final GenotypeLikelihoodsCalculationModel.Name model, final Map perReadAlleleLikelihoodMap) { + final HashMap attributes = new HashMap<>(); + + + final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; + + + // inherit attributed from input vc if requested + if (inheritAttributesFromInputVC) + attributes.putAll(vc.getAttributes()); + // if the site was downsampled, record that fact + if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) + attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); + + + // add the MLE AC and AF annotations + if ( alleleCountsofMLE.size() > 0 ) { + attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); + int AN = 0; + for (final Genotype g : genotypes) { + for (final Allele a : g.getAlleles()) + if (!a.isNoCall()) AN++; + } + final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); + // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) + for ( int AC : alleleCountsofMLE ) + MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN)); + attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); + } + + return attributes; + } + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingMode.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingMode.java new file mode 100644 index 000000000..f6fc87c5a --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingMode.java @@ -0,0 +1,68 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +/** + * Enumeration of possible genotyping modes. + * + *

+ * A genotyping mode represents the general approach taken when detecting and calculate variant genotypes. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public enum GenotypingMode { + + /** + * The genotyper will choose the most likely alternate allele + */ + DISCOVERY, + + /** + * Only the alleles passed by the user should be considered. + */ + GENOTYPE_GIVEN_ALLELES +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index ae2ea427b..4a0bf371e 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -215,7 +215,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final boolean ignoreSNPAllelesWhenGenotypingIndels) { List alleles = new ArrayList(); - if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { + if (UAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES) { VariantContext vc = null; for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) { if (vc_input != null && diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/OutputMode.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/OutputMode.java new file mode 100644 index 000000000..72918e914 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/OutputMode.java @@ -0,0 +1,62 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +/** +* TODO document this. +* +* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> +*/ +public enum OutputMode { + /** produces calls only at variant sites */ + EMIT_VARIANTS_ONLY, + /** produces calls at variant sites and confident reference sites */ + EMIT_ALL_CONFIDENT_SITES, + /** produces calls at any callable site regardless of confidence; this argument is intended only for point + * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by + * no means produce a comprehensive set of indels in DISCOVERY mode */ + EMIT_ALL_SITES +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index f94baf09f..5ec3ac494 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -59,6 +59,7 @@ import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.genotyper.DiploidGenotype; +import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -76,7 +77,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + useAlleleFromVCF = UAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES; perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); } @@ -127,7 +128,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC if ( allAllelesToUse != null ) { alleles.addAll(allAllelesToUse.subList(1,allAllelesToUse.size())); // this includes ref allele } else if ( useAlleleFromVCF ) { - final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); + final VariantContext vc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, ref.getLocus(), true, logger, UAC.alleles); // ignore places where we don't have a SNP if ( vc == null || !vc.isSNP() ) @@ -145,7 +146,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // if there are no non-ref alleles... if ( alleles.size() == 1 ) { // if we only want variants, then we don't need to calculate genotype likelihoods - if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) + if ( UAC.outputMode == OutputMode.EMIT_VARIANTS_ONLY ) return builder.make(); else @@ -193,7 +194,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC gb.PL(genotypeLikelihoods); gb.DP(sampleData.depth); if (UAC.annotateAllSitesWithPLs) - gb.attribute(UnifiedGenotyperEngine.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true))); + gb.attribute(UnifiedGenotypingEngine.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true))); genotypes.add(gb.make()); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 04c5587c3..64960200a 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -49,16 +49,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.utils.pairhmm.PairHMM; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.VariantContext; -import java.util.Collections; -import java.util.List; - public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) - public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; + public GenotypeLikelihoodsCalculationModel.Name GLmodel = GenotypeLikelihoodsCalculationModel.Name.SNP; /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily @@ -102,18 +98,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable", required = false) public Double MAX_DELETION_FRACTION = 0.05; - /** - * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites. - * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any). - * WARNINGS: - * - This feature will inflate VCF file size considerably. - * - All SNP ALT alleles will be emitted with corresponding 10 PL values. - * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used - */ - @Advanced - @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false) - public boolean annotateAllSitesWithPLs = false; - // indel-related arguments /** * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. @@ -184,13 +168,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="refsample", fullName="reference_sample_name", doc="Reference sample name.", required=false) String referenceSampleName; - /* - Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy - */ - @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) - public int samplePloidy = GATKVariantContextUtils.DEFAULT_PLOIDY; - - /** * The following argument are for debug-only tweaks when running generalized ploidy with a reference sample */ @@ -218,9 +195,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) - public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - /** * Create a new UAC with defaults for all UAC arguments */ @@ -228,51 +202,8 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection super(); } - /** - * Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments - * @param scac - */ - public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) { - super(scac); - } - - /** - * Create a new UAC with all parameters having the values in uac - * - * @param uac - */ - public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! - super(uac); - - this.GLmodel = uac.GLmodel; - this.AFmodel = uac.AFmodel; - this.PCR_error = uac.PCR_error; - this.COMPUTE_SLOD = uac.COMPUTE_SLOD; - this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; - this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE; - this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; - this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; - this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; - this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; - this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; - this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; - this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE; - this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL; - this.referenceSampleRod = uac.referenceSampleRod; - this.referenceSampleName = uac.referenceSampleName; - this.samplePloidy = uac.samplePloidy; - this.maxQualityScore = uac.minQualityScore; - this.phredScaledPrior = uac.phredScaledPrior; - this.minPower = uac.minPower; - this.minReferenceDepth = uac.minReferenceDepth; - this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; - this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; - this.pairHMM = uac.pairHMM; - this.OutputMode = uac.OutputMode; - - this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs; - // todo- arguments to remove - this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; + @Override + public UnifiedArgumentCollection clone() { + return (UnifiedArgumentCollection) super.clone(); } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 17d0217f0..227fd3f20 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -216,7 +216,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif protected String[] annotationClassesToUse = { "Standard" }; // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; + private UnifiedGenotypingEngine genotypingEngine = null; // the annotation engine private VariantAnnotatorEngine annotationEngine; @@ -273,9 +273,9 @@ public class UnifiedGenotyper extends LocusWalker, Unif throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); // warn the user for misusing EMIT_ALL_SITES - if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES && - UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY && - UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP ) + if ( UAC.outputMode == OutputMode.EMIT_ALL_SITES && + UAC.genotypingMode == GenotypingMode.DISCOVERY && + UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Name.SNP ) logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode"); // initialize the verbose writer @@ -283,7 +283,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP"); annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, UAC.samplePloidy); + genotypingEngine = new UnifiedGenotypingEngine(getToolkit(), UAC, annotationEngine,samples, verboseWriter); // initialize the header Set headerInfo = getHeaderInfo(UAC, annotationEngine, dbsnp); @@ -318,7 +318,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, VCFConstants.STRAND_BIAS_KEY); if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) - headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site")); + headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site")); // add the pool values for each genotype if (UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY) { @@ -330,7 +330,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif } if (UAC.annotateAllSitesWithPLs) { - headerInfo.add(new VCFFormatHeaderLine(UnifiedGenotyperEngine.PL_FOR_ALL_SNP_ALLELES_KEY, 10, VCFHeaderLineType.Integer, "Phred-scaled genotype likelihoods for all 4 possible bases regardless of whether there is statistical evidence for them. Ordering is always PL for AA AC CC GA GC GG TA TC TG TT.")); + headerInfo.add(new VCFFormatHeaderLine(UnifiedGenotypingEngine.PL_FOR_ALL_SNP_ALLELES_KEY, 10, VCFHeaderLineType.Integer, "Phred-scaled genotype likelihoods for all 4 possible bases regardless of whether there is statistical evidence for them. Ordering is always PL for AA AC CC GA GC GG TA TC TG TT.")); } VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, VCFConstants.DOWNSAMPLED_KEY, @@ -350,7 +350,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality")); return headerInfo; } @@ -364,7 +364,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * @return the VariantCallContext object */ public List map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, samples); + return genotypingEngine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext); } public UGStatistics reduceInit() { return new UGStatistics(); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java deleted file mode 100644 index c5070a76f..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ /dev/null @@ -1,844 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.PrintStream; -import java.lang.reflect.Constructor; -import java.util.*; - -public class UnifiedGenotyperEngine { - public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - private static final String GPSTRING = "GENERALPLOIDY"; - - public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; - public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; - - public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; - public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; - - private static final int SNP_MODEL = 0; - private static final int INDEL_MODEL = 1; - - public enum OUTPUT_MODE { - /** produces calls only at variant sites */ - EMIT_VARIANTS_ONLY, - /** produces calls at variant sites and confident reference sites */ - EMIT_ALL_CONFIDENT_SITES, - /** produces calls at any callable site regardless of confidence; this argument is intended only for point - * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by - * no means produce a comprehensive set of indels in DISCOVERY mode */ - EMIT_ALL_SITES - } - - // the unified argument collection - private final UnifiedArgumentCollection UAC; - public UnifiedArgumentCollection getUAC() { return UAC; } - - // the annotation engine - private final VariantAnnotatorEngine annotationEngine; - - // the model used for calculating genotypes - private ThreadLocal> glcm = new ThreadLocal>(); - private final List modelsToUse = new ArrayList(2); - - // the model used for calculating p(non-ref) - private ThreadLocal afcm = new ThreadLocal(); - - // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[] log10AlleleFrequencyPriorsSNPs; - private final double[] log10AlleleFrequencyPriorsIndels; - - // samples in input - private final Set samples; - - // the various loggers and writers - private final Logger logger; - private final PrintStream verboseWriter; - - // number of chromosomes (ploidy * samples) in input - private final int ploidy; - private final int N; - - // the standard filter to use for calls below the confidence threshold but above the emit threshold - private static final Set filter = new HashSet(1); - - private final GenomeLocParser genomeLocParser; - private final boolean BAQEnabledOnCMDLine; - - // --------------------------------------------------------------------------------------------------------- - // - // Public interface functions - // - // --------------------------------------------------------------------------------------------------------- - @Requires({"toolkit != null", "UAC != null"}) - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), GATKVariantContextUtils.DEFAULT_PLOIDY); - } - - protected UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, Set samples, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - } - - @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples, int ploidy) { - this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; - genomeLocParser = toolkit.getGenomeLocParser(); - this.samples = new TreeSet(samples); - // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ - this.UAC = UAC; - - this.logger = logger; - this.verboseWriter = verboseWriter; - this.annotationEngine = engine; - - this.ploidy = ploidy; - this.N = samples.size() * ploidy; - log10AlleleFrequencyPriorsSNPs = new double[N+1]; - log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); - - filter.add(LOW_QUAL_FILTER_NAME); - - determineGLModelsToUse(); - - // do argument checking - if (UAC.annotateAllSitesWithPLs) { - if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP)) - throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs"); - - } - } - - /** - * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set) - * - * same as the full call but with allSamples == null - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantCallContext object - */ - public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext) { - return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null); - } - - - /** - * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. - * - * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype - * for every sample in allSamples. If it's null there's no such guarentee. Providing this - * argument is critical when the resulting calls will be written to a VCF file. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) - * @return the VariantCallContext object - */ - public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Set allSamples) { - final List results = new ArrayList(2); - - final List models = getGLModelsToUse(tracker, refContext, rawContext); - - final Map perReadAlleleLikelihoodMap = new HashMap(); - - if ( models.isEmpty() ) { - results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); - } - else { - for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { - perReadAlleleLikelihoodMap.clear(); - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - if ( stratifiedContexts == null ) { - results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); - } - else { - final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); - if ( vc != null ) - results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); -// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set -// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) -// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); - - } - } - } - - return results; - } - - /** - * Compute GLs at a given locus. Entry point for engine calls from UGCalcLikelihoods. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) - * @return the VariantContext object - */ - public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Map perReadAlleleLikelihoodMap) { - final List models = getGLModelsToUse(tracker, refContext, rawContext); - if ( models.isEmpty() ) { - return null; - } - - for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - // return the first valid one we encounter - if ( stratifiedContexts != null ) - return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); - - } - - return null; - } - - /** - * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param vc the GL-annotated variant context - * @return the VariantCallContext object - */ - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final VariantContext vc) { - final List models = getGLModelsToUse(tracker, refContext, rawContext); - if ( models.isEmpty() ) { - return null; - } - - // return the first one - final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); - } - - /** - * Compute genotypes at a given locus. - * - * @param vc the GL-annotated variant context - * @return the VariantCallContext object - */ - public VariantCallContext calculateGenotypes(VariantContext vc) { - return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), null); - } - - - // --------------------------------------------------------------------------------------------------------- - // - // Private implementation helpers - // - // --------------------------------------------------------------------------------------------------------- - - // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine - private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final Map stratifiedContexts, - final AlignmentContextUtils.ReadOrientation type, - final List alternateAllelesToUse, - final boolean useBAQedPileup, - final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap) { - - // initialize the data for this thread if that hasn't been done yet - if ( glcm.get() == null ) { - glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); - } - - return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); - } - - private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { - VariantContext vc; - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); - if ( vcInput == null ) - return null; - vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()).make(); - } else { - // deal with bad/non-standard reference bases - if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) - return null; - - Set alleles = new HashSet(); - alleles.add(Allele.create(ref.getBase(), true)); - vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); - } - - if ( annotationEngine != null ) { - // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations - final ReadBackedPileup pileup = rawContext.getBasePileup(); - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); - } - - return new VariantCallContext(vc, false); - } - - public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap); - } - - public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(null, null, null, null, vc, model, null); - } - - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Map stratifiedContexts, - final VariantContext vc, - final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); - } - - /** - * Main entry function to calculate genotypes of a given VC with corresponding GL's - * @param tracker Tracker - * @param refContext Reference context - * @param rawContext Raw context - * @param stratifiedContexts Stratified alignment contexts - * @param vc Input VC - * @param model GL calculation model - * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc - * @return VC with assigned genotypes - */ - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext rawContext, Map stratifiedContexts, - final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, - final boolean inheritAttributesFromInputVC, - final Map perReadAlleleLikelihoodMap) { - - boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; - - // TODO TODO TODO TODO - // REFACTOR THIS FUNCTION, TOO UNWIELDY!! - - // initialize the data for this thread if that hasn't been done yet - if ( afcm.get() == null ) { - afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); - } - - // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call - if (!canVCbeGenotyped(vc)) { - if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext) - return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext); - else - return null; - - } - - // estimate our confidence in a reference call and return - if ( vc.getNSamples() == 0 ) { - if ( limitedContext ) - return null; - return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? - estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : - generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); - } - - final AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); - - // is the most likely frequency conformation AC=0 for all alternate alleles? - boolean bestGuessIsRef = true; - - // determine which alternate alleles have AF>0 - final List myAlleles = new ArrayList<>(vc.getAlleles().size()); - final List alleleCountsofMLE = new ArrayList<>(vc.getAlleles().size()); - myAlleles.add(vc.getReference()); - for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) { - final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); - if ( alternateAllele.isReference() ) - continue; - - // Compute if the site is considered polymorphic with sufficient confidence relative to our - // phred-scaled emission QUAL - final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - final boolean toInclude = isNonRef || alternateAllele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE || - UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || - UAC.annotateAllSitesWithPLs; - - bestGuessIsRef &= !isNonRef; - - if (toInclude) { - myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); - } - } - - final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); - - // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice - final double phredScaledConfidence = - Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs - ? -10 * AFresult.getLog10PosteriorOfAFEq0() - : -10 * AFresult.getLog10PosteriorOfAFGT0()); - - // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { - // technically, at this point our confidence in a reference call isn't accurately estimated - // because it didn't take into account samples with no data, so let's get a better estimate - return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); - } - - // start constructing the resulting VC - final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); - final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); - builder.log10PError(phredScaledConfidence/-10.0); - if ( ! passesCallThreshold(phredScaledConfidence) ) - builder.filters(filter); - - // create the genotypes - final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy); - builder.genotypes(genotypes); - - // print out stats if we have a writer - if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); - - // *** note that calculating strand bias involves overwriting data structures, so we do that last - final HashMap attributes = new HashMap(); - - // inherit attributed from input vc if requested - if (inheritAttributesFromInputVC) - attributes.putAll(vc.getAttributes()); - // if the site was downsampled, record that fact - if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) - attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - - if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) - attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); - - // add the MLE AC and AF annotations - if ( alleleCountsofMLE.size() > 0 ) { - attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); - final int AN = builder.make().getCalledChrCount(); - final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); - // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) - for ( int AC : alleleCountsofMLE ) - MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN)); - attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); - } - - if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { - //final boolean DEBUG_SLOD = false; - - // the overall lod - //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - final double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); - - final List allAllelesToUse = builder.make().getAlleles(); - - // the forward lod - final VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - final AFCalcResult forwardAFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); - //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - final double forwardLog10PofNull = forwardAFresult.getLog10LikelihoodOfAFEq0(); - final double forwardLog10PofF = forwardAFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); - - // the reverse lod - final VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - final AFCalcResult reverseAFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); - //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - final double reverseLog10PofNull = reverseAFresult.getLog10LikelihoodOfAFEq0(); - final double reverseLog10PofF = reverseAFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); - - final double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; - final double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; - //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); - - // strand score is max bias between forward and reverse strands - double strandScore = Math.max(forwardLod, reverseLod); - // rescale by a factor of 10 - strandScore *= 10.0; - //logger.debug(String.format("SLOD=%f", strandScore)); - - if ( !Double.isNaN(strandScore) ) - attributes.put("SB", strandScore); - } - - // finish constructing the resulting VC - builder.attributes(attributes); - VariantContext vcCall = builder.make(); - - if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine - // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations - final ReadBackedPileup pileup = rawContext.getBasePileup(); - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); - } - - // if we are subsetting alleles (either because there were too many or because some were not polymorphic) - // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). - if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync - vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall); - - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); - } - - /** - * Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed. - * @param vc Input VC - * @return Status check - */ - @Requires("vc != null") - protected boolean canVCbeGenotyped(final VariantContext vc) { - // protect against too many alternate alleles that we can't even run AF on: - if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) { - logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + - " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart()); - return false; - } - else return true; - - } - - private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { - - if ( !BaseUtils.isRegularBase(refContext.getBase()) ) - return null; - - Map stratifiedContexts = null; - - if ( model.name().contains("INDEL") ) { - - final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); - // don't call when there is no coverage - if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) - return null; - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - } else if ( model.name().contains("SNP") ) { - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); - - if ( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) { - int numDeletions = 0; - for ( final PileupElement p : rawContext.getBasePileup() ) { - if ( p.isDeletion() ) - numDeletions++; - } - if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { - return null; - } - } - } - - return stratifiedContexts; - } - - private final double getRefBinomialProbLog10(final int depth) { - return MathUtils.log10BinomialProbability(depth, 0); - } - - private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { - if ( contexts == null ) - return null; - - double log10POfRef = Math.log10(initialPofRef); - - // for each sample that we haven't examined yet - for ( String sample : samples ) { - final AlignmentContext context = contexts.get(sample); - if ( ignoreCoveredSamples && context != null ) - continue; - final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage(); - log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta); - } - - return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); - } - - /** - * Compute the log10 probability of a sample with sequencing depth and no alt allele is actually truly homozygous reference - * - * Assumes the sample is diploid - * - * @param depth the depth of the sample - * @param theta the heterozygosity of this species (between 0 and 1) - * @return a valid log10 probability of the sample being hom-ref - */ - @Requires({"depth >= 0", "theta >= 0.0 && theta <= 1.0"}) - @Ensures("MathUtils.goodLog10Probability(result)") - protected double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) { - final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth); - return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef)); - } - - protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { - Allele refAllele = null, altAllele = null; - for ( Allele allele : vc.getAlleles() ) { - if ( allele.isReference() ) - refAllele = allele; - else - altAllele = allele; - } - - for (int i = 0; i <= N; i++) { - StringBuilder AFline = new StringBuilder("AFINFO\t"); - AFline.append(pos); - AFline.append("\t"); - AFline.append(refAllele); - AFline.append("\t"); - if ( altAllele != null ) - AFline.append(altAllele); - else - AFline.append("N/A"); - AFline.append("\t"); - AFline.append(i + "/" + N + "\t"); - AFline.append(String.format("%.2f\t", ((float)i)/N)); - AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - verboseWriter.println(AFline.toString()); - } - - verboseWriter.println("P(f>0) = " + PofF); - verboseWriter.println("Qscore = " + phredScaledConfidence); - verboseWriter.println(); - } - - protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { - return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - } - - protected boolean passesCallThreshold(double conf) { - return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING; - } - - protected boolean confidentlyCalled(double conf, double PofF) { - return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING || - (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); - } - - private void determineGLModelsToUse() { - String modelPrefix = ""; - if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY ) - modelPrefix = GPSTRING; - - // GGA mode => must initialize both the SNP and indel models - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || - UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); - } - else { - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); - } - } - - // decide whether we are currently processing SNPs, indels, neither, or both - private List getGLModelsToUse(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext) { - if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) - return modelsToUse; - - if ( modelsToUse.size() != 2 ) - throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse); - - // if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested - final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); - - if ( vcInput == null ) { - return Collections.emptyList(); // no work to be done - } else if ( vcInput.isSNP() ) { - return Collections.singletonList(modelsToUse.get(SNP_MODEL)); - } else if ( vcInput.isIndel() || vcInput.isMixed() ) { - return Collections.singletonList(modelsToUse.get(INDEL_MODEL)); - } else { - return Collections.emptyList(); // No support for other types yet - } - } - - /** - * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, - * where Pr(AC=i) = theta/i where theta is heterozygosity - * @param N Number of chromosomes - * @param priors (output) array to be filled with priors - * @param heterozygosity default heterozygosity to use, if inputPriors is empty - * @param inputPriors Input priors to use (in which case heterozygosity is ignored) - */ - public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { - - - double sum = 0.0; - - if (!inputPriors.isEmpty()) { - // user-specified priors - if (inputPriors.size() != N) - throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); - - int idx = 1; - for (final double prior: inputPriors) { - if (prior < 0.0) - throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); - priors[idx++] = Math.log10(prior); - sum += prior; - } - } - else { - // for each i - for (int i = 1; i <= N; i++) { - final double value = heterozygosity / (double)i; - priors[i] = Math.log10(value); - sum += value; - } - } - - // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) - if (sum > 1.0) { - throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); - } - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[0] = Math.log10(1.0 - sum); - } - - protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { - if (model.name().toUpperCase().contains("SNP")) - return log10AlleleFrequencyPriorsSNPs; - else if (model.name().toUpperCase().contains("INDEL")) - return log10AlleleFrequencyPriorsIndels; - else - throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - - } - - protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) { - if( model.name().contains("SNP") ) - return HUMAN_SNP_HETEROZYGOSITY; - if( model.name().contains("INDEL") ) - return HUMAN_INDEL_HETEROZYGOSITY; - else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - } - - private static Map getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { - - final Map glcm = new HashMap(); - final List> glmClasses = new PluginManager(GenotypeLikelihoodsCalculationModel.class).getPlugins(); - - for (int i = 0; i < glmClasses.size(); i++) { - final Class glmClass = glmClasses.get(i); - final String key = glmClass.getSimpleName().replaceAll("GenotypeLikelihoodsCalculationModel","").toUpperCase(); - try { - final Object args[] = new Object[]{UAC,logger}; - final Constructor c = glmClass.getDeclaredConstructor(UnifiedArgumentCollection.class, Logger.class); - glcm.put(key, (GenotypeLikelihoodsCalculationModel)c.newInstance(args)); - } - catch (Exception e) { - throw new UserException("The likelihoods model provided for the -glm argument (" + UAC.GLmodel + ") is not a valid option: " + e.getMessage()); - } - } - - return glcm; - } - - public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { - if ( tracker == null || ref == null || logger == null ) - return null; - VariantContext vc = null; - - // search for usable record - for ( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { - if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { - if ( vc == null ) { - vc = vc_input; - } else { - logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record"); - } - } - } - - return vc; - } -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotypingEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotypingEngine.java new file mode 100644 index 000000000..abf3721f3 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotypingEngine.java @@ -0,0 +1,503 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.io.PrintStream; +import java.lang.reflect.Constructor; +import java.util.*; + +/** + * {@link UnifiedGenotyper}'s genotyping strategy implementation. + */ +public class UnifiedGenotypingEngine extends GenotypingEngine { + + public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + private static final String GPSTRING = "GENERALPLOIDY"; + + public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; + public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; + + private static final int SNP_MODEL = 0; + private static final int INDEL_MODEL = 1; + + // the model used for calculating genotypes + private ThreadLocal> glcm; + private final List modelsToUse = new ArrayList<>(2); + + + // the various loggers and writers + private final PrintStream verboseWriter; + + private final GenomeLocParser genomeLocParser; + private final boolean BAQEnabledOnCMDLine; + + // --------------------------------------------------------------------------------------------------------- + // + // Public interface functions + // + // --------------------------------------------------------------------------------------------------------- + + /** + * Constructs a new Unified-Genotyper engine. + *

The new engine won't emmit annotations, will use the full sample set and will not produce additional verbose + * output

+ * + * @param toolkit reference to the enclosing genome analysis engine. + * @param configuration configuration object. + * + * @throws IllegalArgumentException if either {@code toolkit} or {@code UAC} is {@code null}. + */ + public UnifiedGenotypingEngine(final GenomeAnalysisEngine toolkit, final UnifiedArgumentCollection configuration) { + this(toolkit, configuration, null, null, null); + } + + /** + * Constructs a new Unified-Genotyper engine. + * + * @param toolkit reference to the enclosing genome analysis engine. + * @param configuration configuration object. + * @param annotationEngine variant annotation engine. If {@code null}, no annotations will be processed. + * @param sampleNames subset of sample names to work on. If {@code null}, all it will use the {@code toolkit} full sample set. + * @param verboseWriter where to output additional verbose debugging information. + * + * @throws IllegalArgumentException if either {@code toolkit} or {@code UAC} is {@code null}. + */ + public UnifiedGenotypingEngine(final GenomeAnalysisEngine toolkit, final UnifiedArgumentCollection configuration, + final VariantAnnotatorEngine annotationEngine, + final Set sampleNames, final PrintStream verboseWriter) { + super(toolkit,configuration,annotationEngine,sampleNames); + this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; + genomeLocParser = toolkit.getGenomeLocParser(); + // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ + + this.verboseWriter = verboseWriter; + + determineGLModelsToUse(); + + // do argument checking + if (configuration.annotateAllSitesWithPLs) { + if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Name.SNP)) + throw new IllegalArgumentException("Invalid genotype likelihood model specification: " + + "only diploid SNP model can be used in conjunction with option allSitePLs"); + } + + glcm = new ThreadLocal>() { + @Override + protected Map initialValue() { + return getGenotypeLikelihoodsCalculationObject(logger,UnifiedGenotypingEngine.this.configuration); + } + }; + } + + /** + * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @return the VariantCallContext object + */ + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + final List results = new ArrayList<>(2); + + final List models = getGLModelsToUse(tracker, rawContext); + + final Map perReadAlleleLikelihoodMap = new HashMap<>(); + + if ( models.isEmpty() ) { + results.add(configuration.outputMode == OutputMode.EMIT_ALL_SITES && configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES ? emptyCallContext(tracker, refContext, rawContext) : null); + } + else { + for ( final GenotypeLikelihoodsCalculationModel.Name model : models ) { + perReadAlleleLikelihoodMap.clear(); + final Map stratifiedContexts = getFilteredAndStratifiedContexts(refContext, rawContext, model); + if ( stratifiedContexts == null ) { + results.add(configuration.outputMode == OutputMode.EMIT_ALL_SITES && configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES ? emptyCallContext(tracker, refContext, rawContext) : null); + } + else { + final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); + if ( vc != null ) + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); +// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set +// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) +// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); + + } + } + } + + return results; + } + + /** + * Compute GLs at a given locus. Entry point for engine calls from UGCalcLikelihoods. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) + * @return the VariantContext object + */ + public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map perReadAlleleLikelihoodMap) { + final List models = getGLModelsToUse(tracker, rawContext); + if ( models.isEmpty() ) { + return null; + } + + for ( final GenotypeLikelihoodsCalculationModel.Name model : models ) { + final Map stratifiedContexts = getFilteredAndStratifiedContexts(refContext, rawContext, model); + // return the first valid one we encounter + if ( stratifiedContexts != null ) + return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); + + } + + return null; + } + + /** + * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param vc the GL-annotated variant context + * @return the VariantCallContext object + */ + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final VariantContext vc) { + final List models = getGLModelsToUse(tracker, rawContext); + if ( models.isEmpty() ) { + return null; + } + + // return the first one + final GenotypeLikelihoodsCalculationModel.Name model = models.get(0); + final Map stratifiedContexts = getFilteredAndStratifiedContexts(refContext, rawContext, model); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); + } + + /** + * Compute genotypes at a given locus. + * + * @param vc the GL-annotated variant context + * @return the VariantCallContext object + */ + public VariantCallContext calculateGenotypes(VariantContext vc) { + return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Name.valueOf("SNP"), null); + } + + + // --------------------------------------------------------------------------------------------------------- + // + // Private implementation helpers + // + // --------------------------------------------------------------------------------------------------------- + + // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine + private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final Map stratifiedContexts, + final AlignmentContextUtils.ReadOrientation type, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenotypeLikelihoodsCalculationModel.Name model, + final Map perReadAlleleLikelihoodMap) { + + return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); + } + + + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Name model) { + return calculateGenotypes(null, null, null, null, vc, model, null); + } + + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map stratifiedContexts, + final VariantContext vc, + final GenotypeLikelihoodsCalculationModel.Name model, + final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); + } + + @Override + protected boolean forceKeepAllele(final Allele allele) { + return configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES || configuration.annotateAllSitesWithPLs; + } + + @Override + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext rawContext, Map stratifiedContexts, + final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Name model, + final boolean inheritAttributesFromInputVC, + final Map perReadAlleleLikelihoodMap) { + boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; + final VariantCallContext result = super.calculateGenotypes(tracker,refContext,rawContext,stratifiedContexts,vc,model,inheritAttributesFromInputVC,perReadAlleleLikelihoodMap); + if ( verboseWriter != null && !limitedContext ) + printVerboseData(refContext.getLocus().toString(), vc, model); + return result; + } + + @Override + protected String callSourceString() { + return "UG_call"; + } + + @Override + protected boolean forceSiteEmission() { + return configuration.outputMode == OutputMode.EMIT_ALL_SITES; + } + + + @Override + protected Map composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc, + final AlignmentContext rawContext, final Map stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List alleleCountsofMLE, final boolean bestGuessIsRef, + final AFCalcResult AFresult, final List allAllelesToUse, final GenotypesContext genotypes, + final GenotypeLikelihoodsCalculationModel.Name model, final Map perReadAlleleLikelihoodMap) { + final Map result = super.composeCallAttributes(inheritAttributesFromInputVC, vc,rawContext,stratifiedContexts,tracker,refContext,alleleCountsofMLE,bestGuessIsRef, + AFresult,allAllelesToUse,genotypes,model,perReadAlleleLikelihoodMap); + + final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; + + if ( configuration.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + result.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); + + if ( configuration.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { + + // the overall lod + //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; + final double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); + + // the forward lod + final VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + final AFCalcResult forwardAFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); + //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + final double forwardLog10PofNull = forwardAFresult.getLog10LikelihoodOfAFEq0(); + final double forwardLog10PofF = forwardAFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); + + // the reverse lod + final VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + final AFCalcResult reverseAFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); + //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + final double reverseLog10PofNull = reverseAFresult.getLog10LikelihoodOfAFEq0(); + final double reverseLog10PofF = reverseAFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); + + final double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; + final double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; + //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + + // strand score is max bias between forward and reverse strands + double strandScore = Math.max(forwardLod, reverseLod); + // rescale by a factor of 10 + strandScore *= 10.0; + //logger.debug(String.format("SLOD=%f", strandScore)); + + if ( !Double.isNaN(strandScore) ) + result.put("SB", strandScore); + } + return result; + } + + private Map getFilteredAndStratifiedContexts(ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Name model) { + + if ( !BaseUtils.isRegularBase(refContext.getBase()) ) + return null; + + + switch (model) { + case INDEL: + case GENERALPLOIDYINDEL: + + final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(configuration.MIN_BASE_QUALTY_SCORE); + // don't call when there is no coverage + if ( pileup.getNumberOfElements() == 0 && configuration.outputMode != OutputMode.EMIT_ALL_SITES ) + return null; + + // stratify the AlignmentContext and cut by sample + return AlignmentContextUtils.splitContextBySampleName(pileup); + case SNP: + case GENERALPLOIDYSNP: + + if ( !(configuration.outputMode == OutputMode.EMIT_ALL_SITES && configuration.genotypingMode != GenotypingMode.GENOTYPE_GIVEN_ALLELES) ) { + int numDeletions = 0; + for ( final PileupElement p : rawContext.getBasePileup() ) { + if ( p.isDeletion() ) + numDeletions++; + } + if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > configuration.MAX_DELETION_FRACTION ) { + return null; + } + } + // stratify the AlignmentContext and cut by sample + return AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); + default : + throw new IllegalStateException("unexpected model: " + model); + } + } + + protected void printVerboseData(final String pos, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Name model) { + Allele refAllele = null, altAllele = null; + for ( Allele allele : vc.getAlleles() ) { + if ( allele.isReference() ) + refAllele = allele; + else + altAllele = allele; + } + + for (int i = 0; i <= numberOfGenomes; i++) { + StringBuilder AFline = new StringBuilder("AFINFO\t"); + AFline.append(pos); + AFline.append('\t'); + AFline.append(refAllele); + AFline.append('\t'); + if ( altAllele != null ) + AFline.append(altAllele); + else + AFline.append("N/A"); + AFline.append('\t'); + AFline.append(i).append('/').append(numberOfGenomes).append('\t'); + AFline.append(String.format("%.2f\t", ((float) i) / numberOfGenomes)); + AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); + verboseWriter.println(AFline.toString()); + } + + verboseWriter.println("Qscore = " + vc.getLog10PError()); + verboseWriter.println(); + } + + private void determineGLModelsToUse() { + String modelPrefix = ""; + if ( !configuration.GLmodel.name().contains(GPSTRING) && configuration.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY ) + modelPrefix = GPSTRING; + + // GGA mode => must initialize both the SNP and indel models + if ( configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES || + configuration.GLmodel.name().toUpperCase().contains("BOTH") ) { + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Name.valueOf(modelPrefix + "SNP")); + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Name.valueOf(modelPrefix + "INDEL")); + } + else { + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Name.valueOf(modelPrefix + configuration.GLmodel.name().toUpperCase())); + } + } + + // decide whether we are currently processing SNPs, indels, neither, or both + private List getGLModelsToUse(final RefMetaDataTracker tracker, + final AlignmentContext rawContext) { + if ( configuration.genotypingMode != GenotypingMode.GENOTYPE_GIVEN_ALLELES ) + return modelsToUse; + + if ( modelsToUse.size() != 2 ) + throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse); + + // if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested + final VariantContext vcInput = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, rawContext.getLocation(), false, logger, configuration.alleles); + + if ( vcInput == null ) { + return Collections.emptyList(); // no work to be done + } else if ( vcInput.isSNP() ) { + return Collections.singletonList(modelsToUse.get(SNP_MODEL)); + } else if ( vcInput.isIndel() || vcInput.isMixed() ) { + return Collections.singletonList(modelsToUse.get(INDEL_MODEL)); + } else { + return Collections.emptyList(); // No support for other types yet + } + } + + private static Map getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { + + final Map glcm = new HashMap<>(); + final List> glmClasses = new PluginManager(GenotypeLikelihoodsCalculationModel.class).getPlugins(); + + for (final Class glmClass : glmClasses) { + final String key = glmClass.getSimpleName().replaceAll("GenotypeLikelihoodsCalculationModel","").toUpperCase(); + try { + final Object args[] = new Object[]{UAC,logger}; + final Constructor c = glmClass.getDeclaredConstructor(UnifiedArgumentCollection.class, Logger.class); + glcm.put(key, (GenotypeLikelihoodsCalculationModel)c.newInstance(args)); + } + catch (Exception e) { + throw new UserException("The likelihoods model provided for the -glm argument (" + UAC.GLmodel + ") is not a valid option: " + e.getMessage()); + } + } + + return glcm; + } + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 6dffa8a6d..0af4c520e 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -134,7 +134,7 @@ public class AFCalcFactory { * @param logger an optional (can be null) logger to override the default in the model * @return an initialized AFCalc */ - public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC, + public static AFCalc createAFCalc(final StandardCallerArgumentCollection UAC, final int nSamples, final Logger logger) { final int maxAltAlleles = UAC.MAX_ALTERNATE_ALLELES; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index 042e04767..f0c7231fc 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -47,8 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypingEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.variant.variantcontext.*; @@ -110,9 +109,11 @@ public class AFCalcTestBuilder { switch ( priorType ) { case flat: return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors + + //TODO break dependency with human... avoid special reference to this species. case human: final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); + GenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); return humanPriors; default: throw new RuntimeException("Unexpected type " + priorType); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java index 8b37e265d..18341e185 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -64,7 +64,7 @@ import java.util.Map; /** * Read likelihood calculation engine base on applying heuristic on the assembly graph. */ -public class GraphBasedLikelihoodCalculationEngine implements LikelihoodCalculationEngine { +public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine { private static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngine.class); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 871ef78b7..45a2ffda9 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -51,7 +51,6 @@ import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; -import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -65,10 +64,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.sting.utils.GenomeLoc; @@ -84,8 +80,11 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.sting.utils.gvcf.GVCFWriter; -import org.broadinstitute.sting.utils.haplotype.*; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotype.LDMerger; +import org.broadinstitute.sting.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; @@ -93,7 +92,6 @@ import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.variant.vcf.*; @@ -159,7 +157,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Advanced @Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine", doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false) - protected LikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = LikelihoodCalculationEngine.Implementation.PairHMM; + protected ReadLikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = ReadLikelihoodCalculationEngine.Implementation.PairHMM; @Hidden @Advanced @@ -260,7 +258,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In protected String[] annotationClassesToUse = { "Standard" }; @ArgumentCollection - private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); + private HaplotypeCallerArgumentCollection SCAC = new HaplotypeCallerArgumentCollection(); // ----------------------------------------------------------------------------------------------- // arguments to control internal behavior of the read threading assembler @@ -297,21 +295,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // general advanced arguments to control haplotype caller behavior // ----------------------------------------------------------------------------------------------- - /** - * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference. - * See http://www.broadinstitute.org/gatk/guide/article?id=2940 for more details of how this works. - * Note that if you set -ERC GVCF, you also need to set -variant_index_type LINEAR and -variant_index_parameter 128000 (with those exact values!). - * This requirement is a temporary workaround for an issue with index compression. - */ - @Advanced - @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Mode for emitting experimental reference confidence scores", required = false) - protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; - - public enum ReferenceConfidenceMode { - NONE, - BP_RESOLUTION, - GVCF - } /** * The GQ partition intervals @@ -394,10 +377,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) protected boolean USE_ALLELES_TRIGGER = false; - @Advanced - @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false) - protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; - /** * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their * mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of @@ -455,10 +434,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) protected boolean errorCorrectKmers = false; - @Advanced - @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) - protected boolean DEBUG; - @Hidden @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) protected boolean debugGraphTransformations = false; @@ -506,17 +481,16 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // ----------------------------------------------------------------------------------------------- // the UG engines - private UnifiedGenotyperEngine UG_engine = null; - private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; + private UnifiedGenotypingEngine activeRegionEvaluationGenotyperEngine = null; // the assembly engine private LocalAssemblyEngine assemblyEngine = null; // the likelihoods engine - private LikelihoodCalculationEngine likelihoodCalculationEngine = null; + private ReadLikelihoodCalculationEngine likelihoodCalculationEngine = null; // the genotyping engine - private GenotypingEngine genotypingEngine = null; + private HaplotypeCallerGenotypingEngine genotypingEngine = null; // fasta reference reader to supplement the edges of the reference sequence protected CachingIndexedFastaSequenceFile referenceReader; @@ -533,7 +507,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // the minimum length of a read we'd consider using for genotyping private final static int MIN_READ_LENGTH = 10; - private List samplesList = new ArrayList<>(); + private List samplesList; private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file @@ -553,13 +527,18 @@ public class HaplotypeCaller extends ActiveRegionWalker, In public void initialize() { super.initialize(); - if (dontGenotype && emitReferenceConfidence == ReferenceConfidenceMode.GVCF) + if (dontGenotype && emitReferenceConfidence()) throw new UserException("You cannot request gVCF output and do not genotype at the same time"); if ( emitReferenceConfidence() ) { + + if (SCAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES) + throw new UserException.BadArgumentValue("ERC/gt_mode","you cannot request reference confidence output and Genotyping Giving Alleles at the same time"); + SCAC.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; SCAC.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; + // also, we don't need to output several of the annotations annotationsToExclude.add("ChromosomeCounts"); annotationsToExclude.add("FisherStrand"); @@ -568,43 +547,35 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // but we definitely want certain other ones annotationsToUse.add("StrandBiasBySample"); logger.info("Standard Emitting and Calling confidence set to 0.0 for reference-model confidence output"); + if (!SCAC.annotateAllSitesWithPLs) + logger.info("All sites annotated with PLs forced to true for reference-model confidence output"); + SCAC.annotateAllSitesWithPLs = true; } if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); - // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - samplesList.addAll( samples ); - // initialize the UnifiedGenotyper Engine which is used to call into the exact model - final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine - UAC.OutputMode = SCAC.GenotypingMode.equals(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) - ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - if (emitReferenceConfidence() && !UG_engine.getUAC().annotateAllSitesWithPLs) { - UG_engine.getUAC().annotateAllSitesWithPLs = true; - logger.info("All sites annotated with PLs force to true for reference-model confidence output"); - } + samplesList = new ArrayList<>(SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader())); + Set samplesSet = new LinkedHashSet<>(samplesList); + // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); - simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + final UnifiedArgumentCollection simpleUAC = SCAC.cloneTo(UnifiedArgumentCollection.class); + simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY; + simpleUAC.genotypingMode = GenotypingMode.DISCOVERY; + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, SCAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, SCAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; - UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(getToolkit(), simpleUAC, null, samplesSet, null); + activeRegionEvaluationGenotyperEngine.setLogger(logger); - if( UAC.CONTAMINATION_FRACTION_FILE != null ) { - UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); - } + if( SCAC.CONTAMINATION_FRACTION_FILE != null ) + SCAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(SCAC.CONTAMINATION_FRACTION_FILE, SCAC.CONTAMINATION_FRACTION, samplesSet, logger)); - if( SCAC.GenotypingMode.equals(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) && consensusMode ) { + if( SCAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES && consensusMode ) throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other."); - } // initialize the output VCF header final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); @@ -627,11 +598,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - initializeReferenceConfidenceModel(samples, headerInfo); + initializeReferenceConfidenceModel(samplesSet, headerInfo); - vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); + vcfWriter.writeHeader(new VCFHeader(headerInfo, samplesSet)); try { // fasta reference reader to supplement the edges of the reference sequence @@ -645,7 +616,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); - assemblyEngine.setDebug(DEBUG); + assemblyEngine.setDebug(SCAC.DEBUG); assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails); @@ -670,9 +641,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // create our likelihood calculation engine likelihoodCalculationEngine = createLikelihoodCalculationEngine(); - final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); + final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(SCAC.DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); - genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger ); + genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(),SCAC, annotationEngine, null, variantMerger ); if ( bamWriter != null ) { // we currently do not support multi-threaded BAM writing, so exception out @@ -681,8 +652,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); } - trimmer.initialize(getToolkit().getGenomeLocParser(), DEBUG, - UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence()); + trimmer.initialize(getToolkit().getGenomeLocParser(), SCAC.DEBUG, + SCAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence()); } private void initializeReferenceConfidenceModel(final Set samples, final Set headerInfo) { @@ -690,7 +661,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if ( emitReferenceConfidence() ) { if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); - if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { + if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { // a kluge to enforce the use of this indexing strategy if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { @@ -711,12 +682,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * * @return never {@code null}. */ - private LikelihoodCalculationEngine createLikelihoodCalculationEngine() { + private ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine() { switch (likelihoodEngineImplementation) { case PairHMM: - return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); + return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, SCAC.DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); case GraphBased: - return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,DEBUG,debugGraphTransformations); + return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,SCAC.DEBUG,debugGraphTransformations); case Random: return new RandomLikelihoodCalculationEngine(); default: @@ -739,35 +710,28 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // enable non primary and extended reads in the active region @Override public EnumSet desiredReadStates() { - if ( includeUnmappedReads ) { + if ( includeUnmappedReads ) throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional"); -// return EnumSet.of( -// ActiveRegionReadState.PRIMARY, -// ActiveRegionReadState.NONPRIMARY, -// ActiveRegionReadState.EXTENDED, -// ActiveRegionReadState.UNMAPPED -// ); - } else + else return EnumSet.of( ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, - ActiveRegionReadState.EXTENDED - ); + ActiveRegionReadState.EXTENDED); } @Override @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles); + if( SCAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, ref.getLocus(), false, logger, SCAC.alleles); if( vcFromAllelesRod != null ) { return new ActivityProfileState(ref.getLocus(), 1.0); } } if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileState( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + return new ActivityProfileState( ref.getLocus(), tracker.getValues(SCAC.alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); } if( context == null || context.getBasePileup().isEmpty() ) @@ -784,7 +748,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } final List alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE); - final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP); + final VariantCallContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Name.SNP); final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); @@ -807,27 +771,27 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // Not active so nothing to do! return referenceModelForNoVariation(originalActiveRegion, true); - final List activeAllelesToGenotype = new ArrayList<>(); - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - for ( final VariantContext vc : metaDataTracker.getValues(UG_engine.getUAC().alleles) ) { + final List givenAlleles = new ArrayList<>(); + if( SCAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES ) { + for ( final VariantContext vc : metaDataTracker.getValues(SCAC.alleles) ) { if ( vc.isNotFiltered() ) { - activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode + givenAlleles.add(vc); // do something with these VCs during GGA mode } } // No alleles found in this region so nothing to do! - if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); } + if ( givenAlleles.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); } } else { // No reads here so nothing to do! if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } } // run the local assembler, getting back a collection of information on how we should proceed - final AssemblyResultSet untrimmedAssemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); + final AssemblyResultSet untrimmedAssemblyResult = assembleReads(originalActiveRegion, givenAlleles); final TreeSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(); // TODO - line bellow might be unecessary : it might be that assemblyResult will always have those alleles anyway // TODO - so check and remove if that is the case: - allVariationEvents.addAll(activeAllelesToGenotype); + allVariationEvents.addAll(givenAlleles); final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents); @@ -877,7 +841,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] - final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, + final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( haplotypes, stratifiedReadMap, perSampleFilteredReadList, @@ -886,7 +850,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), metaDataTracker, - ( consensusMode ? Collections.emptyList() : activeAllelesToGenotype ), + ( consensusMode ? Collections.emptyList() : givenAlleles ), emitReferenceConfidence() ); // TODO -- must disable if we are doing NCT, or set the output type of ! presorted @@ -899,7 +863,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In stratifiedReadMap); } - if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } + if( SCAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } if ( emitReferenceConfidence() ) { @@ -925,7 +889,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } } - private boolean containsCalls(final GenotypingEngine.CalledHaplotypes calledHaplotypes) { + private boolean containsCalls(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes) { final List calls = calledHaplotypes.getCalls(); if (calls.isEmpty()) return false; for (final VariantContext call : calls) @@ -941,10 +905,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * for further HC steps * * @param activeRegion the region we should assemble - * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) + * @param giveAlleles additional alleles we might need to genotype (can be empty) * @return the AssemblyResult describing how to proceed with genotyping */ - protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { + protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List giveAlleles) { // Create the reference haplotype which is the bases from the reference that make up the active region finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails @@ -955,10 +919,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // Create ReadErrorCorrector object if requested - will be used within assembly engine. ReadErrorCorrector readErrorCorrector = null; if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding); + readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, SCAC.DEBUG, fullReferenceWithPadding); try { - final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); + final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, giveAlleles,readErrorCorrector ); assemblyResultSet.debugDump(logger); return assemblyResultSet; @@ -1056,7 +1020,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Override public void onTraversalDone(Integer result) { - if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it + if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it referenceConfidenceModel.close(); //TODO remove the need to call close here for debugging, the likelihood output stream should be managed //TODO (open & close) at the walker, not the engine. @@ -1073,7 +1037,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private void finalizeActiveRegion( final ActiveRegion activeRegion ) { if (activeRegion.isFinalized()) return; - if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } + if( SCAC.DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } // Loop through the reads hard clipping the adaptor and low quality tails final List readsToUse = new ArrayList<>(activeRegion.getReads().size()); @@ -1156,10 +1120,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In /** * Are we emitting a reference confidence in some form, or not? - * @return true if we are + * + * @return true if HC must emit reference confidence. */ - private boolean emitReferenceConfidence(){ - return emitReferenceConfidence != ReferenceConfidenceMode.NONE; + private boolean emitReferenceConfidence() { + return SCAC.emitReferenceConfidence != ReferenceConfidenceMode.NONE; } /** diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java new file mode 100644 index 000000000..1224ed6f7 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java @@ -0,0 +1,82 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; + +/** + * Set of arguments for the {@link HaplotypeCaller} + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class HaplotypeCallerArgumentCollection extends StandardCallerArgumentCollection { + + @Advanced + @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) + protected boolean DEBUG; + + @Advanced + @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false) + protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; + + /** + * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference. + * See http://www.broadinstitute.org/gatk/guide/article?id=2940 for more details of how this works. + * Note that if you set -ERC GVCF, you also need to set -variant_index_type LINEAR and -variant_index_parameter 128000 (with those exact values!). + * This requirement is a temporary workaround for an issue with index compression. + */ + @Advanced + @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Mode for emitting experimental reference confidence scores", required = false) + protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; + + @Override + public HaplotypeCallerArgumentCollection clone() { + return (HaplotypeCallerArgumentCollection) super.clone(); + } + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java similarity index 92% rename from protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index d65251e58..8daf9c3df 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -49,10 +49,13 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypingEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypingMode; +import org.broadinstitute.sting.gatk.walkers.genotyper.OutputMode; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; @@ -67,25 +70,41 @@ import org.broadinstitute.variant.variantcontext.*; import java.util.*; -public class GenotypingEngine { - private final static Logger logger = Logger.getLogger(GenotypingEngine.class); +/** + * {@link HaplotypeCaller}'s genotyping strategy implementation. + */ +public class HaplotypeCallerGenotypingEngine extends GenotypingEngine { + + private final static Logger logger = Logger.getLogger(HaplotypeCallerGenotypingEngine.class); + + private final static List NO_CALL = Collections.singletonList(Allele.NO_CALL); - private final boolean DEBUG; - private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; - private final static List noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied - private final VariantAnnotatorEngine annotationEngine; private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; - public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, - final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, - final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) { - this.DEBUG = DEBUG; - this.annotationEngine = annotationEngine; - this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; - noCall.add(Allele.NO_CALL); + public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration, + final VariantAnnotatorEngine annotationEngine, final Set sampleNames, + final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) { + super(toolkit,configuration,annotationEngine,sampleNames); this.crossHaplotypeEventMerger = crossHaplotypeEventMerger; } + @Override + protected String callSourceString() { + return "HC_call"; + } + + @Override + protected boolean forceKeepAllele(final Allele allele) { + return allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE || + configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES || + configuration.emitReferenceConfidence != ReferenceConfidenceMode.NONE; + } + + @Override + protected boolean forceSiteEmission() { + return configuration.outputMode == OutputMode.EMIT_ALL_SITES || configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES; + } + /** * Carries the result of a call to #assignGenotypeLikelihoods */ @@ -125,10 +144,9 @@ public class GenotypingEngine { * * The list of samples we're working with is obtained from the haplotypeReadMap * - * @param UG_engine UG Engine with basic input parameters * @param haplotypes Haplotypes to assign likelihoods to * @param haplotypeReadMap Map from reads->(haplotypes,likelihoods) - * @param perSampleFilteredReadList + * @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods. * @param ref Reference bytes at active region * @param refLoc Corresponding active region genome location * @param activeRegionWindow Active window @@ -142,8 +160,7 @@ public class GenotypingEngine { @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! - public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, + public CalledHaplotypes assignGenotypeLikelihoods( final List haplotypes, final Map haplotypeReadMap, final Map> perSampleFilteredReadList, final byte[] ref, @@ -154,10 +171,9 @@ public class GenotypingEngine { final List activeAllelesToGenotype, final boolean emitReferenceConfidence) { // sanity check input arguments - if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); - if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); + if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got " + Arrays.toString(ref)); if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); @@ -192,8 +208,8 @@ public class GenotypingEngine { if( mergedVC == null ) { continue; } - final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP() - ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL; + final GenotypeLikelihoodsCalculationModel.Name calculationModel = mergedVC.isSNP() + ? GenotypeLikelihoodsCalculationModel.Name.SNP : GenotypeLikelihoodsCalculationModel.Name.INDEL; if (emitReferenceConfidence) { final List alleleList = new ArrayList<>(); @@ -211,18 +227,18 @@ public class GenotypingEngine { final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); - if( DEBUG ) { + if( configuration.DEBUG ) { logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); } - final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() ); + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap(haplotypeReadMap, alleleMapper, configuration.getSampleContamination()); if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap); final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); - VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), calculationModel); + final VariantContext call = calculateGenotypes(null,null,null,null,new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), calculationModel, false,null); if( call != null ) { - final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : + final Map alleleReadMap_annotations = ( configuration.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) ); if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap_annotations); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); @@ -281,7 +297,7 @@ public class GenotypingEngine { * @param ref the reference bases (over the same interval as the haplotypes) * @param refLoc the span of the reference bases * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) - * @return + * @return never {@code null} but perhaps an empty list if there is no variants to report. */ private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, final Map haplotypeReadMap, @@ -291,7 +307,7 @@ public class GenotypingEngine { final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); + final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, configuration.DEBUG); if ( in_GGA_mode ) startPosKeySet.clear(); @@ -389,7 +405,7 @@ public class GenotypingEngine { genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC } } - genotypes.add(new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make()); + genotypes.add(new GenotypeBuilder(sample).alleles(NO_CALL).PL(genotypeLikelihoods).make()); } return genotypes; } @@ -517,7 +533,7 @@ public class GenotypingEngine { @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final List> alleleMapper, final List haplotypes ) { - if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } + if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return NO_CALL; } final List eventAllelesForSample = new ArrayList<>(); for( final Allele a : haplotypeAllelesForSample ) { final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a)); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index cfd07da67..ce074a0a8 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -360,8 +360,8 @@ public class HaplotypeResolver extends RodWalker { } // order results by start position - final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1)); - final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2)); + final TreeMap source1Map = new TreeMap(HaplotypeCallerGenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1)); + final TreeMap source2Map = new TreeMap(HaplotypeCallerGenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2)); if ( source1Map.size() == 0 || source2Map.size() == 0 ) { // TODO -- handle errors appropriately logger.debug("No source alleles; aborting at " + refContext.getLocus()); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index b53445933..6291f9536 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -55,10 +55,10 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.CigarUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; @@ -112,11 +112,7 @@ public abstract class LocalAssemblyEngine { * @param refHaplotype the reference haplotype * @return a non-null list of reads */ - protected abstract List assemble(List reads, Haplotype refHaplotype, List activeAlleleHaplotypes); - - protected List assemble(List reads, Haplotype refHaplotype) { - return assemble(reads, refHaplotype, Collections.emptyList()); - } + protected abstract List assemble(List reads, Haplotype refHaplotype, List givenHaplotypes); /** * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads @@ -124,7 +120,7 @@ public abstract class LocalAssemblyEngine { * @param refHaplotype reference haplotype object * @param fullReferenceWithPadding byte array holding the reference sequence with padding * @param refLoc GenomeLoc object corresponding to the reference sequence with padding - * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode + * @param givenAlleles the alleles to inject into the haplotypes during GGA mode * @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used. * @return the resulting assembly-result-set */ @@ -132,7 +128,7 @@ public abstract class LocalAssemblyEngine { final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, - final List activeAllelesToGenotype, + final List givenAlleles, final ReadErrorCorrector readErrorCorrector) { if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } if( activeRegion.getExtendedLoc() == null ) { throw new IllegalArgumentException("Active region must have an extended location."); } @@ -141,7 +137,7 @@ public abstract class LocalAssemblyEngine { if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } // create the list of artificial haplotypes that should be added to the graph for GGA mode - final List activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc()); + final List givenHaplotypes = GenotypingGivenAllelesUtils.composeGivenHaplotypes(refHaplotype, givenAlleles, activeRegion.getExtendedLoc()); // error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them final List correctedReads; @@ -165,7 +161,7 @@ public abstract class LocalAssemblyEngine { resultSet.add(refHaplotype); final Map assemblyResultByGraph = new HashMap<>(); // create the graphs by calling our subclass assemble method - for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) { + for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, givenHaplotypes) ) { if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) { // do some QC on the graph sanityCheckGraph(result.getGraph(), refHaplotype); @@ -184,30 +180,6 @@ public abstract class LocalAssemblyEngine { return resultSet; } - /** - * Create the list of artificial GGA-mode haplotypes by injecting each of the provided alternate alleles into the reference haplotype - * @param refHaplotype the reference haplotype - * @param activeAllelesToGenotype the list of alternate alleles in VariantContexts - * @param activeRegionWindow the window containing the reference haplotype - * @return a non-null list of haplotypes - */ - private List createActiveAlleleHaplotypes(final Haplotype refHaplotype, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow) { - final Set returnHaplotypes = new LinkedHashSet<>(); - final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); - - for( final VariantContext compVC : activeAllelesToGenotype ) { - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); - if( insertedRefHaplotype != null ) { // can be null if the requested allele can't be inserted into the haplotype - returnHaplotypes.add(insertedRefHaplotype); - } - } - } - - return new ArrayList<>(returnHaplotypes); - } - - @Ensures({"result.contains(refHaplotype)"}) protected List findBestPaths(final List graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow, final Map assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) { diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index 7165e61a5..5891cbdd7 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -67,7 +67,7 @@ import java.io.FileOutputStream; import java.io.PrintStream; import java.util.*; -public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine { +public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine { private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class); public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java index d5d424ca9..f2f8a2eb6 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java @@ -59,7 +59,7 @@ import java.util.Random; /** * Random likelihoods generator, used for testing/benchmarking purposes. */ -public class RandomLikelihoodCalculationEngine implements LikelihoodCalculationEngine { +public class RandomLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine { @Override public Map computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map> reads) { diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java similarity index 99% rename from protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java index 04e64186f..98d76b356 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java @@ -55,7 +55,7 @@ import java.util.Map; /** * Common interface for assembly-haplotype vs reads likelihood engines. */ -public interface LikelihoodCalculationEngine { +public interface ReadLikelihoodCalculationEngine { enum Implementation { /** diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceMode.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceMode.java new file mode 100644 index 000000000..ce40fbc05 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceMode.java @@ -0,0 +1,69 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +/** +* Reference confidence emission modes. +* +* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> +*/ +public enum ReferenceConfidenceMode { + + /** + * Regular calling without emitting reference confidence calls. + */ + NONE, + + /** + * Reference model emitted site by site. + */ + BP_RESOLUTION, + + /** + * Reference model emitted with condensed non-variant blocks, i.e. the GVCF format. + */ + GVCF; +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index 7c5971e03..fc5d22fd7 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -104,12 +104,12 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { } @Override - public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) { + public List assemble(final List reads, final Haplotype refHaplotype, final List givenHaplotypes) { final List results = new LinkedList<>(); // first, try using the requested kmer sizes for ( final int kmerSize : kmerSizes ) { - addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles)); + addResult(results, createGraph(reads, refHaplotype, kmerSize, givenHaplotypes, dontIncreaseKmerSizesForCycles)); } // if none of those worked, iterate over larger sizes if allowed to do so @@ -118,7 +118,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { int numIterations = 1; while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { // on the last attempt we will allow low complexity graphs - addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT)); + addResult(results, createGraph(reads, refHaplotype, kmerSize, givenHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT)); kmerSize += KMER_SIZE_ITERATION_INCREASE; numIterations++; } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java index 6af39c0b0..6c49fe076 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java @@ -52,10 +52,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; @@ -275,8 +272,8 @@ public class GenotypeAndValidate extends RodWalker samples; private enum GVstatus { @@ -336,13 +333,13 @@ public class GenotypeAndValidate extends RodWalker= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; if (deletions >= 0) @@ -352,13 +349,13 @@ public class GenotypeAndValidate extends RodWalker= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; - uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); + uac.GLmodel = GenotypeLikelihoodsCalculationModel.Name.SNP; + snpEngine = new UnifiedGenotypingEngine(getToolkit(), uac); // Adding the INDEL calling arguments for UG - UnifiedArgumentCollection uac_indel = new UnifiedArgumentCollection(uac); - uac_indel.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL; - indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac_indel); + UnifiedArgumentCollection uac_indel = uac.clone(); + uac_indel.GLmodel = GenotypeLikelihoodsCalculationModel.Name.INDEL; + indelEngine = new UnifiedGenotypingEngine(getToolkit(), uac_indel); // make sure we have callConf set to the threshold set by the UAC so we can use it later. callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java index b8a0585a2..7a7addab3 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java @@ -46,28 +46,24 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.variant.HomoSapiens; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.commandline.*; import org.broadinstitute.variant.vcf.*; + import java.util.*; /** @@ -174,7 +170,7 @@ public class CalculateGenotypePosteriors extends RodWalker { * */ @Argument(fullName="globalPrior",shortName="G",doc="The global Dirichlet prior parameters for the allele frequency",required=false) - public double globalPrior = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; + public double globalPrior = HomoSapiens.SNP_HETEROZYGOSITY; /** * When a variant is not seen in a panel, whether to infer (and with what effective strength) that only reference diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java index fcc187d04..a9cd6d101 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java @@ -59,7 +59,7 @@ import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotypingEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -140,7 +140,7 @@ public class GenotypeGVCFs extends RodWalker getDbsnpRodBinding() { return dbsnp.dbsnp; } // the genotyping engine - private UnifiedGenotyperEngine genotypingEngine; + private UnifiedGenotypingEngine genotypingEngine; // the annotation engine private VariantAnnotatorEngine annotationEngine; @@ -167,7 +167,7 @@ public class GenotypeGVCFs extends RodWalker variantCollection : variantCollections ) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java index 37b81ab9e..10a37d3db 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java @@ -46,7 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -54,20 +55,23 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.VariantContextUtils; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; -import java.util.*; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; /** * Regenotypes the variants from a VCF. VCF records must contain PLs or GLs. @@ -107,17 +111,17 @@ public class RegenotypeVariants extends RodWalker implements T @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; - private UnifiedGenotyperEngine UG_engine = null; + private UnifiedGenotypingEngine UG_engine = null; public void initialize() { final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH; - UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; - UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Name.BOTH; + UAC.outputMode = OutputMode.EMIT_ALL_SITES; + UAC.genotypingMode = GenotypingMode.GENOTYPE_GIVEN_ALLELES; String trackName = variantCollection.variants.getName(); Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotypingEngine(getToolkit(), UAC, null, samples, null); final Set hInfo = new HashSet(); hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(trackName))); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gga/GenotypingGivenAllelesUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gga/GenotypingGivenAllelesUtils.java new file mode 100644 index 000000000..72df5d35b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gga/GenotypingGivenAllelesUtils.java @@ -0,0 +1,135 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.gga; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +/** + * Compendium of utils to work in GENOTYPE_GIVEN_ALLELES mode. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class GenotypingGivenAllelesUtils { + + /** + * Composes the given allele variant-context providing information about the rods and reference location. + * @param tracker the meta data tracker. + * @param loc the query location. + * @param snpsOnly whether we only should consider SNP variation. + * @param logger where to output warnings. + * @param allelesBinding the target variation context binding containing the given alleles. + * @return never {@code null} + */ + public static VariantContext composeGivenAllelesVariantContextFromRod(final RefMetaDataTracker tracker, + final GenomeLoc loc, + final boolean snpsOnly, + final Logger logger, + final RodBinding allelesBinding) { + if ( tracker == null ) throw new IllegalArgumentException("the tracker cannot be null"); + if ( loc == null ) throw new IllegalArgumentException("the location cannot be null"); + if ( allelesBinding == null ) throw new IllegalArgumentException("the alleles binding cannot be null"); + + VariantContext vc = null; + + // search for usable record + for ( final VariantContext rodVc : tracker.getValues(allelesBinding, loc) ) { + if ( rodVc != null && ! rodVc.isFiltered() && (! snpsOnly || rodVc.isSNP() )) { + if ( vc == null ) + vc = rodVc; + else + if (logger != null) + logger.warn("Multiple valid VCF records detected in the alleles input file at site " + + loc + ", only considering the first record"); + } + } + + return vc; + } + + /** + * Create the list of artificial GGA-mode haplotypes by injecting each of the provided alternate alleles into the reference haplotype + * + * @param refHaplotype the reference haplotype + * @param givenHaplotypes the list of alternate alleles in VariantContexts + * @param activeRegionWindow the window containing the reference haplotype + * + * @return a non-null list of haplotypes + */ + public static List composeGivenHaplotypes(final Haplotype refHaplotype, final List givenHaplotypes, final GenomeLoc activeRegionWindow) { + if (refHaplotype == null) throw new IllegalArgumentException("the reference haplotype cannot be null"); + if (givenHaplotypes == null) throw new IllegalArgumentException("given haplotypes cannot be null"); + if (activeRegionWindow == null) throw new IllegalArgumentException("active region window cannot be null"); + if (activeRegionWindow.size() != refHaplotype.length()) throw new IllegalArgumentException("inconsistent reference haplotype and active region window"); + + final Set returnHaplotypes = new LinkedHashSet<>(); + final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); + + for( final VariantContext compVC : givenHaplotypes ) { + if (!GATKVariantContextUtils.overlapsRegion(compVC, activeRegionWindow)) + throw new IllegalArgumentException(" some variant provided does not overlap with active region window"); + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); + if( insertedRefHaplotype != null ) { // can be null if the requested allele can't be inserted into the haplotype + returnHaplotypes.add(insertedRefHaplotype); + } + } + } + + return new ArrayList<>(returnHaplotypes); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java new file mode 100644 index 000000000..a09a36291 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java @@ -0,0 +1,206 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import junit.framework.Assert; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerArgumentCollection; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +/** + * Checks on Caller argument collection cloning. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class StandardCallerArgumentCollectionUnitTest { + + public final static List> COLLECTION_CLASSES; + + static { + COLLECTION_CLASSES = new ArrayList<>(); + COLLECTION_CLASSES.add(StandardCallerArgumentCollection.class); + COLLECTION_CLASSES.add(UnifiedArgumentCollection.class); + COLLECTION_CLASSES.add(HaplotypeCallerArgumentCollection.class); + } + + @Test(dataProvider="collectionClasses") + public void testParameterLessInitialization(final Class clazz) { + try { + clazz.newInstance(); + } catch (final Exception ex) { + Assert.fail(); + } + } + + @Test(dataProvider="collectionClassPairs") + public void testCloneTo(final Class fromClass, final Class toClass) { + final StandardCallerArgumentCollection fromObject; + try { + fromObject = randomArgumentCollection(fromClass); + } catch (final IllegalAccessException e) { + throw new SkipException("cannot create a random configuration"); + } catch (final InstantiationException e) { + throw new SkipException("cannot create a random configuration"); + } + + final StandardCallerArgumentCollection toObject = fromObject.cloneTo(toClass); + Assert.assertNotNull(toObject); + Assert.assertEquals(toClass,toObject.getClass()); + for (final Field field : fromClass.getFields()) + if (!field.getDeclaringClass().isAssignableFrom(toClass)) + continue; + else if (Modifier.isPrivate(field.getModifiers())) + continue; + else if (Modifier.isStatic(field.getModifiers())) + continue; + else if (Modifier.isFinal(field.getModifiers())) + continue; + else { + final Object fromValue, toValue; + try { + fromValue = field.get(fromObject); + toValue = field.get(toObject); + } catch (final Exception ex) { + continue; + } + + if (fromValue == null) + Assert.assertNull(toValue); + else if (toValue == null) + Assert.assertNull(fromValue); + else + Assert.assertEquals(fromValue,toValue); + } + } + + @DataProvider(name="collectionClasses") + public Object[][] collectionClasses() { + final Object[][] result = new Object[COLLECTION_CLASSES.size()][]; + for (int i = 0; i < COLLECTION_CLASSES.size(); i++) + result[i] = new Object[] { COLLECTION_CLASSES.get(i) }; + return result; + } + + @DataProvider(name="collectionClassPairs") + public Object[][] collectionClassPairs() { + final Object[][] collectionClasses = collectionClasses(); + final Object[][] result = new Object[collectionClasses.length * collectionClasses.length][]; + for (int i = 0; i < collectionClasses.length; i++) + for (int j = 0; j < collectionClasses.length; j++) + result[i * collectionClasses.length + j] = new Object[] { collectionClasses[i][0], collectionClasses[j][0]}; + return result; + } + + public T randomArgumentCollection(final Class clazz) throws IllegalAccessException, InstantiationException { + final T result = clazz.newInstance(); + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + for (final Field field : clazz.getFields()) { + final int fieldModifiers = field.getModifiers(); + if (!Modifier.isPublic(fieldModifiers)) + continue; + final Class fieldType = mapWrappersToPrimitives(field.getType()); + final Object value; + if (fieldType.isPrimitive()) { + if (fieldType == Integer.TYPE) + value = rnd.nextInt(100) - 50; + else if (fieldType == Double.TYPE) + value = rnd.nextDouble() * 10; + else if (fieldType == Float.TYPE) + value = rnd.nextFloat() * 10; + else if (fieldType == Boolean.TYPE) + value = rnd.nextBoolean(); + else if (fieldType == Byte.TYPE) + value = (byte) rnd.nextInt(256); + else if (fieldType == Character.TYPE) + value = (char) rnd.nextInt(256); + else if (fieldType == Long.TYPE) + value = rnd.nextLong(); + else if (fieldType == Short.TYPE) + value = (short) rnd.nextLong(); + else + throw new IllegalStateException("unknown primitive type!!!!"); + } else if (fieldType == String.class) { + value = "" + rnd.nextLong(); + } else if (fieldType.isEnum()) { + value = fieldType.getEnumConstants()[rnd.nextInt(fieldType.getEnumConstants().length)]; + } else { // Cannot handle other types in a general way. + value = null; + } + field.set(result,value); + } + return result; + } + + private Class mapWrappersToPrimitives(Class type) { + if (type == Boolean.class) + return Boolean.TYPE; + else if (type == Integer.class) + return Integer.TYPE; + else if (type == Double.class) + return Double.TYPE; + else if (type == Float.class) + return Float.TYPE; + else if (type == Character.class) + return Character.TYPE; + else if (type == Short.class) + return Short.TYPE; + else if (type == Byte.class) + return Byte.TYPE; + else if (type == Long.class) + return Long.TYPE; + else + return type; + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java index 657cd9c0c..54b5637de 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java @@ -50,7 +50,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; // the imports for unit testing. -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; @@ -62,7 +61,6 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -70,18 +68,20 @@ import java.util.*; public class UnifiedGenotyperEngineUnitTest extends BaseTest { private final static double TOLERANCE = 1e-5; - private UnifiedGenotyperEngine ugEngine; + private UnifiedGenotypingEngine ugEngine; @BeforeClass public void setUp() throws Exception { final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setArguments(new GATKArgumentCollection()); + final UnifiedArgumentCollection args = new UnifiedArgumentCollection(); final Set fakeSamples = Collections.singleton("fake"); - ugEngine = new UnifiedGenotyperEngine(engine, fakeSamples, args); + + ugEngine = new UnifiedGenotypingEngine(engine, args, null, fakeSamples, null); } - private UnifiedGenotyperEngine getEngine() { + private UnifiedGenotypingEngine getEngine() { return ugEngine; } @@ -122,8 +122,8 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest { alleles.add(Allele.create(Utils.dupString('A', len + 1), false)); } final VariantContext vc = new VariantContextBuilder("test", "chr1", 1000, 1000, alleles).make(); - final boolean result = ugEngine.canVCbeGenotyped(vc); - Assert.assertTrue(result == (vc.getNAlleles()<= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)); + final boolean result = ugEngine.hasTooManyAlternativeAlleles(vc); + Assert.assertTrue(result == (vc.getNAlleles() > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)); } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index ecfda9d8a..f0d0c954a 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -155,6 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test no prior 1", spec1); } + @Test public void testUserPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( @@ -175,11 +176,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test all site PLs 1", spec1); } + // -------------------------------------------------------------------------------------------------------------- // // testing heterozygosity // // -------------------------------------------------------------------------------------------------------------- + @Test public void testHeterozyosity1() { testHeterozosity( 0.01, "6053106407e09a6aefb78395a0e22ec4" ); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 2bdf5078d..c52a36853 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -48,7 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotypingEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; @@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest { final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); + UnifiedGenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { for ( AFCalc model : calcs ) { @@ -588,7 +588,7 @@ public class AFCalcUnitTest extends BaseTest { final ArrayList inputPrior = new ArrayList(); inputPrior.add(1.0/3); inputPrior.add(1.0/3); - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior); + UnifiedGenotypingEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0, inputPrior); GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior"); GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior"); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index bde1e7ef5..514b1a927 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -67,12 +67,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50323a284788c8220c9226037c7003b5"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "96fea2caf0a40df3feb268e8b14da670"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "19efc8020f31d1b68d80c50df0629e50"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50323a284788c8220c9226037c7003b5"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "96fea2caf0a40df3feb268e8b14da670"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "19efc8020f31d1b68d80c50df0629e50"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"}); return tests.toArray(new Object[][]{}); } @@ -81,7 +81,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { * Example testng test using MyDataProvider */ @Test(dataProvider = "MyDataProvider") - public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { + public void testHCWithGVCF(String bam, ReferenceConfidenceMode mode, String intervals, String md5) { final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java similarity index 94% rename from protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java index 57df96475..de6d5f8cd 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java @@ -74,9 +74,9 @@ import java.io.FileNotFoundException; import java.util.*; /** - * Unit tests for GenotypingEngine + * Unit tests for {@link HaplotypeCallerGenotypingEngine}. */ -public class GenotypingEngineUnitTest extends BaseTest { +public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { private static ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; @@ -119,11 +119,11 @@ public class GenotypingEngineUnitTest extends BaseTest { eventAllelesForSample.add( Allele.create("C", false) ); eventAllelesForSample.add( Allele.create("C", false) ); - if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { - logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); + if(!compareAlleleLists(eventAllelesForSample, HaplotypeCallerGenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { + logger.warn("calc alleles = " + HaplotypeCallerGenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); logger.warn("expected alleles = " + eventAllelesForSample); } - Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); + Assert.assertTrue(compareAlleleLists(eventAllelesForSample, HaplotypeCallerGenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); } @Test @@ -169,11 +169,11 @@ public class GenotypingEngineUnitTest extends BaseTest { eventAllelesForSample.add( Allele.create("A", true) ); eventAllelesForSample.add( Allele.create("T", false) ); - if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { - logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); + if(!compareAlleleLists(eventAllelesForSample, HaplotypeCallerGenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { + logger.warn("calc alleles = " + HaplotypeCallerGenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); logger.warn("expected alleles = " + eventAllelesForSample); } - Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); + Assert.assertTrue(compareAlleleLists(eventAllelesForSample, HaplotypeCallerGenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); } private boolean compareAlleleLists(List l1, List l2) { @@ -204,7 +204,7 @@ public class GenotypingEngineUnitTest extends BaseTest { public Map calcAlignment() { final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar()); - return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); + return HaplotypeCallerGenotypingEngine.generateVCsFromAlignment(h, ref, genomeLocParser.createGenomeLoc("4", 1, 1 + ref.length), "name"); } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 25df0703b..55aaaab5b 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -152,16 +152,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { - final List> VCs = new ArrayList<>(); + final List> VCs = new ArrayList<>(); try { for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { - VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); + VCs.add(new Pair<>(parser.createGenomeLoc(vc), new HaplotypeCallerGenotypingEngine.Event(vc))); } } catch( IOException e ) { throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); } - final Set> VCsAsSet = new HashSet<>(VCs); + final Set> VCsAsSet = new HashSet<>(VCs); return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 7c807cf36..97495a3f9 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -106,7 +106,7 @@ public class GenomeAnalysisEngine { /** * Accessor for sample metadata */ - private SampleDB sampleDB = null; + private SampleDB sampleDB = new SampleDB(); /** * Accessor for sharded reference-ordered data. diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDB.java index 40a28486e..0ca00dfdd 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -119,7 +119,7 @@ public class SampleDB { } public Set getSamples() { - return new HashSet(samples.values()); + return new LinkedHashSet<>(samples.values()); } public Collection getSampleNames() { diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SampleUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SampleUtils.java index b52154c2b..e861af7b6 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SampleUtils.java @@ -101,7 +101,7 @@ public class SampleUtils { * @return the set of unique samples */ public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { - Set samples = new LinkedHashSet(); + Set samples = new LinkedHashSet<>(); for ( VCFHeader header : GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames).values() ) samples.addAll(header.getGenotypeSamples()); diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 6fc65bfb3..d72e1f67d 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -44,7 +44,8 @@ public class GATKVariantContextUtils { private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); - public static final int DEFAULT_PLOIDY = 2; + public static final int DEFAULT_PLOIDY = HomoSapiens.DEFAULT_PLOIDY; + public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); @@ -56,6 +57,34 @@ public class GATKVariantContextUtils { public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_INTERSECTION = "Intersection"; + /** + * Checks whether a variant-context overlaps with a region. + * + *

+ * No event overlaps an unmapped region. + *

+ * + * @param variantContext variant-context to test the overlap with. + * @param region region to test the overlap with. + * + * @throws IllegalArgumentException if either region or event is {@code null}. + * + * @return {@code true} if there is an overlap between the event described and the active region provided. + */ + public static boolean overlapsRegion(final VariantContext variantContext, final GenomeLoc region) { + if (region == null) throw new IllegalArgumentException("the active region provided cannot be null"); + if (variantContext == null) throw new IllegalArgumentException("the variant context provided cannot be null"); + if (region.isUnmapped()) + return false; + if (variantContext.getEnd() < region.getStart()) + return false; + if (variantContext.getStart() > region.getStop()) + return false; + if (!variantContext.getChr().equals(region.getContig())) + return false; + return true; + } + public enum GenotypeMergeType { /** * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/HomoSapiens.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/HomoSapiens.java new file mode 100644 index 000000000..816c06f35 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/HomoSapiens.java @@ -0,0 +1,51 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.variant; + +/** + * Homo sapiens genome constants. + * + *

NOTE: reference to this constants is a indication that your code is (human) species assumption dependant.

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class HomoSapiens { + + /** + * Standard heterozygous rate for SNP variation. + */ + public static final double SNP_HETEROZYGOSITY = 1e-3; + + /** + * Standard heterozygous rate for INDEL variation. + */ + public static final double INDEL_HETEROZYGOSITY = 1.0/8000; + + /** + * Standard ploidy for autosomal chromosomes. + */ + public static final int DEFAULT_PLOIDY = 2; +} diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index efc701a6d..f8b7124f5 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -30,12 +30,15 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.variant.variantcontext.*; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.IOException; import java.util.*; public class GATKVariantContextUtilsUnitTest extends BaseTest { @@ -47,8 +50,13 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Allele Anoref; Allele GT; + private GenomeLocParser genomeLocParser; + + + + @BeforeSuite - public void setup() { + public void setup() throws IOException { // alleles Aref = Allele.create("A", true); Cref = Allele.create("C", true); @@ -61,6 +69,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { ATref = Allele.create("AT",true); Anoref = Allele.create("A",false); GT = Allele.create("GT",false); + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); } private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { @@ -1737,5 +1746,67 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } return tests.toArray(new Object[][]{}); } + + @Test(dataProvider="overlapWithData") + public void testOverlapsWith(final VariantContext vc, final GenomeLoc genomeLoc) { + final boolean expected; + + if (genomeLoc.isUnmapped()) + expected = false; + else if (vc.getStart() > genomeLoc.getStop()) + expected = false; + else if (vc.getEnd() < genomeLoc.getStart()) + expected = false; + else if (!vc.getChr().equals(genomeLoc.getContig())) + expected = false; + else + expected = true; + + Assert.assertEquals(GATKVariantContextUtils.overlapsRegion(vc, genomeLoc), expected); + } + + @DataProvider(name="overlapWithData") + public Object[][] overlapWithData() { + final String[] chrs = new String[] { "chr1", "chr20" }; + final int[] eventSizes = new int[] { -10 , -1 , 0 , 1 , 10 }; // 0 == SNP. + final int[] eventStarts = new int[] { 10000000, 10000001, 10000005, 10000010, 10000009, 10000011, 20000000}; + + final int totalLocations = chrs.length * eventSizes.length * eventStarts.length + 1; + final int totalEvents = chrs.length * eventSizes.length * eventStarts.length; + final GenomeLoc[] locs = new GenomeLoc[totalLocations]; + final VariantContext[] events = new VariantContext[totalEvents]; + + int nextIndex = 0; + for (final String chr : chrs ) + for (final int size : eventSizes ) + for (final int starts : eventStarts ) { + locs[nextIndex] = genomeLocParser.createGenomeLoc(chr,starts,starts + Math.max(0,size)); + events[nextIndex++] = new VariantContextBuilder().source("test").loc(chr,starts,starts + Math.max(0,size)).alleles(Arrays.asList( + Allele.create(randomBases(size <= 0 ? 1 : size + 1,true),true),Allele.create(randomBases(size < 0 ? - size + 1 : 1,false),false))).make(); + } + + locs[nextIndex++] = GenomeLoc.UNMAPPED; + + final List result = new LinkedList<>(); + for (final GenomeLoc loc : locs) + for (final VariantContext event : events) + result.add(new Object[] { event , loc }); + + return result.toArray(new Object[result.size()][]); + } + + private byte[] randomBases(final int length, final boolean reference) { + final byte[] bases = new byte[length]; + bases[0] = (byte) (reference ? 'A' : 'C'); + for (int i = 1; i < bases.length; i++) + switch (GenomeAnalysisEngine.getRandomGenerator().nextInt(4)) { + case 0: bases[i] = 'A'; break; + case 1: bases[i] = 'C'; break; + case 2: bases[i] = 'G'; break; + default: + bases[i] = 'T'; + } + return bases; + } }